def main(argv=sys.argv[1:]):
    """Command-line program
	
	Parameters
	----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

		Default: `sys.argv[1:]`. The command-line arguments, if the script is
		invoked from the command line
	"""
    bp = BaseParser()
    parser = argparse.ArgumentParser(
        description=format_module_docstring(__doc__),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[bp.get_parser()])
    parser.add_argument("--exclude",
                        nargs="+",
                        default=[],
                        help="Feature types to exclude from consideration")
    parser.add_argument("infile",
                        metavar="infile.gff",
                        type=str,
                        help="Input GFF3 file")
    parser.add_argument("outfile",
                        metavar="outfile.txt",
                        type=str,
                        help="Name of output file")

    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    excluded = set(args.exclude)

    fin = sys.stdin if args.infile == "-" else opener(args.infile)

    feature_counts = Counter()
    features_with_parents = []
    feature_types = {}
    name_type = {}

    printer.write("Opening %s..." % args.infile)
    c = 0
    for feature in GFF3_Reader(fin, return_stopfeatures=False):
        if c % 10000 == 0:
            printer.write("Processed %s features..." % c)
        c += 1
        ftype = feature.attr["type"]
        fname = feature.get_name()
        if ftype not in excluded:
            if ftype not in feature_types:
                feature_types[ftype] = Counter()
            feature_counts[ftype] += 1
            if fname is not None:
                name_type[fname] = ftype
            if "Parent" in feature.attr:
                features_with_parents.append(feature)
            else:
                feature_types[ftype]["parent unspecified"] += 1

    printer.write("Sorting parents...")
    c = 0
    for feature in features_with_parents:
        if c % 10000 == 0:
            printer.write("Processed %s parents..." % c)
        c += 1
        pnames = feature.attr["Parent"]
        ftype = feature.attr["type"]
        if pnames == "":
            feature_types[ftype]["parent unspecified"] += 1
        else:
            if len(pnames) > 1:
                feature_types[ftype]["multiple parents"] += 1
            else:
                ptype = name_type.get(pnames[0], "parent not in database")
                feature_types[ftype][ptype] += 1

    rows = sorted(feature_types.keys())
    cols = rows + [
        "parent unspecified", "parent not in database", "multiple parents"
    ]

    with argsopener(args.outfile, args, "w") as fh:
        printer.write("Writing %s..." % args.outfile)
        header = "#feature_type\tcount\t" + "\t".join(cols) + "\n"
        fh.write(header)
        for r in rows:
            sout = "%s\t%s" % (r, feature_counts[r])
            for i in cols:
                sout += "\t%s" % feature_types[r].get(i, 0)
            fh.write("%s\n" % sout)

    printer.write("Done.")
def main(argv=sys.argv[1:]):
	"""Command-line program
	
	Parameters
	----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directly.

		Default: `sys.argv[1:]`. The command-line arguments, if the script is
		invoked from the command line
	"""
	bp = BaseParser()
	parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
									 formatter_class=argparse.RawDescriptionHelpFormatter,
									 parents=[bp.get_parser()])
	parser.add_argument("--exclude",nargs="+",default=[],
					    help="Feature types to exclude from consideration")
	parser.add_argument("infile",metavar="infile.gff",type=str,
	                   help="Input GFF3 file")
	parser.add_argument("outfile",metavar="outfile.txt",type=str,
					   help="Name of output file")
	
	args = parser.parse_args(argv)
	bp.get_base_ops_from_args(args)
	excluded = set(args.exclude)

	fin = sys.stdin if args.infile == "-" else opener(args.infile)
	
	feature_counts        = Counter()
	features_with_parents = []
	feature_types         = {}
	name_type             = {}

	
	printer.write("Opening %s..." % args.infile)
	c = 0
	for feature in GFF3_Reader(fin,return_stopfeatures=False):
		if c % 10000 == 0:
			printer.write("Processed %s features..." % c)
		c += 1
		ftype = feature.attr["type"]
		fname = feature.get_name()
		if ftype not in excluded:
			if ftype not in feature_types:
				feature_types[ftype] = Counter()
			feature_counts[ftype] += 1
			if fname is not None:
				name_type[fname] = ftype
			if "Parent" in feature.attr:
				features_with_parents.append(feature)
			else:
				feature_types[ftype]["parent unspecified"] += 1
	
	printer.write("Sorting parents...")
	c = 0
	for feature in features_with_parents:
		if c % 10000 == 0:
			printer.write("Processed %s parents..." % c)
		c += 1
		pnames = feature.attr["Parent"]
		ftype = feature.attr["type"]
		if pnames == "":
			feature_types[ftype]["parent unspecified"] += 1
		else:
			if len(pnames) > 1:
				feature_types[ftype]["multiple parents"] += 1
			else:
				ptype = name_type.get(pnames[0],"parent not in database")
				feature_types[ftype][ptype] += 1

	rows = sorted(feature_types.keys())
	cols = rows + ["parent unspecified","parent not in database","multiple parents"]

	with argsopener(args.outfile,args,"w") as fh:
		printer.write("Writing %s..." % args.outfile)
		header = "#feature_type\tcount\t" + "\t".join(cols) + "\n"
		fh.write(header)
		for r in rows:
			sout = "%s\t%s" % (r, feature_counts[r])
			for i in cols:
				sout += "\t%s" % feature_types[r].get(i,0)
			fh.write("%s\n" % sout)

	printer.write("Done.")
Exemple #3
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
	argv : list, optional
		A list of command-line arguments, which will be processed
		as if the script were called from the command line if
		:py:func:`main` is called directrly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"],
                         disabled=["normalize","big_genome",])
    bp = BaseParser()
    alignment_file_parser = ap.get_parser()
    base_parser = bp.get_parser()
    
    pp = PlottingParser()
    plotting_parser = pp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[base_parser,
                                              alignment_file_parser,
                                              plotting_parser])
    
    parser.add_argument("--min_counts",type=int,default=10,metavar="N",
                         help="Minimum counts required in normalization region "+
                              "to be included in metagene average (Default: 10)")
    parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N",
                         default=None,
                         #default=(20,50),
                         help="Portion of each window against which its individual raw count profile"+
                              " will be normalized. Specify two integers, in nucleotide"+
                              " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)")
    parser.add_argument("--norm_region",type=int,nargs=2,metavar="N",
                         default=None,
                         help="Deprecated. Use ``--normalize_over`` instead. "+
                              "Formerly, Portion of each window against which its individual raw count profile"+
                              " will be normalized. Specify two integers, in nucleotide"+
                              " distance, from 5\' end of window. (Default: 70 100)")
    parser.add_argument("--require_upstream",default=False,action="store_true",
                        help="If supplied, the P-site offset is taken to be the distance "+
                             "between the largest peak upstream of the start codon and "+
                             "the start codon itself. Otherwise, the P-site offset is taken "+
                             "to be the distance between the largest peak in the entire ROI "+
                             "and the start codon. Ignored if ``--constrain`` is used."
                        )
    parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X",
                        help="Constrain P-site offset to be between specified distance from "+
                             "start codon. Useful for noisy data. "+
                             "(Reasonable set: 10 15; default: not constrained)")
    parser.add_argument("--aggregate",default=False,action="store_true",
                        help="Estimate P-site from aggregate reads at each position, instead "+
                             "of median normalized read density. Noisier, but helpful for "+
                             "lower-count data or read lengths with few counts. (Default: False)"
                        ),
    parser.add_argument("--keep",default=False,action="store_true",
                        help="Save intermediate count files. Useful for additional computations (Default: False)")
    parser.add_argument("--default",type=int,default=13,
                        help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)")

    parser.add_argument("roi_file",type=str,
                        help="ROI file surrounding start codons, from ``metagene generate`` subprogram")
    
    parser.add_argument("outbase",type=str,help="Basename for output files")
    
    # set manual options
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    # set defaults
    args.mapping = "fiveprime"
    args.offset  = 0
    args.nibble  = 0

    
    # process arguments
    min_len = args.min_length
    max_len = args.max_length
    profiles = max_len + 1 - min_len
    lengths = list(range(min_len,max_len+1))
    outbase = args.outbase
    title  = "Fiveprime read offsets by length" if args.title is None else args.title
    
    pp.set_style_from_args(args)
    colors = pp.get_colors_from_args(args,profiles)
 
    printer.write("Opening ROI file %s ..." % args.roi_file)
    with opener(args.roi_file) as roi_fh:
        roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0)
        roi_fh.close()
        
    printer.write("Opening count files %s ..." % ",".join(args.count_files))
    ga = ap.get_genome_array_from_args(args,printer=printer)

    
    # remove default size filters
    my_filters = ga._filters.keys()
    for f in my_filters:
        ga.remove_filter(f)

    norm_start, norm_end = _get_norm_region(roi_table,args)
    
    # count
    count_dict, norm_count_dict, metagene_profile = do_count(roi_table,
                                                             ga,
                                                             norm_start,
                                                             norm_end,
                                                             args.min_counts,
                                                             min_len,
                                                             max_len,
                                                             aggregate=args.aggregate,
                                                             printer=printer)
    
    # save counts
    profile_fn = "%s_metagene_profiles.txt" % outbase
    with argsopener(profile_fn,args,"w") as metagene_out:
        metagene_profile.to_csv(metagene_out,
                                sep="\t",
                                header=True,
                                index=False,
                                na_rep="nan",
                                columns=["x"]+["%s-mers" % X for X in lengths])
        metagene_out.close()

    if args.keep == True:
        printer.write("Saving raw and normalized counts ...")
        for k in count_dict:
            count_fn     = "%s_%s_rawcounts.txt.gz"  % (outbase,k)
            normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k)
            mask_fn      = "%s_%s_mask.txt.gz" % (outbase,k)
            numpy.savetxt(count_fn,count_dict[k],delimiter="\t")
            numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t")
            numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t")
    
    # plotting & offsets
    printer.write("Plotting and determining offsets ...")
    offset_dict = OrderedDict() 

    # Determine scaling factor for plotting metagene profiles
    max_y = numpy.nan 
    with warnings.catch_warnings():
        # ignore warnings for slices that contain only NaNs
        warnings.simplefilter("ignore",category=RuntimeWarning)
        for k in lengths:
            max_y = numpy.nanmax([max_y,
                                  numpy.nanmax(metagene_profile["%s-mers"% k].values)])

    if numpy.isnan(max_y) or max_y == 0:
        max_y = 1.0


    # parse arguments & set styles
    mplrc = matplotlib.rcParams
    plt_incr  = 1.2

    # use this figsize if not specified on command line
    figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles)
    default_figsize = (7.5,figheight)

    fig = pp.get_figure_from_args(args,figsize=default_figsize)

    ax = plt.gca()
    plt.title(title)
    plt.xlabel("Distance from CDS start, (nt; 5' end mapping)")
    if args.aggregate == True:
        plt.ylabel("Aggregate read counts (au)")
    else:
        plt.ylabel("Median normalized read density (au)")
        
    plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2])

    x = metagene_profile["x"].values
    xmin = x.min()
    xmax = x.max()
    
    if args.constrain is not None:
        mask = numpy.tile(True,len(x))
        
        zp = (x==0).argmax()
        l,r = args.constrain
        if l == r:
            warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning)
            
        mindist = min(l,r)
        maxdist = max(l,r)
        
        mask[zp-maxdist:zp-mindist+1] = False
    elif args.require_upstream == True:
        mask = x >= 0
    else:
        mask = numpy.tile(False,len(x))

    for n,k in enumerate(lengths):
        color = colors[n]
        baseline = plt_incr*n
        y = metagene_profile["%s-mers" % k].values
        #ymask = y[mask]
        ymask = numpy.ma.MaskedArray(y,mask=mask)

        if numpy.isnan(y).all():
            plot_y = numpy.zeros_like(x)
        else:
            if args.aggregate == False:
                plot_y = y / max_y
            else:
                plot_y = y.astype(float) / numpy.nanmax(y) * 0.9
 
        # plot metagene profiles on common scale, offset by baseline from bottom to top
        ax.plot(x,baseline + plot_y,color=color)
        ax.text(xmin,baseline,"%s-mers" % k,
                ha="left",
                va="bottom",
                color=color,
                transform=matplotlib.transforms.offset_copy(ax.transData,fig,
                                                            x=6.0,y=3.0,units="points"))

        ymax = baseline + numpy.nanmax(plot_y)

        # if all valid positions are nan, or if all valid positions are <= 0
        if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0:
            offset = args.default
            usedefault = True
        else:
            offset = -x[numpy.ma.argmax(ymask)]
            usedefault = False

        offset_dict[k] = offset
        if usedefault == False:
            yadj = ymax - 0.2 * plt_incr

            ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2])
            ax.text(-offset / 2.0,
                     yadj,
                     "%s nt" % (offset),
                     color=color,
                     ha="center",
                     va="bottom",
                     transform=matplotlib.transforms.offset_copy(ax.transData,fig,
                                                                 x=0.0,y=3.0,units="points")
                    )   

    plt.xlim(xmin,xmax)
    plt.ylim(-0.1,plt_incr+baseline)
    ax.yaxis.set_ticks([])

    # save data as p-site offset table
    fn = "%s_p_offsets.txt" % outbase
    fout = argsopener(fn,args)
    printer.write("Writing offset table to %s ..." % fn)
    fout.write("length\tp_offset\n")
    for k in offset_dict:
        fout.write("%s\t%s\n" % (k,offset_dict[k]))
    
    fout.write("default\t%s" % args.default)
    
    fout.close()

    # save plot
    plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat) 
    printer.write("Saving plot to %s ..." % plot_fn)
    plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight")

    printer.write("Done.")
def main(argv=sys.argv[1:], verbose=False):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    
    verbose : bool, optional
        If `True`, return 
    
    
    Returns
    -------
    int
        `0` if files are identical, `1` otherwise
    
    str
        Only returned if `verbose` is selected. String describing how
        tables are unequal (e.g. which columns failed, et c).
    """
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("file1", type=str)
    parser.add_argument("file2", type=str)
    parser.add_argument("-v",
                        dest="verbose",
                        default=False,
                        action="store_true",
                        help="Give verbose output")
    parser.add_argument(
        "--sort_keys",
        default=None,
        metavar="key",
        nargs="+",
        help=
        "If specified, values will be sorted by the column(s) corresponding to these name or numbers (0-indexed) before comparison"
    )
    parser.add_argument("--exclude",
                        type=str,
                        default=[],
                        nargs="+",
                        metavar="key",
                        help="Key or number (0-indexed) of columns to exclude")
    parser.add_argument("--no_header",default=False,action="store_true",
                        help="If specified, no header row is present. Columns "+\
                             "for all other command-line flags "+\
                             "must be referenced by number (starting at zero) "+\
                             "rather than name, and will be assumed to be in "+\
                             "the same order in both files.")
    parser.add_argument(
        "--tol",
        type=float,
        default=1e-8,
        help="Tolerance by which floats are allowed to differ (Default: 1e-8)")

    args = parser.parse_args(argv)

    kwargs = {
        "sep": "\t",
        "index_col": args.sort_keys,
        "comment": "#",
    }
    exclude = args.exclude
    if args.no_header is True:
        if args.sort_keys is not None:
            kwargs["index_col"] = [int(X) for X in args.sort_keys]
        exclude = [int(X) for X in exclude]

        with opener(args.file1) as fh:
            df1 = pd.read_table(fh, header=None, **kwargs)

        with opener(args.file2) as fh:
            df2 = pd.read_table(fh, header=None, **kwargs)

    else:
        with opener(args.file1) as fh:
            df1 = pd.read_table(fh, header=0, **kwargs)

        with opener(args.file2) as fh:
            df2 = pd.read_table(fh, header=0, **kwargs)

    if len(args.exclude) > 0:
        printer.write("Excluding columns %s: " % ", ".join(args.exclude))

    for k in exclude:
        if k in df1:
            df1.pop(k)
        if k in df2:
            df2.pop(k)

    test_result, messages = test_dataframe_equality(
        df1,
        df2,
        printer=printer,
        print_verbose=True,
        return_verbose=True,
        tol=args.tol)  #test_3(df1,df2)

    if test_result == True:
        printer.write("Files contain equivalent data.")
        exit_code = 0
    else:
        printer.write("Files non-equivalent.")
        exit_code = 1

    if __name__ == "__main__":
        sys.exit(exit_code)
    else:
        if args.verbose == True or verbose == True:
            return exit_code, messages
        else:
            return exit_code
Exemple #5
0
def do_chart(args, plot_parser):
    """Produce a set of charts comparing multiple samples pairwise.
    
    Charts include histograms of log2 fold changes and scatter plots with
    correlation coefficients, both generated for raw count and RPKM data.

    Parameters
    ----------
    args : :py:class:`argparse.Namespace`
        command-line arguments for ``chart`` subprogram       
    """
    plot_parser.set_style_from_args(args)

    outbase = args.outbase
    bins = numpy.array(args.bins)
    figformat = args.figformat

    # read input files
    printer.write("Reading input files: %s ..." % ", ".join(args.infiles))

    samples = { get_short_samplename(X) : read_count_file(opener(X),args.list_of_regions)\
                                          for X in args.infiles }

    # Define some variables for later
    sample_names = sorted(samples.keys())
    comparisons = [
        X for X in itertools.combinations(sample_names, 2) if X[0] != X[1]
    ]
    colors = plot_parser.get_colors_from_args(args, len(comparisons))
    binkeys = tuple(["%s_reads" % k for k in args.regions])

    comparison_labels = sorted(["%s_vs_%s" % (X, Y) for X, Y in comparisons])
    bigtable = {}.fromkeys(comparison_labels)
    corrcoef_by_bin_table = {}.fromkeys(comparison_labels)

    # ki, kj = names of samples i and j
    # vi, vj = data of samples i and j
    for ki, kj in comparisons:
        try:
            assert (samples[ki]["region"] == samples[kj]["region"]).all()
        except AssertionError:
            printer.write(
                "Mismatched line entries for samples %s and %s. Were different gene lists used for counting?"
                % (ki, kj))

        vi = samples[ki]
        vj = samples[kj]
        printer.write("Comparing %s to %s:" % (ki, kj))

        label = "%s_vs_%s" % (ki, kj)

        bigtable[label] = {}
        corrcoef_by_bin_table[label] = {}

        for binkey in binkeys:
            bigtable[label][binkey] = {
                K: copy.deepcopy({})
                for K in args.metrics
            }
            corrcoef_by_bin_table[label][binkey] = {
                K: copy.deepcopy({})
                for K in args.metrics
            }

        for region in args.regions:
            region_counts = "%s_reads" % region
            count_mask = (vi[region_counts].values + vj[region_counts].values
                          >= 128)

            for metric in args.metrics:
                region_metric = "%s_%s" % (region, metric)
                printer.write("    -%s across %s for all >=128 ..." %
                              (metric, region))

                # divide into regions >=128 and <128 and plot
                viover = vi[region_metric][count_mask].values
                vjover = vj[region_metric][count_mask].values

                viunder = vi[region_metric][~count_mask].values
                vjunder = vj[region_metric][~count_mask].values

                pearsonr = scipy.stats.pearsonr(viover, vjover)[0]
                spearmanr = scipy.stats.spearmanr(viover, vjover)[0]

                # log2 fold change stats
                log2_ratios = numpy.log2(
                    numpy.ma.masked_invalid(vjover / viover))
                log2_mean = log2_ratios.mean()
                log2_std = log2_ratios.std()
                min_diff = 2**(3 * log2_std)
                num_genes_log2 = (~log2_ratios.mask).sum()

                # ma plot
                try:
                    ma_title = "%s vs %s (%s %s)" % (ki, kj, region, metric)
                    fig = plot_parser.get_figure_from_args(args)
                    _, axdict = ma_plot(viunder,
                                        vjunder,
                                        color=process_black,
                                        label="< 128 counts",
                                        axes=plt.gca(),
                                        kdalpha=0.2,
                                        title=ma_title)
                    _, _ = ma_plot(viover,
                                   vjover,
                                   axes=axdict,
                                   label=">= 128 counts",
                                   kdalpha=0.8)

                    mainax = axdict["main"]
                    mainax.set_xlabel("%s %s" % (ki, metric))
                    mainax.legend(loc="upper right", frameon=False)

                    text_kwargs = {
                        "horizontalalignment": "right",
                        "verticalalignment": "baseline",
                        "linespacing": 1.6,
                        "transform": mainax.transAxes,
                    }

                    plot_text = "\n".join([
                        "sample mean: %0.3e" % log2_mean,
                        "sample stdev: %0.3e" % log2_std,
                        "3-sigma fold change: %0.2f" % min_diff,
                        "regions_counted: %s" % count_mask.sum(),
                    ])

                    mainax.text(0.96, 0.04, plot_text, **text_kwargs)

                    plt.savefig("%s_ma_%s_%s_%s.%s" %
                                (outbase, label, region, metric, figformat),
                                bbox_inches="tight")
                    plt.close()
                except Exception as e:
                    warnings.warn(
                        "Could not make MA plot for samples %s and %s. Error message:\n    %s"
                        % (ki, kj, e.message), DataWarning)

                # scatter plot
                try:
                    scatter_title = "%s vs %s (%s %s)" % (ki, kj, region,
                                                          metric)
                    do_scatter(vi[region_metric].values,
                               vj[region_metric].values,
                               count_mask,
                               plot_parser,
                               args,
                               pearsonr=pearsonr,
                               xlabel=ki,
                               ylabel=kj,
                               title=scatter_title)

                    plt.savefig("%s_scatter_%s_%s_%s.%s" %
                                (outbase, label, region, metric, figformat),
                                bbox_inches="tight")
                    plt.close()
                except ValueError as e:
                    warnings.warn(
                        "Could not make scatter plot for samples %s and %s. Error message:\n    %s"
                        % (ki, kj, e.message), DataWarning)

                # TODO: make these tables into dataframes. this scheme is insane

                # add entries to bigtable for export later
                bigtable[label][region_counts][metric]["pearsonr"] = pearsonr
                bigtable[label][region_counts][metric]["spearmanr"] = spearmanr
                bigtable[label][region_counts][metric]["log2_mean"] = log2_mean
                bigtable[label][region_counts][metric]["log2_std"] = log2_std
                bigtable[label][region_counts][metric]["2**3sigma"] = 2**(
                    3 * log2_std)
                bigtable[label][region_counts][metric][
                    "num_genes_128plus"] = count_mask.sum()
                bigtable[label][region_counts][metric][
                    "num_genes_log2"] = num_genes_log2

                # do bin-by-bin counting
                printer.write("    -%s across %s by bin ..." %
                              (metric, region))
                bin_masks = get_bin_mask_by_summed_key(vi,
                                                       vj,
                                                       bins=args.bins,
                                                       key=region_counts)
                for my_bin, bin_mask in sorted(bin_masks.items()):

                    bin_vec_i = vi[region_metric][bin_mask]
                    bin_vec_j = vj[region_metric][bin_mask]

                    # make sure there are genes in bin before attempting calculations
                    if len(bin_vec_i) > 0:
                        nonzero_binmask = get_nonzero_either_mask(
                            bin_vec_i, bin_vec_j)
                        bin_vi_log2 = bin_vec_i[nonzero_binmask]
                        bin_vj_log2 = bin_vec_j[nonzero_binmask]
                        my_logs = numpy.log2(bin_vj_log2 / bin_vi_log2)
                        my_logmean = numpy.mean(my_logs)
                        my_logstd = numpy.std(my_logs)
                        if len(bin_vec_i) > 2:
                            my_pearsonr = scipy.stats.pearsonr(
                                bin_vec_i, bin_vec_j)[0]
                            my_spearmanr = scipy.stats.spearmanr(
                                bin_vec_i, bin_vec_j)[0]
                        else:
                            my_spearmanr = numpy.nan
                            my_pearsonr = numpy.nan
                    else:
                        # fill with dummy values
                        my_logs = numpy.array([])
                        my_logmean = numpy.nan
                        my_logstd = numpy.nan
                        my_spearmanr = numpy.nan
                        my_pearsonr = numpy.nan

                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin] = {}
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["pearsonr"] = my_pearsonr
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["spearmanr"] = my_spearmanr
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["genes_in_bin"] = sum(bin_mask)
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["log2_genes_in_bin"] = sum(nonzero_binmask)
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["log2_mean"] = my_logmean
                    corrcoef_by_bin_table[label][region_counts][metric][
                        my_bin]["log2_std"] = my_logstd

    # export big (non-binned) table
    printer.write("Writing tables ...")

    bigtable_out = argsopener("%s_bigtable.txt" % args.outbase, args, "w")
    stats = (
        "num_genes_128plus",  # 0
        "pearsonr",  # 1
        "spearmanr",  # 2
        "num_genes_log2",  # 3
        "log2_mean",  # 4
        "log2_std",  # 5
        "2**3sigma",  # 6
    )

    header = ["#region", "metric", "statistic"]
    header += [X for X in comparison_labels]

    bigtable_out.write("\t".join(header) + "\n")
    for region in binkeys:
        for metric in args.metrics:
            for stat in stats:
                ltmp = [region, metric, stat]
                for key in comparison_labels:
                    ltmp.append(bigtable[key][region][metric][stat])
                ltmp = [str(X) for X in ltmp]
                bigtable_out.write("\t".join(ltmp) + "\n")

    bigtable_out.close()

    # export binned data table and make bin-by-bin plots
    bintable_out = argsopener("%s_bintable.txt" % args.outbase, args, "w")

    region_metrics = [
        "%s_%s" % (X, Y) for X in args.regions for Y in args.metrics
    ]
    stats = [
        "genes_in_bin",  # 0
        "pearsonr",  # 1
        "spearmanr",  # 2
        "",  # 3
        "log2_genes_in_bin",  # 4
        "log2_mean",  # 5
        "log2_std",  # 6
        ""
    ]  # 7
    for region_metric in region_metrics:
        plt.close()
        fig = plot_parser.get_figure_from_args(args)
        plt.semilogx(basex=2)
        plt.title("Correlation coefficients by bin for %s" % region_metric)
        plt.xlabel("Bin")
        plt.ylabel("Spearman rho")

        region, metric = region_metric.split("_")
        region_counts = "%s_reads" % region
        bintable_out.write("%s\n\n" % region_metric)

        try:
            for n, label in enumerate(comparison_labels):
                corrcoefs = []
                bintable_out.write("%s\t\t\t\t\t\t\t\t" % label)
                bintable_out.write("\n")
                bintable_out.write("bin\t" + "\t".join(stats) + "\n")
                for my_bin in bins:
                    ltmp = [my_bin]
                    for stat in stats:
                        ltmp.append(corrcoef_by_bin_table[label][region_counts]
                                    [metric][my_bin].get(stat, ""))
                    ltmp = [str(X) for X in ltmp]
                    bintable_out.write("\t".join(ltmp) + "\n\n")
                    corrcoefs.append(
                        corrcoef_by_bin_table[label][region_counts][metric]
                        [my_bin]["spearmanr"])
                corrcoefs = ma.masked_invalid(corrcoefs)
                plt.plot(bins[~corrcoefs.mask],
                         corrcoefs[~corrcoefs.mask],
                         label=label,
                         color=colors[n])
            plt.legend(loc="lower right")
            plt.savefig("%s_corrcoef_by_bin_%s_%s.%s" %
                        (outbase, region_metric, label, figformat),
                        bbox_inches="tight")
        except Exception as e:
            warnings.warn(
                "Could not plot correlation-by-bin plot for '%s', '%s'" %
                (outbase, region_metric), DataWarning)

        bintable_out.write("\n\n")
    bintable_out.close()

    printer.write("Done.")
def main(argv=sys.argv[1:],verbose=False):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    
    verbose : bool, optional
        If `True`, return 
    
    
    Returns
    -------
    int
        `0` if files are identical, `1` otherwise
    
    str
        Only returned if `verbose` is selected. String describing how
        tables are unequal (e.g. which columns failed, et c).
    """
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("file1",type=str)
    parser.add_argument("file2",type=str)
    parser.add_argument("-v",dest="verbose",default=False,action="store_true",
                        help="Give verbose output")
    parser.add_argument("--sort_keys",default=None,metavar="key",nargs="+",
                        help="If specified, values will be sorted by the column(s) corresponding to these name or numbers (0-indexed) before comparison")
    parser.add_argument("--exclude",type=str,default=[],nargs="+",metavar="key",
                        help="Key or number (0-indexed) of columns to exclude")
    parser.add_argument("--no_header",default=False,action="store_true",
                        help="If specified, no header row is present. Columns "+\
                             "for all other command-line flags "+\
                             "must be referenced by number (starting at zero) "+\
                             "rather than name, and will be assumed to be in "+\
                             "the same order in both files.")
    parser.add_argument("--tol",type=float,default=1e-8,
                        help="Tolerance by which floats are allowed to differ (Default: 1e-8)")
    
    args = parser.parse_args(argv)

    kwargs = { "sep"       : "\t",
               "index_col" : args.sort_keys,
               "comment"   : "#",
              }
    exclude = args.exclude
    if args.no_header is True:
        if args.sort_keys is not None:
            kwargs["index_col"] = [int(X) for X in args.sort_keys]
        exclude = [int(X) for X in exclude]

        with opener(args.file1) as fh:
            df1 = pd.read_table(fh,header=None,**kwargs)
            
        with opener(args.file2) as fh:
            df2 = pd.read_table(fh,header=None,**kwargs)

    else:
        with opener(args.file1) as fh:
            df1 = pd.read_table(fh,header=0,**kwargs)
            
        with opener(args.file2) as fh:
            df2 = pd.read_table(fh,header=0,**kwargs)
    
    if len(args.exclude) > 0:
        printer.write("Excluding columns %s: " % ", ".join(args.exclude))

    for k in exclude:
        if k in df1:
            df1.pop(k)
        if k in df2:
            df2.pop(k)

    test_result, messages = test_dataframe_equality(df1,df2,printer=printer,print_verbose=True,return_verbose=True,tol=args.tol) #test_3(df1,df2)

    if test_result == True:
        printer.write("Files contain equivalent data.")
        exit_code = 0
    else:
        printer.write("Files non-equivalent.")
        exit_code = 1
    
    if __name__ == "__main__":
        sys.exit(exit_code)
    else:
        if args.verbose == True or verbose == True:
            return exit_code, messages
        else:
            return exit_code
Exemple #7
0
def main(argv=sys.argv[1:]):
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: `sys.argv[1:]`. The command-line arguments, if the script is
        invoked from the command line
    """
    sp = SequenceParser()
    mp = MaskParser()
    bp = BaseParser()
    
    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()],
                                     )
    parser.add_argument("--maxslide",type=int,default=10,
                        help="Maximum number of nt to search 5\' and 3\' of intron"+
                             " boundaries (Default: 10)")
    parser.add_argument("--ref",type=str,metavar="ref.bed",default=None,
                        help="Reference file describing known splice junctions")
    parser.add_argument("--slide_canonical",action="store_true",default=False,
                        help="Slide junctions to canonical junctions if present within equal support region")
    parser.add_argument("infile",type=str,metavar="input.bed",
                        help="BED file describing discovered junctions")
    parser.add_argument("outbase",type=str,
                        help="Basename for output files")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)
    
    printer.write("Opening genome from %s..." % args.sequence_file)
    genome = sp.get_seqdict_from_args(args)
    
    # load crossmap    
    cross_hash = mp.get_genome_hash_from_args(args)

    # load ref junctions
    if args.ref is not None:
        printer.write("Loading reference junctions from %s" % args.ref)
        known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False)
    else:
        known_hash = GenomeHash()

    # set up variables    
    canonicals_plus = [("GT","AG"),
                       ("GC","AG")
                      ]
    
    canonicals_minus = [("CT","AC"),
                        ("CT","GC")
                       ]
    
    known_in_range     = 0
    canonical_in_range = 0
    repetitive         = 0
    untouched          = 0
    c = 0
    
    seen_already = []

    outfiles = {
                 "repetitive" : "%s_repetitive.bed" % args.outbase,
                 "known"      : "%s_shifted_known.bed" % args.outbase,
                 "canonical"  : "%s_shifted_canonical.bed" % args.outbase,
                 "untouched"  : "%s_untouched.bed" % args.outbase,
                }
    outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() }

    # process data
    printer.write("Opening junctions from %s..." % args.infile)
    for ivc in BED_Reader(CommentReader(opener(args.infile))):
        processed = False
        tup = None

        if c % 1000 == 0 and c > 0:
            printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
                    (c, known_in_range, canonical_in_range, repetitive, untouched))
                   
        assert len(ivc) == 2
        strand = ivc.strand
        
        minus_range, plus_range = find_match_range(ivc,genome,args.maxslide)
        
        # see if either end of splice junction +- match_range lands in repetitive areas of genome
        if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash):
            repetitive += 1
            outfiles["repetitive"].write(ivc.as_bed())
            processed = True

        # see if one or more known junctions in range
        if processed == False and args.ref is not None:
            # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions)
            known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc))
            if len(known_juncs) > 0:
                known_in_range += 1
                for my_known in known_juncs:
                    tup = get_junction_tuple(my_known)
                    if tup not in seen_already:
                        outfiles["known"].write(my_known.as_bed())
                        seen_already.append(tup)
                    
                processed = True
            
        # see if one or more canonical junctions in range
        if processed == False and args.slide_canonical == True:
            canonicals = canonicals_plus if strand == "+" else canonicals_minus
            #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals)
            canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals)
            if len(canonical_juncs) > 0:
                canonical_in_range += 1
                for can in canonical_juncs:
                    tup = get_junction_tuple(can)
                    if tup not in seen_already:
                        outfiles["canonical"].write(can.as_bed())
                        seen_already.append(tup)

                processed = True
                    
        if processed == False:
            outfiles["untouched"].write(ivc.as_bed())
            untouched += 1
            
        c += 1

    # save output
    printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \
            (c, known_in_range, canonical_in_range, repetitive, untouched))    

    for v in outfiles.values():
        v.close()
    
    printer.write("Done.")