def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser()]) parser.add_argument("--exclude", nargs="+", default=[], help="Feature types to exclude from consideration") parser.add_argument("infile", metavar="infile.gff", type=str, help="Input GFF3 file") parser.add_argument("outfile", metavar="outfile.txt", type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin, return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0], "parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + [ "parent unspecified", "parent not in database", "multiple parents" ] with argsopener(args.outfile, args, "w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i, 0) fh.write("%s\n" % sout) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser()]) parser.add_argument("--exclude",nargs="+",default=[], help="Feature types to exclude from consideration") parser.add_argument("infile",metavar="infile.gff",type=str, help="Input GFF3 file") parser.add_argument("outfile",metavar="outfile.txt",type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin,return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0],"parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + ["parent unspecified","parent not in database","multiple parents"] with argsopener(args.outfile,args,"w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i,0) fh.write("%s\n" % sout) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directrly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"], disabled=["normalize","big_genome",]) bp = BaseParser() alignment_file_parser = ap.get_parser() base_parser = bp.get_parser() pp = PlottingParser() plotting_parser = pp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, plotting_parser]) parser.add_argument("--min_counts",type=int,default=10,metavar="N", help="Minimum counts required in normalization region "+ "to be included in metagene average (Default: 10)") parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N", default=None, #default=(20,50), help="Portion of each window against which its individual raw count profile"+ " will be normalized. Specify two integers, in nucleotide"+ " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)") parser.add_argument("--norm_region",type=int,nargs=2,metavar="N", default=None, help="Deprecated. Use ``--normalize_over`` instead. "+ "Formerly, Portion of each window against which its individual raw count profile"+ " will be normalized. Specify two integers, in nucleotide"+ " distance, from 5\' end of window. (Default: 70 100)") parser.add_argument("--require_upstream",default=False,action="store_true", help="If supplied, the P-site offset is taken to be the distance "+ "between the largest peak upstream of the start codon and "+ "the start codon itself. Otherwise, the P-site offset is taken "+ "to be the distance between the largest peak in the entire ROI "+ "and the start codon. Ignored if ``--constrain`` is used." ) parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X", help="Constrain P-site offset to be between specified distance from "+ "start codon. Useful for noisy data. "+ "(Reasonable set: 10 15; default: not constrained)") parser.add_argument("--aggregate",default=False,action="store_true", help="Estimate P-site from aggregate reads at each position, instead "+ "of median normalized read density. Noisier, but helpful for "+ "lower-count data or read lengths with few counts. (Default: False)" ), parser.add_argument("--keep",default=False,action="store_true", help="Save intermediate count files. Useful for additional computations (Default: False)") parser.add_argument("--default",type=int,default=13, help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)") parser.add_argument("roi_file",type=str, help="ROI file surrounding start codons, from ``metagene generate`` subprogram") parser.add_argument("outbase",type=str,help="Basename for output files") # set manual options args = parser.parse_args(argv) bp.get_base_ops_from_args(args) # set defaults args.mapping = "fiveprime" args.offset = 0 args.nibble = 0 # process arguments min_len = args.min_length max_len = args.max_length profiles = max_len + 1 - min_len lengths = list(range(min_len,max_len+1)) outbase = args.outbase title = "Fiveprime read offsets by length" if args.title is None else args.title pp.set_style_from_args(args) colors = pp.get_colors_from_args(args,profiles) printer.write("Opening ROI file %s ..." % args.roi_file) with opener(args.roi_file) as roi_fh: roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0) roi_fh.close() printer.write("Opening count files %s ..." % ",".join(args.count_files)) ga = ap.get_genome_array_from_args(args,printer=printer) # remove default size filters my_filters = ga._filters.keys() for f in my_filters: ga.remove_filter(f) norm_start, norm_end = _get_norm_region(roi_table,args) # count count_dict, norm_count_dict, metagene_profile = do_count(roi_table, ga, norm_start, norm_end, args.min_counts, min_len, max_len, aggregate=args.aggregate, printer=printer) # save counts profile_fn = "%s_metagene_profiles.txt" % outbase with argsopener(profile_fn,args,"w") as metagene_out: metagene_profile.to_csv(metagene_out, sep="\t", header=True, index=False, na_rep="nan", columns=["x"]+["%s-mers" % X for X in lengths]) metagene_out.close() if args.keep == True: printer.write("Saving raw and normalized counts ...") for k in count_dict: count_fn = "%s_%s_rawcounts.txt.gz" % (outbase,k) normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k) mask_fn = "%s_%s_mask.txt.gz" % (outbase,k) numpy.savetxt(count_fn,count_dict[k],delimiter="\t") numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t") numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t") # plotting & offsets printer.write("Plotting and determining offsets ...") offset_dict = OrderedDict() # Determine scaling factor for plotting metagene profiles max_y = numpy.nan with warnings.catch_warnings(): # ignore warnings for slices that contain only NaNs warnings.simplefilter("ignore",category=RuntimeWarning) for k in lengths: max_y = numpy.nanmax([max_y, numpy.nanmax(metagene_profile["%s-mers"% k].values)]) if numpy.isnan(max_y) or max_y == 0: max_y = 1.0 # parse arguments & set styles mplrc = matplotlib.rcParams plt_incr = 1.2 # use this figsize if not specified on command line figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles) default_figsize = (7.5,figheight) fig = pp.get_figure_from_args(args,figsize=default_figsize) ax = plt.gca() plt.title(title) plt.xlabel("Distance from CDS start, (nt; 5' end mapping)") if args.aggregate == True: plt.ylabel("Aggregate read counts (au)") else: plt.ylabel("Median normalized read density (au)") plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2]) x = metagene_profile["x"].values xmin = x.min() xmax = x.max() if args.constrain is not None: mask = numpy.tile(True,len(x)) zp = (x==0).argmax() l,r = args.constrain if l == r: warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning) mindist = min(l,r) maxdist = max(l,r) mask[zp-maxdist:zp-mindist+1] = False elif args.require_upstream == True: mask = x >= 0 else: mask = numpy.tile(False,len(x)) for n,k in enumerate(lengths): color = colors[n] baseline = plt_incr*n y = metagene_profile["%s-mers" % k].values #ymask = y[mask] ymask = numpy.ma.MaskedArray(y,mask=mask) if numpy.isnan(y).all(): plot_y = numpy.zeros_like(x) else: if args.aggregate == False: plot_y = y / max_y else: plot_y = y.astype(float) / numpy.nanmax(y) * 0.9 # plot metagene profiles on common scale, offset by baseline from bottom to top ax.plot(x,baseline + plot_y,color=color) ax.text(xmin,baseline,"%s-mers" % k, ha="left", va="bottom", color=color, transform=matplotlib.transforms.offset_copy(ax.transData,fig, x=6.0,y=3.0,units="points")) ymax = baseline + numpy.nanmax(plot_y) # if all valid positions are nan, or if all valid positions are <= 0 if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0: offset = args.default usedefault = True else: offset = -x[numpy.ma.argmax(ymask)] usedefault = False offset_dict[k] = offset if usedefault == False: yadj = ymax - 0.2 * plt_incr ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2]) ax.text(-offset / 2.0, yadj, "%s nt" % (offset), color=color, ha="center", va="bottom", transform=matplotlib.transforms.offset_copy(ax.transData,fig, x=0.0,y=3.0,units="points") ) plt.xlim(xmin,xmax) plt.ylim(-0.1,plt_incr+baseline) ax.yaxis.set_ticks([]) # save data as p-site offset table fn = "%s_p_offsets.txt" % outbase fout = argsopener(fn,args) printer.write("Writing offset table to %s ..." % fn) fout.write("length\tp_offset\n") for k in offset_dict: fout.write("%s\t%s\n" % (k,offset_dict[k])) fout.write("default\t%s" % args.default) fout.close() # save plot plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat) printer.write("Saving plot to %s ..." % plot_fn) plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight") printer.write("Done.")
def main(argv=sys.argv[1:], verbose=False): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line verbose : bool, optional If `True`, return Returns ------- int `0` if files are identical, `1` otherwise str Only returned if `verbose` is selected. String describing how tables are unequal (e.g. which columns failed, et c). """ parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("file1", type=str) parser.add_argument("file2", type=str) parser.add_argument("-v", dest="verbose", default=False, action="store_true", help="Give verbose output") parser.add_argument( "--sort_keys", default=None, metavar="key", nargs="+", help= "If specified, values will be sorted by the column(s) corresponding to these name or numbers (0-indexed) before comparison" ) parser.add_argument("--exclude", type=str, default=[], nargs="+", metavar="key", help="Key or number (0-indexed) of columns to exclude") parser.add_argument("--no_header",default=False,action="store_true", help="If specified, no header row is present. Columns "+\ "for all other command-line flags "+\ "must be referenced by number (starting at zero) "+\ "rather than name, and will be assumed to be in "+\ "the same order in both files.") parser.add_argument( "--tol", type=float, default=1e-8, help="Tolerance by which floats are allowed to differ (Default: 1e-8)") args = parser.parse_args(argv) kwargs = { "sep": "\t", "index_col": args.sort_keys, "comment": "#", } exclude = args.exclude if args.no_header is True: if args.sort_keys is not None: kwargs["index_col"] = [int(X) for X in args.sort_keys] exclude = [int(X) for X in exclude] with opener(args.file1) as fh: df1 = pd.read_table(fh, header=None, **kwargs) with opener(args.file2) as fh: df2 = pd.read_table(fh, header=None, **kwargs) else: with opener(args.file1) as fh: df1 = pd.read_table(fh, header=0, **kwargs) with opener(args.file2) as fh: df2 = pd.read_table(fh, header=0, **kwargs) if len(args.exclude) > 0: printer.write("Excluding columns %s: " % ", ".join(args.exclude)) for k in exclude: if k in df1: df1.pop(k) if k in df2: df2.pop(k) test_result, messages = test_dataframe_equality( df1, df2, printer=printer, print_verbose=True, return_verbose=True, tol=args.tol) #test_3(df1,df2) if test_result == True: printer.write("Files contain equivalent data.") exit_code = 0 else: printer.write("Files non-equivalent.") exit_code = 1 if __name__ == "__main__": sys.exit(exit_code) else: if args.verbose == True or verbose == True: return exit_code, messages else: return exit_code
def do_chart(args, plot_parser): """Produce a set of charts comparing multiple samples pairwise. Charts include histograms of log2 fold changes and scatter plots with correlation coefficients, both generated for raw count and RPKM data. Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``chart`` subprogram """ plot_parser.set_style_from_args(args) outbase = args.outbase bins = numpy.array(args.bins) figformat = args.figformat # read input files printer.write("Reading input files: %s ..." % ", ".join(args.infiles)) samples = { get_short_samplename(X) : read_count_file(opener(X),args.list_of_regions)\ for X in args.infiles } # Define some variables for later sample_names = sorted(samples.keys()) comparisons = [ X for X in itertools.combinations(sample_names, 2) if X[0] != X[1] ] colors = plot_parser.get_colors_from_args(args, len(comparisons)) binkeys = tuple(["%s_reads" % k for k in args.regions]) comparison_labels = sorted(["%s_vs_%s" % (X, Y) for X, Y in comparisons]) bigtable = {}.fromkeys(comparison_labels) corrcoef_by_bin_table = {}.fromkeys(comparison_labels) # ki, kj = names of samples i and j # vi, vj = data of samples i and j for ki, kj in comparisons: try: assert (samples[ki]["region"] == samples[kj]["region"]).all() except AssertionError: printer.write( "Mismatched line entries for samples %s and %s. Were different gene lists used for counting?" % (ki, kj)) vi = samples[ki] vj = samples[kj] printer.write("Comparing %s to %s:" % (ki, kj)) label = "%s_vs_%s" % (ki, kj) bigtable[label] = {} corrcoef_by_bin_table[label] = {} for binkey in binkeys: bigtable[label][binkey] = { K: copy.deepcopy({}) for K in args.metrics } corrcoef_by_bin_table[label][binkey] = { K: copy.deepcopy({}) for K in args.metrics } for region in args.regions: region_counts = "%s_reads" % region count_mask = (vi[region_counts].values + vj[region_counts].values >= 128) for metric in args.metrics: region_metric = "%s_%s" % (region, metric) printer.write(" -%s across %s for all >=128 ..." % (metric, region)) # divide into regions >=128 and <128 and plot viover = vi[region_metric][count_mask].values vjover = vj[region_metric][count_mask].values viunder = vi[region_metric][~count_mask].values vjunder = vj[region_metric][~count_mask].values pearsonr = scipy.stats.pearsonr(viover, vjover)[0] spearmanr = scipy.stats.spearmanr(viover, vjover)[0] # log2 fold change stats log2_ratios = numpy.log2( numpy.ma.masked_invalid(vjover / viover)) log2_mean = log2_ratios.mean() log2_std = log2_ratios.std() min_diff = 2**(3 * log2_std) num_genes_log2 = (~log2_ratios.mask).sum() # ma plot try: ma_title = "%s vs %s (%s %s)" % (ki, kj, region, metric) fig = plot_parser.get_figure_from_args(args) _, axdict = ma_plot(viunder, vjunder, color=process_black, label="< 128 counts", axes=plt.gca(), kdalpha=0.2, title=ma_title) _, _ = ma_plot(viover, vjover, axes=axdict, label=">= 128 counts", kdalpha=0.8) mainax = axdict["main"] mainax.set_xlabel("%s %s" % (ki, metric)) mainax.legend(loc="upper right", frameon=False) text_kwargs = { "horizontalalignment": "right", "verticalalignment": "baseline", "linespacing": 1.6, "transform": mainax.transAxes, } plot_text = "\n".join([ "sample mean: %0.3e" % log2_mean, "sample stdev: %0.3e" % log2_std, "3-sigma fold change: %0.2f" % min_diff, "regions_counted: %s" % count_mask.sum(), ]) mainax.text(0.96, 0.04, plot_text, **text_kwargs) plt.savefig("%s_ma_%s_%s_%s.%s" % (outbase, label, region, metric, figformat), bbox_inches="tight") plt.close() except Exception as e: warnings.warn( "Could not make MA plot for samples %s and %s. Error message:\n %s" % (ki, kj, e.message), DataWarning) # scatter plot try: scatter_title = "%s vs %s (%s %s)" % (ki, kj, region, metric) do_scatter(vi[region_metric].values, vj[region_metric].values, count_mask, plot_parser, args, pearsonr=pearsonr, xlabel=ki, ylabel=kj, title=scatter_title) plt.savefig("%s_scatter_%s_%s_%s.%s" % (outbase, label, region, metric, figformat), bbox_inches="tight") plt.close() except ValueError as e: warnings.warn( "Could not make scatter plot for samples %s and %s. Error message:\n %s" % (ki, kj, e.message), DataWarning) # TODO: make these tables into dataframes. this scheme is insane # add entries to bigtable for export later bigtable[label][region_counts][metric]["pearsonr"] = pearsonr bigtable[label][region_counts][metric]["spearmanr"] = spearmanr bigtable[label][region_counts][metric]["log2_mean"] = log2_mean bigtable[label][region_counts][metric]["log2_std"] = log2_std bigtable[label][region_counts][metric]["2**3sigma"] = 2**( 3 * log2_std) bigtable[label][region_counts][metric][ "num_genes_128plus"] = count_mask.sum() bigtable[label][region_counts][metric][ "num_genes_log2"] = num_genes_log2 # do bin-by-bin counting printer.write(" -%s across %s by bin ..." % (metric, region)) bin_masks = get_bin_mask_by_summed_key(vi, vj, bins=args.bins, key=region_counts) for my_bin, bin_mask in sorted(bin_masks.items()): bin_vec_i = vi[region_metric][bin_mask] bin_vec_j = vj[region_metric][bin_mask] # make sure there are genes in bin before attempting calculations if len(bin_vec_i) > 0: nonzero_binmask = get_nonzero_either_mask( bin_vec_i, bin_vec_j) bin_vi_log2 = bin_vec_i[nonzero_binmask] bin_vj_log2 = bin_vec_j[nonzero_binmask] my_logs = numpy.log2(bin_vj_log2 / bin_vi_log2) my_logmean = numpy.mean(my_logs) my_logstd = numpy.std(my_logs) if len(bin_vec_i) > 2: my_pearsonr = scipy.stats.pearsonr( bin_vec_i, bin_vec_j)[0] my_spearmanr = scipy.stats.spearmanr( bin_vec_i, bin_vec_j)[0] else: my_spearmanr = numpy.nan my_pearsonr = numpy.nan else: # fill with dummy values my_logs = numpy.array([]) my_logmean = numpy.nan my_logstd = numpy.nan my_spearmanr = numpy.nan my_pearsonr = numpy.nan corrcoef_by_bin_table[label][region_counts][metric][ my_bin] = {} corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["pearsonr"] = my_pearsonr corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["spearmanr"] = my_spearmanr corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["genes_in_bin"] = sum(bin_mask) corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["log2_genes_in_bin"] = sum(nonzero_binmask) corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["log2_mean"] = my_logmean corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["log2_std"] = my_logstd # export big (non-binned) table printer.write("Writing tables ...") bigtable_out = argsopener("%s_bigtable.txt" % args.outbase, args, "w") stats = ( "num_genes_128plus", # 0 "pearsonr", # 1 "spearmanr", # 2 "num_genes_log2", # 3 "log2_mean", # 4 "log2_std", # 5 "2**3sigma", # 6 ) header = ["#region", "metric", "statistic"] header += [X for X in comparison_labels] bigtable_out.write("\t".join(header) + "\n") for region in binkeys: for metric in args.metrics: for stat in stats: ltmp = [region, metric, stat] for key in comparison_labels: ltmp.append(bigtable[key][region][metric][stat]) ltmp = [str(X) for X in ltmp] bigtable_out.write("\t".join(ltmp) + "\n") bigtable_out.close() # export binned data table and make bin-by-bin plots bintable_out = argsopener("%s_bintable.txt" % args.outbase, args, "w") region_metrics = [ "%s_%s" % (X, Y) for X in args.regions for Y in args.metrics ] stats = [ "genes_in_bin", # 0 "pearsonr", # 1 "spearmanr", # 2 "", # 3 "log2_genes_in_bin", # 4 "log2_mean", # 5 "log2_std", # 6 "" ] # 7 for region_metric in region_metrics: plt.close() fig = plot_parser.get_figure_from_args(args) plt.semilogx(basex=2) plt.title("Correlation coefficients by bin for %s" % region_metric) plt.xlabel("Bin") plt.ylabel("Spearman rho") region, metric = region_metric.split("_") region_counts = "%s_reads" % region bintable_out.write("%s\n\n" % region_metric) try: for n, label in enumerate(comparison_labels): corrcoefs = [] bintable_out.write("%s\t\t\t\t\t\t\t\t" % label) bintable_out.write("\n") bintable_out.write("bin\t" + "\t".join(stats) + "\n") for my_bin in bins: ltmp = [my_bin] for stat in stats: ltmp.append(corrcoef_by_bin_table[label][region_counts] [metric][my_bin].get(stat, "")) ltmp = [str(X) for X in ltmp] bintable_out.write("\t".join(ltmp) + "\n\n") corrcoefs.append( corrcoef_by_bin_table[label][region_counts][metric] [my_bin]["spearmanr"]) corrcoefs = ma.masked_invalid(corrcoefs) plt.plot(bins[~corrcoefs.mask], corrcoefs[~corrcoefs.mask], label=label, color=colors[n]) plt.legend(loc="lower right") plt.savefig("%s_corrcoef_by_bin_%s_%s.%s" % (outbase, region_metric, label, figformat), bbox_inches="tight") except Exception as e: warnings.warn( "Could not plot correlation-by-bin plot for '%s', '%s'" % (outbase, region_metric), DataWarning) bintable_out.write("\n\n") bintable_out.close() printer.write("Done.")
def main(argv=sys.argv[1:],verbose=False): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line verbose : bool, optional If `True`, return Returns ------- int `0` if files are identical, `1` otherwise str Only returned if `verbose` is selected. String describing how tables are unequal (e.g. which columns failed, et c). """ parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("file1",type=str) parser.add_argument("file2",type=str) parser.add_argument("-v",dest="verbose",default=False,action="store_true", help="Give verbose output") parser.add_argument("--sort_keys",default=None,metavar="key",nargs="+", help="If specified, values will be sorted by the column(s) corresponding to these name or numbers (0-indexed) before comparison") parser.add_argument("--exclude",type=str,default=[],nargs="+",metavar="key", help="Key or number (0-indexed) of columns to exclude") parser.add_argument("--no_header",default=False,action="store_true", help="If specified, no header row is present. Columns "+\ "for all other command-line flags "+\ "must be referenced by number (starting at zero) "+\ "rather than name, and will be assumed to be in "+\ "the same order in both files.") parser.add_argument("--tol",type=float,default=1e-8, help="Tolerance by which floats are allowed to differ (Default: 1e-8)") args = parser.parse_args(argv) kwargs = { "sep" : "\t", "index_col" : args.sort_keys, "comment" : "#", } exclude = args.exclude if args.no_header is True: if args.sort_keys is not None: kwargs["index_col"] = [int(X) for X in args.sort_keys] exclude = [int(X) for X in exclude] with opener(args.file1) as fh: df1 = pd.read_table(fh,header=None,**kwargs) with opener(args.file2) as fh: df2 = pd.read_table(fh,header=None,**kwargs) else: with opener(args.file1) as fh: df1 = pd.read_table(fh,header=0,**kwargs) with opener(args.file2) as fh: df2 = pd.read_table(fh,header=0,**kwargs) if len(args.exclude) > 0: printer.write("Excluding columns %s: " % ", ".join(args.exclude)) for k in exclude: if k in df1: df1.pop(k) if k in df2: df2.pop(k) test_result, messages = test_dataframe_equality(df1,df2,printer=printer,print_verbose=True,return_verbose=True,tol=args.tol) #test_3(df1,df2) if test_result == True: printer.write("Files contain equivalent data.") exit_code = 0 else: printer.write("Files non-equivalent.") exit_code = 1 if __name__ == "__main__": sys.exit(exit_code) else: if args.verbose == True or verbose == True: return exit_code, messages else: return exit_code
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() mp = MaskParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()], ) parser.add_argument("--maxslide",type=int,default=10, help="Maximum number of nt to search 5\' and 3\' of intron"+ " boundaries (Default: 10)") parser.add_argument("--ref",type=str,metavar="ref.bed",default=None, help="Reference file describing known splice junctions") parser.add_argument("--slide_canonical",action="store_true",default=False, help="Slide junctions to canonical junctions if present within equal support region") parser.add_argument("infile",type=str,metavar="input.bed", help="BED file describing discovered junctions") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) printer.write("Opening genome from %s..." % args.sequence_file) genome = sp.get_seqdict_from_args(args) # load crossmap cross_hash = mp.get_genome_hash_from_args(args) # load ref junctions if args.ref is not None: printer.write("Loading reference junctions from %s" % args.ref) known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False) else: known_hash = GenomeHash() # set up variables canonicals_plus = [("GT","AG"), ("GC","AG") ] canonicals_minus = [("CT","AC"), ("CT","GC") ] known_in_range = 0 canonical_in_range = 0 repetitive = 0 untouched = 0 c = 0 seen_already = [] outfiles = { "repetitive" : "%s_repetitive.bed" % args.outbase, "known" : "%s_shifted_known.bed" % args.outbase, "canonical" : "%s_shifted_canonical.bed" % args.outbase, "untouched" : "%s_untouched.bed" % args.outbase, } outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() } # process data printer.write("Opening junctions from %s..." % args.infile) for ivc in BED_Reader(CommentReader(opener(args.infile))): processed = False tup = None if c % 1000 == 0 and c > 0: printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) assert len(ivc) == 2 strand = ivc.strand minus_range, plus_range = find_match_range(ivc,genome,args.maxslide) # see if either end of splice junction +- match_range lands in repetitive areas of genome if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash): repetitive += 1 outfiles["repetitive"].write(ivc.as_bed()) processed = True # see if one or more known junctions in range if processed == False and args.ref is not None: # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions) known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc)) if len(known_juncs) > 0: known_in_range += 1 for my_known in known_juncs: tup = get_junction_tuple(my_known) if tup not in seen_already: outfiles["known"].write(my_known.as_bed()) seen_already.append(tup) processed = True # see if one or more canonical junctions in range if processed == False and args.slide_canonical == True: canonicals = canonicals_plus if strand == "+" else canonicals_minus #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals) canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals) if len(canonical_juncs) > 0: canonical_in_range += 1 for can in canonical_juncs: tup = get_junction_tuple(can) if tup not in seen_already: outfiles["canonical"].write(can.as_bed()) seen_already.append(tup) processed = True if processed == False: outfiles["untouched"].write(ivc.as_bed()) untouched += 1 c += 1 # save output printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) for v in outfiles.values(): v.close() printer.write("Done.")