def write_output_files(table, title, args): """Write gene info table from :py:func:`do_generate` to several output files: OUTBASE_gene.positions Tab-delimited text file. Each line is a merged gene, and columns indicate the genomic coordinates and lengths of each of the position sets above. OUTBASE_transcript.positions Tab-delimited text file. Each line is a transcript, and columns indicate the genomic coordinates and lengths of each of the position sets above. OUTBASE_gene_REGION.bed `BED`_ files showing position sets for `REGION`, where `REGION` is one of *exon*, *utr5*, *cds*, *utr3*, or *masked*. These contain the same information in ``OUTBASE_gene.positions``, but can be visualized easily in a :term:`genome browser` Parameters ---------- table : :class:`pandas.DataFrame` Gene info table made in :py:func:`do_generate` title : str Title ("gene" or "transcript") args : :py:class:`argparse.Namespace` Command-line arguments """ keys = ("utr5", "utr3", "cds", "masked", "exon") bed_columns = ["%s_bed" % K for K in keys] bedfiles = { X: argsopener("%s_%s_%s.bed" % (args.outbase, title, X), args) for X in keys } for _, row in table[bed_columns].iterrows(): for k in keys: bedfiles[k].write(row["%s_bed" % k]) for k in bedfiles: bedfiles[k].close() pos_out = argsopener("%s_%s.positions" % (args.outbase, title), args) table.to_csv(pos_out, sep="\t", header=True, index=False, na_rep="nan", float_format="%.8f", columns=[ "region", "exon", "utr5", "cds", "utr3", "masked", "exon_unmasked", "transcript_ids" ]) pos_out.close()
def do_count(args, alignment_parser): """Count the number and density covering each merged gene in an annotation made made using the `generate` subcommand). Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``count`` subprogram """ # we expect many zero-lenght segmentchains, so turn these off for now warnings.filterwarnings( "ignore", ".*zero-length SegmentChain.*", ) keys = ("exon", "utr5", "cds", "utr3") column_order = ["region"] gene_positions = read_pl_table(args.position_file) # read count files ga = alignment_parser.get_genome_array_from_args(args, printer=printer) total_counts = ga.sum() normconst = 1000.0 * 1e6 / total_counts printer.write("Dataset has %s counts in it." % total_counts) printer.write("Tallying genes ...") dtmp = {"region": []} for x in keys: for y in ("reads", "length", "rpkm"): label = "%s_%s" % (x, y) dtmp[label] = [] column_order.append(label) for i, name in enumerate(gene_positions["region"]): dtmp["region"].append(name) if i % 500 == 0: printer.write("Processed %s genes ..." % i) for k in keys: ivc = SegmentChain.from_str(gene_positions[k][i]) total = sum(ivc.get_counts(ga)) length = ivc.length rpkm = (normconst * total / length) if length > 0 else numpy.nan dtmp["%s_reads" % k].append(total) dtmp["%s_length" % k].append(length) dtmp["%s_rpkm" % k].append(rpkm) fout = argsopener("%s.txt" % args.outbase, args, "w") dtmp = pd.DataFrame(dtmp) dtmp.to_csv(fout, sep="\t", header=True, index=False, columns=column_order, na_rep="nan", float_format="%.8f") fout.close() printer.write("Done.")
def chrom_worker(chrom_seq,args=None): name, seq_or_kmers = chrom_seq printer.write("Processing chromosome %s..." % name) base = "%s_%s_%s_%s" % (args.outbase, args.read_length, args.mismatches, name) kmer_file = "%s_kmers.fa" % base toomany_file = "%s_multimap.fa" % base bed_file = "%s_crossmap.bed" % base if args.have_kmers == False: with open(kmer_file,"w") as kmer: # only do this step if args.have_kmers == False simulate_reads(seq_or_kmers,kmer,args.read_length) kmer.close() else: kmer_file = seq_or_kmers argdict = { "mismatches" : args.mismatches, "processors" : 1, "bowtie" : args.bowtie, "toomany" : toomany_file, "kmers" : kmer_file, "ebwt" : args.ebwt, "null" : os.devnull, } cmd = "%(bowtie)s -m1 -a --best -f -v %(mismatches)s -p %(processors)s %(ebwt)s %(kmers)s --max %(toomany)s >%(null)s" % argdict printer.write("Aligning %s-mers for chromosome '%s' :\n\t'%s'" % (args.read_length,name,cmd)) try: retcode = subprocess.call(cmd,shell=True) if retcode < 0 or retcode == 2: printer.write("Alignment for chromosome '%s' terminated with status %s" % (name,retcode)) else: if os.path.exists(toomany_file): printer.write("Assembling multimappers from chromosome '%s' into crossmap..."% name) with argsopener(bed_file,args,"w") as bed_out: for plus_chain, minus_chain in fa_to_bed(open(toomany_file), args.read_length, offset=args.offset): bed_out.write(plus_chain.as_bed()) bed_out.write(minus_chain.as_bed()) bed_out.close() else: printer.write("Could not find multimapper source file '%s' ." % toomany_file) except OSError as e: printer.write("Alignment failed for chromosome '%s': %s" % (name,e)) printer.write("Cleaning up chromosome '%s'..." % name) os.remove(toomany_file) if args.have_kmers == False and args.save_kmers == False: os.remove(kmer_file) return bed_file
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser], ) parser.add_argument("outfile",type=str,help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args,printer=printer) transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args,printer=printer) ga_sum = ga.sum() normconst = 1000.0*1e6 / ga_sum with argsopener(args.outfile,args,"w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n") for n,ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts)/length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser() bp = BaseParser() annotation_parser = ap.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), parents=[base_parser,annotation_parser], formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--no_escape",default=True,action="store_false", help="If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)") parser.add_argument("--output_format",choices=["BED","GTF2"],default="GTF2", help="Format of output file. (default: GTF2)") parser.add_argument("--extra_columns",nargs="+",default=[],type=str, help="Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only).") parser.add_argument("--empty_value",default="na",type=str, help="Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'") parser.add_argument("outfile",metavar="outfile.[ bed | gtf ]",type=str, help="Output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) end_message = "" extra_cols = args.extra_columns if extra_cols is not None: if args.output_format == "BED": # avoid name clashes names_used = copy.copy(BED12_RESERVED_NAMES) asql_names = [fix_name(X,names_used) for X in extra_cols] autosql_str = "\n".join(AUTOSQL_ROW_FMT_STR % (X," "*max(15-len(X),2)) for X in asql_names) file_info = { "outbase" : args.outfile.replace(".bed","").replace(".gtf",""), "numcols" : len(extra_cols), "autosql" : DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]),autosql_str), } end_message = MAKE_BIGBED_MESSAGE % file_info else: warn("`--extra_columns` is ignored for %s-formatted output." % (args.output_format),ArgumentWarning) with argsopener(args.outfile,args,"w") as fout: c = 0 transcripts = ap.get_transcripts_from_args(args,printer=printer) for transcript in transcripts: if args.output_format == "GTF2": fout.write(transcript.as_gtf(escape=args.no_escape)) elif args.output_format == "BED": fout.write(transcript.as_bed(extra_columns=extra_cols,empty_value=args.empty_value)) if c % 1000 == 1: printer.write("Processed %s transcripts ..." % c) c += 1 printer.write("Processed %s transcripts total." % c) printer.write("Done.") print(end_message)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AnnotationParser(input_choices=_ANNOTATION_INPUT_CHOICES) annotation_file_parser = ap.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser,annotation_file_parser]) parser.add_argument("--export_tophat",default=False,action="store_true", help="Export tophat `.juncs` file in addition to BED output") parser.add_argument("outbase",type=str,help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain) with argsopener("%s.bed" % args.outbase,args,"w") as bed_out: if args.export_tophat == True: tophat_out = open("%s.juncs" % args.outbase,"w") printer.write("params: " +" ".join(argv)) printer.write("Detecting & comparing junctions...") ex_pairs = {} c = 0 u = 0 for chain in transcripts: if len(chain) > 1: # if multi-exon chrom = chain.chrom strand = chain.strand try: ep = ex_pairs[(chrom,strand)] except KeyError: ex_pairs[(chrom,strand)] = [] ep = ex_pairs[(chrom,strand)] for i in range(0,len(chain)-1): seg1 = chain[i] seg2 = chain[i+1] if c % 1000 == 0 and c > 0: printer.write("Processed %s junctions. Found %s unique..." % (c,u) ) c+=1 key = (seg1.end,seg2.start) if key not in ep: ep.append(key) u += 1 new_chain = SegmentChain(seg1,seg2) bed_out.write(new_chain.as_bed()) if args.export_tophat == True: my_junc = (chrom,seg1.end-1,seg2.start,strand) tophat_out.write("%s\t%s\t%s\t%s\n" % my_junc) del new_chain del seg1 del seg2 del chain printer.write("Processed %s total junctions. Found %s unique." % (c,u) ) bed_out.close() if args.export_tophat == True: tophat_out.close() printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser()]) parser.add_argument("--exclude", nargs="+", default=[], help="Feature types to exclude from consideration") parser.add_argument("infile", metavar="infile.gff", type=str, help="Input GFF3 file") parser.add_argument("outfile", metavar="outfile.txt", type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin, return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0], "parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + [ "parent unspecified", "parent not in database", "multiple parents" ] with argsopener(args.outfile, args, "w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i, 0) fh.write("%s\n" % sout) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directrly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"], disabled=["normalize","big_genome",]) bp = BaseParser() alignment_file_parser = ap.get_parser() base_parser = bp.get_parser() pp = PlottingParser() plotting_parser = pp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, plotting_parser]) parser.add_argument("--min_counts",type=int,default=10,metavar="N", help="Minimum counts required in normalization region "+ "to be included in metagene average (Default: 10)") parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N", default=None, #default=(20,50), help="Portion of each window against which its individual raw count profile"+ " will be normalized. Specify two integers, in nucleotide"+ " distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)") parser.add_argument("--norm_region",type=int,nargs=2,metavar="N", default=None, help="Deprecated. Use ``--normalize_over`` instead. "+ "Formerly, Portion of each window against which its individual raw count profile"+ " will be normalized. Specify two integers, in nucleotide"+ " distance, from 5\' end of window. (Default: 70 100)") parser.add_argument("--require_upstream",default=False,action="store_true", help="If supplied, the P-site offset is taken to be the distance "+ "between the largest peak upstream of the start codon and "+ "the start codon itself. Otherwise, the P-site offset is taken "+ "to be the distance between the largest peak in the entire ROI "+ "and the start codon. Ignored if ``--constrain`` is used." ) parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X", help="Constrain P-site offset to be between specified distance from "+ "start codon. Useful for noisy data. "+ "(Reasonable set: 10 15; default: not constrained)") parser.add_argument("--aggregate",default=False,action="store_true", help="Estimate P-site from aggregate reads at each position, instead "+ "of median normalized read density. Noisier, but helpful for "+ "lower-count data or read lengths with few counts. (Default: False)" ), parser.add_argument("--keep",default=False,action="store_true", help="Save intermediate count files. Useful for additional computations (Default: False)") parser.add_argument("--default",type=int,default=13, help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)") parser.add_argument("roi_file",type=str, help="ROI file surrounding start codons, from ``metagene generate`` subprogram") parser.add_argument("outbase",type=str,help="Basename for output files") # set manual options args = parser.parse_args(argv) bp.get_base_ops_from_args(args) # set defaults args.mapping = "fiveprime" args.offset = 0 args.nibble = 0 # process arguments min_len = args.min_length max_len = args.max_length profiles = max_len + 1 - min_len lengths = list(range(min_len,max_len+1)) outbase = args.outbase title = "Fiveprime read offsets by length" if args.title is None else args.title pp.set_style_from_args(args) colors = pp.get_colors_from_args(args,profiles) printer.write("Opening ROI file %s ..." % args.roi_file) with opener(args.roi_file) as roi_fh: roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0) roi_fh.close() printer.write("Opening count files %s ..." % ",".join(args.count_files)) ga = ap.get_genome_array_from_args(args,printer=printer) # remove default size filters my_filters = ga._filters.keys() for f in my_filters: ga.remove_filter(f) norm_start, norm_end = _get_norm_region(roi_table,args) # count count_dict, norm_count_dict, metagene_profile = do_count(roi_table, ga, norm_start, norm_end, args.min_counts, min_len, max_len, aggregate=args.aggregate, printer=printer) # save counts profile_fn = "%s_metagene_profiles.txt" % outbase with argsopener(profile_fn,args,"w") as metagene_out: metagene_profile.to_csv(metagene_out, sep="\t", header=True, index=False, na_rep="nan", columns=["x"]+["%s-mers" % X for X in lengths]) metagene_out.close() if args.keep == True: printer.write("Saving raw and normalized counts ...") for k in count_dict: count_fn = "%s_%s_rawcounts.txt.gz" % (outbase,k) normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k) mask_fn = "%s_%s_mask.txt.gz" % (outbase,k) numpy.savetxt(count_fn,count_dict[k],delimiter="\t") numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t") numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t") # plotting & offsets printer.write("Plotting and determining offsets ...") offset_dict = OrderedDict() # Determine scaling factor for plotting metagene profiles max_y = numpy.nan with warnings.catch_warnings(): # ignore warnings for slices that contain only NaNs warnings.simplefilter("ignore",category=RuntimeWarning) for k in lengths: max_y = numpy.nanmax([max_y, numpy.nanmax(metagene_profile["%s-mers"% k].values)]) if numpy.isnan(max_y) or max_y == 0: max_y = 1.0 # parse arguments & set styles mplrc = matplotlib.rcParams plt_incr = 1.2 # use this figsize if not specified on command line figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles) default_figsize = (7.5,figheight) fig = pp.get_figure_from_args(args,figsize=default_figsize) ax = plt.gca() plt.title(title) plt.xlabel("Distance from CDS start, (nt; 5' end mapping)") if args.aggregate == True: plt.ylabel("Aggregate read counts (au)") else: plt.ylabel("Median normalized read density (au)") plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2]) x = metagene_profile["x"].values xmin = x.min() xmax = x.max() if args.constrain is not None: mask = numpy.tile(True,len(x)) zp = (x==0).argmax() l,r = args.constrain if l == r: warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning) mindist = min(l,r) maxdist = max(l,r) mask[zp-maxdist:zp-mindist+1] = False elif args.require_upstream == True: mask = x >= 0 else: mask = numpy.tile(False,len(x)) for n,k in enumerate(lengths): color = colors[n] baseline = plt_incr*n y = metagene_profile["%s-mers" % k].values #ymask = y[mask] ymask = numpy.ma.MaskedArray(y,mask=mask) if numpy.isnan(y).all(): plot_y = numpy.zeros_like(x) else: if args.aggregate == False: plot_y = y / max_y else: plot_y = y.astype(float) / numpy.nanmax(y) * 0.9 # plot metagene profiles on common scale, offset by baseline from bottom to top ax.plot(x,baseline + plot_y,color=color) ax.text(xmin,baseline,"%s-mers" % k, ha="left", va="bottom", color=color, transform=matplotlib.transforms.offset_copy(ax.transData,fig, x=6.0,y=3.0,units="points")) ymax = baseline + numpy.nanmax(plot_y) # if all valid positions are nan, or if all valid positions are <= 0 if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0: offset = args.default usedefault = True else: offset = -x[numpy.ma.argmax(ymask)] usedefault = False offset_dict[k] = offset if usedefault == False: yadj = ymax - 0.2 * plt_incr ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2]) ax.text(-offset / 2.0, yadj, "%s nt" % (offset), color=color, ha="center", va="bottom", transform=matplotlib.transforms.offset_copy(ax.transData,fig, x=0.0,y=3.0,units="points") ) plt.xlim(xmin,xmax) plt.ylim(-0.1,plt_incr+baseline) ax.yaxis.set_ticks([]) # save data as p-site offset table fn = "%s_p_offsets.txt" % outbase fout = argsopener(fn,args) printer.write("Writing offset table to %s ..." % fn) fout.write("length\tp_offset\n") for k in offset_dict: fout.write("%s\t%s\n" % (k,offset_dict[k])) fout.write("default\t%s" % args.default) fout.close() # save plot plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat) printer.write("Saving plot to %s ..." % plot_fn) plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight") printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser(disabled=["normalize","big_genome","spliced_bowtie_files"], input_choices=["BAM"]) an = AnnotationParser() pp = PlottingParser() bp = BaseParser() plotting_parser = pp.get_parser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[base_parser, annotation_file_parser, alignment_file_parser, plotting_parser]) parser.add_argument("roi_file",type=str,nargs="?",default=None, help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\ "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\ "prevents double-counting of codons when multiple transcript isoforms exist "+\ "for a gene. See the documentation for `metagene` for more info about ROI files."+\ "If an ROI file is not given, supply an annotation with ``--annotation_files``") parser.add_argument("outbase",type=str,help="Required. Basename for output files") parser.add_argument("--codon_buffer",type=int,default=5, help="Codons before and after start codon to ignore (Default: 5)") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) pp.set_style_from_args(args) gnd = al.get_genome_array_from_args(args,printer=printer) read_lengths = list(range(args.min_length,args.max_length+1)) codon_buffer = args.codon_buffer dtmp = { "read_length" : numpy.array(read_lengths), "reads_counted" : numpy.zeros_like(read_lengths,dtype=int), } if args.roi_file is not None: using_roi = True roi_table = read_pl_table(args.roi_file) regions = roi_table.iterrows() transform_fn = roi_row_to_cds back_buffer = -1 if len(args.annotation_files) > 0: warnings.warn("If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file, ", ".join(args.annotation_files)), ArgumentWarning) else: using_roi = False if len(args.annotation_files) == 0: printer.write("Either an ROI file or at least annotation file must be given.") sys.exit(1) else: warnings.warn("Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.", ArgumentWarning) regions = an.get_transcripts_from_args(args,printer=printer) back_buffer = -codon_buffer transform_fn = lambda x: x.get_cds() phase_sums = {} for k in read_lengths: phase_sums[k] = numpy.zeros(3) for n, roi in enumerate(regions): if n % 1000 == 1: printer.write("Counted %s ROIs ..." % n) # transformation needed to extract CDS from transcript or from ROI file window cds_part = transform_fn(roi) # only calculate for coding genes if len(cds_part) > 0: read_dict = {} count_vectors = {} for k in read_lengths: read_dict[k] = [] count_vectors[k] = [] # for each seg, fetch reads, sort them, and create individual count vectors for seg in cds_part: reads = gnd.get_reads(seg) for read in filter(lambda x: len(x.positions) in read_dict,reads): read_dict[len(read.positions)].append(read) # map and sort by length for read_length in read_dict: count_vector = list(gnd.map_fn(read_dict[read_length],seg)[1]) count_vectors[read_length].extend(count_vector) # add each count vector for each length to total for k, vec in count_vectors.items(): counts = numpy.array(vec) if cds_part.strand == "-": counts = counts[::-1] if len(counts) % 3 == 0: counts = counts.reshape((len(counts)/3,3)) else: if using_roi == False: message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (roi.get_name(),len(counts)) warnings.warn(message,DataWarning) newlen = len(counts)//3 counts = counts[:3*newlen] counts = counts.reshape(newlen,3) phase_sums[k] += counts[codon_buffer:back_buffer,:].sum(0) printer.write("Counted %s ROIs total." % (n+1)) for k in dtmp: dtmp[k] = numpy.array(dtmp[k]) # total reads counted for each size for k in read_lengths: dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum() # read length distribution dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(float) / dtmp["reads_counted"].sum() # phase vectors phase_vectors = { K : V.astype(float)/V.astype(float).sum() for K,V in phase_sums.items() } for i in range(3): dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"])) for k, vec in phase_vectors.items(): for i in range(3): dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i] # phase table fn = "%s_phasing.txt" % args.outbase printer.write("Saving phasing table to %s ..." % fn) dtmp = pd.DataFrame(dtmp) with argsopener(fn,args) as fh: dtmp.to_csv(fh,columns=["read_length", "reads_counted", "fraction_reads_counted", "phase0", "phase1", "phase2", ], float_format="%.6f", na_rep="nan", sep="\t", index=False, header=True ) fh.close() fig = {} if args.figsize is not None: fig["figsize"] = tuple(args.figsize) colors = pp.get_colors_from_args(args,len(read_lengths)) fn = "%s_phasing.%s" % (args.outbase,args.figformat) printer.write("Plotting to %s ..." % fn) plot_counts = numpy.vstack([V for (_,V) in sorted(phase_sums.items())]) fig, (ax1,_) = phase_plot(plot_counts,labels=read_lengths,lighten_by=0.3, cmap=None,color=colors,fig=fig) if args.title is not None: ax1.set_title(args.title) else: ax1.set_title("Phasing stats for %s" % args.outbase) fig.savefig(fn,dpi=args.dpi,bbox_inches="tight")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() mp = MaskParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()], ) parser.add_argument("--maxslide",type=int,default=10, help="Maximum number of nt to search 5\' and 3\' of intron"+ " boundaries (Default: 10)") parser.add_argument("--ref",type=str,metavar="ref.bed",default=None, help="Reference file describing known splice junctions") parser.add_argument("--slide_canonical",action="store_true",default=False, help="Slide junctions to canonical junctions if present within equal support region") parser.add_argument("infile",type=str,metavar="input.bed", help="BED file describing discovered junctions") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) printer.write("Opening genome from %s..." % args.sequence_file) genome = sp.get_seqdict_from_args(args) # load crossmap cross_hash = mp.get_genome_hash_from_args(args) # load ref junctions if args.ref is not None: printer.write("Loading reference junctions from %s" % args.ref) known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False) else: known_hash = GenomeHash() # set up variables canonicals_plus = [("GT","AG"), ("GC","AG") ] canonicals_minus = [("CT","AC"), ("CT","GC") ] known_in_range = 0 canonical_in_range = 0 repetitive = 0 untouched = 0 c = 0 seen_already = [] outfiles = { "repetitive" : "%s_repetitive.bed" % args.outbase, "known" : "%s_shifted_known.bed" % args.outbase, "canonical" : "%s_shifted_canonical.bed" % args.outbase, "untouched" : "%s_untouched.bed" % args.outbase, } outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() } # process data printer.write("Opening junctions from %s..." % args.infile) for ivc in BED_Reader(CommentReader(opener(args.infile))): processed = False tup = None if c % 1000 == 0 and c > 0: printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) assert len(ivc) == 2 strand = ivc.strand minus_range, plus_range = find_match_range(ivc,genome,args.maxslide) # see if either end of splice junction +- match_range lands in repetitive areas of genome if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash): repetitive += 1 outfiles["repetitive"].write(ivc.as_bed()) processed = True # see if one or more known junctions in range if processed == False and args.ref is not None: # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions) known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc)) if len(known_juncs) > 0: known_in_range += 1 for my_known in known_juncs: tup = get_junction_tuple(my_known) if tup not in seen_already: outfiles["known"].write(my_known.as_bed()) seen_already.append(tup) processed = True # see if one or more canonical junctions in range if processed == False and args.slide_canonical == True: canonicals = canonicals_plus if strand == "+" else canonicals_minus #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals) canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals) if len(canonical_juncs) > 0: canonical_in_range += 1 for can in canonical_juncs: tup = get_junction_tuple(can) if tup not in seen_already: outfiles["canonical"].write(can.as_bed()) seen_already.append(tup) processed = True if processed == False: outfiles["untouched"].write(ivc.as_bed()) untouched += 1 c += 1 # save output printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) for v in outfiles.values(): v.close() printer.write("Done.")
def do_chart(args, plot_parser): """Produce a set of charts comparing multiple samples pairwise. Charts include histograms of log2 fold changes and scatter plots with correlation coefficients, both generated for raw count and RPKM data. Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``chart`` subprogram """ plot_parser.set_style_from_args(args) outbase = args.outbase bins = numpy.array(args.bins) figformat = args.figformat # read input files printer.write("Reading input files: %s ..." % ", ".join(args.infiles)) samples = { get_short_samplename(X) : read_count_file(opener(X),args.list_of_regions)\ for X in args.infiles } # Define some variables for later sample_names = sorted(samples.keys()) comparisons = [ X for X in itertools.combinations(sample_names, 2) if X[0] != X[1] ] colors = plot_parser.get_colors_from_args(args, len(comparisons)) binkeys = tuple(["%s_reads" % k for k in args.regions]) comparison_labels = sorted(["%s_vs_%s" % (X, Y) for X, Y in comparisons]) bigtable = {}.fromkeys(comparison_labels) corrcoef_by_bin_table = {}.fromkeys(comparison_labels) # ki, kj = names of samples i and j # vi, vj = data of samples i and j for ki, kj in comparisons: try: assert (samples[ki]["region"] == samples[kj]["region"]).all() except AssertionError: printer.write( "Mismatched line entries for samples %s and %s. Were different gene lists used for counting?" % (ki, kj)) vi = samples[ki] vj = samples[kj] printer.write("Comparing %s to %s:" % (ki, kj)) label = "%s_vs_%s" % (ki, kj) bigtable[label] = {} corrcoef_by_bin_table[label] = {} for binkey in binkeys: bigtable[label][binkey] = { K: copy.deepcopy({}) for K in args.metrics } corrcoef_by_bin_table[label][binkey] = { K: copy.deepcopy({}) for K in args.metrics } for region in args.regions: region_counts = "%s_reads" % region count_mask = (vi[region_counts].values + vj[region_counts].values >= 128) for metric in args.metrics: region_metric = "%s_%s" % (region, metric) printer.write(" -%s across %s for all >=128 ..." % (metric, region)) # divide into regions >=128 and <128 and plot viover = vi[region_metric][count_mask].values vjover = vj[region_metric][count_mask].values viunder = vi[region_metric][~count_mask].values vjunder = vj[region_metric][~count_mask].values pearsonr = scipy.stats.pearsonr(viover, vjover)[0] spearmanr = scipy.stats.spearmanr(viover, vjover)[0] # log2 fold change stats log2_ratios = numpy.log2( numpy.ma.masked_invalid(vjover / viover)) log2_mean = log2_ratios.mean() log2_std = log2_ratios.std() min_diff = 2**(3 * log2_std) num_genes_log2 = (~log2_ratios.mask).sum() # ma plot try: ma_title = "%s vs %s (%s %s)" % (ki, kj, region, metric) fig = plot_parser.get_figure_from_args(args) _, axdict = ma_plot(viunder, vjunder, color=process_black, label="< 128 counts", axes=plt.gca(), kdalpha=0.2, title=ma_title) _, _ = ma_plot(viover, vjover, axes=axdict, label=">= 128 counts", kdalpha=0.8) mainax = axdict["main"] mainax.set_xlabel("%s %s" % (ki, metric)) mainax.legend(loc="upper right", frameon=False) text_kwargs = { "horizontalalignment": "right", "verticalalignment": "baseline", "linespacing": 1.6, "transform": mainax.transAxes, } plot_text = "\n".join([ "sample mean: %0.3e" % log2_mean, "sample stdev: %0.3e" % log2_std, "3-sigma fold change: %0.2f" % min_diff, "regions_counted: %s" % count_mask.sum(), ]) mainax.text(0.96, 0.04, plot_text, **text_kwargs) plt.savefig("%s_ma_%s_%s_%s.%s" % (outbase, label, region, metric, figformat), bbox_inches="tight") plt.close() except Exception as e: warnings.warn( "Could not make MA plot for samples %s and %s. Error message:\n %s" % (ki, kj, e.message), DataWarning) # scatter plot try: scatter_title = "%s vs %s (%s %s)" % (ki, kj, region, metric) do_scatter(vi[region_metric].values, vj[region_metric].values, count_mask, plot_parser, args, pearsonr=pearsonr, xlabel=ki, ylabel=kj, title=scatter_title) plt.savefig("%s_scatter_%s_%s_%s.%s" % (outbase, label, region, metric, figformat), bbox_inches="tight") plt.close() except ValueError as e: warnings.warn( "Could not make scatter plot for samples %s and %s. Error message:\n %s" % (ki, kj, e.message), DataWarning) # TODO: make these tables into dataframes. this scheme is insane # add entries to bigtable for export later bigtable[label][region_counts][metric]["pearsonr"] = pearsonr bigtable[label][region_counts][metric]["spearmanr"] = spearmanr bigtable[label][region_counts][metric]["log2_mean"] = log2_mean bigtable[label][region_counts][metric]["log2_std"] = log2_std bigtable[label][region_counts][metric]["2**3sigma"] = 2**( 3 * log2_std) bigtable[label][region_counts][metric][ "num_genes_128plus"] = count_mask.sum() bigtable[label][region_counts][metric][ "num_genes_log2"] = num_genes_log2 # do bin-by-bin counting printer.write(" -%s across %s by bin ..." % (metric, region)) bin_masks = get_bin_mask_by_summed_key(vi, vj, bins=args.bins, key=region_counts) for my_bin, bin_mask in sorted(bin_masks.items()): bin_vec_i = vi[region_metric][bin_mask] bin_vec_j = vj[region_metric][bin_mask] # make sure there are genes in bin before attempting calculations if len(bin_vec_i) > 0: nonzero_binmask = get_nonzero_either_mask( bin_vec_i, bin_vec_j) bin_vi_log2 = bin_vec_i[nonzero_binmask] bin_vj_log2 = bin_vec_j[nonzero_binmask] my_logs = numpy.log2(bin_vj_log2 / bin_vi_log2) my_logmean = numpy.mean(my_logs) my_logstd = numpy.std(my_logs) if len(bin_vec_i) > 2: my_pearsonr = scipy.stats.pearsonr( bin_vec_i, bin_vec_j)[0] my_spearmanr = scipy.stats.spearmanr( bin_vec_i, bin_vec_j)[0] else: my_spearmanr = numpy.nan my_pearsonr = numpy.nan else: # fill with dummy values my_logs = numpy.array([]) my_logmean = numpy.nan my_logstd = numpy.nan my_spearmanr = numpy.nan my_pearsonr = numpy.nan corrcoef_by_bin_table[label][region_counts][metric][ my_bin] = {} corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["pearsonr"] = my_pearsonr corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["spearmanr"] = my_spearmanr corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["genes_in_bin"] = sum(bin_mask) corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["log2_genes_in_bin"] = sum(nonzero_binmask) corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["log2_mean"] = my_logmean corrcoef_by_bin_table[label][region_counts][metric][ my_bin]["log2_std"] = my_logstd # export big (non-binned) table printer.write("Writing tables ...") bigtable_out = argsopener("%s_bigtable.txt" % args.outbase, args, "w") stats = ( "num_genes_128plus", # 0 "pearsonr", # 1 "spearmanr", # 2 "num_genes_log2", # 3 "log2_mean", # 4 "log2_std", # 5 "2**3sigma", # 6 ) header = ["#region", "metric", "statistic"] header += [X for X in comparison_labels] bigtable_out.write("\t".join(header) + "\n") for region in binkeys: for metric in args.metrics: for stat in stats: ltmp = [region, metric, stat] for key in comparison_labels: ltmp.append(bigtable[key][region][metric][stat]) ltmp = [str(X) for X in ltmp] bigtable_out.write("\t".join(ltmp) + "\n") bigtable_out.close() # export binned data table and make bin-by-bin plots bintable_out = argsopener("%s_bintable.txt" % args.outbase, args, "w") region_metrics = [ "%s_%s" % (X, Y) for X in args.regions for Y in args.metrics ] stats = [ "genes_in_bin", # 0 "pearsonr", # 1 "spearmanr", # 2 "", # 3 "log2_genes_in_bin", # 4 "log2_mean", # 5 "log2_std", # 6 "" ] # 7 for region_metric in region_metrics: plt.close() fig = plot_parser.get_figure_from_args(args) plt.semilogx(basex=2) plt.title("Correlation coefficients by bin for %s" % region_metric) plt.xlabel("Bin") plt.ylabel("Spearman rho") region, metric = region_metric.split("_") region_counts = "%s_reads" % region bintable_out.write("%s\n\n" % region_metric) try: for n, label in enumerate(comparison_labels): corrcoefs = [] bintable_out.write("%s\t\t\t\t\t\t\t\t" % label) bintable_out.write("\n") bintable_out.write("bin\t" + "\t".join(stats) + "\n") for my_bin in bins: ltmp = [my_bin] for stat in stats: ltmp.append(corrcoef_by_bin_table[label][region_counts] [metric][my_bin].get(stat, "")) ltmp = [str(X) for X in ltmp] bintable_out.write("\t".join(ltmp) + "\n\n") corrcoefs.append( corrcoef_by_bin_table[label][region_counts][metric] [my_bin]["spearmanr"]) corrcoefs = ma.masked_invalid(corrcoefs) plt.plot(bins[~corrcoefs.mask], corrcoefs[~corrcoefs.mask], label=label, color=colors[n]) plt.legend(loc="lower right") plt.savefig("%s_corrcoef_by_bin_%s_%s.%s" % (outbase, region_metric, label, figformat), bbox_inches="tight") except Exception as e: warnings.warn( "Could not plot correlation-by-bin plot for '%s', '%s'" % (outbase, region_metric), DataWarning) bintable_out.write("\n\n") bintable_out.close() printer.write("Done.")
def do_generate(args, annotation_parser, mask_parser): """Generate gene position files from gene annotations. 1. Genes whose transcripts share exons are first collapsed into merged genes. 2. Within merged genes, all positions are classified. All positions are included in a set called *exon*. All positions that appear as coding regions in all transcripts (i.e. are never part of a 5'UTR or 3'UTR) included in a set called *CDS*. Similarly, all positions that appear as 5' UTR or 3' UTR in all transcripts are included in sets called *UTR5* or *UTR3*, respectively. 3. Genomic positions that are overlapped by multiple merged genes are excluded from the position sets for those genes. 4. If a :term:`mask file` is supplied, positions annotated in the mask file are also excluded 5. Output is given as a series of `BED`_ files and a `positions` file containing the same data. Parameters ---------- args : :py:class:`argparse.Namespace` command-line arguments for ``generate`` subprogram """ # variables for transcript <-> merged gene mapping transcripts = {} merged_genes = {} # data table for merged genes gene_table = pd.DataFrame({ "region": [], "transcript_ids": [], "exon_unmasked": [], "exon": [], "masked": [], "utr5": [], "cds": [], "utr3": [], "exon_bed": [], "utr5_bed": [], "cds_bed": [], "utr3_bed": [], "masked_bed": [], }) # data table for transcripts transcript_table = pd.DataFrame({ "region": [], "exon": [], "utr5": [], "cds": [], "utr3": [], "exon_bed": [], "utr5_bed": [], "cds_bed": [], "utr3_bed": [], "masked": [], "exon_unmasked": [], "transcript_ids": [], "masked_bed": [], }) # data is_sorted = (args.sorted == True) or \ (args.tabix == True) or \ (args.annotation_format == "BigBed") annotation_message = """`cs` relies upon relationships between transcripts and genes to collapse transcripts to genes for quantitation. Gene-transcript relationships are not generally preserved in BED or BigBed files, and a `gene_id` column could not be found in the input data. This may yield nonsensical results in the output. Consider either (1) using a GTF2 or GFF3 file or (2) creating an extended BED or BigBed file with a `gene_id` column.""".replace(" ", "").replace( "\n", " ") if args.annotation_format == "BED": if not isinstance(args.bed_extra_columns, list) or 'gene_id' not in args.bed_extra_columns: warnings.warn(annotation_message, FileFormatWarning) elif args.annotation_format == "BigBed": reader = BigBedReader(args.annotation_files[0]) if 'gene_id' not in reader.extension_fields: warnings.warn(annotation_message, FileFormatWarning) source = annotation_parser.get_transcripts_from_args(args, printer=printer) mask_hash = mask_parser.get_genome_hash_from_args(args) # loop conditions last_chrom = None do_loop = True # to save memory, we process one chromosome at a time if input file is sorted # knowing that at that moment all transcript parts are assembled while do_loop == True: try: tx = next(source) except StopIteration: do_loop = False try: # if chromosome is completely processed or EOF if (is_sorted and tx.spanning_segment.chrom != last_chrom ) or do_loop == False: if do_loop == True: source = itertools.chain([tx], source) if last_chrom is not None or do_loop == False: printer.write("Merging genes on chromosome/contig '%s'" % last_chrom) my_gene_table, my_transcript_table, my_merged_genes = process_partial_group( transcripts, mask_hash, printer) gene_table = pd.concat((gene_table, my_gene_table), axis=0) transcript_table = pd.concat( (transcript_table, my_transcript_table), axis=0) merged_genes.update(my_merged_genes) del transcripts gc.collect() del gc.garbage[:] transcripts = {} # reset last chrom last_chrom = tx.spanning_segment.chrom # otherwise, remember transcript else: transcripts[tx.get_name()] = tx # exit gracefully if no transcripts found except UnboundLocalError: pass # write output printer.write("Writing output ...") merged_fn = "%s_merged.txt" % args.outbase number_merged = len(set(merged_genes.values())) printer.write("Collapsed %s genes to %s merged groups. Writing to %s" % (len(merged_genes), number_merged, merged_fn)) fout = argsopener(merged_fn, args, "w") for gene, merged_name in sorted(merged_genes.items()): fout.write("%s\t%s\n" % (gene, merged_name)) fout.close() printer.write("Writing gene table and BED files ...") write_output_files(gene_table, "gene", args) printer.write("Writing transcript summary table and BED files ...") write_output_files(transcript_table, "transcript", args) printer.write("Done!")
def chrom_worker(chrom_seq, args=None): name, seq_or_kmers = chrom_seq printer.write("Processing chromosome %s..." % name) base = "%s_%s_%s_%s" % (args.outbase, args.read_length, args.mismatches, name) kmer_file = "%s_kmers.fa" % base toomany_file = "%s_multimap.fa" % base bed_file = "%s_crossmap.bed" % base if args.have_kmers == False: with open( kmer_file, "w") as kmer: # only do this step if args.have_kmers == False simulate_reads(seq_or_kmers, kmer, args.read_length) kmer.close() else: kmer_file = seq_or_kmers argdict = { "mismatches": args.mismatches, "processors": 1, "bowtie": args.bowtie, "toomany": toomany_file, "kmers": kmer_file, "ebwt": args.ebwt, "null": os.devnull, } cmd = "%(bowtie)s -m1 -a --best -f -v %(mismatches)s -p %(processors)s %(ebwt)s %(kmers)s --max %(toomany)s >%(null)s" % argdict printer.write("Aligning %s-mers for chromosome '%s' :\n\t'%s'" % (args.read_length, name, cmd)) try: retcode = subprocess.call(cmd, shell=True) if retcode < 0 or retcode == 2: printer.write( "Alignment for chromosome '%s' terminated with status %s" % (name, retcode)) else: if os.path.exists(toomany_file): printer.write( "Assembling multimappers from chromosome '%s' into crossmap..." % name) with argsopener(bed_file, args, "w") as bed_out: for plus_chain, minus_chain in fa_to_bed( open(toomany_file), args.read_length, offset=args.offset): bed_out.write(plus_chain.as_bed()) bed_out.write(minus_chain.as_bed()) bed_out.close() else: printer.write("Could not find multimapper source file '%s' ." % toomany_file) except OSError as e: printer.write("Alignment failed for chromosome '%s': %s" % (name, e)) printer.write("Cleaning up chromosome '%s'..." % name) os.remove(toomany_file) if args.have_kmers == False and args.save_kmers == False: os.remove(kmer_file) return bed_file
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: sys.argv[1:] (actually command-line arguments) """ ap = AlignmentParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),ap.get_parser()]) parser.add_argument("-o","--out",dest="outbase",type=str,required=True, metavar="FILENAME", help="Base name for output files") parser.add_argument("--window_size",default=100000,metavar="N",type=int, help="Size of nucleotides to fetch at once for export. "+\ "Large values are faster but require more memory "+\ "(Default: 100000)") track_opts = parser.add_argument_group(title="Browser track options") track_opts.add_argument("--color",type=str,default=None, help="An RGB hex string (`'#NNNNNN'`, `N` in `[0-9,A-F]`) specifying \ the track color.") track_opts.add_argument("-t","--track_name",dest="track_name",type=str, help="Name to give browser track", default=None) track_opts.add_argument("--output_format",choices=("bedgraph","variable_step"), default="bedgraph", help="Format of output file (Default: bedgraph)") args = parser.parse_args(argv) gnd = ap.get_genome_array_from_args(args,printer=printer) bp.get_base_ops_from_args(args) if args.track_name is None: name = args.outbase else: name = args.track_name if args.color is not None: fw_color = rc_color = "%s,%s,%s" % tuple(get_rgb255(args.color)) else: fw_color = rc_color = "0,0,0" if args.output_format == "bedgraph": outfn = gnd.to_bedgraph elif args.output_format == "variable_step": outfn = gnd.to_variable_step track_fw = "%s_fw.wig" % args.outbase track_rc = "%s_rc.wig" % args.outbase with argsopener(track_fw,args,"w") as fw_out: printer.write("Writing forward strand track to %s ..." % track_fw) outfn(fw_out,"%s_fw" % name,"+",window_size=args.window_size,color=fw_color, printer=printer) fw_out.close() with argsopener(track_rc,args,"w") as rc_out: printer.write("Writing reverse strand track to %s ..." % track_rc) outfn(rc_out,"%s_rc" % name,"-",window_size=args.window_size,color=rc_color, printer=printer) rc_out.close() printer.write("Done!")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser( disabled=["normalize", "big_genome", "spliced_bowtie_files"], input_choices=["BAM"]) an = AnnotationParser() pp = PlottingParser() bp = BaseParser() plotting_parser = pp.get_parser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[ base_parser, annotation_file_parser, alignment_file_parser, plotting_parser ]) parser.add_argument("roi_file",type=str,nargs="?",default=None, help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\ "from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\ "prevents double-counting of codons when multiple transcript isoforms exist "+\ "for a gene. See the documentation for `metagene` for more info about ROI files."+\ "If an ROI file is not given, supply an annotation with ``--annotation_files``") parser.add_argument("outbase", type=str, help="Required. Basename for output files") parser.add_argument( "--codon_buffer", type=int, default=5, help="Codons before and after start codon to ignore (Default: 5)") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) pp.set_style_from_args(args) gnd = al.get_genome_array_from_args(args, printer=printer) read_lengths = list(range(args.min_length, args.max_length + 1)) codon_buffer = args.codon_buffer dtmp = { "read_length": numpy.array(read_lengths), "reads_counted": numpy.zeros_like(read_lengths, dtype=int), } if args.roi_file is not None: using_roi = True roi_table = read_pl_table(args.roi_file) regions = roi_table.iterrows() transform_fn = roi_row_to_cds back_buffer = -1 if len(args.annotation_files) > 0: warnings.warn( "If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file, ", ".join(args.annotation_files)), ArgumentWarning) else: using_roi = False if len(args.annotation_files) == 0: printer.write( "Either an ROI file or at least annotation file must be given." ) sys.exit(1) else: warnings.warn( "Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.", ArgumentWarning) regions = an.get_transcripts_from_args(args, printer=printer) back_buffer = -codon_buffer transform_fn = lambda x: x.get_cds() phase_sums = {} for k in read_lengths: phase_sums[k] = numpy.zeros(3) for n, roi in enumerate(regions): if n % 1000 == 1: printer.write("Counted %s ROIs ..." % n) # transformation needed to extract CDS from transcript or from ROI file window cds_part = transform_fn(roi) # only calculate for coding genes if len(cds_part) > 0: read_dict = {} count_vectors = {} for k in read_lengths: read_dict[k] = [] count_vectors[k] = [] # for each seg, fetch reads, sort them, and create individual count vectors for seg in cds_part: reads = gnd.get_reads(seg) for read in filter(lambda x: len(x.positions) in read_dict, reads): read_dict[len(read.positions)].append(read) # map and sort by length for read_length in read_dict: count_vector = list( gnd.map_fn(read_dict[read_length], seg)[1]) count_vectors[read_length].extend(count_vector) # add each count vector for each length to total for k, vec in count_vectors.items(): counts = numpy.array(vec) if cds_part.strand == "-": counts = counts[::-1] if len(counts) % 3 == 0: counts = counts.reshape((int(len(counts) / 3), 3)) else: if using_roi == False: message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % ( roi.get_name(), len(counts)) warnings.warn(message, DataWarning) newlen = int(len(counts) // 3) counts = counts[:3 * newlen] counts = counts.reshape(newlen, 3) phase_sums[k] += counts[codon_buffer:back_buffer, :].sum(0) printer.write("Counted %s ROIs total." % (n + 1)) for k in dtmp: dtmp[k] = numpy.array(dtmp[k]) # total reads counted for each size for k in read_lengths: dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum() # read length distribution dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype( float) / dtmp["reads_counted"].sum() # phase vectors phase_vectors = { K: V.astype(float) / V.astype(float).sum() for K, V in phase_sums.items() } for i in range(3): dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"])) for k, vec in phase_vectors.items(): for i in range(3): dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i] # phase table fn = "%s_phasing.txt" % args.outbase printer.write("Saving phasing table to %s ..." % fn) dtmp = pd.DataFrame(dtmp) with argsopener(fn, args) as fh: dtmp.to_csv(fh, columns=[ "read_length", "reads_counted", "fraction_reads_counted", "phase0", "phase1", "phase2", ], float_format="%.6f", na_rep="nan", sep="\t", index=False, header=True) fh.close() fig = {} if args.figsize is not None: fig["figsize"] = tuple(args.figsize) colors = pp.get_colors_from_args(args, len(read_lengths)) fn = "%s_phasing.%s" % (args.outbase, args.figformat) printer.write("Plotting to %s ..." % fn) plot_counts = numpy.vstack([V for (_, V) in sorted(phase_sums.items())]) fig, (ax1, _) = phase_plot(plot_counts, labels=read_lengths, lighten_by=0.3, cmap=None, color=colors, fig=fig) if args.title is not None: ax1.set_title(args.title) else: ax1.set_title("Phasing stats for %s" % args.outbase) fig.savefig(fn, dpi=args.dpi, bbox_inches="tight")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser()]) parser.add_argument("--exclude",nargs="+",default=[], help="Feature types to exclude from consideration") parser.add_argument("infile",metavar="infile.gff",type=str, help="Input GFF3 file") parser.add_argument("outfile",metavar="outfile.txt",type=str, help="Name of output file") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) excluded = set(args.exclude) fin = sys.stdin if args.infile == "-" else opener(args.infile) feature_counts = Counter() features_with_parents = [] feature_types = {} name_type = {} printer.write("Opening %s..." % args.infile) c = 0 for feature in GFF3_Reader(fin,return_stopfeatures=False): if c % 10000 == 0: printer.write("Processed %s features..." % c) c += 1 ftype = feature.attr["type"] fname = feature.get_name() if ftype not in excluded: if ftype not in feature_types: feature_types[ftype] = Counter() feature_counts[ftype] += 1 if fname is not None: name_type[fname] = ftype if "Parent" in feature.attr: features_with_parents.append(feature) else: feature_types[ftype]["parent unspecified"] += 1 printer.write("Sorting parents...") c = 0 for feature in features_with_parents: if c % 10000 == 0: printer.write("Processed %s parents..." % c) c += 1 pnames = feature.attr["Parent"] ftype = feature.attr["type"] if pnames == "": feature_types[ftype]["parent unspecified"] += 1 else: if len(pnames) > 1: feature_types[ftype]["multiple parents"] += 1 else: ptype = name_type.get(pnames[0],"parent not in database") feature_types[ftype][ptype] += 1 rows = sorted(feature_types.keys()) cols = rows + ["parent unspecified","parent not in database","multiple parents"] with argsopener(args.outfile,args,"w") as fh: printer.write("Writing %s..." % args.outfile) header = "#feature_type\tcount\t" + "\t".join(cols) + "\n" fh.write(header) for r in rows: sout = "%s\t%s" % (r, feature_counts[r]) for i in cols: sout += "\t%s" % feature_types[r].get(i,0) fh.write("%s\n" % sout) printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[ base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser ], ) parser.add_argument("outfile", type=str, help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args, printer=printer) transcripts = ap.get_transcripts_from_args(args, printer=printer, return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args, printer=printer) ga_sum = ga.sum() normconst = 1000.0 * 1e6 / ga_sum with argsopener(args.outfile, args, "w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write( "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n" ) for n, ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts) / length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [ name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length ] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")