def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser], ) parser.add_argument("outfile",type=str,help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args,printer=printer) transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args,printer=printer) ga_sum = ga.sum() normconst = 1000.0*1e6 / ga_sum with argsopener(args.outfile,args,"w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n") for n,ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts)/length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")
def main(args=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser() an = AnnotationParser() mp = MaskParser() bp = BaseParser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") mask_file_parser = mp.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser], ) parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors") parser.add_argument( "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)" ) parser.add_argument( "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')" ) args = parser.parse_args(args) bp.get_base_ops_from_args(args) # if output folder doesn't exist, create it if not os.path.isdir(args.out_folder): os.mkdir(args.out_folder) # parse args ga = al.get_genome_array_from_args(args, printer=printer) transcripts = an.get_segmentchains_from_args(args, printer=printer) mask_hash = mp.get_genome_hash_from_args(args, printer=printer) # evaluate for n, tx in enumerate(transcripts): if n % 1000 == 0: printer.write("Processed %s regions of interest" % n) filename = "%s%s.txt" % (args.out_prefix, tx.get_name()) full_filename = os.path.join(args.out_folder, filename) # mask out overlapping masked regions overlapping = mask_hash.get_overlapping_features(tx) for feature in overlapping: tx.add_masks(*feature.segments) count_vec = tx.get_masked_counts(ga) numpy.savetxt(full_filename, count_vec, fmt=args.format)
def main(args=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser() an = AnnotationParser() mp = MaskParser() bp = BaseParser() alignment_file_parser = al.get_parser(conflict_handler="resolve") annotation_file_parser = an.get_parser(conflict_handler="resolve") mask_file_parser = mp.get_parser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler="resolve", parents=[ base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser ]) parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors") parser.add_argument( "--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)") parser.add_argument( "--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')") args = parser.parse_args(args) bp.get_base_ops_from_args(args) # if output folder doesn't exist, create it if not os.path.isdir(args.out_folder): os.mkdir(args.out_folder) # parse args ga = al.get_genome_array_from_args(args, printer=printer) transcripts = an.get_segmentchains_from_args(args, printer=printer) mask_hash = mp.get_genome_hash_from_args(args, printer=printer) # evaluate for n, tx in enumerate(transcripts): if n % 1000 == 0: printer.write("Processed %s regions of interest" % n) filename = "%s%s.txt" % (args.out_prefix, tx.get_name()) full_filename = os.path.join(args.out_folder, filename) # mask out overlapping masked regions overlapping = mask_hash.get_overlapping_features(tx) for feature in overlapping: tx.add_masks(*feature.segments) count_vec = tx.get_masked_counts(ga) numpy.savetxt(full_filename, count_vec, fmt=args.format)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ al = AlignmentParser(disabled=["normalize"]) an = AnnotationParser() mp = MaskParser() pl = PlottingParser() bp = BaseParser() alignment_file_parser = al.get_parser() annotation_file_parser = an.get_parser() mask_file_parser = mp.get_parser() plotting_parser = pl.get_parser() base_parser = bp.get_parser() generator_help = "Create unambiguous position file from GFF3 annotation" generator_desc = format_module_docstring(do_generate.__doc__) counter_help = "Count reads in unambiguous gene positions" counter_desc = format_module_docstring(do_count.__doc__) chart_help = "Produce charts comparing reads between samples" chart_desc = format_module_docstring(do_chart.__doc__) parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers( title="subcommands", description="choose one of the following", dest="program") gparser = subparsers.add_parser( "generate", help=generator_help, description=generator_desc, formatter_class=argparse.RawDescriptionHelpFormatter, parents=[base_parser, annotation_file_parser, mask_file_parser], ) cparser = subparsers.add_parser( "count", help=counter_help, description=counter_desc, parents=[base_parser, alignment_file_parser], formatter_class=argparse.RawDescriptionHelpFormatter, ) pparser = subparsers.add_parser( "chart", help=chart_help, description=chart_desc, parents=[base_parser, plotting_parser], formatter_class=argparse.RawDescriptionHelpFormatter) gparser.add_argument("outbase", metavar="outbase", type=str, help="Basename for output files") cparser.add_argument( "position_file", type=str, metavar="file.positions", help= "File assigning positions to genes or transcripts (made using 'generate' subcommand)" ) cparser.add_argument("outbase", type=str, help="Basename for output files") pparser.add_argument("-i", "--in", nargs="+", type=str, dest="infiles", help="input files, made by 'count' subprogram") pparser.add_argument( "--bins", nargs="+", type=int, default=(0, 32, 64, 128, 256, 512, 1024, 2048, 4096), help="Bins into which features are partitioned based on counts") pparser.add_argument( "--regions", nargs="+", type=str, default=("exon", "utr5", "cds", "utr3"), help="Regions to compare (default: exon, utr5, cds, utr3)") pparser.add_argument("--metrics", nargs="+", type=str, default=("rpkm", "reads"), help="Metrics to compare (default: rpkm, reads)") pparser.add_argument( "list_of_regions", type=str, metavar='gene_list.txt', nargs="?", default=None, help= "Optional. File listing regions (genes or transcripts), one per line, to include in comparisons. If not given, all genes are included." ) pparser.add_argument("outbase", type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) if args.program == "generate": #generate position file do_generate(args, an, mp) elif args.program == "count": #use position file to count gene expression in infiles do_count(args, al) elif args.program == "chart": #use count files to generate a family of charts and tables do_chart(args, pl)
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :py:func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ sp = SequenceParser() mp = MaskParser() bp = BaseParser() parser = argparse.ArgumentParser(description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[bp.get_parser(),sp.get_parser(),mp.get_parser()], ) parser.add_argument("--maxslide",type=int,default=10, help="Maximum number of nt to search 5\' and 3\' of intron"+ " boundaries (Default: 10)") parser.add_argument("--ref",type=str,metavar="ref.bed",default=None, help="Reference file describing known splice junctions") parser.add_argument("--slide_canonical",action="store_true",default=False, help="Slide junctions to canonical junctions if present within equal support region") parser.add_argument("infile",type=str,metavar="input.bed", help="BED file describing discovered junctions") parser.add_argument("outbase",type=str, help="Basename for output files") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) printer.write("Opening genome from %s..." % args.sequence_file) genome = sp.get_seqdict_from_args(args) # load crossmap cross_hash = mp.get_genome_hash_from_args(args) # load ref junctions if args.ref is not None: printer.write("Loading reference junctions from %s" % args.ref) known_hash = GenomeHash(list(BED_Reader(open(args.ref))),do_copy=False) else: known_hash = GenomeHash() # set up variables canonicals_plus = [("GT","AG"), ("GC","AG") ] canonicals_minus = [("CT","AC"), ("CT","GC") ] known_in_range = 0 canonical_in_range = 0 repetitive = 0 untouched = 0 c = 0 seen_already = [] outfiles = { "repetitive" : "%s_repetitive.bed" % args.outbase, "known" : "%s_shifted_known.bed" % args.outbase, "canonical" : "%s_shifted_canonical.bed" % args.outbase, "untouched" : "%s_untouched.bed" % args.outbase, } outfiles = { K : argsopener(V,args,"w") for K,V in outfiles.items() } # process data printer.write("Opening junctions from %s..." % args.infile) for ivc in BED_Reader(CommentReader(opener(args.infile))): processed = False tup = None if c % 1000 == 0 and c > 0: printer.write("Processed: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) assert len(ivc) == 2 strand = ivc.strand minus_range, plus_range = find_match_range(ivc,genome,args.maxslide) # see if either end of splice junction +- match_range lands in repetitive areas of genome if covered_by_repetitive(ivc,minus_range,plus_range,cross_hash): repetitive += 1 outfiles["repetitive"].write(ivc.as_bed()) processed = True # see if one or more known junctions in range if processed == False and args.ref is not None: # find_known_in_range(query_ivc,minus_range,plus_range,knownjunctions) known_juncs = find_known_in_range(ivc,minus_range,plus_range,known_hash.get_nearby_features(ivc)) if len(known_juncs) > 0: known_in_range += 1 for my_known in known_juncs: tup = get_junction_tuple(my_known) if tup not in seen_already: outfiles["known"].write(my_known.as_bed()) seen_already.append(tup) processed = True # see if one or more canonical junctions in range if processed == False and args.slide_canonical == True: canonicals = canonicals_plus if strand == "+" else canonicals_minus #find_canonicals_in_range(query_ivc,minus_range,plus_range,genome,canonicals) canonical_juncs = find_canonicals_in_range(ivc,minus_range,plus_range,genome,canonicals) if len(canonical_juncs) > 0: canonical_in_range += 1 for can in canonical_juncs: tup = get_junction_tuple(can) if tup not in seen_already: outfiles["canonical"].write(can.as_bed()) seen_already.append(tup) processed = True if processed == False: outfiles["untouched"].write(ivc.as_bed()) untouched += 1 c += 1 # save output printer.write("Totals: %s\tknown: %s\tshifted to canonical: %s\trepetitive: %s\tuntouched: %s" % \ (c, known_in_range, canonical_in_range, repetitive, untouched)) for v in outfiles.values(): v.close() printer.write("Done.")
def main(argv=sys.argv[1:]): """Command-line program Parameters ---------- argv : list, optional A list of command-line arguments, which will be processed as if the script were called from the command line if :func:`main` is called directly. Default: `sys.argv[1:]`. The command-line arguments, if the script is invoked from the command line """ ap = AnnotationParser() annotation_file_parser = ap.get_parser(conflict_handler="resolve") al = AlignmentParser(disabled=_DISABLED) alignment_file_parser = al.get_parser(conflict_handler="resolve") mp = MaskParser() mask_file_parser = mp.get_parser() bp = BaseParser() base_parser = bp.get_parser() parser = argparse.ArgumentParser( description=format_module_docstring(__doc__), formatter_class=argparse.RawDescriptionHelpFormatter, parents=[ base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser ], ) parser.add_argument("outfile", type=str, help="Output filename") args = parser.parse_args(argv) bp.get_base_ops_from_args(args) ga = al.get_genome_array_from_args(args, printer=printer) transcripts = ap.get_transcripts_from_args(args, printer=printer, return_type=SegmentChain) crossmap = mp.get_genome_hash_from_args(args, printer=printer) ga_sum = ga.sum() normconst = 1000.0 * 1e6 / ga_sum with argsopener(args.outfile, args, "w") as fout: fout.write("## total_dataset_counts: %s\n" % ga_sum) fout.write( "region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n" ) for n, ivc in enumerate(transcripts): name = ivc.get_name() masks = crossmap.get_overlapping_features(ivc) ivc.add_masks(*itertools.chain.from_iterable((X for X in masks))) if n % 1000 == 0: printer.write("Processed %s regions..." % n) counts = numpy.nansum(ivc.get_masked_counts(ga)) length = ivc.masked_length rpnt = numpy.nan if length == 0 else float(counts) / length rpkm = numpy.nan if length == 0 else rpnt * normconst ltmp = [ name, str(ivc), "%.8e" % counts, "%.8e" % rpnt, "%.8e" % rpkm, "%d" % length ] fout.write("%s\n" % "\t".join(ltmp)) fout.close() printer.write("Processed %s regions total." % n) printer.write("Done.")