def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="method", type=str, choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply ") parser.add_argument("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf.") parser.add_argument("-c", "--contigs-tsv-file", dest="input_filename_contigs", type=str, help="filename with contig lengths.") parser.add_argument( "--agp-file", dest="input_filename_agp", type=str, help="agp file to map coordinates from contigs to scaffolds.") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--crop-gff-file", dest="filename_crop_gff", type=str, help="GFF/GTF file to crop against.") parser.add_argument( "--group-field", dest="group_field", type=str, help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... .""") parser.add_argument( "--filter-range", dest="filter_range", type=str, help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_argument("--sanitize-method", dest="sanitize_method", type=str, choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " ".") parser.add_argument( "--flank-method", dest="flank_method", type=str, choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " ".") parser.add_argument("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised .") parser.add_argument( "--contig-pattern", dest="contig_pattern", type=str, help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize .") parser.add_argument( "--assembly-report", dest="assembly_report", type=str, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-hasids", dest="assembly_report_hasIDs", type=int, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type=int, help="column in the assembly report containing ucsc contig ids" ".") parser.add_argument( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type=int, help="column in the assembly report containing ensembl contig ids") parser.add_argument( "--assembly-extras", dest="assembly_extras", type=str, help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome .") parser.add_argument("--extension-upstream", dest="extension_upstream", type=float, help="extension for upstream end .") parser.add_argument("--extension-downstream", dest="extension_downstream", type=float, help="extension for downstream end .") parser.add_argument("--min-distance", dest="min_distance", type=int, help="minimum distance of features to merge/join .") parser.add_argument("--max-distance", dest="max_distance", type=int, help="maximum distance of features to merge/join .") parser.add_argument("--min-features", dest="min_features", type=int, help="minimum number of features to merge/join .") parser.add_argument("--max-features", dest="max_features", type=int, help="maximum number of features to merge/join .") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if args.input_filename_contigs: contigs = Genomics.readContigSizes( iotools.open_file(args.input_filename_contigs, "r")) if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if args.assembly_report: df = pd.read_csv(args.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if args.assembly_report_hasIDs == 1: ucsccol = args.assembly_report_ucsccol ensemblcol = args.assembly_report_ensemblcol df.loc[df[1] == "assembled-molecule", ensemblcol] = df.loc[df[1] == "assembled-molecule", 0] if args.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif args.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if args.assembly_extras is not None: assembly_extras = args.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if args.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if args.input_filename_agp: agp = AGP.AGP() agp.readFromFile(iotools.open_file(args.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(args.stdin) if args.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == args.method add_downstream_flank = "add-downstream-flank" == args.method if args.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(args.extension_upstream) downstream_flank = int(args.extension_downstream) extend_flank = args.flank_method == "extend" if args.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, args.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: args.stdout.write(str(gff) + "\n") elif args.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: if args.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start args.stdout.write(str(x) + "\n") x.start = c.end elif args.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" args.stdout.write(str(x) + "\n") elif args.method == "join-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=False, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "merge-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=True, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "crop": for gff in cropGFF(gffs, args.filename_crop_gff): args.stdout.write(str(gff) + "\n") elif args.method == "crop-unique": for gff in cropGFFUnique(gffs): args.stdout.write(str(gff) + "\n") elif args.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", args.filter_range).groups() except AttributeError: raise "can not parse range %s" % args.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): args.stdout.write(str(gff) + "\n") elif args.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif args.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif args.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if args.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if args.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if args.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if args.contig_pattern: to_remove = [ re.compile(x) for x in args.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue args.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif args.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): args.stdout.write(str(gff) + "\n") else: for gff in gffs: if args.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if args.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) args.stdout.write(str(gff) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "geneprofile", "tssprofile", "utrprofile", "intervalprofile", "midpointprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "separateexonprofile", "separateexonprofilewithintrons", ), help='counters to use. Counters describe the ' 'meta-gene structure to use. ' 'Note using geneprofilewithintrons, or ' 'geneprofileabsolutedistancefromthreeprimeend will ' 'automatically turn on the --use-base-accuracy option' '[%default].') parser.add_option("-b", "--bam-file", "--bedfile", "--bigwigfile", dest="infiles", metavar="BAM", type="string", action="append", help="BAM/bed/bigwig files to use. Do not mix " "different types [%default]") parser.add_option("-c", "--control-bam-file", dest="controlfiles", metavar="BAM", type="string", action="append", help="control/input to use. Should be of the same " "type as the bam/bed/bigwig file" " [%default]") parser.add_option("-g", "--gtf-file", dest="gtffile", type="string", metavar="GTF", help="GTF file to use. " "[%default]") parser.add_option("--normalize-transcript", dest="transcript_normalization", type="choice", choices=("none", "max", "sum", "total-max", "total-sum"), help="normalization to apply on each transcript " "profile before adding to meta-gene profile. " "[%default]") parser.add_option("--normalize-profile", dest="profile_normalizations", type="choice", action="append", choices=("all", "none", "area", "counts", "background"), help="normalization to apply on meta-gene " "profile normalization. " "[%default]") parser.add_option( "-r", "--reporter", dest="reporter", type="choice", choices=("gene", "transcript"), help="report results for genes or transcripts." " When 'genes` is chosen, exons across all transcripts for" " a gene are merged. When 'transcript' is chosen, counts are" " computed for each transcript separately with each transcript" " contributing equally to the meta-gene profile." " [%default]") parser.add_option("-i", "--shift-size", dest="shifts", type="int", action="append", help="shift reads in :term:`bam` formatted file " "before computing densities (ChIP-Seq). " "[%default]") parser.add_option("-a", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge pairs in :term:`bam` formatted " "file before computing " "densities (ChIP-Seq). " "[%default]") parser.add_option("-u", "--use-base-accuracy", dest="base_accuracy", action="store_true", help="compute densities with base accuracy. The default " "is to only use the start and end of the aligned region " "(RNA-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extends", type="int", action="append", help="extend reads in :term:`bam` formatted file " "(ChIP-Seq). " "[%default]") parser.add_option("--resolution-upstream", dest="resolution_upstream", type="int", help="resolution of upstream region in bp " "[%default]") parser.add_option("--resolution-downstream", dest="resolution_downstream", type="int", help="resolution of downstream region in bp " "[%default]") parser.add_option("--resolution-upstream-utr", dest="resolution_upstream_utr", type="int", help="resolution of upstream UTR region in bp " "[%default]") parser.add_option("--resolution-downstream-utr", dest="resolution_downstream_utr", type="int", help="resolution of downstream UTR region in bp " "[%default]") parser.add_option("--resolution-cds", dest="resolution_cds", type="int", help="resolution of cds region in bp " "[%default]") parser.add_option("--resolution-first-exon", dest="resolution_first", type="int", help="resolution of first exon in gene, in bp" "[%default]") parser.add_option("--resolution-last-exon", dest="resolution_last", type="int", help="resolution of last exon in gene, in bp" "[%default]") parser.add_option("--resolution-introns", dest="resolution_introns", type="int", help="resolution of introns region in bp " "[%default]") parser.add_option("--resolution-exons-absolute-distance-topolya", dest="resolution_exons_absolute_distance_topolya", type="int", help="resolution of exons absolute distance " "topolya in bp " "[%default]") parser.add_option("--resolution-introns-absolute-distance-topolya", dest="resolution_introns_absolute_distance_topolya", type="int", help="resolution of introns absolute distance " "topolya in bp " "[%default]") parser.add_option("--extension-exons-absolute-distance-topolya", dest="extension_exons_absolute_distance_topolya", type="int", help="extension for exons from the absolute " "distance from the topolya in bp " "[%default]") parser.add_option( "--extension-introns-absolute-distance-topolya", dest="extension_introns_absolute_distance_topolya", type="int", help="extension for introns from the absolute distance from " "the topolya in bp [%default]") parser.add_option("--extension-upstream", dest="extension_upstream", type="int", help="extension upstream from the first exon in bp" "[%default]") parser.add_option("--extension-downstream", dest="extension_downstream", type="int", help="extension downstream from the last exon in bp" "[%default]") parser.add_option("--extension-inward", dest="extension_inward", type="int", help="extension inward from a TSS start site in bp" "[%default]") parser.add_option("--extension-outward", dest="extension_outward", type="int", help="extension outward from a TSS start site in bp" "[%default]") parser.add_option("--scale-flank-length", dest="scale_flanks", type="int", help="scale flanks to (integer multiples of) gene length" "[%default]") parser.add_option( "--control-factor", dest="control_factor", type="float", help="factor for normalizing control and foreground data. " "Computed from data if not set. " "[%default]") parser.add_option("--output-all-profiles", dest="output_all_profiles", action="store_true", help="keep individual profiles for each " "transcript and output. " "[%default]") parser.add_option("--counts-tsv-file", dest="input_filename_counts", type="string", help="filename with count data for each transcript. " "Use this instead " "of recomputing the profile. Useful for plotting the " "meta-gene profile " "from previously computed counts " "[%default]") parser.add_option( "--background-region-bins", dest="background_region_bins", type="int", help="number of bins on either end of the profile " "to be considered for background meta-gene normalization " "[%default]") parser.add_option( "--output-res", dest="resolution_images", type="int", help="the output dpi for the figure plot - will default to " "[%default]") parser.add_option( "--image-format", dest="image_format", type="string", help="The output format for the figure plot - defaults to " "[%default]") parser.set_defaults( remove_rna=False, ignore_pairs=False, force_output=False, bin_size=10, extends=[], shifts=[], sort=[], reporter="transcript", resolution_cds=1000, resolution_introns=1000, # 3kb is a good balance of seeing long enough 3 prime bias and not omit # too many genes. Tim 31th Aug 2013 resolution_exons_absolute_distance_topolya=3000, # introns is only for assess the noise level, thus do ont need a long # region, a long region has the side effect of omit more genes. Tim # 31th Aug 2013 resolution_introns_absolute_distance_topolya=500, # extension can simply just be the same as resolution extension_exons_absolute_distance_topolya=3000, extension_introns_absolute_distance_topolya=500, resolution_upstream_utr=1000, resolution_downstream_utr=1000, resolution_upstream=1000, resolution_downstream=1000, resolution_first=1000, resolution_last=1000, # mean length of transcripts: about 2.5 kb extension_upstream=2500, extension_downstream=2500, extension_inward=3000, extension_outward=3000, plot=True, methods=[], infiles=[], controlfiles=[], gtffile=None, profile_normalizations=[], transcript_normalization=None, scale_flanks=0, merge_pairs=False, min_insert_size=0, max_insert_size=1000, base_accuracy=False, matrix_format="single", control_factor=None, output_all_profiles=False, background_region_bins=10, input_filename_counts=None, resolution_images=None, image_format="png", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) # Keep for backwards compatability if len(args) == 2: infile, gtf = args options.infiles.append(infile) options.gtffile = gtf if not options.gtffile: raise ValueError("no GTF file specified") if options.gtffile == "-": options.gtffile = options.stdin else: options.gtffile = iotools.open_file(options.gtffile) if len(options.infiles) == 0: raise ValueError("no bam/wig/bed files specified") for methodsRequiresBaseAccuracy in [ "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", ]: # If you implemented any methods that you do not want the # spliced out introns or exons appear to be covered by # non-existent reads, it is better you let those methods imply # --base-accurarcy by add them here. if methodsRequiresBaseAccuracy in options.methods: options.base_accuracy = True if options.reporter == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile)) elif options.reporter == "transcript": gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile)) # Select rangecounter based on file type if len(options.infiles) > 0: if options.infiles[0].endswith(".bam"): bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.AlignmentFile(x, "rb") for x in options.controlfiles ] else: controlfiles = None format = "bam" if options.merge_pairs: range_counter = geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, merge_pairs=options.merge_pairs, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, controfiles=controlfiles, control_factor=options.control_factor) elif options.shifts or options.extends: range_counter = geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, controlfiles=controlfiles, control_factor=options.control_factor) elif options.base_accuracy: range_counter = geneprofile.RangeCounterBAMBaseAccuracy( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) else: range_counter = geneprofile.RangeCounterBAM( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bed.gz"): bedfiles = [pysam.Tabixfile(x) for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.Tabixfile(x) for x in options.controlfiles ] else: controlfiles = None range_counter = geneprofile.RangeCounterBed( bedfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bw"): wigfiles = [BigWigFile(file=open(x)) for x in options.infiles] range_counter = geneprofile.RangeCounterBigWig(wigfiles) else: raise NotImplementedError("can't determine file type for %s" % str(options.infiles)) counters = [] for method in options.methods: if method == "utrprofile": counters.append( geneprofile.UTRCounter( range_counter, options.resolution_upstream, options.resolution_upstream_utr, options.resolution_cds, options.resolution_downstream_utr, options.resolution_downstream, options.extension_upstream, options.extension_downstream, )) elif method == "geneprofile": counters.append( geneprofile.GeneCounter( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofilewithintrons": counters.append( geneprofile.GeneCounterWithIntrons( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofileabsolutedistancefromthreeprimeend": # options.extension_exons_absolute_distance_tostartsite, # options.extension_introns_absolute_distance_tostartsite, # Tim 31th Aug 2013: a possible feature for future, if five prime # bias is of your interest. # (you need to create another class). It is not very difficult to # derive from this class, but is not implemented yet # This future feature is slightly different the TSS profile # already implemented, because in this future feature introns are # skipped, counters.append( geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd( range_counter, options.resolution_upstream, options.resolution_downstream, options.resolution_exons_absolute_distance_topolya, options.resolution_introns_absolute_distance_topolya, options.extension_upstream, options.extension_downstream, options.extension_exons_absolute_distance_topolya, options.extension_introns_absolute_distance_topolya, options.scale_flanks)) elif method == "tssprofile": counters.append( geneprofile.TSSCounter(range_counter, options.extension_outward, options.extension_inward)) elif method == "intervalprofile": counters.append( geneprofile.RegionCounter(range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "midpointprofile": counters.append( geneprofile.MidpointCounter(range_counter, options.resolution_upstream, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # add new method to split 1st and last exons out # requires a representative transcript for reach gene # gtf should be sorted gene-position elif method == "separateexonprofile": counters.append( geneprofile.SeparateExonCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "separateexonprofilewithintrons": counters.append( geneprofile.SeparateExonWithIntronCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # set normalization for c in counters: c.setNormalization(options.transcript_normalization) if options.output_all_profiles: c.setOutputProfiles( iotools.open_file( E.get_output_file(c.name) + ".profiles.tsv.gz", "w")) if options.input_filename_counts: # read counts from file E.info("reading counts from %s" % options.input_filename_counts) all_counts = pandas.read_csv(iotools.open_file( options.input_filename_counts), sep='\t', header=0, index_col=0) if len(counters) != 1: raise NotImplementedError( 'counting from matrix only implemented for 1 counter.') # build counter based on reference counter counter = geneprofile.UnsegmentedCounter(counters[0]) counters = [counter] geneprofile.countFromCounts(counters, all_counts) else: E.info("starting counting with %i counters" % len(counters)) feature_names = geneprofile.countFromGTF(counters, gtf_iterator) # output matrices if not options.profile_normalizations: options.profile_normalizations.append("none") elif "all" in options.profile_normalizations: options.profile_normalizations = [ "none", "area", "counts", "background" ] for method, counter in zip(options.methods, counters): profiles = [] for norm in options.profile_normalizations: # build matrix, apply normalization profile = counter.getProfile( normalize=norm, background_region_bins=options.background_region_bins) profiles.append(profile) for x in range(1, len(profiles)): assert profiles[0].shape == profiles[x].shape # build a single matrix of all profiles for output matrix = numpy.concatenate(profiles) matrix.shape = len(profiles), len(profiles[0]) matrix = matrix.transpose() with iotools.open_file( E.get_output_file(counter.name) + ".matrix.tsv.gz", "w") as outfile: outfile.write("bin\tregion\tregion_bin\t%s\n" % "\t".join(options.profile_normalizations)) fields = [] bins = [] for field, nbins in zip(counter.fields, counter.nbins): fields.extend([field] * nbins) bins.extend(list(range(nbins))) for row, cols in enumerate(zip(fields, bins, matrix)): outfile.write("%i\t%s\t" % (row, "\t".join([str(x) for x in cols[:-1]]))) outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]]))) with iotools.open_file( E.get_output_file(counter.name) + ".lengths.tsv.gz", "w") as outfile: counter.writeLengthStats(outfile) if options.output_all_profiles: counter.closeOutputProfiles() if options.plot: import matplotlib # avoid Tk or any X matplotlib.use("Agg") import matplotlib.pyplot as plt for method, counter in zip(options.methods, counters): if method in ("geneprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "utrprofile", "intervalprofile", "separateexonprofile", "separateexonprofilewithintrons"): plt.figure() plt.subplots_adjust(wspace=0.05) max_scale = max([max(x) for x in counter.aggregate_counts]) for x, counts in enumerate(counter.aggregate_counts): plt.subplot(6, 1, x + 1) plt.plot(list(range(len(counts))), counts) plt.title(counter.fields[x]) plt.ylim(0, max_scale) figname = counter.name + ".full" fn = E.get_output_file(figname) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) plt.figure() points = [] cuts = [] for x, counts in enumerate(counter.aggregate_counts): points.extend(counts) cuts.append(len(counts)) plt.plot(list(range(len(points))), points) xx, xxx = 0, [] for x in cuts: xxx.append(xx + x // 2) xx += x plt.axvline(xx, color="r", ls="--") plt.xticks(xxx, counter.fields) figname = counter.name + ".detail" fn = E.get_output_file(figname) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) elif method == "tssprofile": plt.figure() plt.subplot(1, 3, 1) plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.title(counter.fields[0]) plt.subplot(1, 3, 2) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.title(counter.fields[1]) plt.subplot(1, 3, 3) plt.title("combined") plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.legend(counter.fields[:2]) fn = E.get_output_file( counter.name) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) elif method == "midpointprofile": plt.figure() plt.plot(numpy.arange(-options.resolution_upstream, 0), counter.aggregate_counts[0]) plt.plot(numpy.arange(0, options.resolution_downstream), counter.aggregate_counts[1]) fn = E.get_output_file( counter.name) + "." + options.image_format plt.savefig(os.path.expanduser(fn), format=options.image_format, dpi=options.resolution_images) # write footer and output benchmark information. E.stop()