def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([ gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts) ]) options.stdout.write("\t".join([ "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count" ]) + "\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--output-matrix", dest="matrix", type="string", default=None, help="output full matrix to this file") parser.add_option("-f", "--flanks-length", dest="flanks", type="int", default=1000, help="number of basepairs to use for gene flanks") parser.add_option("-u", "--upstreamflanks-length", dest="tssupflanks", type="int", default=1000, help="number of basepairs to use for gene flanks") parser.add_option( "--pseudo_count", dest="pseudo_count", type="float", default=0, help= "add pseduo count to bins to mitiage effects of low numbers of reads") parser.add_option("--normalised_profile", dest="normalize_profile", action="store_true", default=False, help="Normlize profile by profile sum") parser.add_option( "--plus-wig", dest="plus_wig", default=None, help="Use this wig file instead of a BAM file to get clip density" "may be used as only wig file, or may be provided together with" "--minus-wig for standed computation") parser.add_option("--minus_wig", dest="minus_wig", default=None, help="Use this to provide stranded wig data") parser.add_option("--bed", dest="bedfile", default=None, help="Use bed file with signal instead of bam") parser.add_option("--centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than end") parser.add_option("--no-gene-norm", dest="row_norm", action="store_false", default=True, help="Do not normalise profile from each gene") parser.add_option( "--region-length-correction", dest="rlc", action="store_true", default=False, help="Correct for regions of different legnths. Calculates something" "akin to an FPKM for the region") parser.add_option("-r", "--regions", dest="regions", type="string", default="flank5,exons,flank3", help="Which regions to use. Choose from %s" % ", ".join(regions_dict.keys())) parser.add_option("-b", "--bins", dest="bins", action="store", default=None, help="Bins to use. If not specified defaults for the" "chosen regions will be used") parser.add_option("-p", "--profile", dest="profile", type="choice", choices=iCLIP.getters.profiles.keys(), help="Read profile for the experiment. Choose from %s" % ", ".join(iCLIP.getters.profiles.keys())) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.plus_wig: bam = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bam = iCLIP.make_getter(bedfile=options.bedfile) else: bam = iCLIP.make_getter(bamfile=args[0], profile=options.profile, centre=options.centre) regions_dict['5flank'] = partial(regions_dict['5flank'], length=options.flanks) regions_dict['3flank'] = partial(regions_dict['3flank'], length=options.flanks) regions_dict['tss'] = partial(regions_dict['tss'], upstream=options.tssupflanks, downstream=options.flanks) regions_dict['tts'] = partial(regions_dict['tts'], upstream=options.flanks, downstream=options.flanks) names = options.regions.split(",") regions = [regions_dict[r] for r in names] if options.bins: bins = [int(b) for b in options.bins.split(",")] if not len(bins) == len(regions): raise ValueError("Bins and regions not same length") else: bins = [default_bins[r] for r in names] index = [list(product([n], range(b))) for n, b in zip(names, bins)] index = sum(index, []) index = pandas.MultiIndex.from_tuples(index, names=["region", "region_bin"]) profile = pandas.Series(index=index) accumulator = list() transcript_interator = GTF.transcript_iterator(GTF.iterator(options.stdin)) for transcript in transcript_interator: this_profile = transcript_region_meta(transcript, bam, regions, names, bins, length_norm=options.rlc) if options.pseudo_count: this_profile = profile.reindex(index, fill_value=0) +\ options.pseudo_count if options.row_norm: this_profile = this_profile / this_profile.sum() profile = profile.add(this_profile, fill_value=0) if options.matrix: profile.name = transcript[0].transcript_id accumulator.append(profile) if options.normalize_profile: profile = profile / profile.sum() profile = profile.reindex(names, level="region") profile.name = "density" profile = profile.reset_index() profile.index.name = "bin" profile.to_csv(options.stdout, sep="\t", index_label="bin") if options.matrix: counts_matrix = pandas.concat(accumulator, axis=1) counts_matrix = counts_matrix.transpose() counts_matrix = counts_matrix.reset_index(drop=True) counts_matrix = counts_matrix.transpose() counts_matrix.to_csv(IOTools.openFile(options.matrix, "w"), sep="\t", index=True, index_label="transcript_id") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) grouping_choices = ["exons", "utrs", "all"] parser.add_option("-g", "--grouping", dest="grouping", type="choice", choices=grouping_choices, help="How to group transcript regions choices are [%s]" % ",".join(grouping_choices)) parser.add_option("-p", "--pipeout", dest="pipeout", action="store_true", help="Output continuously to the pipe rather than in a" "chunk at the end") parser.add_option("-d", "--dtype", dest="dtype", type="string", default="int32", help="Numpy dtype for storing counts") parser.add_option("-w", "--window-size", dest="window_size", type="int", default=15, help="Size of window either size of crosslinked base to" "consider") parser.add_option("-f", "--fdr", dest="fdr", action="store_true", default=False, help="perform BH fdr correction on p-values, implies not" "--pipeout") parser.add_option("-o", "--output-windows", dest="output_windows", action="store_true", default=False, help="Output consolidated windows isntead of bases") parser.add_option("-b", "--output-both", type="string", dest="output_both", default=None, help="Output both bases bedGraph (stdout) and windows" "bed12 (specified file).") parser.add_option("-t", "--threshold", dest="threshold", type="float", default=0.05, help="p-value threshold under which to merge windows") parser.add_option("-c", "--centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than -1 base when no" "mutaiton is present") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # Standard in contains the transcripts gffs = GTF.gene_iterator(GTF.iterator(options.stdin)) # bam file is the first positional arguement bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre) if options.output_both: outfile_bases = options.stdout outfile_windows = IOTools.openFile(options.output_both, "w") elif options.output_windows: outfile_bases = None outfile_windows = options.stdout else: outfile_bases = options.stdout outfile_windows = None if options.fdr and options.pipeout: E.warning("--fdr implies not --pipeout, instant output disabled") options.pipeout = False if options.pipeout: output = InstantOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, window_size=options.window_size, threshold=options.threshold) else: output = DeferredOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, correct=options.fdr, window_size=options.window_size, threshold=options.threshold) E.info("Counting accross transcripts ...") max_end = 0 for gene in gffs: if options.grouping == "all": gene = GTF.merged_gene_iterator(gene) transcript_ps = {} for transcript in gene: # E.debug("Transcript is %s" % transcript[0].transcript_id) coords_converter = iCLIP.TranscriptCoordInterconverter(transcript) exons = GTF.asRanges(transcript, "exon") counts = iCLIP.count_intervals(bamfile, exons, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) counts.index = coords_converter.genome2transcript(counts.index.values) counts = counts.sort_index() cds = GTF.asRanges(transcript, "CDS") if options.grouping == "utrs" and len(cds) > 0: cds_interval = (cds[0][0], cds[-1][1]) cds_interval = coords_converter.genome2transcript(cds_interval) cds_interval.sort() cds_length = cds_interval[1] - cds_interval[0] p_intervals = [(0, cds_interval[0]), (cds_interval[0], cds_length), (cds_interval[1], coords_converter.length - cds_interval[1])] else: # do not group by cds or there is no cds p_intervals = [(0, coords_converter.length)] p_values = [calculateProbabilities(counts, options.window_size, length=length, start=start) for start, length in p_intervals if length > 0] if len(p_values) > 1: p_values = pd.concat(p_values) else: p_values = p_values[0] p_values.index = coords_converter.transcript2genome(p_values.index.values) intron_intervals = GTF.toIntronIntervals(transcript) if len(intron_intervals) > 0: intron_coords = iCLIP.TranscriptCoordInterconverter(transcript, introns=True) intron_counts = iCLIP.count_intervals(bamfile, intron_intervals, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) intron_counts.index = intron_coords.genome2transcript( intron_counts.index.values) intron_counts = intron_counts.sort_index() intron_pvalues = calculateProbabilities(intron_counts, options.window_size, intron_coords.length) intron_pvalues.index = intron_coords.transcript2genome( intron_pvalues.index.values) p_values = p_values.append(intron_pvalues) transcript_ps[transcript[0].transcript_id] = p_values transcript_df = pd.DataFrame(transcript_ps) transcript_df.index.rename("position", inplace=True) transcript_df["contig"] = gene[0][0].contig transcript_df["strand"] = gene[0][0].strand transcript_df["gene_id"] = gene[0][0].gene_id transcript_df.set_index("contig", append=True, inplace=True) transcript_df.set_index("strand", append=True, inplace=True) transcript_df.set_index("gene_id", append=True, inplace=True) gene_ps = transcript_df.mean(1) gene_ps = gene_ps.reorder_levels(["gene_id", "contig", "strand", "position"]) output.write(gene_ps, gene) output.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) grouping_choices = ["exons", "utrs", "all"] parser.add_option("-g", "--grouping", dest="grouping", type="choice", choices=grouping_choices, help="How to group transcript regions choices are [%s]" % ",".join(grouping_choices)) parser.add_option("-p", "--pipeout", dest="pipeout", action="store_true", help="Output continuously to the pipe rather than in a" "chunk at the end") parser.add_option("-d", "--dtype", dest="dtype", type="string", default="int32", help="Numpy dtype for storing counts") parser.add_option("-w", "--window-size", dest="window_size", type="int", default=15, help="Size of window either size of crosslinked base to" "consider") parser.add_option("-f", "--fdr", dest="fdr", action="store_true", default=False, help="perform BH fdr correction on p-values, implies not" "--pipeout") parser.add_option("-o", "--output-windows", dest="output_windows", action="store_true", default=False, help="Output consolidated windows isntead of bases") parser.add_option("-b", "--output-both", type="string", dest="output_both", default=None, help="Output both bases bedGraph (stdout) and windows" "bed12 (specified file).") parser.add_option("-t", "--threshold", dest="threshold", type="float", default=0.05, help="p-value threshold under which to merge windows") parser.add_option("-c", "--centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than -1 base when no" "mutaiton is present") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # Standard in contains the transcripts gffs = GTF.gene_iterator(GTF.iterator(options.stdin)) # bam file is the first positional arguement bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre) if options.output_both: outfile_bases = options.stdout outfile_windows = IOTools.openFile(options.output_both, "w") elif options.output_windows: outfile_bases = None outfile_windows = options.stdout else: outfile_bases = options.stdout outfile_windows = None if options.fdr and options.pipeout: E.warning("--fdr implies not --pipeout, instant output disabled") options.pipeout = False if options.pipeout: output = InstantOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, window_size=options.window_size, threshold=options.threshold) else: output = DeferredOutput(outfile_bases=outfile_bases, outfile_windows=outfile_windows, correct=options.fdr, window_size=options.window_size, threshold=options.threshold) E.info("Counting accross transcripts ...") max_end = 0 for gene in gffs: if options.grouping == "all": gene = GTF.merged_gene_iterator(gene) transcript_ps = {} for transcript in gene: # E.debug("Transcript is %s" % transcript[0].transcript_id) coords_converter = iCLIP.TranscriptCoordInterconverter(transcript) exons = GTF.asRanges(transcript, "exon") counts = iCLIP.count_intervals(bamfile, exons, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) counts.index = coords_converter.genome2transcript( counts.index.values) counts = counts.sort_index() cds = GTF.asRanges(transcript, "CDS") if options.grouping == "utrs" and len(cds) > 0: cds_interval = (cds[0][0], cds[-1][1]) cds_interval = coords_converter.genome2transcript(cds_interval) cds_interval.sort() cds_length = cds_interval[1] - cds_interval[0] p_intervals = [(0, cds_interval[0]), (cds_interval[0], cds_length), (cds_interval[1], coords_converter.length - cds_interval[1])] else: # do not group by cds or there is no cds p_intervals = [(0, coords_converter.length)] p_values = [ calculateProbabilities(counts, options.window_size, length=length, start=start) for start, length in p_intervals if length > 0 ] if len(p_values) > 1: p_values = pd.concat(p_values) else: p_values = p_values[0] p_values.index = coords_converter.transcript2genome( p_values.index.values) intron_intervals = GTF.toIntronIntervals(transcript) if len(intron_intervals) > 0: intron_coords = iCLIP.TranscriptCoordInterconverter( transcript, introns=True) intron_counts = iCLIP.count_intervals( bamfile, intron_intervals, strand=transcript[0].strand, contig=transcript[0].contig, dtype=options.dtype) intron_counts.index = intron_coords.genome2transcript( intron_counts.index.values) intron_counts = intron_counts.sort_index() intron_pvalues = calculateProbabilities( intron_counts, options.window_size, intron_coords.length) intron_pvalues.index = intron_coords.transcript2genome( intron_pvalues.index.values) p_values = p_values.append(intron_pvalues) transcript_ps[transcript[0].transcript_id] = p_values transcript_df = pd.DataFrame(transcript_ps) transcript_df.index.rename("position", inplace=True) transcript_df["contig"] = gene[0][0].contig transcript_df["strand"] = gene[0][0].strand transcript_df["gene_id"] = gene[0][0].gene_id transcript_df.set_index("contig", append=True, inplace=True) transcript_df.set_index("strand", append=True, inplace=True) transcript_df.set_index("gene_id", append=True, inplace=True) gene_ps = transcript_df.mean(1) gene_ps = gene_ps.reorder_levels( ["gene_id", "contig", "strand", "position"]) output.write(gene_ps, gene) output.close() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--output-matrix", dest="matrix", type="string", default=None, help="output full matrix to this file") parser.add_option("-f", "--flanks-length", dest="flanks", type="int", default=1000, help="number of basepairs to use for gene flanks") parser.add_option("-u", "--upstreamflanks-length", dest="tssupflanks", type="int", default=1000, help="number of basepairs to use for gene flanks") parser.add_option("--pseudo_count", dest="pseudo_count", type="float", default=0, help="add pseduo count to bins to mitiage effects of low numbers of reads") parser.add_option("--normalised_profile", dest="normalize_profile", action="store_true", default=False, help="Normlize profile by profile sum") parser.add_option("--plus-wig", dest="plus_wig", default=None, help="Use this wig file instead of a BAM file to get clip density" "may be used as only wig file, or may be provided together with" "--minus-wig for standed computation") parser.add_option("--minus_wig", dest="minus_wig", default=None, help="Use this to provide stranded wig data") parser.add_option("--bed", dest="bedfile", default=None, help="Use bed file with signal instead of bam") parser.add_option("--centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than end") parser.add_option("--no-gene-norm", dest="row_norm", action="store_false", default=True, help="Do not normalise profile from each gene") parser.add_option("--region-length-correction", dest="rlc", action="store_true", default=False, help="Correct for regions of different legnths. Calculates something" "akin to an FPKM for the region") parser.add_option("-r", "--regions", dest="regions", type="string", default="flank5,exons,flank3", help="Which regions to use. Choose from %s" % ", ".join(regions_dict.keys())) parser.add_option("-b", "--bins", dest="bins", action="store", default=None, help="Bins to use. If not specified defaults for the" "chosen regions will be used") parser.add_option("-p", "--profile", dest="profile", type="choice", choices=iCLIP.getters.profiles.keys(), help="Read profile for the experiment. Choose from %s" % ", ".join(iCLIP.getters.profiles.keys())) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.plus_wig: bam = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bam = iCLIP.make_getter(bedfile=options.bedfile) else: bam = iCLIP.make_getter(bamfile=args[0], profile=options.profile, centre=options.centre) regions_dict['5flank'] = partial(regions_dict['5flank'], length=options.flanks) regions_dict['3flank'] = partial(regions_dict['3flank'], length=options.flanks) regions_dict['tss'] = partial(regions_dict['tss'], upstream=options.tssupflanks, downstream=options.flanks) regions_dict['tts'] = partial(regions_dict['tts'], upstream=options.flanks, downstream=options.flanks) names = options.regions.split(",") regions = [regions_dict[r] for r in names] if options.bins: bins = [int(b) for b in options.bins.split(",")] if not len(bins) == len(regions): raise ValueError("Bins and regions not same length") else: bins = [default_bins[r] for r in names] index = [list(product([n], range(b))) for n, b in zip(names, bins)] index = sum(index, []) index = pandas.MultiIndex.from_tuples(index, names=["region", "region_bin"]) profile = pandas.Series( index=index) accumulator = list() transcript_interator = GTF.transcript_iterator(GTF.iterator(options.stdin)) for transcript in transcript_interator: this_profile = transcript_region_meta(transcript, bam, regions, names, bins, length_norm=options.rlc) if options.pseudo_count: this_profile = profile.reindex(index, fill_value=0) +\ options.pseudo_count if options.row_norm: this_profile = this_profile/this_profile.sum() profile = profile.add(this_profile, fill_value=0) if options.matrix: profile.name = transcript[0].transcript_id accumulator.append(profile) if options.normalize_profile: profile = profile/profile.sum() profile = profile.reindex(names, level="region") profile.name = "density" profile = profile.reset_index() profile.index.name = "bin" profile.to_csv(options.stdout, sep="\t", index_label="bin") if options.matrix: counts_matrix = pandas.concat(accumulator, axis=1) counts_matrix = counts_matrix.transpose() counts_matrix = counts_matrix.reset_index(drop=True) counts_matrix = counts_matrix.transpose() counts_matrix.to_csv(IOTools.openFile(options.matrix, "w"), sep="\t", index=True, index_label="transcript_id") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--gtf-file", dest="gtf", type="string", help="GTF containing gene annotations") parser.add_option("-s", "--sort", dest="sort", type="choice", default="length", choices=sort_choices, help="Property to sort rows by. Choices are %s" % ", ".join(sort_choices)) parser.add_option("-b", "--bin-size", dest="bin_size", type="int", default=25, help="Size of window over which to sum reads") parser.add_option("-u", "--upstream-window", dest="us_win", type="int", default=500, help="Amount of sequence upstream of alignment point (less introns)") parser.add_option("-d", "--downstream-window", dest="ds_win", type="int", default=None, help="Amount of sequence downstream of alignment point (default longest segment)") parser.add_option("-a", "--align-at", dest="align_at", type="choice", default="start", choices=align_choices, help="Where to align genes/transcripts at. Choices are %s" % ", ".join(align_choices)) parser.add_option("-H", "--height", dest="height", type="int", default=None, help="Number of rows in output matrix/heigh of plot in px") parser.add_option("-w", "--width", dest="width", type="int", default=None, help="Number of columns in output/width of plot in px" "default based on bin size") parser.add_option("-n", "--normalize", dest="normalize", type="choice", default="none", choices=norm_choices, help="Row normalization to apply. Choices are: %s" % ", ".join(norm_choices)) parser.add_option("-r", "--renormalize", dest="renormalize", type="choice", default="none", choices=norm_choices, help="Row normalization to apply after row/column compression") parser.add_option("--no-plot", dest="plot", action="store_false", default=True, help="Do not output plot - compute matrix only") parser.add_option("--use-matrix", dest="use_matrix", type="string", default=None, help="Use existing matrix") parser.add_option("--annotations", dest="annotations", type="choice", action="append", choices=annotation_choices, help="Add annotations to the output plot") parser.add_option("--reverse-strand", dest="rstrand", action="store_true", default=False, help="Find reads on reverse strand") parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript"], default="gene", help="use genes or transcripts") parser.add_option("--quantile", dest="quantile", type="float", default=0.99, help="Quantile to use in quantile normalization") parser.add_option("-o", "--outfile-prefix", dest="outfile_pattern", type="string", default=None, help="base of names for output files") parser.add_option("-c", "--crop", dest="crop", type="string", default=None, help="crop view to a certain range on the xaxis. Specify like" "-500:1000") parser.add_option("--format", dest="format", type="string", default="png", help="Output format, use valid R graphics device") parser.add_option("--plus-wig", dest="plus_wig", type="string", help="Use this wig for plus strand info rather than bam file") parser.add_option("--minus-wig", dest="minus_wig", type="string", help="Use this wig for minus strand info rather than bam file") parser.add_option("--bed", dest="bed", type="string", help="Use this bed for signal(must be indexed)") parser.add_option("--norm-mat", dest="norm_mat", type="string", help="Use this matrix for normalizing (e.g. RNA data") parser.add_option("--sort-order-file", dest="sort_file", type="string", default=None, help="Two column file containing gene names in the first column and a numeric value to sort on in the second") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.plot and (options.height is None): options.height = 100 if options.gtf: f = IOTools.openFile(options.gtf) if options.feature == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(f)) else: gtf_iterator = GTF.transcript_iterator(GTF.iterator(f)) lengths = dict() utr3_lengths = dict() utr5_lengths = dict() first_exon_lengths = dict() for transcript in gtf_iterator: lengths[transcript[0].transcript_id] = sum( [e[1] - e[0] for e in GTF.asRanges(transcript, "exon")]) exons = GTF.asRanges(transcript, "exon") utrs = GTF.asRanges(transcript, "UTR") coding = Intervals.truncate(exons, utrs) coding.sort() utr5 = [utr for utr in utrs if utr[1] <= coding[0][0]] utr3 = [utr for utr in utrs if utr[0] >= coding[-1][-1]] if transcript[0].strand == "-": utr3, utr5 = utr5, utr3 if transcript[0].strand == "+" or len(exons) == 1: first_exon_lengths[transcript[0].transcript_id] = \ exons[0][1] - exons[0][0] else: first_exon_lengths[transcript[0].transcript_id] = \ exons[-1][1] - exons[-1][0] utr3_lengths[transcript[0].transcript_id] = sum( [e[1] - e[0] for e in utr3]) utr5_lengths[transcript[0].transcript_id] = sum( [e[1] - e[0] for e in utr5]) lengths = pandas.Series(lengths) utr3_lengths = pandas.Series(utr3_lengths) utr5_lengths = pandas.Series(utr5_lengths) first_exon_lengths = pandas.Series(first_exon_lengths) else: options.sort = "none" options.annotations = None if options.plus_wig: getter = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bed: getter = iCLIP.make_getter(bedfile=options.bed) else: try: getter = iCLIP.make_getter(bamfile=args[0]) except IOError: E.error("Cannot open bamfile %s" % args[0]) return(1) except IndexError: getter = None if options.use_matrix: raw_matrix = pandas.read_csv(options.use_matrix, sep="\t", index_col=0) raw_matrix.columns = raw_matrix.columns.astype("int") else: raw_matrix = get_matrix(getter, lengths, options) if options.crop: crop_from, crop_to = map(int, options.crop.split(":")) raw_matrix = raw_matrix.loc[:, crop_from:crop_to] if options.norm_mat: norm_matrix = pandas.read_csv(options.norm_mat, sep="\t", index_col=0) norm_matrix.columns = norm_matrix.columns.astype("int") if options.crop: norm_matrix = norm_matrix.loc[:, crop_from:crop_to] if all(norm_matrix.columns == raw_matrix.columns) and \ all(raw_matrix.index.isin(norm_matrix.index.values)): norm_matrix = norm_matrix.loc[raw_matrix.index] norm_matrix = norm_matrix.replace( 0, norm_matrix[norm_matrix > 0].min().min()) raw_matrix = raw_matrix/norm_matrix norm_matrix = None else: raise ValueError("Incompatible normalisation matrix") normalized_matrix = normalize(raw_matrix, options.normalize, quantile=options.quantile) if options.sort == "length": sorter = lengths elif options.sort == "3utr": sorter = utr3_lengths elif options.sort == "5utr": sorter = utr5_lengths elif options.sort == "first-exon": sorter = first_exon_lengths elif options.sort == "manual": sorter = pandas.read_csv(options.sort_file, sep="\t", index_col=0, usecols=[0, 1]) sorter = sorter[sorter.columns[0]] elif options.sort == "none": sorter = pandas.Series(range(raw_matrix.shape[0]), index=raw_matrix.index[::-1]) sorter = sorter[sorter.index.isin(normalized_matrix.index)] sorter = sorter.sort_values(ascending=False) sorted_matrix = normalized_matrix.loc[sorter.index.values] compress_matrix = iCLIP.compress_matrix(sorted_matrix, ncols=options.width, nrows=options.height) renormalized_matrix = normalize(compress_matrix, options.renormalize, quantile=options.quantile) if renormalized_matrix is raw_matrix and options.use_matrix is not None: E.info("Input and output matrices are identical, no matrix output") else: if options.outfile_pattern: mat_outfile = IOTools.openFile( options.outfile_pattern + ".matrix.tsv.gz", "w") else: mat_outfile = options.stdout renormalized_matrix.to_csv(mat_outfile, sep="\t") if options.plot: try: from rpy2.robjects import r as R from rpy2 import robjects as ro except: E.info("No rpy2. Not plotting image") return(0) from rpy2.robjects.numpy2ri import numpy2ri ro.conversion.py2ri = numpy2ri ro.numpy2ri.activate() if options.outfile_pattern: plot_outfile = options.outfile_pattern + ".png" else: plot_outfile = "bam2heatmap_out.png" c = R["c"] R[options.format](plot_outfile, width=renormalized_matrix.shape[1] + 72, height=renormalized_matrix.shape[0] + 72, unit="px", res=72) R.par(mai=c(1, 0.5, 0, 0.5)) cols = R["colorRampPalette"](c("white", "blue"))(50) bases = renormalized_matrix.columns.values.astype("int") groups = renormalized_matrix.index.values.astype("int") mat = renormalized_matrix.as_matrix() mat[mat >= 1] = 1 R.image(bases, groups, R.t(mat), zlim=c(0, 1), raster=True, col=cols, xlab="Base", yaxt="n") def _sort_and_compress_annotation(anno): sorted_anno = anno.loc[sorter.index] comp_anno = iCLIP.compress_matrix( sorted_anno, renormalized_matrix.shape[0]) return comp_anno if options.annotations: ends = _sort_and_compress_annotation(lengths) starts = pandas.Series(0, index=renormalized_matrix.index) if options.align_at == "end": starts, ends = -1 * ends, starts if "start" in options.annotations: R.lines(starts.values, starts.index.values, col="black", pch=".") if "end" in options.annotations: R.lines(ends.values, ends.index.values, pch=".", col="black") if "5utr" in options.annotations: utr5s = _sort_and_compress_annotation(utr5_lengths) utr5s = starts + utr5s R.lines(utr5s.values, utr5s.index.values, col="orange", pch=".") if "3utr" in options.annotations: utr3s = _sort_and_compress_annotation(utr3_lengths) utr3s = ends - utr3s R.lines(utr3s.values, utr3s.index.values, col="orange", pch=".") R["dev.off"]() # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts)]) options.stdout.write("\t".join(["gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"])+"\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--output-matrix", dest="matrix", type="string", default=None, help="output full matrix to this file") parser.add_option("-f", "--flanks", dest="flanks", type="int", default=100, help="number of basepairs to use for gene flanks") parser.add_option("-b", "--exon-bins", dest="exon_bins", type="int", default=1000, help="number of bins to divide transcripts into") parser.add_option("--flank-bins", dest="flank_bins", type="int", default=10, help="number of bins to divide flanks into") parser.add_option( "--scale-flanks", dest="scale_flanks", action="store_true", default=False, help="Scale the size of the flank bins to match the size of the" "exon bins for each transcript") parser.add_option( "--pseudo_count", dest="pseudo_count", type="float", default=0, help= "add pseduo count to bins to mitiage effects of low numbers of reads") parser.add_option("--normalised_profile", dest="normalize_profile", action="store_true", default=False, help="Normlize profile by profile sum") parser.add_option( "--plus-wig", dest="plus_wig", default=None, help="Use this wig file instead of a BAM file to get clip density" "may be used as only wig file, or may be provided together with" "--minus-wig for standed computation") parser.add_option("--minus_wig", dest="minus_wig", default=None, help="Use this to provide stranded wig data") parser.add_option("--bed", dest="bedfile", default=None, help="Use bed file with signal instead of bam") parser.add_option("--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than end") parser.add_option("--no-gene-norm", dest="row_norm", action="store_false", default=True, help="Do not normalise profile from each gene") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.plus_wig: bam = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bam = iCLIP.make_getter(bedfile=options.bedfile) else: bam = iCLIP.make_getter(bamfile=args[0], centre=options.centre) if options.flanks > 0: bins = [options.flank_bins, options.exon_bins, options.flank_bins] else: bins = options.exon_bins summed_matrix, counts_matrix = iCLIP.meta_gene( options.stdin, bam, bins, options.flanks, output_matrix=(options.matrix is not None), calculate_flanks=options.scale_flanks, pseudo_count=options.pseudo_count, row_norm=options.row_norm) if options.flanks > 0: summed_matrix = summed_matrix[["flank5", "exons", "flank3"]] summed_matrix = summed_matrix.reset_index() if options.normalize_profile: summed_matrix["density"] = summed_matrix["density"] / summed_matrix[ "density"].sum() summed_matrix.to_csv(options.stdout, sep="\t", index=True, index_label="bin") if options.matrix: counts_matrix = counts_matrix.transpose() counts_matrix = counts_matrix.loc[["flank5", "exons", "flank3"], :] counts_matrix = counts_matrix.reset_index(drop=True) counts_matrix = counts_matrix.transpose() counts_matrix.to_csv(IOTools.openFile(options.matrix, "w"), sep="\t", index=True, index_label="transcript_id") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam", type="string", help="BAM file containing iCLIP reads", default=None) parser.add_option( "-w", "--wig", "--plus-wig", dest="plus_wig", type="string", help="BigWig file containing signal for already processed sample", default=None) parser.add_option( "--minus-wig", dest="minus_wig", type="string", help="BigWig file containing signal for sample on minus strand", default=None) parser.add_option("--bed", dest="bedfile", type="string", help="Bed file containing signal for sample") parser.add_option("-s", "--spread", dest="spread", type="int", default=15, help="Number of bases each site of each bases" "to use when calculating height") parser.add_option("-r", "--randomisations", dest="rands", type="int", default=100, help="Number of randomisations to use when" "calculating FDR") parser.add_option("-t", "--threshold", dest="threshold", type="float", default=0.05, help="FDR threshold on which to select bases") parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["transcript", "gene"], default="gene", help="GTF feature to use. Gene or transcript") parser.add_option("-p", "--processes", dest="proc", type="int", default=None, help="Number of processes to use for multiprocessing") parser.add_option("-c", "--centre", dest="centre", action="store_true", default=False, help="Use centre of read instead of -1 when no" "mutation is present") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.proc is not None: try: import multiprocessing pool = multiprocessing.Pool(options.proc) E.debug("Operating in multiprocessing mode") except ImportError: E.warn("Failed to setup multiprocessing, using single processor") pool = None else: E.debug("Operating in single processor mode") pool = None if options.feature == "gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) elif options.feature == "transcript": iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: raise ValueError("Unknown feature type %s" % options.feature) if options.bam: bam = iCLIP.make_getter(options.bam, centre=options.centre) elif options.plus_wig: bam = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bam = iCLIP.make_getter(bedfile=options.bedfile) else: E.error("Please specifiy one of bam file, bed file or wig file") sys.exit(1) results = iCLIP.get_crosslink_fdr_by_randomisation(iterator, bam, options.rands, options.spread, pool) results = results[results.fdr <= options.threshold] results = results.reset_index() results.columns = ["contig", "start", "FDR", "depth", "strand"] # Deal with case where there is more than one value on a base. Keep one with # lowest FDR. results = results.sort_values(by=["FDR", "depth"], ascending=[True, False]) results = results.drop_duplicates(["contig", "start", "strand"]) results = results.sort_values(["contig", "start", "strand"]) results["start"] = results["start"].astype("int") results["end"] = results.start + 1 results = results.loc[:, ["contig", "start", "end", "FDR", "depth", "strand"]] results["FDR"] = -numpy.log10(results["FDR"]) results.to_csv(options.stdout, header=False, index=False, sep="\t") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv profiles = iCLIP.getters.profiles.keys() # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--profile", dest="profile", type="choice", choices=profiles, default="iclip", help="Experiment profile to use. Sets various things" "about obtaining 1-bp position from read. Options are" " %s" % ", ".join(profiles)) parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=None, help="Use centre of read rather than frist base." "Overrides profile") parser.add_option( "-f", "--format", dest="format", choices=[ "bigWig", "bigwig", "BigWig", "bedGraph", "bg", "bedgraph", "bed", "Bed", "BED" ], help="Output format. Either bigWig (2 files, + and - strand)" ", bedGraph (2 files), or bed (1 file, depth in column 5," "strand in column 6", default="bigWig") parser.add_option("-w", "--wig", dest="output_wig", action="store_true", default=False, help="Write output to bedgraph file rather than bigwig") parser.add_option("--dtype", dest="dtype", type="string", default="uint32", help="dtype for storing depths") parser.add_option( "--cpm", dest="cpm", action="store_true", default=False, help= "Normalize output depths to number of mapped reads (in millions) in BAM" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) options.format = options.format.lower() if options.format == "bg": options.format = "bedgraph" profile = iCLIP.getters.profiles[options.profile] if options.centre is not None: centre = True else: centre = profile.centre if options.stdin == sys.stdin: in_bam = pysam.Samfile("-", "rb") else: fn = options.stdin.name options.stdin.close() in_bam = pysam.Samfile(fn, "rb") getter = iCLIP.make_getter(in_bam, profile=profile, centre=centre) if options.cpm: scale_factor = sum(contig.mapped for contig in in_bam.get_index_statistics()) scale_factor = 1000000.0 / scale_factor if options.format == "bed": bedfile = IOTools.openFile(args[0], "w") else: plus_wig = tempfile.NamedTemporaryFile(delete=False) minus_wig = tempfile.NamedTemporaryFile(delete=False) contig_sizes = [] for chrom, chrom_length in zip(in_bam.references, in_bam.lengths): # get depths over chromosome pos_depth, neg_depth, counter = getter(chrom, strand="both", dtype=options.dtype) pos_depth_sorted = pos_depth.sort_index() del pos_depth neg_depth_sorted = neg_depth.sort_index() del neg_depth neg_depth_sorted = -1 * neg_depth_sorted if options.cpm: pos_depth_sorted = pos_depth_sorted * scale_factor neg_depth_sorted = neg_depth_sorted * scale_factor if options.cpm: pos_depth = pos_depth * scale_factor neg_depth = neg_depth * scale_factor # output to temporary wig file if options.format == "bed": output2Bed(pos_depth_sorted, neg_depth_sorted, chrom, bedfile) else: outputToBG(pos_depth_sorted, chrom, chrom_length, plus_wig) outputToBG(neg_depth_sorted, chrom, chrom_length, minus_wig) contig_sizes.append([chrom, chrom_length]) del pos_depth_sorted del neg_depth_sorted if options.format == "bed": bedfile.close() else: plus_wig_name = plus_wig.name minus_wig_name = minus_wig.name plus_wig.close() minus_wig.close() outname_plus = args[0] + "_plus" outname_minus = args[0] + "_minus" if options.format == "bedgraph": E.debug("Outputting to bedGraph") shutil.move(plus_wig_name, outname_plus + ".bg") shutil.move(minus_wig_name, outname_minus + ".bg") elif options.format == "bigwig": chrom_sizes_file = tempfile.NamedTemporaryFile(delete=False, dir=".") contig_sizes = ["\t".join(map(str, row)) for row in contig_sizes] contig_sizes = "\n".join(contig_sizes) + "\n" chrom_sizes_file.write(contig_sizes) chrom_sizes_filename = chrom_sizes_file.name chrom_sizes_file.close() outputToBW(plus_wig_name, outname_plus, chrom_sizes_filename) outputToBW(minus_wig_name, outname_minus, chrom_sizes_filename) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam", type="string", help="BAM file with iCLIP reads") parser.add_option("-f", "--fasta-file", dest="fasta", type="string", help="CGAT indexed Fasta file with genome sequence") parser.add_option("-k", "--kmer", dest="kmer", type="int", default=5, help="Size of kmer to test default=[%default]") parser.add_option("-s", "--spread", dest="spread", type="int", default=15, help="Amount of sequence around each read to consider" "default=[%default]") parser.add_option("-n", "--num-randomizations", dest="randomisations", type="int", default=100, help="Number of times to permute profiles to assess" "significance of enrichment") parser.add_option("-p", "--processes", dest="proc", type="int", default=None, help="Use this many processesors for multiprocessing") parser.add_option("--bed", dest="bedfile", type="string", default=None, help="Use signal from bedfile rather than BAM. File must" "be compressed with bgzip and indexed with tabix") parser.add_option( "-w", "--bigwig", "--plus-bw", dest="plus_wig", default=None, help="Use signal from bigwig rather than BAM" ", to use stranded sequence pass plus strand to this option" "and minus strand to --minus-bw") parser.add_option("--minus-bw", dest="minus_wig", type="string", default=None, help="Use minus signal from this bigwig instead of BAM" "must pass plus signal to -w/--plus-bw") parser.add_option("--feature", dest="feature", type="choice", choices=["transcript", "gene"], default="gene", help="Treat transcripts seperately or merge genes") parser.add_option("--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read for XL location") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.proc: try: import multiprocessing as mp pool = mp.Pool(options.proc) except ImportError: E.warn("Multiprocessing setup failed." " Falling back to single processor mode") pool = None else: pool = None if options.bedfile: bam = iCLIP.make_getter(bedfile=options.bedfile) elif options.plus_wig: bam = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) else: bam = iCLIP.make_getter(bamfile=pysam.AlignmentFile(options.bam), centre=options.centre) fasta = IndexedFasta(options.fasta) fasta.setConverter(getConverter("zero-both-open")) if options.feature == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) else: gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) results = iCLIP.pentamer_enrichment(gtf_iterator, bam, fasta, options.kmer, options.randomisations, spread=options.spread, pool=pool) results.name = "Z" results.to_csv(options.stdout, header=True, index_label="Kmer", sep="\t") # write footer and output benchmark information. E.Stop()
#!/usr/bin/env python import iCLIP import pandas from CGAT import GTF, IOTools my_getter = iCLIP.make_getter( plus_wig= "/fastdata/mbp15jdp/MRC5_polII_ChIP/mp/bwa.dir/MRC5_polII_ChIP-seq.bw") gtffile = IOTools.openFile("geneset_1000_extension_filtered.gtf.gz") gtf_iterator = GTF.iterator(gtffile) gene_iterator = GTF.flat_gene_iterator(gtf_iterator) def truncate_exon(exon, start, l, strand): if strand == "+": exon.end = min(exon.end, start + l) else: exon.start = max(exon.start, start - l) return exon def get_pausing_ratio(transcript, getter, promoter_up_extension=100, promoter_down_extension=300, gb_start=500, gb_down_extension=2000): strand = transcript[0].strand
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv profiles = iCLIP.getters.profiles.keys() # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-p", "--profile", dest="profile", type="choice", choices=profiles, default="iclip", help="Experiment profile to use. Sets various things" "about obtaining 1-bp position from read. Options are" " %s" % ", ".join(profiles)) parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=None, help="Use centre of read rather than frist base." "Overrides profile") parser.add_option("-f", "--format", dest="format", choices=["bigWig", "bigwig", "BigWig", "bedGraph", "bg", "bedgraph", "bed", "Bed", "BED"], help="Output format. Either bigWig (2 files, + and - strand)" ", bedGraph (2 files), or bed (1 file, depth in column 5," "strand in column 6", default="bigWig") parser.add_option("-w", "--wig", dest="output_wig", action="store_true", default=False, help="Write output to bedgraph file rather than bigwig") parser.add_option("--dtype", dest = "dtype", type="string", default="uint32", help="dtype for storing depths") parser.add_option("--cpm", dest="cpm", action="store_true", default=False, help="Normalize output depths to number of mapped reads (in millions) in BAM") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) options.format = options.format.lower() if options.format == "bg": options.format = "bedgraph" profile = iCLIP.getters.profiles[options.profile] if options.centre is not None: centre=True else: centre=profile.centre if options.stdin == sys.stdin: in_bam = pysam.Samfile("-", "rb") else: fn = options.stdin.name options.stdin.close() in_bam = pysam.Samfile(fn, "rb") getter = iCLIP.make_getter(in_bam, profile=profile, centre=centre) if options.cpm: scale_factor = sum(contig.mapped for contig in in_bam.get_index_statistics()) scale_factor = 1000000.0/scale_factor if options.format == "bed": bedfile = IOTools.openFile(args[0], "w") else: plus_wig = tempfile.NamedTemporaryFile(delete=False) minus_wig = tempfile.NamedTemporaryFile(delete=False) contig_sizes = [] for chrom, chrom_length in zip(in_bam.references, in_bam.lengths): # get depths over chromosome pos_depth, neg_depth, counter = getter(chrom, strand="both", dtype=options.dtype) pos_depth_sorted = pos_depth.sort_index() del pos_depth neg_depth_sorted = neg_depth.sort_index() del neg_depth neg_depth_sorted = -1*neg_depth_sorted if options.cpm: pos_depth_sorted = pos_depth_sorted * scale_factor neg_depth_sorted = neg_depth_sorted * scale_factor if options.cpm: pos_depth = pos_depth * scale_factor neg_depth = neg_depth * scale_factor # output to temporary wig file if options.format == "bed": output2Bed(pos_depth_sorted, neg_depth_sorted, chrom, bedfile) else: outputToBG(pos_depth_sorted, chrom, chrom_length, plus_wig) outputToBG(neg_depth_sorted, chrom, chrom_length, minus_wig) contig_sizes.append([chrom, chrom_length]) del pos_depth_sorted del neg_depth_sorted if options.format == "bed": bedfile.close() else: plus_wig_name = plus_wig.name minus_wig_name = minus_wig.name plus_wig.close() minus_wig.close() outname_plus = args[0] + "_plus" outname_minus = args[0] + "_minus" if options.format == "bedgraph": E.debug("Outputting to bedGraph") shutil.move(plus_wig_name, outname_plus + ".bg") shutil.move(minus_wig_name, outname_minus + ".bg") elif options.format == "bigwig": chrom_sizes_file = tempfile.NamedTemporaryFile(delete=False, dir=".") contig_sizes = ["\t".join(map(str,row)) for row in contig_sizes] contig_sizes = "\n".join(contig_sizes) + "\n" chrom_sizes_file.write(contig_sizes) chrom_sizes_filename = chrom_sizes_file.name chrom_sizes_file.close() outputToBW(plus_wig_name, outname_plus, chrom_sizes_filename) outputToBW(minus_wig_name, outname_minus, chrom_sizes_filename) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam", type="string", help="BAM file containing iCLIP reads", default=None) parser.add_option("-w", "--wig", "--plus-wig", dest="plus_wig", type="string", help="BigWig file containing signal for already processed sample", default=None) parser.add_option("--minus-wig", dest="minus_wig", type="string", help="BigWig file containing signal for sample on minus strand", default=None) parser.add_option("--bed", dest="bedfile", type="string", help="Bed file containing signal for sample") parser.add_option("-s", "--spread", dest="spread", type="int", default=15, help="Number of bases each site of each bases" "to use when calculating height") parser.add_option("-r", "--randomisations", dest="rands", type="int", default=100, help="Number of randomisations to use when" "calculating FDR") parser.add_option("-t", "--threshold", dest="threshold", type="float", default=0.05, help="FDR threshold on which to select bases") parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["transcript", "gene"], default="gene", help="GTF feature to use. Gene or transcript") parser.add_option("-p", "--processes", dest="proc", type="int", default=None, help="Number of processes to use for multiprocessing") parser.add_option("-c", "--centre", dest="centre", action="store_true", default=False, help="Use centre of read instead of -1 when no" "mutation is present") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.proc is not None: try: import multiprocessing pool = multiprocessing.Pool(options.proc) E.debug("Operating in multiprocessing mode") except ImportError: E.warn("Failed to setup multiprocessing, using single processor") pool = None else: E.debug("Operating in single processor mode") pool = None if options.feature == "gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) elif options.feature == "transcript": iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: raise ValueError("Unknown feature type %s" % options.feature) if options.bam: bam = iCLIP.make_getter(options.bam, centre=options.centre) elif options.plus_wig: bam = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bam = iCLIP.make_getter(bedfile=options.bedfile) else: E.error("Please specifiy one of bam file, bed file or wig file") sys.exit(1) results = iCLIP.get_crosslink_fdr_by_randomisation( iterator, bam, options.rands, options.spread, pool) results = results[results.fdr <= options.threshold] results = results.reset_index() results.columns = ["contig", "start", "FDR", "depth", "strand"] # Deal with case where there is more than one value on a base. Keep one with # lowest FDR. results = results.sort_values(by=["FDR", "depth"], ascending=[True, False]) results = results.drop_duplicates(["contig", "start", "strand"]) results = results.sort_values(["contig", "start", "strand"]) results["start"] = results["start"].astype("int") results["end"] = results.start + 1 results = results.loc[:, ["contig", "start", "end", "FDR", "depth", "strand"]] results["FDR"] = -numpy.log10(results["FDR"]) results.to_csv(options.stdout, header=False, index=False, sep="\t") # write footer and output benchmark information. E.Stop()