Esempio n. 1
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw",
                      dest="unstranded_wig",
                      type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw",
                      dest="plus_wig",
                      type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw",
                      dest="minus_wig",
                      type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed",
                      dest="bedfile",
                      type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c",
                      "--use-centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":

        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]

        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bamfile = pysam.AlignmentFile(args[0])

    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)

        outlines.append([
            gene_id, transcript_id, exon_id,
            str(float(exon_counts)),
            str(intron_counts)
        ])

    options.stdout.write("\t".join([
        "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"
    ]) + "\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--output-matrix",
                      dest="matrix",
                      type="string",
                      default=None,
                      help="output full matrix to this file")
    parser.add_option("-f",
                      "--flanks-length",
                      dest="flanks",
                      type="int",
                      default=1000,
                      help="number of basepairs to use for gene flanks")
    parser.add_option("-u",
                      "--upstreamflanks-length",
                      dest="tssupflanks",
                      type="int",
                      default=1000,
                      help="number of basepairs to use for gene flanks")
    parser.add_option(
        "--pseudo_count",
        dest="pseudo_count",
        type="float",
        default=0,
        help=
        "add pseduo count to bins to mitiage effects of low numbers of reads")
    parser.add_option("--normalised_profile",
                      dest="normalize_profile",
                      action="store_true",
                      default=False,
                      help="Normlize profile by profile sum")
    parser.add_option(
        "--plus-wig",
        dest="plus_wig",
        default=None,
        help="Use this wig file instead of a BAM file to get clip density"
        "may be used as only wig file, or may be provided together with"
        "--minus-wig for standed computation")
    parser.add_option("--minus_wig",
                      dest="minus_wig",
                      default=None,
                      help="Use this to provide stranded wig data")
    parser.add_option("--bed",
                      dest="bedfile",
                      default=None,
                      help="Use bed file with signal instead of bam")
    parser.add_option("--centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than end")
    parser.add_option("--no-gene-norm",
                      dest="row_norm",
                      action="store_false",
                      default=True,
                      help="Do not normalise profile from each gene")
    parser.add_option(
        "--region-length-correction",
        dest="rlc",
        action="store_true",
        default=False,
        help="Correct for regions of different legnths. Calculates something"
        "akin to an FPKM for the region")
    parser.add_option("-r",
                      "--regions",
                      dest="regions",
                      type="string",
                      default="flank5,exons,flank3",
                      help="Which regions to use. Choose from %s" %
                      ", ".join(regions_dict.keys()))
    parser.add_option("-b",
                      "--bins",
                      dest="bins",
                      action="store",
                      default=None,
                      help="Bins to use. If not specified defaults for the"
                      "chosen regions will be used")
    parser.add_option("-p",
                      "--profile",
                      dest="profile",
                      type="choice",
                      choices=iCLIP.getters.profiles.keys(),
                      help="Read profile for the experiment. Choose from %s" %
                      ", ".join(iCLIP.getters.profiles.keys()))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.plus_wig:
        bam = iCLIP.make_getter(plus_wig=options.plus_wig,
                                minus_wig=options.minus_wig)
    elif options.bedfile:
        bam = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bam = iCLIP.make_getter(bamfile=args[0],
                                profile=options.profile,
                                centre=options.centre)

    regions_dict['5flank'] = partial(regions_dict['5flank'],
                                     length=options.flanks)
    regions_dict['3flank'] = partial(regions_dict['3flank'],
                                     length=options.flanks)
    regions_dict['tss'] = partial(regions_dict['tss'],
                                  upstream=options.tssupflanks,
                                  downstream=options.flanks)
    regions_dict['tts'] = partial(regions_dict['tts'],
                                  upstream=options.flanks,
                                  downstream=options.flanks)

    names = options.regions.split(",")
    regions = [regions_dict[r] for r in names]

    if options.bins:
        bins = [int(b) for b in options.bins.split(",")]
        if not len(bins) == len(regions):
            raise ValueError("Bins and regions not same length")
    else:
        bins = [default_bins[r] for r in names]

    index = [list(product([n], range(b))) for n, b in zip(names, bins)]
    index = sum(index, [])
    index = pandas.MultiIndex.from_tuples(index,
                                          names=["region", "region_bin"])

    profile = pandas.Series(index=index)
    accumulator = list()

    transcript_interator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    for transcript in transcript_interator:
        this_profile = transcript_region_meta(transcript,
                                              bam,
                                              regions,
                                              names,
                                              bins,
                                              length_norm=options.rlc)

        if options.pseudo_count:
            this_profile = profile.reindex(index, fill_value=0) +\
                           options.pseudo_count

        if options.row_norm:
            this_profile = this_profile / this_profile.sum()

        profile = profile.add(this_profile, fill_value=0)

        if options.matrix:
            profile.name = transcript[0].transcript_id
            accumulator.append(profile)

    if options.normalize_profile:
        profile = profile / profile.sum()

    profile = profile.reindex(names, level="region")
    profile.name = "density"
    profile = profile.reset_index()
    profile.index.name = "bin"

    profile.to_csv(options.stdout, sep="\t", index_label="bin")

    if options.matrix:
        counts_matrix = pandas.concat(accumulator, axis=1)
        counts_matrix = counts_matrix.transpose()

        counts_matrix = counts_matrix.reset_index(drop=True)
        counts_matrix = counts_matrix.transpose()

        counts_matrix.to_csv(IOTools.openFile(options.matrix, "w"),
                             sep="\t",
                             index=True,
                             index_label="transcript_id")

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 3
0
def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    grouping_choices = ["exons",
                        "utrs",
                        "all"]
    parser.add_option("-g", "--grouping", dest="grouping", type="choice",
                      choices=grouping_choices,
                      help="How to group transcript regions choices are [%s]"
                            % ",".join(grouping_choices))
    parser.add_option("-p", "--pipeout", dest="pipeout", action="store_true",
                      help="Output continuously to the pipe rather than in a"
                           "chunk at the end")
    parser.add_option("-d", "--dtype", dest="dtype", type="string",
                      default="int32",
                      help="Numpy dtype for storing counts")
    parser.add_option("-w", "--window-size", dest="window_size",
                      type="int", default=15,
                      help="Size of window either size of crosslinked base to"
                           "consider")
    parser.add_option("-f", "--fdr", dest="fdr", action="store_true",
                      default=False,
                      help="perform BH fdr correction on p-values, implies not"
                           "--pipeout")
    parser.add_option("-o", "--output-windows", dest="output_windows",
                      action="store_true",
                      default=False,
                      help="Output consolidated windows isntead of bases")
    parser.add_option("-b", "--output-both", type="string", dest="output_both",
                      default=None,
                      help="Output both bases bedGraph (stdout) and windows"
                           "bed12 (specified file).")
    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      default=0.05,
                      help="p-value threshold under which to merge windows")
    parser.add_option("-c", "--centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read rather than -1 base when no"
                      "mutaiton is present")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # Standard in contains the transcripts
    
    gffs = GTF.gene_iterator(GTF.iterator(options.stdin))

    # bam file is the first positional arguement
    bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre)

    if options.output_both:
        outfile_bases = options.stdout
        outfile_windows = IOTools.openFile(options.output_both, "w")
    elif options.output_windows:
        outfile_bases = None
        outfile_windows = options.stdout
    else:
        outfile_bases = options.stdout
        outfile_windows = None

    if options.fdr and options.pipeout:
        E.warning("--fdr implies not --pipeout, instant output disabled")
        options.pipeout = False

    if options.pipeout:
        output = InstantOutput(outfile_bases=outfile_bases,
                               outfile_windows=outfile_windows,
                               window_size=options.window_size,
                               threshold=options.threshold)
    else:
        output = DeferredOutput(outfile_bases=outfile_bases,
                                outfile_windows=outfile_windows,
                                correct=options.fdr,
                                window_size=options.window_size,
                                threshold=options.threshold)

    E.info("Counting accross transcripts ...")
    max_end = 0
    for gene in gffs:

        if options.grouping == "all":
            gene = GTF.merged_gene_iterator(gene)

        transcript_ps = {}

        for transcript in gene:
            
            # E.debug("Transcript is %s" % transcript[0].transcript_id)
            coords_converter = iCLIP.TranscriptCoordInterconverter(transcript)
            exons = GTF.asRanges(transcript, "exon")
            counts = iCLIP.count_intervals(bamfile,
                                           exons,
                                           strand=transcript[0].strand,
                                           contig=transcript[0].contig,
                                           dtype=options.dtype)

 
            counts.index = coords_converter.genome2transcript(counts.index.values)
            counts = counts.sort_index()
            cds = GTF.asRanges(transcript, "CDS")

            if options.grouping == "utrs" and len(cds) > 0:
                
                cds_interval = (cds[0][0], cds[-1][1])
                cds_interval = coords_converter.genome2transcript(cds_interval)
                cds_interval.sort()
                cds_length = cds_interval[1] - cds_interval[0]

                p_intervals = [(0, cds_interval[0]),
                               (cds_interval[0], cds_length),
                               (cds_interval[1], coords_converter.length - cds_interval[1])]

            else:  # do not group by cds or there is no cds
                p_intervals = [(0, coords_converter.length)]

            p_values = [calculateProbabilities(counts, options.window_size,
                                              length=length, start=start)
                        for start, length in p_intervals
                        if length > 0]
  
            if len(p_values) > 1:
                p_values = pd.concat(p_values)
            else:
                p_values = p_values[0]

            p_values.index = coords_converter.transcript2genome(p_values.index.values)
 
 
            intron_intervals = GTF.toIntronIntervals(transcript)
            
            if len(intron_intervals) > 0:
                intron_coords = iCLIP.TranscriptCoordInterconverter(transcript,
                                                                    introns=True)
                intron_counts = iCLIP.count_intervals(bamfile,
                                                      intron_intervals,
                                                      strand=transcript[0].strand,
                                                      contig=transcript[0].contig,
                                                      dtype=options.dtype)
             
                intron_counts.index = intron_coords.genome2transcript(
                    intron_counts.index.values)
                intron_counts = intron_counts.sort_index()
                intron_pvalues = calculateProbabilities(intron_counts,
                                                        options.window_size,
                                                        intron_coords.length)
                                                        
                intron_pvalues.index = intron_coords.transcript2genome(
                    intron_pvalues.index.values)
                p_values = p_values.append(intron_pvalues)
                
            transcript_ps[transcript[0].transcript_id] = p_values

        transcript_df = pd.DataFrame(transcript_ps)

        transcript_df.index.rename("position", inplace=True)
        transcript_df["contig"] = gene[0][0].contig
        transcript_df["strand"] = gene[0][0].strand
        transcript_df["gene_id"] = gene[0][0].gene_id
        transcript_df.set_index("contig", append=True, inplace=True)
        transcript_df.set_index("strand", append=True, inplace=True)
        transcript_df.set_index("gene_id", append=True, inplace=True)
 
        gene_ps = transcript_df.mean(1)
        gene_ps = gene_ps.reorder_levels(["gene_id", "contig",
                                          "strand", "position"])

        output.write(gene_ps, gene)

    output.close()

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 4
0
def main(argv=None):
    """script main.

parses command line options in sys.argv, unless *argv* is given.
"""

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    grouping_choices = ["exons", "utrs", "all"]
    parser.add_option("-g",
                      "--grouping",
                      dest="grouping",
                      type="choice",
                      choices=grouping_choices,
                      help="How to group transcript regions choices are [%s]" %
                      ",".join(grouping_choices))
    parser.add_option("-p",
                      "--pipeout",
                      dest="pipeout",
                      action="store_true",
                      help="Output continuously to the pipe rather than in a"
                      "chunk at the end")
    parser.add_option("-d",
                      "--dtype",
                      dest="dtype",
                      type="string",
                      default="int32",
                      help="Numpy dtype for storing counts")
    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      default=15,
                      help="Size of window either size of crosslinked base to"
                      "consider")
    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      action="store_true",
                      default=False,
                      help="perform BH fdr correction on p-values, implies not"
                      "--pipeout")
    parser.add_option("-o",
                      "--output-windows",
                      dest="output_windows",
                      action="store_true",
                      default=False,
                      help="Output consolidated windows isntead of bases")
    parser.add_option("-b",
                      "--output-both",
                      type="string",
                      dest="output_both",
                      default=None,
                      help="Output both bases bedGraph (stdout) and windows"
                      "bed12 (specified file).")
    parser.add_option("-t",
                      "--threshold",
                      dest="threshold",
                      type="float",
                      default=0.05,
                      help="p-value threshold under which to merge windows")
    parser.add_option("-c",
                      "--centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than -1 base when no"
                      "mutaiton is present")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # Standard in contains the transcripts

    gffs = GTF.gene_iterator(GTF.iterator(options.stdin))

    # bam file is the first positional arguement
    bamfile = iCLIP.make_getter(bamfile=args[0], centre=options.centre)

    if options.output_both:
        outfile_bases = options.stdout
        outfile_windows = IOTools.openFile(options.output_both, "w")
    elif options.output_windows:
        outfile_bases = None
        outfile_windows = options.stdout
    else:
        outfile_bases = options.stdout
        outfile_windows = None

    if options.fdr and options.pipeout:
        E.warning("--fdr implies not --pipeout, instant output disabled")
        options.pipeout = False

    if options.pipeout:
        output = InstantOutput(outfile_bases=outfile_bases,
                               outfile_windows=outfile_windows,
                               window_size=options.window_size,
                               threshold=options.threshold)
    else:
        output = DeferredOutput(outfile_bases=outfile_bases,
                                outfile_windows=outfile_windows,
                                correct=options.fdr,
                                window_size=options.window_size,
                                threshold=options.threshold)

    E.info("Counting accross transcripts ...")
    max_end = 0
    for gene in gffs:

        if options.grouping == "all":
            gene = GTF.merged_gene_iterator(gene)

        transcript_ps = {}

        for transcript in gene:

            # E.debug("Transcript is %s" % transcript[0].transcript_id)
            coords_converter = iCLIP.TranscriptCoordInterconverter(transcript)
            exons = GTF.asRanges(transcript, "exon")
            counts = iCLIP.count_intervals(bamfile,
                                           exons,
                                           strand=transcript[0].strand,
                                           contig=transcript[0].contig,
                                           dtype=options.dtype)

            counts.index = coords_converter.genome2transcript(
                counts.index.values)
            counts = counts.sort_index()
            cds = GTF.asRanges(transcript, "CDS")

            if options.grouping == "utrs" and len(cds) > 0:

                cds_interval = (cds[0][0], cds[-1][1])
                cds_interval = coords_converter.genome2transcript(cds_interval)
                cds_interval.sort()
                cds_length = cds_interval[1] - cds_interval[0]

                p_intervals = [(0, cds_interval[0]),
                               (cds_interval[0], cds_length),
                               (cds_interval[1],
                                coords_converter.length - cds_interval[1])]

            else:  # do not group by cds or there is no cds
                p_intervals = [(0, coords_converter.length)]

            p_values = [
                calculateProbabilities(counts,
                                       options.window_size,
                                       length=length,
                                       start=start)
                for start, length in p_intervals if length > 0
            ]

            if len(p_values) > 1:
                p_values = pd.concat(p_values)
            else:
                p_values = p_values[0]

            p_values.index = coords_converter.transcript2genome(
                p_values.index.values)

            intron_intervals = GTF.toIntronIntervals(transcript)

            if len(intron_intervals) > 0:
                intron_coords = iCLIP.TranscriptCoordInterconverter(
                    transcript, introns=True)
                intron_counts = iCLIP.count_intervals(
                    bamfile,
                    intron_intervals,
                    strand=transcript[0].strand,
                    contig=transcript[0].contig,
                    dtype=options.dtype)

                intron_counts.index = intron_coords.genome2transcript(
                    intron_counts.index.values)
                intron_counts = intron_counts.sort_index()
                intron_pvalues = calculateProbabilities(
                    intron_counts, options.window_size, intron_coords.length)

                intron_pvalues.index = intron_coords.transcript2genome(
                    intron_pvalues.index.values)
                p_values = p_values.append(intron_pvalues)

            transcript_ps[transcript[0].transcript_id] = p_values

        transcript_df = pd.DataFrame(transcript_ps)

        transcript_df.index.rename("position", inplace=True)
        transcript_df["contig"] = gene[0][0].contig
        transcript_df["strand"] = gene[0][0].strand
        transcript_df["gene_id"] = gene[0][0].gene_id
        transcript_df.set_index("contig", append=True, inplace=True)
        transcript_df.set_index("strand", append=True, inplace=True)
        transcript_df.set_index("gene_id", append=True, inplace=True)

        gene_ps = transcript_df.mean(1)
        gene_ps = gene_ps.reorder_levels(
            ["gene_id", "contig", "strand", "position"])

        output.write(gene_ps, gene)

    output.close()

    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--output-matrix", dest="matrix", type="string",
                      default=None,
                      help="output full matrix to this file")
    parser.add_option("-f", "--flanks-length", dest="flanks", type="int",
                      default=1000,
                      help="number of basepairs to use for gene flanks")
    parser.add_option("-u", "--upstreamflanks-length", dest="tssupflanks", type="int",
                      default=1000,
                      help="number of basepairs to use for gene flanks")
    parser.add_option("--pseudo_count", dest="pseudo_count", type="float",
                      default=0,
                      help="add pseduo count to bins to mitiage effects of low numbers of reads")
    parser.add_option("--normalised_profile", dest="normalize_profile", action="store_true",
                      default=False,
                      help="Normlize profile by profile sum")
    parser.add_option("--plus-wig", dest="plus_wig",
                      default=None,
                      help="Use this wig file instead of a BAM file to get clip density"
                      "may be used as only wig file, or may be provided together with"
                      "--minus-wig for standed computation")
    parser.add_option("--minus_wig", dest="minus_wig", default=None,
                      help="Use this to provide stranded wig data")
    parser.add_option("--bed", dest="bedfile", default=None,
                      help="Use bed file with signal instead of bam")
    parser.add_option("--centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read rather than end")
    parser.add_option("--no-gene-norm", dest="row_norm", action="store_false",
                      default=True,
                      help="Do not normalise profile from each gene")
    parser.add_option("--region-length-correction", dest="rlc", action="store_true",
                      default=False,
                      help="Correct for regions of different legnths. Calculates something"
                      "akin to an FPKM for the region")
    parser.add_option("-r", "--regions", dest="regions", type="string",
                      default="flank5,exons,flank3",
                      help="Which regions to use. Choose from %s" %
                      ", ".join(regions_dict.keys()))
    parser.add_option("-b", "--bins", dest="bins", action="store",
                      default=None,
                      help="Bins to use. If not specified defaults for the"
                      "chosen regions will be used")
    parser.add_option("-p", "--profile", dest="profile", type="choice",
                      choices=iCLIP.getters.profiles.keys(),
                      help="Read profile for the experiment. Choose from %s"
                      % ", ".join(iCLIP.getters.profiles.keys()))
    
    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.plus_wig:
        bam = iCLIP.make_getter(plus_wig=options.plus_wig,
                                minus_wig=options.minus_wig)
    elif options.bedfile:
        bam = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bam = iCLIP.make_getter(bamfile=args[0], profile=options.profile, centre=options.centre)

    regions_dict['5flank'] = partial(regions_dict['5flank'],
                                     length=options.flanks)
    regions_dict['3flank'] = partial(regions_dict['3flank'],
                                     length=options.flanks)
    regions_dict['tss'] = partial(regions_dict['tss'],
                                  upstream=options.tssupflanks,
                                  downstream=options.flanks)
    regions_dict['tts'] = partial(regions_dict['tts'],
                                  upstream=options.flanks,
                                  downstream=options.flanks)

    names = options.regions.split(",")
    regions = [regions_dict[r] for r in names]

    if options.bins:
        bins = [int(b) for b in options.bins.split(",")]
        if not len(bins) == len(regions):
            raise ValueError("Bins and regions not same length")
    else:
        bins = [default_bins[r] for r in names]

    index = [list(product([n], range(b))) for n, b in zip(names, bins)]
    index = sum(index, [])
    index = pandas.MultiIndex.from_tuples(index,
                                          names=["region", "region_bin"])

    profile = pandas.Series(
        index=index)
    accumulator = list()

    transcript_interator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    for transcript in transcript_interator:
        this_profile = transcript_region_meta(transcript, bam, regions, names,
                                              bins, length_norm=options.rlc)

        if options.pseudo_count:
            this_profile = profile.reindex(index, fill_value=0) +\
                           options.pseudo_count

        if options.row_norm:
            this_profile = this_profile/this_profile.sum()

        profile = profile.add(this_profile, fill_value=0)

        if options.matrix:
            profile.name = transcript[0].transcript_id
            accumulator.append(profile)

    if options.normalize_profile:
        profile = profile/profile.sum()

    profile = profile.reindex(names, level="region")
    profile.name = "density"
    profile = profile.reset_index()
    profile.index.name = "bin"

    profile.to_csv(options.stdout, sep="\t", index_label="bin")

    if options.matrix:
        counts_matrix = pandas.concat(accumulator, axis=1)
        counts_matrix = counts_matrix.transpose()

        counts_matrix = counts_matrix.reset_index(drop=True)
        counts_matrix = counts_matrix.transpose()

        counts_matrix.to_csv(IOTools.openFile(options.matrix, "w"),
                             sep="\t",
                             index=True,
                             index_label="transcript_id")

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 6
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g", "--gtf-file", dest="gtf", type="string",
                      help="GTF containing gene annotations")
    parser.add_option("-s", "--sort", dest="sort", type="choice",
                      default="length",
                      choices=sort_choices,
                      help="Property to sort rows by. Choices are %s"
                           % ", ".join(sort_choices))
    parser.add_option("-b", "--bin-size", dest="bin_size", type="int",
                      default=25,
                      help="Size of window over which to sum reads")
    parser.add_option("-u", "--upstream-window", dest="us_win", type="int",
                      default=500,
                      help="Amount of sequence upstream of alignment point (less introns)")
    parser.add_option("-d", "--downstream-window", dest="ds_win", type="int",
                      default=None,
                      help="Amount of sequence downstream of alignment point (default longest segment)")
    parser.add_option("-a", "--align-at", dest="align_at", type="choice",
                      default="start",
                      choices=align_choices,
                      help="Where to align genes/transcripts at. Choices are %s"
                            % ", ".join(align_choices))
    parser.add_option("-H", "--height", dest="height", type="int",
                      default=None,
                      help="Number of rows in output matrix/heigh of plot in px")
    parser.add_option("-w", "--width", dest="width", type="int",
                      default=None,
                      help="Number of columns in output/width of plot in px"
                           "default based on bin size")
    parser.add_option("-n", "--normalize", dest="normalize", type="choice",
                      default="none",
                      choices=norm_choices,
                      help="Row normalization to apply. Choices are: %s"
                           % ", ".join(norm_choices))
    parser.add_option("-r", "--renormalize", dest="renormalize", type="choice",
                      default="none",
                      choices=norm_choices,
                      help="Row normalization to apply after row/column compression")
    parser.add_option("--no-plot", dest="plot", action="store_false",
                      default=True,
                      help="Do not output plot - compute matrix only")
    parser.add_option("--use-matrix", dest="use_matrix", type="string",
                      default=None,
                      help="Use existing matrix")
    parser.add_option("--annotations", dest="annotations", type="choice",
                      action="append",
                      choices=annotation_choices,
                      help="Add annotations to the output plot")
    parser.add_option("--reverse-strand", dest="rstrand", action="store_true",
                      default=False,
                      help="Find reads on reverse strand")
    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript"],
                      default="gene",
                      help="use genes or transcripts")
    parser.add_option("--quantile", dest="quantile", type="float",
                      default=0.99,
                      help="Quantile to use in quantile normalization")
    parser.add_option("-o", "--outfile-prefix", dest="outfile_pattern", type="string",
                      default=None,
                      help="base of names for output files")
    parser.add_option("-c", "--crop", dest="crop", type="string",
                      default=None,
                      help="crop view to a certain range on the xaxis. Specify like"
                      "-500:1000")
    parser.add_option("--format", dest="format", type="string",
                      default="png",
                      help="Output format, use valid R graphics device")
    parser.add_option("--plus-wig", dest="plus_wig", type="string",
                      help="Use this wig for plus strand info rather than bam file")
    parser.add_option("--minus-wig", dest="minus_wig", type="string",
                      help="Use this wig for minus strand info rather than bam file")
    parser.add_option("--bed", dest="bed", type="string",
                      help="Use this bed for signal(must be indexed)")
    parser.add_option("--norm-mat", dest="norm_mat", type="string",
                      help="Use this matrix for normalizing (e.g. RNA data")
    parser.add_option("--sort-order-file", dest="sort_file", type="string",
                      default=None,
                      help="Two column file containing gene names in the first
                      column and a numeric value to sort on in the second")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.plot and (options.height is None):
        options.height = 100

    if options.gtf:
        
        f = IOTools.openFile(options.gtf)
        if options.feature == "gene":
            gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(f))
        else:
            gtf_iterator = GTF.transcript_iterator(GTF.iterator(f))

        lengths = dict()
        utr3_lengths = dict()
        utr5_lengths = dict()
        first_exon_lengths = dict()
        for transcript in gtf_iterator:
            lengths[transcript[0].transcript_id] = sum(
                [e[1] - e[0] for e in GTF.asRanges(transcript, "exon")])

            exons = GTF.asRanges(transcript, "exon")
            utrs = GTF.asRanges(transcript, "UTR")
            coding = Intervals.truncate(exons, utrs)
            coding.sort()

            utr5 = [utr for utr in utrs if utr[1] <= coding[0][0]]
            utr3 = [utr for utr in utrs if utr[0] >= coding[-1][-1]]

            if transcript[0].strand == "-":
                utr3, utr5 = utr5, utr3
            
            if transcript[0].strand == "+" or len(exons) == 1:
                first_exon_lengths[transcript[0].transcript_id] = \
                    exons[0][1] - exons[0][0]
            else:
                first_exon_lengths[transcript[0].transcript_id] = \
                    exons[-1][1] - exons[-1][0]

            utr3_lengths[transcript[0].transcript_id] = sum(
                [e[1] - e[0] for e in utr3])

            utr5_lengths[transcript[0].transcript_id] = sum(
                [e[1] - e[0] for e in utr5])

        lengths = pandas.Series(lengths)
        utr3_lengths = pandas.Series(utr3_lengths)
        utr5_lengths = pandas.Series(utr5_lengths)
        first_exon_lengths = pandas.Series(first_exon_lengths)

    else:
        options.sort = "none"
        options.annotations = None

    if options.plus_wig:
        getter = iCLIP.make_getter(plus_wig=options.plus_wig,
                                   minus_wig=options.minus_wig)
    elif options.bed:
        getter = iCLIP.make_getter(bedfile=options.bed)
    else:
        try:
            getter = iCLIP.make_getter(bamfile=args[0])
        except IOError:
            E.error("Cannot open bamfile %s" % args[0])
            return(1)
        except IndexError:
            getter = None

    if options.use_matrix:
        raw_matrix = pandas.read_csv(options.use_matrix,
                                     sep="\t",
                                     index_col=0)
        raw_matrix.columns = raw_matrix.columns.astype("int")
    else:
        raw_matrix = get_matrix(getter, lengths, options)

    if options.crop:
        crop_from, crop_to = map(int, options.crop.split(":"))
        raw_matrix = raw_matrix.loc[:, crop_from:crop_to]

    if options.norm_mat:
        norm_matrix = pandas.read_csv(options.norm_mat,
                                     sep="\t",
                                     index_col=0)
        norm_matrix.columns = norm_matrix.columns.astype("int")

        if options.crop:
            norm_matrix = norm_matrix.loc[:, crop_from:crop_to]
        
        if all(norm_matrix.columns == raw_matrix.columns) and \
           all(raw_matrix.index.isin(norm_matrix.index.values)):
            norm_matrix = norm_matrix.loc[raw_matrix.index]
            norm_matrix = norm_matrix.replace(
                0, norm_matrix[norm_matrix > 0].min().min())
            raw_matrix = raw_matrix/norm_matrix
            norm_matrix = None

        else:
            raise ValueError("Incompatible normalisation matrix")

    normalized_matrix = normalize(raw_matrix, options.normalize,
                                  quantile=options.quantile)

    if options.sort == "length":
        sorter = lengths
    elif options.sort == "3utr":
        sorter = utr3_lengths
    elif options.sort == "5utr":
        sorter = utr5_lengths
    elif options.sort == "first-exon":
        sorter = first_exon_lengths
    elif options.sort == "manual":
        sorter = pandas.read_csv(options.sort_file, sep="\t",
                                 index_col=0, usecols=[0, 1])
        sorter = sorter[sorter.columns[0]]
    elif options.sort == "none":
        sorter = pandas.Series(range(raw_matrix.shape[0]),
                               index=raw_matrix.index[::-1])

    sorter = sorter[sorter.index.isin(normalized_matrix.index)]
    sorter = sorter.sort_values(ascending=False)
    sorted_matrix = normalized_matrix.loc[sorter.index.values]

    compress_matrix = iCLIP.compress_matrix(sorted_matrix,
                                            ncols=options.width,
                                            nrows=options.height)

    renormalized_matrix = normalize(compress_matrix, options.renormalize,
                                    quantile=options.quantile)

    if renormalized_matrix is raw_matrix and options.use_matrix is not None:
        E.info("Input and output matrices are identical, no matrix output")
    else:
        if options.outfile_pattern:
            mat_outfile = IOTools.openFile(
                options.outfile_pattern + ".matrix.tsv.gz", "w")
        else:
            mat_outfile = options.stdout

        renormalized_matrix.to_csv(mat_outfile, sep="\t")

    if options.plot:

        try:
            from rpy2.robjects import r as R
            from rpy2 import robjects as ro
        except:
            E.info("No rpy2. Not plotting image")
            return(0)

        from rpy2.robjects.numpy2ri import numpy2ri
        ro.conversion.py2ri = numpy2ri
        ro.numpy2ri.activate()

        if options.outfile_pattern:
            plot_outfile = options.outfile_pattern + ".png"
        else:
            plot_outfile = "bam2heatmap_out.png"

        c = R["c"]

        R[options.format](plot_outfile,
                          width=renormalized_matrix.shape[1] + 72,
                          height=renormalized_matrix.shape[0] + 72,
                          unit="px",
                          res=72)
        R.par(mai=c(1, 0.5, 0, 0.5))
        cols = R["colorRampPalette"](c("white", "blue"))(50)
        bases = renormalized_matrix.columns.values.astype("int")
        groups = renormalized_matrix.index.values.astype("int")
        mat = renormalized_matrix.as_matrix()
        mat[mat >= 1] = 1

        R.image(bases, groups, R.t(mat),
                zlim=c(0, 1),
                raster=True,
                col=cols,
                xlab="Base",
                yaxt="n")

        def _sort_and_compress_annotation(anno):
            sorted_anno = anno.loc[sorter.index]
            comp_anno = iCLIP.compress_matrix(
                sorted_anno, renormalized_matrix.shape[0])
            return comp_anno

        if options.annotations:
            ends = _sort_and_compress_annotation(lengths)
            starts = pandas.Series(0, index=renormalized_matrix.index)

            if options.align_at == "end":
                starts, ends = -1 * ends, starts

            if "start" in options.annotations:
                R.lines(starts.values, starts.index.values, col="black", pch=".")
            if "end" in options.annotations:
                R.lines(ends.values, ends.index.values,
                        pch=".", col="black")
            if "5utr" in options.annotations:
                utr5s = _sort_and_compress_annotation(utr5_lengths)
                utr5s = starts + utr5s
                R.lines(utr5s.values, utr5s.index.values, col="orange", pch=".")
            if "3utr" in options.annotations:
                utr3s = _sort_and_compress_annotation(utr3_lengths)
                utr3s = ends - utr3s
                R.lines(utr3s.values, utr3s.index.values, col="orange", pch=".")

        R["dev.off"]()

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 7
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw", dest="plus_wig", type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw", dest="minus_wig", type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed", dest="bedfile", type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c", "--use-centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":
        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]
        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)   
    else:
        bamfile = pysam.AlignmentFile(args[0])
        
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)
            
        outlines.append([gene_id,
                         transcript_id,
                         exon_id,
                         str(float(exon_counts)),
                         str(intron_counts)])

    options.stdout.write("\t".join(["gene_id",
                                    "transcript_id",
                                    "exon_id",
                                    "exon_count",
                                    "intron_count"])+"\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 8
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--output-matrix",
                      dest="matrix",
                      type="string",
                      default=None,
                      help="output full matrix to this file")
    parser.add_option("-f",
                      "--flanks",
                      dest="flanks",
                      type="int",
                      default=100,
                      help="number of basepairs to use for gene flanks")
    parser.add_option("-b",
                      "--exon-bins",
                      dest="exon_bins",
                      type="int",
                      default=1000,
                      help="number of bins to divide transcripts into")
    parser.add_option("--flank-bins",
                      dest="flank_bins",
                      type="int",
                      default=10,
                      help="number of bins to divide flanks into")
    parser.add_option(
        "--scale-flanks",
        dest="scale_flanks",
        action="store_true",
        default=False,
        help="Scale the size of the flank bins to match the size of the"
        "exon bins for each transcript")
    parser.add_option(
        "--pseudo_count",
        dest="pseudo_count",
        type="float",
        default=0,
        help=
        "add pseduo count to bins to mitiage effects of low numbers of reads")
    parser.add_option("--normalised_profile",
                      dest="normalize_profile",
                      action="store_true",
                      default=False,
                      help="Normlize profile by profile sum")
    parser.add_option(
        "--plus-wig",
        dest="plus_wig",
        default=None,
        help="Use this wig file instead of a BAM file to get clip density"
        "may be used as only wig file, or may be provided together with"
        "--minus-wig for standed computation")
    parser.add_option("--minus_wig",
                      dest="minus_wig",
                      default=None,
                      help="Use this to provide stranded wig data")
    parser.add_option("--bed",
                      dest="bedfile",
                      default=None,
                      help="Use bed file with signal instead of bam")
    parser.add_option("--use-centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than end")
    parser.add_option("--no-gene-norm",
                      dest="row_norm",
                      action="store_false",
                      default=True,
                      help="Do not normalise profile from each gene")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.plus_wig:
        bam = iCLIP.make_getter(plus_wig=options.plus_wig,
                                minus_wig=options.minus_wig)
    elif options.bedfile:
        bam = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bam = iCLIP.make_getter(bamfile=args[0], centre=options.centre)

    if options.flanks > 0:
        bins = [options.flank_bins, options.exon_bins, options.flank_bins]
    else:
        bins = options.exon_bins

    summed_matrix, counts_matrix = iCLIP.meta_gene(
        options.stdin,
        bam,
        bins,
        options.flanks,
        output_matrix=(options.matrix is not None),
        calculate_flanks=options.scale_flanks,
        pseudo_count=options.pseudo_count,
        row_norm=options.row_norm)

    if options.flanks > 0:
        summed_matrix = summed_matrix[["flank5", "exons", "flank3"]]

    summed_matrix = summed_matrix.reset_index()
    if options.normalize_profile:
        summed_matrix["density"] = summed_matrix["density"] / summed_matrix[
            "density"].sum()

    summed_matrix.to_csv(options.stdout,
                         sep="\t",
                         index=True,
                         index_label="bin")

    if options.matrix:
        counts_matrix = counts_matrix.transpose()

        counts_matrix = counts_matrix.loc[["flank5", "exons", "flank3"], :]
        counts_matrix = counts_matrix.reset_index(drop=True)
        counts_matrix = counts_matrix.transpose()

        counts_matrix.to_csv(IOTools.openFile(options.matrix, "w"),
                             sep="\t",
                             index=True,
                             index_label="transcript_id")

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam",
                      type="string",
                      help="BAM file containing iCLIP reads",
                      default=None)
    parser.add_option(
        "-w",
        "--wig",
        "--plus-wig",
        dest="plus_wig",
        type="string",
        help="BigWig file containing signal for already processed sample",
        default=None)
    parser.add_option(
        "--minus-wig",
        dest="minus_wig",
        type="string",
        help="BigWig file containing signal for sample on minus strand",
        default=None)
    parser.add_option("--bed",
                      dest="bedfile",
                      type="string",
                      help="Bed file containing signal for sample")
    parser.add_option("-s",
                      "--spread",
                      dest="spread",
                      type="int",
                      default=15,
                      help="Number of bases each site of each bases"
                      "to use when calculating height")
    parser.add_option("-r",
                      "--randomisations",
                      dest="rands",
                      type="int",
                      default=100,
                      help="Number of randomisations to use when"
                      "calculating FDR")
    parser.add_option("-t",
                      "--threshold",
                      dest="threshold",
                      type="float",
                      default=0.05,
                      help="FDR threshold on which to select bases")
    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["transcript", "gene"],
                      default="gene",
                      help="GTF feature to use. Gene or transcript")
    parser.add_option("-p",
                      "--processes",
                      dest="proc",
                      type="int",
                      default=None,
                      help="Number of processes to use for multiprocessing")
    parser.add_option("-c",
                      "--centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read instead of -1 when no"
                      "mutation is present")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.proc is not None:
        try:
            import multiprocessing
            pool = multiprocessing.Pool(options.proc)
            E.debug("Operating in multiprocessing mode")
        except ImportError:
            E.warn("Failed to setup multiprocessing, using single processor")
            pool = None
    else:
        E.debug("Operating in single processor mode")
        pool = None

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        raise ValueError("Unknown feature type %s" % options.feature)

    if options.bam:
        bam = iCLIP.make_getter(options.bam, centre=options.centre)
    elif options.plus_wig:
        bam = iCLIP.make_getter(plus_wig=options.plus_wig,
                                minus_wig=options.minus_wig)
    elif options.bedfile:
        bam = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        E.error("Please specifiy one of bam file, bed file or wig file")
        sys.exit(1)

    results = iCLIP.get_crosslink_fdr_by_randomisation(iterator, bam,
                                                       options.rands,
                                                       options.spread, pool)

    results = results[results.fdr <= options.threshold]
    results = results.reset_index()
    results.columns = ["contig", "start", "FDR", "depth", "strand"]

    # Deal with case where there is more than one value on a base. Keep one with
    # lowest FDR.
    results = results.sort_values(by=["FDR", "depth"], ascending=[True, False])
    results = results.drop_duplicates(["contig", "start", "strand"])
    results = results.sort_values(["contig", "start", "strand"])

    results["start"] = results["start"].astype("int")
    results["end"] = results.start + 1
    results = results.loc[:,
                          ["contig", "start", "end", "FDR", "depth", "strand"]]
    results["FDR"] = -numpy.log10(results["FDR"])
    results.to_csv(options.stdout, header=False, index=False, sep="\t")
    # write footer and output benchmark information.
    E.Stop()
Esempio n. 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    profiles = iCLIP.getters.profiles.keys()
    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--profile",
                      dest="profile",
                      type="choice",
                      choices=profiles,
                      default="iclip",
                      help="Experiment profile to use. Sets various things"
                      "about obtaining 1-bp position from read. Options are"
                      " %s" % ", ".join(profiles))
    parser.add_option("-c",
                      "--use-centre",
                      dest="centre",
                      action="store_true",
                      default=None,
                      help="Use centre of read rather than frist base."
                      "Overrides profile")
    parser.add_option(
        "-f",
        "--format",
        dest="format",
        choices=[
            "bigWig", "bigwig", "BigWig", "bedGraph", "bg", "bedgraph", "bed",
            "Bed", "BED"
        ],
        help="Output format. Either bigWig (2 files, + and - strand)"
        ", bedGraph (2 files), or bed (1 file, depth in column 5,"
        "strand in column 6",
        default="bigWig")
    parser.add_option("-w",
                      "--wig",
                      dest="output_wig",
                      action="store_true",
                      default=False,
                      help="Write output to bedgraph file rather than bigwig")
    parser.add_option("--dtype",
                      dest="dtype",
                      type="string",
                      default="uint32",
                      help="dtype for storing depths")
    parser.add_option(
        "--cpm",
        dest="cpm",
        action="store_true",
        default=False,
        help=
        "Normalize output depths to number of mapped reads (in millions) in BAM"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    options.format = options.format.lower()
    if options.format == "bg":
        options.format = "bedgraph"

    profile = iCLIP.getters.profiles[options.profile]

    if options.centre is not None:
        centre = True
    else:
        centre = profile.centre

    if options.stdin == sys.stdin:
        in_bam = pysam.Samfile("-", "rb")

    else:
        fn = options.stdin.name
        options.stdin.close()
        in_bam = pysam.Samfile(fn, "rb")

    getter = iCLIP.make_getter(in_bam, profile=profile, centre=centre)

    if options.cpm:
        scale_factor = sum(contig.mapped
                           for contig in in_bam.get_index_statistics())

        scale_factor = 1000000.0 / scale_factor

    if options.format == "bed":
        bedfile = IOTools.openFile(args[0], "w")
    else:
        plus_wig = tempfile.NamedTemporaryFile(delete=False)
        minus_wig = tempfile.NamedTemporaryFile(delete=False)

    contig_sizes = []

    for chrom, chrom_length in zip(in_bam.references, in_bam.lengths):

        # get depths over chromosome
        pos_depth, neg_depth, counter = getter(chrom,
                                               strand="both",
                                               dtype=options.dtype)
        pos_depth_sorted = pos_depth.sort_index()
        del pos_depth
        neg_depth_sorted = neg_depth.sort_index()
        del neg_depth
        neg_depth_sorted = -1 * neg_depth_sorted

        if options.cpm:
            pos_depth_sorted = pos_depth_sorted * scale_factor
            neg_depth_sorted = neg_depth_sorted * scale_factor

        if options.cpm:
            pos_depth = pos_depth * scale_factor
            neg_depth = neg_depth * scale_factor

        # output to temporary wig file
        if options.format == "bed":
            output2Bed(pos_depth_sorted, neg_depth_sorted, chrom, bedfile)
        else:
            outputToBG(pos_depth_sorted, chrom, chrom_length, plus_wig)
            outputToBG(neg_depth_sorted, chrom, chrom_length, minus_wig)

        contig_sizes.append([chrom, chrom_length])

        del pos_depth_sorted
        del neg_depth_sorted

    if options.format == "bed":
        bedfile.close()
    else:
        plus_wig_name = plus_wig.name
        minus_wig_name = minus_wig.name
        plus_wig.close()
        minus_wig.close()

    outname_plus = args[0] + "_plus"
    outname_minus = args[0] + "_minus"

    if options.format == "bedgraph":
        E.debug("Outputting to bedGraph")
        shutil.move(plus_wig_name, outname_plus + ".bg")
        shutil.move(minus_wig_name, outname_minus + ".bg")

    elif options.format == "bigwig":
        chrom_sizes_file = tempfile.NamedTemporaryFile(delete=False, dir=".")
        contig_sizes = ["\t".join(map(str, row)) for row in contig_sizes]
        contig_sizes = "\n".join(contig_sizes) + "\n"
        chrom_sizes_file.write(contig_sizes)
        chrom_sizes_filename = chrom_sizes_file.name
        chrom_sizes_file.close()

        outputToBW(plus_wig_name, outname_plus, chrom_sizes_filename)
        outputToBW(minus_wig_name, outname_minus, chrom_sizes_filename)

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 11
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam",
                      type="string",
                      help="BAM file with iCLIP reads")
    parser.add_option("-f",
                      "--fasta-file",
                      dest="fasta",
                      type="string",
                      help="CGAT indexed Fasta file with genome sequence")
    parser.add_option("-k",
                      "--kmer",
                      dest="kmer",
                      type="int",
                      default=5,
                      help="Size of kmer to test default=[%default]")
    parser.add_option("-s",
                      "--spread",
                      dest="spread",
                      type="int",
                      default=15,
                      help="Amount of sequence around each read to consider"
                      "default=[%default]")
    parser.add_option("-n",
                      "--num-randomizations",
                      dest="randomisations",
                      type="int",
                      default=100,
                      help="Number of times to permute profiles to assess"
                      "significance of enrichment")
    parser.add_option("-p",
                      "--processes",
                      dest="proc",
                      type="int",
                      default=None,
                      help="Use this many processesors for multiprocessing")
    parser.add_option("--bed",
                      dest="bedfile",
                      type="string",
                      default=None,
                      help="Use signal from bedfile rather than BAM. File must"
                      "be compressed with bgzip and indexed with tabix")
    parser.add_option(
        "-w",
        "--bigwig",
        "--plus-bw",
        dest="plus_wig",
        default=None,
        help="Use signal from bigwig rather than BAM"
        ", to use stranded sequence pass plus strand to this option"
        "and minus strand to --minus-bw")
    parser.add_option("--minus-bw",
                      dest="minus_wig",
                      type="string",
                      default=None,
                      help="Use minus signal from this bigwig instead of BAM"
                      "must pass plus signal to -w/--plus-bw")
    parser.add_option("--feature",
                      dest="feature",
                      type="choice",
                      choices=["transcript", "gene"],
                      default="gene",
                      help="Treat transcripts seperately or merge genes")
    parser.add_option("--use-centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read for XL location")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.proc:
        try:
            import multiprocessing as mp
            pool = mp.Pool(options.proc)
        except ImportError:
            E.warn("Multiprocessing setup failed."
                   " Falling back to single processor mode")
            pool = None
    else:
        pool = None

    if options.bedfile:
        bam = iCLIP.make_getter(bedfile=options.bedfile)
    elif options.plus_wig:
        bam = iCLIP.make_getter(plus_wig=options.plus_wig,
                                minus_wig=options.minus_wig)
    else:
        bam = iCLIP.make_getter(bamfile=pysam.AlignmentFile(options.bam),
                                centre=options.centre)

    fasta = IndexedFasta(options.fasta)
    fasta.setConverter(getConverter("zero-both-open"))

    if options.feature == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
    else:
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    results = iCLIP.pentamer_enrichment(gtf_iterator,
                                        bam,
                                        fasta,
                                        options.kmer,
                                        options.randomisations,
                                        spread=options.spread,
                                        pool=pool)

    results.name = "Z"

    results.to_csv(options.stdout, header=True, index_label="Kmer", sep="\t")
    # write footer and output benchmark information.
    E.Stop()
#!/usr/bin/env python
import iCLIP
import pandas
from CGAT import GTF, IOTools

my_getter = iCLIP.make_getter(
    plus_wig=
    "/fastdata/mbp15jdp/MRC5_polII_ChIP/mp/bwa.dir/MRC5_polII_ChIP-seq.bw")
gtffile = IOTools.openFile("geneset_1000_extension_filtered.gtf.gz")
gtf_iterator = GTF.iterator(gtffile)
gene_iterator = GTF.flat_gene_iterator(gtf_iterator)


def truncate_exon(exon, start, l, strand):
    if strand == "+":
        exon.end = min(exon.end, start + l)
    else:
        exon.start = max(exon.start, start - l)

    return exon


def get_pausing_ratio(transcript,
                      getter,
                      promoter_up_extension=100,
                      promoter_down_extension=300,
                      gb_start=500,
                      gb_down_extension=2000):

    strand = transcript[0].strand
Esempio n. 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    profiles = iCLIP.getters.profiles.keys()
    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p", "--profile", dest="profile", type="choice",
                      choices=profiles,
                      default="iclip",
                      help="Experiment profile to use. Sets various things"
                      "about obtaining 1-bp position from read. Options are"
                      " %s" % ", ".join(profiles))
    parser.add_option("-c", "--use-centre", dest="centre", action="store_true",
                      default=None,
                      help="Use centre of read rather than frist base."
                      "Overrides profile")
    parser.add_option("-f", "--format", dest="format",
                      choices=["bigWig", "bigwig", "BigWig",
                               "bedGraph", "bg", "bedgraph",
                               "bed", "Bed", "BED"],
                      help="Output format. Either bigWig (2 files, + and - strand)"
                           ", bedGraph (2 files), or bed (1 file, depth in column 5,"
                           "strand in column 6",
                      default="bigWig")
    parser.add_option("-w", "--wig", dest="output_wig", action="store_true",
                      default=False,
                      help="Write output to bedgraph file rather than bigwig")
    parser.add_option("--dtype", dest = "dtype", type="string",
                      default="uint32",
                      help="dtype for storing depths")
    parser.add_option("--cpm", dest="cpm", action="store_true",
                      default=False,
                      help="Normalize output depths to number of mapped reads (in millions) in BAM")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    options.format = options.format.lower()
    if options.format == "bg":
        options.format = "bedgraph"

    profile = iCLIP.getters.profiles[options.profile]
    
    if options.centre is not None:
        centre=True
    else:
        centre=profile.centre
    
    if options.stdin == sys.stdin:
        in_bam = pysam.Samfile("-", "rb")
                                 
    else:
        fn = options.stdin.name
        options.stdin.close()
        in_bam = pysam.Samfile(fn, "rb")
                                  

    getter = iCLIP.make_getter(in_bam, profile=profile, centre=centre)

    if options.cpm:
        scale_factor = sum(contig.mapped for contig in in_bam.get_index_statistics())

        scale_factor = 1000000.0/scale_factor

    if options.format == "bed":
        bedfile = IOTools.openFile(args[0], "w")
    else:
        plus_wig = tempfile.NamedTemporaryFile(delete=False)
        minus_wig = tempfile.NamedTemporaryFile(delete=False)

    contig_sizes = []

 
    for chrom, chrom_length in zip(in_bam.references, in_bam.lengths):

        # get depths over chromosome
        pos_depth, neg_depth, counter = getter(chrom, strand="both", dtype=options.dtype)
        pos_depth_sorted = pos_depth.sort_index()
        del pos_depth
        neg_depth_sorted = neg_depth.sort_index()
        del neg_depth
        neg_depth_sorted = -1*neg_depth_sorted
 
        if options.cpm:
            pos_depth_sorted = pos_depth_sorted * scale_factor
            neg_depth_sorted = neg_depth_sorted * scale_factor

        if options.cpm:
            pos_depth = pos_depth * scale_factor
            neg_depth = neg_depth * scale_factor
            
        # output to temporary wig file
        if options.format == "bed":
            output2Bed(pos_depth_sorted, neg_depth_sorted, chrom, bedfile)
        else:
            outputToBG(pos_depth_sorted, chrom, chrom_length, plus_wig)
            outputToBG(neg_depth_sorted, chrom, chrom_length, minus_wig)
    
        contig_sizes.append([chrom, chrom_length])

        del pos_depth_sorted
        del neg_depth_sorted

    if options.format == "bed":
        bedfile.close()
    else:
        plus_wig_name = plus_wig.name
        minus_wig_name = minus_wig.name
        plus_wig.close()
        minus_wig.close()

    outname_plus = args[0] + "_plus"
    outname_minus = args[0] + "_minus"

    if options.format == "bedgraph":
        E.debug("Outputting to bedGraph")
        shutil.move(plus_wig_name, outname_plus + ".bg")
        shutil.move(minus_wig_name, outname_minus + ".bg")
        
    elif options.format == "bigwig":
        chrom_sizes_file = tempfile.NamedTemporaryFile(delete=False, dir=".")
        contig_sizes = ["\t".join(map(str,row)) for row in contig_sizes]
        contig_sizes = "\n".join(contig_sizes) + "\n"
        chrom_sizes_file.write(contig_sizes)
        chrom_sizes_filename = chrom_sizes_file.name
        chrom_sizes_file.close()

        outputToBW(plus_wig_name, outname_plus, chrom_sizes_filename)
        outputToBW(minus_wig_name, outname_minus, chrom_sizes_filename)


    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b", "--bam-file", dest="bam", type="string",
                      help="BAM file containing iCLIP reads", default=None)
    parser.add_option("-w", "--wig", "--plus-wig", dest="plus_wig", type="string",
                      help="BigWig file containing signal for already processed sample",
                      default=None)
    parser.add_option("--minus-wig", dest="minus_wig", type="string",
                      help="BigWig file containing signal for sample on minus strand",
                      default=None)
    parser.add_option("--bed", dest="bedfile", type="string",
                      help="Bed file containing signal for sample")
    parser.add_option("-s", "--spread", dest="spread", type="int",
                      default=15,
                      help="Number of bases each site of each bases"
                           "to use when calculating height")
    parser.add_option("-r", "--randomisations", dest="rands", type="int",
                      default=100,
                      help="Number of randomisations to use when"
                           "calculating FDR")
    parser.add_option("-t", "--threshold", dest="threshold", type="float",
                      default=0.05,
                      help="FDR threshold on which to select bases")
    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["transcript", "gene"],
                      default="gene",
                      help="GTF feature to use. Gene or transcript")
    parser.add_option("-p", "--processes", dest="proc", type="int",
                      default=None,
                      help="Number of processes to use for multiprocessing")
    parser.add_option("-c", "--centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read instead of -1 when no"
                      "mutation is present")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.proc is not None:
        try:
            import multiprocessing
            pool = multiprocessing.Pool(options.proc)
            E.debug("Operating in multiprocessing mode")
        except ImportError:
            E.warn("Failed to setup multiprocessing, using single processor")
            pool = None
    else:
        E.debug("Operating in single processor mode")
        pool = None

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        raise ValueError("Unknown feature type %s" % options.feature)

    if options.bam:
        bam = iCLIP.make_getter(options.bam, centre=options.centre)
    elif options.plus_wig:
        bam = iCLIP.make_getter(plus_wig=options.plus_wig,
                                minus_wig=options.minus_wig)
    elif options.bedfile:
        bam = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        E.error("Please specifiy one of bam file, bed file or wig file")
        sys.exit(1)

    results = iCLIP.get_crosslink_fdr_by_randomisation(
        iterator, bam, options.rands, options.spread, pool)

    results = results[results.fdr <= options.threshold]
    results = results.reset_index()
    results.columns = ["contig", "start", "FDR", "depth", "strand"]

    # Deal with case where there is more than one value on a base. Keep one with
    # lowest FDR.
    results = results.sort_values(by=["FDR", "depth"], ascending=[True, False])
    results = results.drop_duplicates(["contig", "start", "strand"])
    results = results.sort_values(["contig", "start", "strand"])
    
    results["start"] = results["start"].astype("int")
    results["end"] = results.start + 1
    results = results.loc[:, ["contig", "start", "end", "FDR", "depth", "strand"]]
    results["FDR"] = -numpy.log10(results["FDR"])
    results.to_csv(options.stdout, header=False, index=False, sep="\t")
    # write footer and output benchmark information.
    E.Stop()