def _get_counts_and_sequence(gtf_iterator, bam, fasta,
                             seperate_UTRs=False):
    '''Called by pentamer_enrichment. This function will return an iterator
    that yeilds tuples of profiles accross transcripts or introns and the
    sequence for which the profile is determined'''

    for transcript in gtf_iterator:

        E.debug("Counting transcript %s" % transcript[0].transcript_id)
        contig, strand = transcript[0].contig, transcript[0].strand

        # exons
        exons = GTF.asRanges(transcript, "exon")
        sequence = "".join(fasta.getSequence(contig, strand, exon[0], exon[1])
                           for exon in exons)
        exon_counts = count_transcript(transcript, bam)
        yield (exon_counts, sequence)

        # introns
        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals, contig, strand)

        if intron_counts.sum() == 0:
            continue

        for intron in intron_intervals:
            
            seq = fasta.getSequence(contig, strand, intron[0], intron[1])
            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            profile.index = profile.index - intron[0]
            yield (profile, seq)
Beispiel #2
0
def last_exon_transcript(gff_file):
    for transcript in GTF.transcript_iterator(GTF.iterator(gff_file)):
        transcript = sorted(transcript)
        if transcript[0].strand == "-":
            yield transcript[0]
        else:
            yield transcript[-1]
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(
        filenameToTablename(P.snip(infile, ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" % (
            transcript[0].transcript_id, 
            transcript[0].gene_id, 
            transcript[0].source))
    temp.close()

    inf_1 = temp.name
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --header=transcript_id,gene_id,class"
                 " < %(inf_1)s > %(outfile)s")
    P.run()
def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".", exon.strand,
                                        "."])) +
                    "\t" + exon.attributes + "\n")

        elif len(entry) == 1:
            for exon in entry:
                single.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".",
                                        exon.strand, "."])) +
                    "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()
def buildCodingGeneSet(abinitio_coding, reference, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes. 
    
    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.
    
    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as 
    novel, the default behaviour is to produce a set that
    is composed of 'complete' transcripts
    '''
    inf = IOTools.openFile(abinitio_coding)
    outf = gzip.open(outfile, "w")

    coding = {}
    coding["protein_coding"] =  GTF.readAndIndex( GTF.iterator_filtered( GTF.iterator(IOTools.openFile(reference))
                                                                        , source="protein_coding" )
                                                                        , with_value = False  )

    for gtf in GTF.iterator(inf):
        if coding["protein_coding"].contains(gtf.contig, gtf.start, gtf.end):
            if gtf.class_code == "=":
                outf.write("%s\n" % str(gtf))
    outf.close()
Beispiel #6
0
def UTR3(transcript):
    
    exons = GTF.asRanges(transcript, "exon")
    cds = GTF.asRanges(transcript, "CDS")

    if len(cds) == 0:
        return list()
    
    utrs = Intervals.truncate(exons, cds)

    if transcript[0].strand == "+":
        utr3 = [exon for exon in utrs
                if exon[0] >= cds[-1][1]]
    else:
        utr3 = [exon for exon in utrs
                if exon[-1] <= cds[0][0]]

    for e in transcript:
        if e.feature == "exon":
            template_exon = e
            break
            
    returned_exons = []     
    for e in utr3:
        gtf = GTF.Entry().fromGTF(template_exon)
        gtf.start = e[0]
        gtf.end = e[1]
        returned_exons.append(gtf)
        
    return returned_exons
Beispiel #7
0
def cropGFF(gffs, options):
    """crop intervals in gff file."""

    # read regions to crop with and convert intervals to intersectors
    E.info("reading gff for cropping: started.")

    other_gffs = GTF.iterator(IOTools.openFile(options.crop, "r"))
    cropper = GTF.readAsIntervals(other_gffs)
    ntotal = 0
    for contig in cropper.keys():
        intersector = bx.intervals.intersection.Intersecter()
        for start, end in cropper[contig]:
            intersector.add_interval(bx.intervals.Interval(start, end))
            ntotal += 1
        cropper[contig] = intersector

    E.info("reading gff for cropping: finished.")
    E.info("reading gff for cropping: %i contigs with %i intervals." %
           (len(cropper), ntotal))

    ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0

    # do the actual cropping
    for gff in gffs:

        ninput += 1

        if gff.contig in cropper:
            start, end = gff.start, gff.end
            overlaps = cropper[gff.contig].find(start, end)

            if overlaps:
                l = end - start
                a = numpy.ones(l)
                for i in overlaps:
                    s = max(0, i.start - start)
                    e = min(l, i.end - start)
                    a[s:e] = 0

                segments = Intervals.fromArray(a)

                if len(segments) == 0:
                    ndeleted += 1
                else:
                    ncropped += 1

                for s, e in segments:
                    gff.start, gff.end = s + start, e + start
                    noutput += 1
                    options.stdout.write("%s\n" % gff)

                continue

        noutput += 1
        options.stdout.write("%s\n" % gff)

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i\n" % (
            ninput, noutput, ncropped, ndeleted))
Beispiel #8
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--input-format", type="choice",
                      choices=["bed", "gtf", "gff"],
                      default="bed",
                      dest="format",
                      help="Format of the cleavage site definition"
                           " default [%default]")
    parser.add_option("--feature", type="choice",
                      choices=["gene", "transcript", "entry"],
                      default="gene",
                      dest="feature",
                      help="Which feature to use if using gtf")
    parser.add_option("-w", "--window-size", type="int",
                      default=50,
                      dest="window_size",
                      help="Number of bases to count upstream and downstream"
                           " of cleavage site. [%default]")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    try:
        bamfile = pysam.AlignmentFile(args[0])
    except IndexError:
        E.error("Please supply a bam file as the first positional arguement")
        return 1
    except IOError:
        E.error("Cannot open BAM file %s" % args[0])

    interval_iterators = {"bed": Bed.iterator,
                          "gtf-gene": lambda x:
                          GTF.merged_gene_iterator(GTF.iterator(x)),
                          "gtf-transcript": last_exon_transcript}

    if options.format == "gtf":
        options.format += "-" + options.feature

    iterator = interval_iterators[options.format](options.stdin)

    pi = iCLIP.processing_index(iterator, bamfile, options.window_size)

    options.stdout.write("Processing Index\t%s\t%s\n" % (args[0], pi))

    # write footer and output benchmark information.
    E.Stop()
def calculateSplicingIndex(bamfile, gtffile, outfile):

    bamfile = pysam.AlignmentFile(bamfile)

    counts = E.Counter()

    for transcript in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(gtffile))):

        introns = GTF.toIntronIntervals(transcript)
        E.debug("Gene %s (%s), Transcript: %s, %i introns" %
                (transcript[0].gene_id,
                 transcript[0].contig,
                 transcript[0].transcript_id,
                 len(introns)))

        for intron in introns:
            reads = bamfile.fetch(
                reference=transcript[0].contig,
                start=intron[0], end=intron[1])
            
            for read in reads:
                if 'N' in read.cigarstring:
                    blocks = read.get_blocks()
                    starts, ends = zip(*blocks)
                    if intron[0] in ends and intron[1] in starts:
                        counts["Exon_Exon"] += 1
                    else:
                        counts["spliced_uncounted"] += 1
                elif (read.reference_start <= intron[0] - 3
                      and read.reference_end >= intron[0] + 3):
                    if transcript[0].strand == "+":
                        counts["Exon_Intron"] += 1
                    else:
                        counts["Intron_Exon"] += 1
                elif (read.reference_start <= intron[1] - 3
                      and read.reference_end >= intron[1] + 3):
                    if transcript[0].strand == "+":
                        counts["Intron_Exon"] += 1
                    else:
                        counts["Exon_Intron"] += 1
                else:
                    counts["unspliced_uncounted"] += 1

        E.debug("Done, counts are: " + str(counts))
    header = ["Exon_Exon",
              "Exon_Intron",
              "Intron_Exon",
              "spliced_uncounted",
              "unspliced_uncounted"]

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("\t".join(header)+"\n")
        outf.write("\t".join(map(str, [counts[col] for col in header]))
                   + "\n")
Beispiel #10
0
    def __call__(self, track, slice=None):

        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
Beispiel #11
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.set_defaults(
        is_gtf=False,
    )

    (options, args) = E.Start(parser, add_output_options=True)

    if len(args) == 0:
        files = [options.stdin]
    else:
        files = args

    options.stdout.write("track\t%s" % ("\t".join(counter_gff.fields)))

    if options.is_gtf:
        options.stdout.write("\t%s" % ("\t".join(counter_exons.fields)))
    options.stdout.write("\n")

    for f in files:
        if f == options.stdin:
            infile = f
            options.stdout.write("stdin")
        else:
            infile = IOTools.openFile(f)
            options.stdout.write(f)

        counters = []
        if options.is_gtf:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))
            counters.append(counter_exons(counters[0]))
        else:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))

        c = counters[-1]
        for x in c:
            pass

        for c in counters:
            options.stdout.write("\t%s" % str(c))
        options.stdout.write("\n")

        if infile != sys.stdin:
            infile.close()

    E.Stop()
def summariseExonCountsAndLengthOfMultiExonicLincRNA(infile, outfile):
    '''
    summarizes some basic statistics on the length and number of exons 
    for each set of parameter values
    '''
    outf = open(outfile, "w")
    outf.write("transcript_id\tno_exons\ttranscriptlength\n")
    inf = GTF.iterator(IOTools.openFile(infile))
    for gtfs in GTF.transcript_iterator(inf):
        outf.write("\t".join( (gtfs[0].transcript_id, str(len(gtfs)), str(sum([x.end - x.start for x in gtfs])) ) ) + "\n")
    outf.close()
def bases_to_windows(pvalues, gene, window_size, threshold):

    contig = gene[0][0].contig
    strand = gene[0][0].strand
    gene_id = gene[0][0].gene_id
    try:
        gene_pvals = pvalues[gene_id][contig][strand]
    except KeyError:
        E.info("No Significant CLIP sites in gene %s" %
               (gene[0][0].gene_id))
        return []

    gene_pvals = gene_pvals.sort_index()
    outlist = []

    for transcript in gene:

        coords_converter = iCLIP.TranscriptCoordInterconverter(transcript)

        # first exons
        exons = GTF.asRanges(transcript, "exon")
       
        # pandas indexing is inclusive, but our exon intervals are
        # half closed
        exons_pvals_list = [gene_pvals.ix[float(start):float(end-1)]
                            for start, end in exons]

        exons_pvals = pd.concat(exons_pvals_list)

        exons_pvals.index = coords_converter.genome2transcript(
            exons_pvals.index.values)

        windows = get_windows(exons_pvals, window_size, threshold)
        windows = [(coords_converter.transcript_interval2genome_intervals(
            window),p) for window,p in windows]

        # now for introns
        for intron in GTF.toIntronIntervals(transcript):
            intron_pvals = gene_pvals.ix[float(intron[0]):float(intron[1]-1)]
            intron_windows = get_windows(intron_pvals, window_size, threshold)
            intron_windows = [((max(intron[0], start), min(intron[1], end)),p)
                              for (start, end), p in intron_windows]
            windows.extend([([window],p) for window,p in intron_windows])
        
        try:
            outlist.extend(
                [windows2bed12(window, contig, transcript[0].strand,
                               "%s_%s" % (transcript[0].transcript_id, n),
                               score=p)
                 for n, (window,p) in enumerate(windows)])
        except:
            print [x for x in enumerate(windows)]
            raise
def removeFirstAndLastExon(infile, outfile):

    transcripts = GTF.transcript_iterator(
        GTF.iterator(IOTools.openFile(infile)))
    outfile = IOTools.openFile(outfile, "w")

    for transcript in transcripts:

        for exon in transcript[1:-1]:
            outfile.write(str(exon) + "\n")

    outfile.close()
def buildFinalLncRNAGeneSet(filteredLncRNAGeneSet, cpc_table, outfile, filter_cpc = None):
    '''
    filters lncRNA set based on the coding potential as output from 
    the CPC
    '''
    
    if filter_cpc:
           
        # get the transcripts that are designated as coding
        coding_set = set()
        dbh = sqlite3.connect("csvdb")
        cc = dbh.cursor()
        for transcript_id in cc.execute("SELECT transcript_id from %s WHERE CP_score > 1" % cpc_table):
            coding_set.add(transcript_id[0])

        remove = set()
        outf_coding = gzip.open("gtfs/cpc_removed.gtf.gz", "w")
        for gtf in GTF.iterator(IOTools.openFile(filteredLncRNAGeneSet)):
            if gtf.transcript_id in coding_set:
                remove.add(gtf.gene_id)
                outf_coding.write("%s\n" % gtf)
        outf_coding.close()
    else:
        # create empty set
        remove = set()
    
    # get temporary file for built lncrna
    temp = P.getTempFile(dir=".")
    
    # get temporary file for known lncrna
    temp2 = P.getTempFile(dir = ".")
        
    for gtf in GTF.iterator(IOTools.openFile(filteredLncRNAGeneSet)):
        if gtf.gene_id in remove: continue
        if gtf.transcript_id.find("TCONS") != -1:
            # output known and buil transcripts separately
            temp.write("%s\n" % gtf)
        else:
            temp2.write("%s\n" % gtf)
    temp.close()
    temp2.close()

    filename = temp.name
    filename2 = temp2.name
    statement = '''cat %(filename)s | python %(scriptsdir)s/gtf2gtf.py --sort=gene | 
                     python %(scriptsdir)s/gtf2gtf.py --renumber-genes=NONCO%%i 
                    --log=%(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py 
                    --sort=gene --log=%(outfile)s.log > temp.gtf'''
    P.run()
    # recombine all transcripts with new ids
    statement = ('''cat %(filename2)s temp.gtf | python %(scriptsdir)s/gtf2gtf.py 
                 --sort=contig+gene --log = %(outfile)s.log | gzip > %(outfile)s''')
    P.run()
Beispiel #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-i", "--ignore-missing", dest="ignore_missing", action="store_true",
                      help="Ignore transcripts on contigs that are not in the genome-file [default=%default].")

    parser.add_option("--min-intron-length", dest="min_intron_length", type="int",
                      help="minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("full", ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if not options.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    annotateGenome(iterator, fasta, options)

    # write footer and output benchmark information.
    E.Stop()
Beispiel #17
0
def get_matrix(getter, lengths, options):

    if getter is None:
        E.error("No bamfile or wigfile specified")
        print(globals()["__usage__"])
        return(1)

    f = IOTools.openFile(options.gtf)
    if options.feature == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(f))
    else:
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(f))

    if options.ds_win is None:
        ds_win = lengths.max()
    else:
        ds_win = options.ds_win

    if options.align_at == "start":
        align_at = 0
        us_win, ds_win = options.us_win, ds_win
    elif options.align_at == "end":
        align_at = lengths
        ds_win, us_win = options.us_win, ds_win
            
    if options.rstrand:
        def _it_reverse(gtf):
            for transcript in gtf:
                if transcript[0].strand == "+":
                    transcript[0].strand = "-"
                else:
                    transcript[0].strand = "+"
                yield transcript

        gtf_iterator = _it_reverse(gtf_iterator)
        ds_win, us_win = us_win, ds_win
        align_at = lengths

    raw_matrix = iCLIP.get_binding_matrix(getter, gtf_iterator,
                                          align_at=align_at,
                                          bin_size=options.bin_size,
                                          left_margin=us_win,
                                          right_margin=ds_win)
    if options.rstrand:
        
        raw_matrix.columns = -1 * raw_matrix.columns.values
        raw_matrix = raw_matrix.sort_index(axis=1)

    return raw_matrix
def countMultiAndSingleExonLincRna(infile, outfile):
    '''
    outputs the transcript and gene counts for lincRNA transcripts
    '''
    outf = open(outfile, "w")
    outf.write("no_multi_exon_transcripts\tno_single_exon_transcripts\tproportion_single\n")
    inf = GTF.iterator(IOTools.openFile(infile))
    c_multi = 0
    c_single = 0
    for gtfs in GTF.transcript_iterator(inf):
        if len(gtfs) > 1:
            c_multi += 1
        elif len(gtfs) == 1:
            c_single += 1
    outf.write( "\t".join( map(str, [c_multi, c_single, float(c_single)/(c_multi + c_single)]) ) )
def buildRefcodingGeneSet(coding_set, refcoding_set, outfile):
    '''
    takes genes from an ab initio assembly and filters a reference coding set
    for these genes. Allows for comparisons of known transcripts for those genes
    that are assembled ab initio. Does this by gene name
    '''
    keep_genes = set()
    for gtf in GTF.iterator(IOTools.openFile(coding_set)):
        keep_genes.add(gtf.gene_name)

    outf = gzip.open(outfile, "w")
    for gtf in GTF.iterator(IOTools.openFile(refcoding_set)):
        if gtf.gene_name in keep_genes:
            outf.write("%s\n" % gtf)
    outf.close()
 def __call__(self, track, slice = None):
     
     if slice == "transcript":
         lengths_transcripts = []
         for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in transcript])
             lengths_transcripts.append(length)
         return np.mean(lengths_transcripts)
     
     elif slice == "gene":
         lengths_genes = []
         for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in gene])
             lengths_genes.append(length)
         return np.mean(lengths_genes)
def buildJunctions(infile, outfile):
    '''build file with splice junctions from gtf file.

    Identify the splice junctions from a gene set :term:`gtf`
    file. A junctions file is a better option than supplying a GTF
    file, as parsing the latter often fails. See:

    http://seqanswers.com/forums/showthread.php?t=7563

    Parameters
    ----------
    infile : str
       Input filename in :term:`gtf` format
    outfile: str
       Output filename

    '''

    outf = IOTools.openFile(outfile, "w")
    njunctions = 0
    for gffs in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(infile, "r"))):

        gffs.sort(key=lambda x: x.start)
        end = gffs[0].end
        for gff in gffs[1:]:
            # subtract one: these are not open/closed coordinates but
            # the 0-based coordinates
            # of first and last residue that are to be kept (i.e., within the
            # exon).
            outf.write("%s\t%i\t%i\t%s\n" %
                       (gff.contig, end - 1, gff.start, gff.strand))
            end = gff.end
            njunctions += 1

    outf.close()

    if njunctions == 0:
        E.warn('no junctions found in gene set')
        return
    else:
        E.info('found %i junctions before removing duplicates' % njunctions)

    # make unique
    statement = '''mv %(outfile)s %(outfile)s.tmp;
                   cat < %(outfile)s.tmp | sort | uniq > %(outfile)s;
                   rm -f %(outfile)s.tmp; '''
    P.run()
 def count(self):
     
     gene_ids = set()
     for transcript in GTF.transcript_iterator(self.gtffile):
         if len(transcript) == 1:
             gene_ids.add(transcript[0].gene_id)
     return len(gene_ids)
Beispiel #23
0
    def _iterator(iterator):
        """yield gene plus the locations of the end of the previous gene and
        start of next gene"""

        last_end, prev_end = 0, 0
        last_contig = None
        last = None
        for matches in GTF.iterator_overlaps(iterator):

            this_start = min([x.start for x in matches])
            this_end = max([x.end for x in matches])

            if method == "tss":
                # restrict to tss
                if matches[0].strand == "+":
                    this_end = this_start + 1
                else:
                    this_start = this_end - 1

            this_contig = matches[0].contig

            if last_contig != this_contig:
                if last:
                    yield prev_end, last, fasta.getLength(last_contig)
                last_end, prev_end = 0, 0
            else:
                yield prev_end, last, this_start

            prev_end = last_end
            last_end = this_end
            last = matches
            last_contig = this_contig

        if last:
            yield prev_end, last, fasta.getLength(last_contig)
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py 
                        --sort=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py 
                           --renumber-genes=%(gene_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --renumber-transcripts=%(transcript_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --sort=gene 
                           --log=%(outfile)s.log
                          | gzip > %(outfile)s'''

    P.run()
Beispiel #25
0
    def _count(self, filename, idx):

        overlapping_genes = set()
        genes = set()
        # iterate over exons
        infile = IOTools.openFile(filename, "r")
        it = GTF.iterator(infile)

        nexons, nexons_overlapping = 0, 0
        nbases, nbases_overlapping = 0, 0
        for this in it:
            nexons += 1
            nbases += this.end - this.start
            genes.add(this.gene_id)

            try:
                intervals = list(idx[this.contig].find(this.start, this.end))
            except KeyError:
                continue

            if len(intervals) == 0:
                continue

            overlapping_genes.add(this.gene_id)
            nexons_overlapping += 1
            start, end = this.start, this.end
            counts = numpy.zeros(end - start, numpy.int)
            for other_start, other_end, other_value in intervals:
                for x in range(max(start, other_start) - start, min(end, other_end) - start):
                    counts[x] += 1
            nbases_overlapping += sum([1 for x in counts if x > 0])

        infile.close()

        return len(genes), len(overlapping_genes), nexons, nexons_overlapping, nbases, nbases_overlapping
Beispiel #26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--min-chunk-size", dest="min_chunk_size", type="int",
        help="minimum chunk size [default=%default].")

    parser.add_option(
        "-n", "--dry-run", dest="dry_run", action="store_true",
        help="do not create any files [default=%default].")

    parser.set_defaults(
        method="overlap",
        dry_run=False,
        min_chunk_size=2,
        output_filename_pattern="%06i.chunk",
    )

    (options, args) = E.Start(parser, add_output_options=True)

    gffs = GTF.iterator(options.stdin)

    ninput, noutput, nchunks = 0, 0, 0

    outputChunk = OutputChunk(options.output_filename_pattern,
                              dry_run=options.dry_run)

    if options.method == "overlap":

        last_contig, last_to = None, 0
        chunk = []
        for gff in gffs:
            ninput += 1
            if len(chunk) >= options.min_chunk_size and \
                    (gff.contig != last_contig or
                     gff.start > last_to):
                noutput += outputChunk(chunk)
                nchunks += 1
                chunk = []
                last_contig, last_to = gff.contig, gff.end

            chunk.append(gff)
            last_to = max(gff.end, last_to)

        noutput += outputChunk(chunk)
        nchunks += 1

    E.info("ninput=%i, noutput=%i, nchunks=%i" % (ninput, noutput, nchunks))

    E.Stop()
    def _run( self, filename, idx ):

        # iterate over exons
        infile = IOTools.openFile( filename, "r" )
        it = GTF.iterator( infile )

        keys = set()

        for this in it:
            
            try:
                intervals = idx[this.contig].find( this.start, this.end )
            except KeyError:
                continue

            if len(intervals) == 0: 
                continue

            for i in intervals:
                key = "%s-%s" % (this.gene_id, i.value.gene_id )
                if key not in keys:
                    self.write( this.gene_id, i.value.gene_id )
                    keys.add( key )

        infile.close()
Beispiel #28
0
def collectExonIntronSequences(transcripts, fasta):
    '''collect all the wild type sequences for exons and introns

    exons and introns are indexed by their respective positions.

    The function changes coordinates in ``gtfs`` to reverse coordinates.
    '''

    contig = transcripts[0][0].contig
    strand = transcripts[0][0].strand
    lcontig = fasta.getLength(contig)

    all_exons, all_introns = {}, {}
    for exons in transcripts:
        for exon in exons:
            exon.invert(lcontig)
            start, end = exon.start, exon.end
            key = start, end
            if key not in all_exons:
                all_exons[key] = fasta.getSequence(
                    contig, strand, start, end).lower()

        intron_intervals = GTF.toIntronIntervals(exons)
        for start, end in intron_intervals:
            key = start, end
            if key not in all_introns:
                all_introns[key] = fasta.getSequence(
                    contig, strand, start, end).lower()
    return all_exons, all_introns
Beispiel #29
0
    def __call__(self, track, slice=None):

        classes = ["antisense", "antisense_upstream", "antisense_downstream", "sense_upstream",
                   "sense_downstream", "intergenic", "sense_intronic", "antisense_intronic"]

        coding_set = {}
        for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding": collections.defaultdict(int)}
        total_nc = float(self.getValue(
            "SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"))
        for c in classes:
            result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id""" % (track, c))) / total_nc) * 100

        total_c = len(list(coding_set.keys()))
        for c in classes:
            ids = self.getValues(
                "SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'")
            for i in ids:
                if i in list(coding_set.keys()):
                    if coding_set[i] == c:
                        result["coding"][c] += 1

        for x, y in result["coding"].items():
            result["coding"][x] = (float(y) / total_c) * 100

        return result
def buildRepeatTrack( infile, outfile ):
    '''build a repeat track as negative control.'''

    nrepeats = 0
    for gff in GTF.iterator( gzip.open(infile, "r" ) ): nrepeats+=1
    sample = set( random.sample( xrange( nrepeats), PARAMS["ancestral_repeats_samplesize"]) )

    outf = gzip.open( outfile, "w" )
    gtf = GTF.Entry()
    for x,gff in enumerate( GTF.iterator( gzip.open(infile, "r" ) ) ):
        if not x in sample: continue
        gtf.fromGTF( gff, "%08i" % x, "%08i" % x )
        outf.write( "%s\n" % str(gtf) )
    outf.close()

    E.debug( "created sample of %i repeats out of %i in %s" % (len(sample), nrepeats, outfile))
Beispiel #31
0
def annotateGREATDomains(iterator, fasta, options):
    """build great domains

    extend from TSS a basal region.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    counter = E.Counter()

    upstream, downstream = options.upstream, options.downstream
    radius = options.radius
    outfile = options.stdout

    regions = []
    ####################################################################
    # define basal regions for each gene
    # take all basal regions per transcript and merge them
    # Thus, the basal region of a gene might be larger than the sum
    # of options.upstream + options.downstream
    for gene in gene_iterator:
        counter.genes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)

        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []

        # collect every basal region per transcript
        for transcript in gene:
            counter.transcripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            # add range to both sides of tss
            if is_negative_strand:
                interval = ma - options.downstream, ma + options.upstream
            else:
                interval = mi - options.upstream, mi + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        # take first/last entry
        start, end = min(x[0] for x in regulons), max(x[1] for x in regulons)

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "greatdomain"
        gtf.start, gtf.end = start, end
        regions.append(gtf)

    regions.sort(key=lambda x: (x.contig, x.start))

    outf = IOTools.openFile("test.gff", "w")
    for x in regions:
        outf.write(str(x) + "\n")
    outf.close()

    ####################################################################
    # extend basal regions
    regions.sort(key=lambda x: (x.contig, x.start))

    # iterate within groups of overlapping basal regions
    groups = list(GTF.iterator_overlaps(iter(regions)))
    counter.groups = len(groups)

    last_end = 0
    reset = False

    for region_id, group in enumerate(groups):

        # collect basal intervals in group
        intervals = [(x.start, x.end) for x in group]

        def overlapsBasalRegion(pos):
            for start, end in intervals:
                if start == pos or end == pos:
                    continue
                if start <= pos < end:
                    return True
                if start > pos:
                    return False
            return False

        # deal with boundary cases - end of contig
        if region_id < len(groups) - 1:
            nxt = groups[region_id + 1]
            if nxt[0].contig == group[0].contig:
                next_start = min([x.start for x in nxt])
            else:
                next_start = fasta.getLength(group[0].contig)
                reset = True
        else:
            next_start = fasta.getLength(group[0].contig)
            reset = True

        # last_end = basal extension of previous group
        # next_start = basal_extension of next group

        # extend region to previous/next group
        # always extend dowstream, but upstream only extend if basal region of an interval is not already
        # overlapping another basal region within the group
        save_end = 0
        for gtf in group:
            save_end = max(save_end, gtf.end)
            if gtf.strand == "+":
                if not overlapsBasalRegion(gtf.start):
                    gtf.start = max(gtf.start - radius, last_end)
                # always extend downstream
                gtf.end = min(gtf.end + radius, next_start)
            else:
                # always extend downstream
                gtf.start = max(gtf.start - radius, last_end)
                if not overlapsBasalRegion(gtf.end):
                    gtf.end = min(gtf.end + radius, next_start)
            outfile.write(str(gtf) + "\n")
            counter.regulons += 1

        if len(group) > 1:
            counter.overlaps += len(group)
        else:
            counter.nonoverlaps += 1

        if reset:
            last_end = 0
            reset = False
        else:
            last_end = save_end

    E.info("%s" % str(counter))
Beispiel #32
0
def readSegments(infile,
                 indexed_workspace,
                 truncate=False,
                 format="gtf",
                 keep_ambiguous=False,
                 remove_overhangs=False):
    """read segments from infile.

    segments not overlapping with indexed_workspace are removed.

    If :attr: truncate is given, segments extending beyond the workspace
    are truncated.

    returns a list of segments for each contig in a dictionary
    """
    counter = E.Counter()

    segments = collections.defaultdict(list)

    def addSegment(contig, start, end, counter):
        if contig in indexed_workspace:
            r = indexed_workspace[contig].find(start, end)
            if not r:
                counter.nskipped += 1
                return
            if len(r) > 1:
                counter.nambiguous += 1
                if not keep_ambiguous:
                    return
            if truncate:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.ntruncated += 1
                    segments[contig].append((rstart, rend))
                    counter.added += 1
            elif remove_overhangs:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.overhangs += 1
                        break
                else:
                    segments[contig].append((start, end))
            else:
                segments[contig].append((start, end))
                counter.added += 1

            counter.nkept += 1

    if format == "gtf":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(infile))

        for gene in gtf_iterator:
            # get start and end ignoring introns
            # contig, start, end = gene[0].contig, min( [x.start for x in gene] ), max( [x.end for x in gene] )

            contig, coords = gene[0].contig, [(x.start, x.end) for x in gene]
            counter.ninput += 1
            for start, end in coords:
                addSegment(contig, start, end, counter)

    elif format == "bed":
        bed_iterator = Bed.iterator(infile)
        for bed in bed_iterator:
            counter.ninput += 1
            addSegment(bed.contig, bed.start, bed.end, counter)

    E.info("read segments: %s" % str(counter))

    return segments
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome (indexed).")

    parser.add_option("-w",
                      "--windows-bed-file",
                      dest="filename_windows",
                      type="string",
                      help="gff file with windows to use.")

    parser.add_option("-d",
                      "--filename-data",
                      dest="filename_data",
                      type="string",
                      help="gff file with data to use.")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="filename-data is gtf file [default=%default.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="choice",
                      action="append",
                      choices=("GC", ),
                      help="features to compute.")

    parser.add_option("-c",
                      "--decorator",
                      dest="decorator",
                      type="choice",
                      choices=("counts", "gc", "gc3", "mean-length",
                               "median-length", "percent-coverage",
                               "median-score", "mean-score", "stddev-score",
                               "min-score", "max-score"),
                      help="decorators to use.")

    parser.add_option("-e",
                      "--skip-empty",
                      dest="skip_empty",
                      action="store_true",
                      help="skip empty windows.")

    parser.add_option(
        "-t",
        "--transform=",
        dest="transform",
        type="choice",
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (options, args) = E.start(parser)

    #    test_transform_third_codon()

    if not options.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if options.loglevel >= 1:
        options.stdlog.write("# reading windows...")
        options.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(IOTools.open_file(options.filename_windows, "r")))

    if options.loglevel >= 1:
        options.stdlog.write("done\n")
        options.stdlog.flush()

    if options.filename_data:
        if options.loglevel >= 1:
            options.stdlog.write("# reading data...")
            options.stdlog.flush()

        if options.is_gtf:
            gff_data = GTF.readFromFile(
                IOTools.open_file(options.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.open_file(options.filename_data, "r"))

        if options.loglevel >= 1:
            options.stdlog.write("done\n")
            options.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        options.transform = "complement"

    map_contig2size = {}

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    options.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                options)
        else:
            annotateWindows(contig, windows[contig], [], fasta, options)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.stop()
Beispiel #34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-m",
        "--merge",
        dest="merge",
        action="store_true",
        help="merge adjacent intervals with the same attributes. "
        "[default=%default]")

    parser.add_option(
        "-e",
        "--feature",
        dest="feature",
        type="string",
        help="filter by a feature, for example 'exon', 'CDS'. If "
        "set to the empty string, all entries are output [%default].")

    parser.add_option(
        "-f",
        "--filename-masks",
        dest="filename_masks",
        type="string",
        metavar="gff",
        help="mask sequences with regions given in gff file [%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option(
        "--min-length",
        dest="min_length",
        type="int",
        help="set minimum length for sequences output [%default]")

    parser.add_option(
        "--max-length",
        dest="max_length",
        type="int",
        help="set maximum length for sequences output [%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        masker=None)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(sys.stdin))
    else:
        gffs = GTF.iterator(sys.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with open(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GFF.iterator(infile))

        # convert intervals to intersectors
        for contig in e.keys():
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    #    for item in iterator:
    #	print len(item) # 3, 2
    #	for i in item:
    #	   print len(i) # 9, 9, 9, 9, 9
    #	   print i.contig
    #	   print i.strand
    #	   print i.transcript_id

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = filter(lambda x: x.feature == feature, ichunk)
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from %s:%i..%i - %s" %
                   (ichunk[0].contig, ichunk[0].start, ichunk[0].end,
                    str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions): nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise "unimplemented"

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write( "# skipped because fully masked: %s: regions=%s masks=%s\n" %\
                                                  (name, str([ (x.start, x.end) for x in chunk ]), masked_regions) )
                    continue

        out = intervals

        if options.extend_at:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        #IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if l < options.min_length or (options.max_length
                                      and l > options.max_length):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write( "# skipped because length out of bounds %s: regions=%s len=%i\n" %\
                                          (name, str(intervals), l) )
            continue

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x
                                             for x in out]), "\n".join(s)))

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, nskipped_masked=%i, nskipped_length=%i" %\
                (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length ) )

    E.Stop()
Beispiel #35
0
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False):
    '''get refseq gene set from UCSC database and save as :term:`gtf`
    formatted file.

    Matches to ``chr_random`` are ignored (as does ENSEMBL).

    Note that this approach does not work as a gene set, as refseq
    maps are not real gene builds and unalignable parts cause
    differences that are not reconcilable.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`gtf` format. The filename
       aims to be close to the ENSEMBL gtf format.
    remove_duplicate : bool
       If True, duplicate mappings are removed.

    '''

    duplicates = set()

    if remove_duplicates:
        cc = dbhandle.cursor()
        cc.execute("""SELECT name, COUNT(*) AS c FROM refGene
        WHERE chrom NOT LIKE '%_random'
        GROUP BY name HAVING c > 1""")
        duplicates = set([x[0] for x in cc.fetchall()])
        E.info("removing %i duplicates" % len(duplicates))

    # these are forward strand coordinates
    statement = '''
    SELECT gene.name, link.geneName, link.name, gene.name2, product,
    protAcc, chrom, strand, cdsStart, cdsEnd,
    exonCount, exonStarts, exonEnds, exonFrames
    FROM refGene as gene, refLink as link
    WHERE gene.name = link.mrnaAcc
    AND chrom NOT LIKE '%_random'
    ORDER by chrom, cdsStart
    '''

    outf = IOTools.openFile(outfile, "w")

    cc = dbhandle.cursor()
    cc.execute(statement)

    SQLResult = collections.namedtuple(
        'Result', '''transcript_id, gene_id, gene_name, gene_id2, description,
        protein_id, contig, strand, start, end,
        nexons, starts, ends, frames''')

    counts = E.Counter()
    counts.duplicates = len(duplicates)

    for r in map(SQLResult._make, cc.fetchall()):

        if r.transcript_id in duplicates:
            continue

        starts = list(map(int, r.starts.split(",")[:-1]))
        ends = list(map(int, r.ends.split(",")[:-1]))
        frames = list(map(int, r.frames.split(",")[:-1]))

        gtf = GTF.Entry()
        gtf.contig = r.contig
        gtf.source = "protein_coding"
        gtf.strand = r.strand
        gtf.gene_id = r.gene_id
        gtf.transcript_id = r.transcript_id
        gtf.addAttribute("protein_id", r.protein_id)
        gtf.addAttribute("transcript_name", r.transcript_id)
        gtf.addAttribute("gene_name", r.gene_name)

        assert len(starts) == len(ends) == len(frames)

        if gtf.strand == "-":
            starts.reverse()
            ends.reverse()
            frames.reverse()

        counts.transcripts += 1
        i = 0
        for start, end, frame in zip(starts, ends, frames):
            gtf.feature = "exon"
            counts.exons += 1
            i += 1
            gtf.addAttribute("exon_number", i)
            # frame of utr exons is set to -1 in UCSC
            gtf.start, gtf.end, gtf.frame = start, end, "."
            outf.write("%s\n" % str(gtf))

            cds_start, cds_end = max(r.start, start), min(r.end, end)
            if cds_start >= cds_end:
                # UTR exons have no CDS
                # do not expect any in UCSC
                continue
            gtf.feature = "CDS"
            # invert the frame
            frame = (3 - frame % 3) % 3
            gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame
            outf.write("%s\n" % str(gtf))

    outf.close()

    E.info("%s" % str(counts))
Beispiel #36
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2bed.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input file is gtf [default=%default] ")

    parser.add_option("--name",
                      dest="name",
                      type="choice",
                      help="field to use as the name field [%default]",
                      choices=("gene_id", "transcript_id", "class", "family",
                               "feature", "source", "repName"))

    parser.add_option(
        "--track",
        dest="track",
        type="choice",
        choices=("feature", "source", None),
        help="use feature/source field to define tracks [default=%default] ")

    parser.set_defaults(track=None, name="gene_id", is_gtf=False)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ninput, noutput = 0, 0

    is_gtf, name = options.is_gtf, options.name
    iterator = GTF.iterator(options.stdin)

    if options.track:
        all_input = list(iterator)

        if options.track == "feature":
            grouper = lambda x: x.feature
        elif options.track == "source":
            grouper = lambda x: x.source

        all_input.sort(key=grouper)

        bed = Bed.Bed()
        for key, vals in itertools.groupby(all_input, grouper):
            options.stdout.write("track name=%s\n" % key)
            for gff in vals:
                ninput += 1
                bed.fromGTF(gff, is_gtf=is_gtf, name=name)
                options.stdout.write(str(bed) + "\n")
                noutput += 1

    else:
        bed = Bed.Bed()
        for gff in iterator:
            ninput += 1
            bed.fromGTF(gff, is_gtf=is_gtf, name=name)
            options.stdout.write(str(bed) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i" % (ninput, noutput))
    E.Stop()
Beispiel #37
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand"),
        help="method to apply [%default]")

    parser.add_option("--ignore-strand",
                      dest="ignore_strand",
                      help="ignore strand information.",
                      action="store_true")

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input will be treated as gtf [default=%default].")

    parser.add_option("-c",
                      "--contigs-tsv-file",
                      dest="input_filename_contigs",
                      type="string",
                      help="filename with contig lengths.")

    parser.add_option(
        "--agp-file",
        dest="input_filename_agp",
        type="string",
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("--crop-gff-file",
                      dest="filename_crop_gff",
                      type="string",
                      help="GFF/GTF file to crop against.")

    parser.add_option(
        "--group-field",
        dest="group_field",
        type="string",
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... [%default].""")

    parser.add_option(
        "--filter-range",
        dest="filter_range",
        type="string",
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_option("--sanitize-method",
                      dest="sanitize_method",
                      type="choice",
                      choices=("ucsc", "ensembl", "genome"),
                      help="method to use for sanitizing chromosome names. "
                      "[%default].")

    parser.add_option(
        "--flank-method",
        dest="flank_method",
        type="choice",
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        "[%default].")

    parser.add_option("--skip-missing",
                      dest="skip_missing",
                      action="store_true",
                      help="skip entries on missing contigs. Otherwise an "
                      "exception is raised [%default].")

    parser.add_option(
        "--contig-pattern",
        dest="contig_pattern",
        type="string",
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report",
        dest="assembly_report",
        type="string",
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type="int",
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize [%default].")

    parser.add_option(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type="int",
        help="column in the assembly report containing ucsc contig ids"
        "[%default].")

    parser.add_option(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type="int",
        help="column in the assembly report containing ensembl contig ids"
        "[%default].")

    parser.add_option(
        "--assembly-extras",
        dest="assembly_extras",
        type="str",
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome [%default].")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="float",
                      help="extension for upstream end [%default].")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="float",
                      help="extension for downstream end [%default].")

    parser.add_option(
        "--min-distance",
        dest="min_distance",
        type="int",
        help="minimum distance of features to merge/join [%default].")

    parser.add_option(
        "--max-distance",
        dest="max_distance",
        type="int",
        help="maximum distance of features to merge/join [%default].")

    parser.add_option(
        "--min-features",
        dest="min_features",
        type="int",
        help="minimum number of features to merge/join [%default].")

    parser.add_option(
        "--max-features",
        dest="max_features",
        type="int",
        help="maximum number of features to merge/join [%default].")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (options, args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    if options.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            IOTools.open_file(options.input_filename_contigs, "r"))

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = genome_fasta.getContigSizes()

    if options.assembly_report:
        df = pd.read_csv(options.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if options.assembly_report_hasIDs == 1:
            ucsccol = options.assembly_report_ucsccol
            ensemblcol = options.assembly_report_ensemblcol
            df.ix[df[1] == "assembled-molecule",
                  ensemblcol] = df.ix[df[1] == "assembled-molecule", 0]
            if options.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif options.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if options.assembly_extras is not None:
            assembly_extras = options.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if options.method in ("forward_coordinates", "forward_strand",
                          "add-flank", "add-upstream-flank",
                          "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if options.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(IOTools.open_file(options.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(options.stdin)

    if options.method in ("add-upstream-flank", "add-downstream-flank",
                          "add-flank"):

        add_upstream_flank = "add-upstream-flank" == options.method
        add_downstream_flank = "add-downstream-flank" == options.method
        if options.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(options.extension_upstream)
        downstream_flank = int(options.extension_downstream)
        extend_flank = options.flank_method == "extend"

        if options.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, options.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                options.stdout.write(str(gff) + "\n")

    elif options.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            if options.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                options.stdout.write(str(x) + "\n")
                x.start = c.end

    elif options.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=options.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            options.stdout.write(str(x) + "\n")

    elif options.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=False,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=options.min_distance,
                              max_distance=options.max_distance,
                              min_features=options.min_features,
                              max_features=options.max_features,
                              merge=True,
                              output_format=options.output_format):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop":
        for gff in cropGFF(gffs, options.filename_crop_gff):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)",
                options.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match(
                    "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      options.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % options.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            options.stdout.write(str(gff) + "\n")

    elif options.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif options.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif options.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if options.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if options.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if options.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if options.contig_pattern:
                to_remove = [
                    re.compile(x) for x in options.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            options.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    else:

        for gff in gffs:

            if options.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if options.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            options.stdout.write(str(gff) + "\n")

    E.stop()
Beispiel #38
0
def cropGFF(gffs, filename_gff):
    """crop intervals in gff file."""

    # read regions to crop with and convert intervals to intersectors
    E.info("reading gff for cropping: started.")

    other_gffs = GTF.iterator(IOTools.open_file(filename_gff, "r"))

    cropper = GTF.readAsIntervals(other_gffs)

    ntotal = 0
    for contig in list(cropper.keys()):
        intersector = bx.intervals.intersection.Intersecter()
        for start, end in cropper[contig]:
            intersector.add_interval(bx.intervals.Interval(start, end))
            ntotal += 1
        cropper[contig] = intersector

    E.info("reading gff for cropping: finished.")
    E.info("reading gff for cropping: %i contigs with %i intervals." %
           (len(cropper), ntotal))

    ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0

    # do the actual cropping
    for gff in gffs:

        ninput += 1

        if gff.contig in cropper:

            start, end = gff.start, gff.end
            overlaps = cropper[gff.contig].find(start, end)

            if overlaps:

                l = end - start
                a = numpy.ones(l)
                for i in overlaps:
                    s = max(0, i.start - start)
                    e = min(l, i.end - start)
                    a[s:e] = 0

                segments = Intervals.fromArray(a)
                if len(segments) == 0:
                    ndeleted += 1
                else:
                    ncropped += 1

                for s, e in segments:
                    gff.start, gff.end = s + start, e + start
                    noutput += 1
                    yield (gff)

                continue

        noutput += 1

        yield (gff)

    E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" %
           (ninput, noutput, ncropped, ndeleted))
Beispiel #39
0
def combineGFF(gffs,
               min_distance,
               max_distance,
               min_features,
               max_features,
               merge=True,
               output_format="%06i"):
    """join intervals in gff file.

    Note: strandedness is ignored
    """

    E.info("joining features: min distance=%i, max_distance=%i, "
           "at least %i and at most %i features." %
           (min_distance, max_distance, min_features, max_features))

    def iterate_chunks(gffs):

        last = next(gffs)
        to_join = [last]

        for gff in gffs:
            d = gff.start - last.end
            if gff.contig == last.contig:
                assert gff.start >= last.start, "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" % (
                    d, last, gff)

            if gff.contig != last.contig or \
                    (max_distance and d > max_distance) or \
                    (min_distance and d < min_distance) or \
                    (max_features and len(to_join) >= max_features):

                if min_features or len(to_join) >= min_features:
                    yield to_join
                to_join = []

            last = gff
            to_join.append(gff)

        if len(to_join) >= min_features:
            yield to_join
        raise StopIteration

    id = 1
    ninput, noutput, nfeatures = 0, 0, 0

    if merge:
        for to_join in iterate_chunks(gffs):

            ninput += 1
            y = GTF.Entry()
            t = output_format % id
            y.fromGTF(to_join[0], t, t)
            y.start = to_join[0].start
            y.end = to_join[-1].end

            yield (y)
            nfeatures += 1

            noutput += 1
            id += 1
    else:

        for to_join in iterate_chunks(gffs):

            ninput += 1
            for x in to_join:
                y = GTF.Entry()
                t = output_format % id
                y.fromGTF(x, t, t)
                yield (y)
                nfeatures += 1

            noutput += 1
            id += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i" %
           (ninput, noutput, nfeatures))
Beispiel #40
0
def buildTerritories(iterator, fasta, method, options):
    """build gene territories. 

    Exons in a gene are merged and the resulting 
    segments enlarged by --radius. Territories
    overlapping are divided in the midpoint between
    the two genes.

    If *method* is ``gene``, gene territories will be built.
    If *method* is ``tss``, tss territories will be built.

    """

    ninput, noutput, nambiguous = 0, 0, 0

    assert method in ("gene", "tss")

    dr = 2 * options.radius

    prev_pos = 0
    last_contig = None
    gff = None

    def _iterator(iterator):
        """yield gene plus the locations of the end of the previous gene and start of next gene"""

        last_end, prev_end = 0, 0
        last_contig = None
        last = None
        for matches in GTF.iterator_overlaps(iterator):

            this_start = min([x.start for x in matches])
            this_end = max([x.end for x in matches])

            if method == "tss":
                # restrict to tss
                if matches[0].strand == "+":
                    this_end = this_start + 1
                else:
                    this_start = this_end - 1

            this_contig = matches[0].contig

            if last_contig != this_contig:
                if last:
                    yield prev_end, last, fasta.getLength(last_contig)
                last_end, prev_end = 0, 0
            else:
                yield prev_end, last, this_start

            prev_end = last_end
            last_end = this_end
            last = matches
            last_contig = this_contig

        if last:
            yield prev_end, last, fasta.getLength(last_contig)

    for last_end, matches, next_start in _iterator(iterator):

        gff = GTF.Entry().copy(matches[0])

        start = min([x.start for x in matches])
        end = max([x.end for x in matches])

        if method == "tss":
            # restrict to tss
            if matches[0].strand == "+":
                end = start + 1
            else:
                start = end - 1

        d = start - last_end
        if d < dr:
            start -= d // 2
        else:
            start -= options.radius

        d = next_start - end
        if d < dr:
            end += d // 2
        else:
            end += options.radius

        gff.gene_id = ":".join(set([x.gene_id for x in matches]))
        gff.transcript_id = gff.gene_id
        gff.start, gff.end = start, end

        nsegments = len(matches)
        if nsegments > 1:
            gff.addAttribute("ambiguous", nsegments)
            nambiguous += 1

        assert gff.start < gff.end, "invalid segment: %s" % str(gff)
        options.stdout.write(str(gff) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nambiguous=%i" %
           (ninput, noutput, nambiguous))
Beispiel #41
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option(
        "--output-filename-synonyms",
        dest="output_filename_synonyms",
        type="string",
        help=
        "output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."
    )

    parser.add_option("-m",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "annotations-genes",
                               "annotations-go", "workspace",
                               "annotations-gff"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.add_option(
        "--remove-regex",
        dest="remove_regex",
        type="string",
        help="regular expression of contigs to remove [default=None].")

    parser.set_defaults(
        genome_file=None,
        feature=None,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        output_filename_synonyms=None,
        input_format="gff",
        remove_regex=None,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0:
        options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith("annotations"):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile(options.remove_regex)

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered(GFF.iterator(options.stdin),
                                         feature=options.feature)

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals = GTF.readAsIntervals(iterator, with_records=with_records)
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments(options.stdout,
                                              intervals,
                                              options.section,
                                              outfile_synonyms=outfile_synonyms,
                                              max_length=options.max_length,
                                              remove_regex=options.remove_regex)

        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map"

        iterator = GTF.iterator_filtered(GTF.iterator(options.stdin),
                                         feature=options.feature)

        geneid2categories = IOTools.readMultiMap(
            open(options.input_filename_map, "r"))

        category2segments = collections.defaultdict(list)

        for contig, gffs in GTF.readAsIntervals(iterator,
                                                with_gene_id=True).items():
            if options.remove_regex and options.remove_regex.search(contig):
                continue

            for start, end, geneid in gffs:
                if geneid not in geneid2categories:
                    continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

        for category, segments in category2segments.iteritems():
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (category, "\t".join(["%i" % x for x in segments])))
            E.info("set %s annotated with %i segments" %
                   (category, len(segments)))

    elif options.section == "annotations":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered(GFF.iterator(sys.stdin),
                                                 feature=options.feature)
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                                     feature=options.feature)

            else:
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GFF.iterator(infile),
                                                     feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                raise ValueError("don't know how to filter %s" % filename)

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator(sys.stdin)
            else:
                iterator = GTF.iterator_filtered(
                    GFF.iterator(open(filename, "r")))

            segments = collections.defaultdict(list)
            for gff in iterator:
                segments[":".join((gff.source, gff.feature))].append(
                    (gff.contig, gff.start, gff.end))

            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    options.stdout.write(
                        "%s\t%i\t%s\t(%i,%i)\n" %
                        (prefix, nsegments, contig, start, end))
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)

        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (feature, "\t".join(["%i" % x for x in xrange(start, end)])))
            E.info("set %s annotated with %i segments" %
                   (feature, end - start))

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile(filename)
            iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                             feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                # output all
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                # create subsets
                E.debug("applying subsets for %s" % filename)
                geneid2label, label2segments = collections.defaultdict(
                    list), {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList(open(filename_ids, "r"))
                    for gene_id in gene_ids:
                        geneid2label[gene_id].append(label)
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals(
                        iterator, with_gene_id=True).items():

                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label:
                            continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)

                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, start, end))
                        nsegments += 1

                for label, segments in label2segments.iteritems():
                    options.stdout.write(
                        "##Ann\t%s\t%s\n" %
                        (label, "\t".join(["%i" % x for x in segments])))
                    E.info("set %s (%s) annotated with %i segments" %
                           (label, filename, len(segments)))

    E.info("ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
           (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()
Beispiel #42
0
def loadEnsemblTranscriptInformation(ensembl_gtf,
                                     geneset_gtf,
                                     outfile,
                                     csvdb,
                                     set_biotype=None,
                                     set_transcript_support=None):
    '''
    Parse and annotate a geneset_gtf using the original Ensembl
    GTF attributes.

    The ensembl GTF structure is not static, so this needs to maintain
    backwards compatibility.  For certain versions, attributes may be
    present in later versions which are used downstream.  These should
    be set with default/missing values if they are not natively present.

    Therefore, gene_biotype is taken from the "feature" field if it is
    not present, and transcript_support = NA if missing.

    Arguments
    ---------
    ensembl_gtf: string
      PATH to ensemlb gtf containing all annotation information and
      attributes

    geneset_gtf: string
      PATH to the geneset GTF to annotate with ensembl attributes

    outfile: string
      PATH to output filtered, annotated and sorted by gene position

    csvdb: string
      PATH to the SQLite database to upload transcript information
      table

    ensembl_version: int
      Ensembl build version used

    set_biotype: string
      should the gene_ and transcript_biotype columns be set
      to a default value.  If false, and not present, default
      value is to use the "feature" attribute

    set_transcript_support: int
      should the transcript_support_level be set to a default value,
      if not it will be set to NA
    '''

    table = P.to_table(outfile)

    gtf_file = IOTools.open_file(geneset_gtf, "rb")
    gtf_iterator = GTF.transcript_iterator(GTF.iterator(gtf_file))

    ensembl_file = IOTools.open_file(ensembl_gtf, "rb")
    ensembl_iterator = GTF.transcript_iterator(GTF.iterator(ensembl_file))

    # parse the two gtfs, creating keys from the GTF entries
    parse_ensembl = {}
    for ens_gtf in ensembl_iterator:
        for ens_trans in ens_gtf:
            ens_att = ens_trans.asDict()
            ens_vals = dict(
                zip(ens_trans.keys(),
                    [ens_trans[x] for x in ens_trans.keys()]))
            ens_att.update(ens_vals)
            parse_ensembl[ens_trans.transcript_id] = ens_att
    ensembl_file.close()

    parse_gtf = {}
    for gtf in gtf_iterator:
        for trans in gtf:
            trans_atts = trans.asDict()
            trans_vals = dict(
                zip(trans.keys(), [trans[g] for g in trans.keys()]))
            trans_atts.update(trans_vals)
            parse_gtf[trans.transcript_id] = trans_atts
    gtf_file.close()

    # convert to dataframe for easier merging, annotating
    # and ultimately SQL database insertion
    # these are large dictionaries to parse, so might
    # be quite memory and compute heavy
    ensembl_df = pd.DataFrame(parse_ensembl).T
    gtf_df = pd.DataFrame(parse_gtf).T

    # check for presence of gene_biotype and
    # transcript_support_level
    merged_df = pd.merge(gtf_df,
                         ensembl_df,
                         left_on=[cx for cx in gtf_df.columns],
                         right_on=[rx for rx in gtf_df.columns],
                         how='left')

    try:
        merged_df["transcript_support_level"]
        E.info("transcript_support_level is present")
    except KeyError:
        E.info("transcript_support_level is not present")
        if set_transcript_support:
            merged_df["transcript_support_level"] = set_transcript_support
        else:
            merged_df["transcript_support_level"] = "NA"

    try:
        merged_df["gene_biotype"]
        E.info("gene biotype is present")
        try:
            merged_df["transcript_biotype"]
            E.info("transcript biotype is present")
        except KeyError:
            E.info("transcript biotype is not present")
            if set_biotype:
                merged_df["transcript_biotype"] = set_biotype
            else:
                merged_df["transcript_biotype"] = "NA"
    except KeyError:
        E.info("gene biotype is not present")
        if set_biotype:
            merged_df["gene_biotype"] = set_biotype
            merged_df["transcript_biotype"] = set_biotype
        else:
            merged_df["gene_biotype"] = "NA"
            merged_df["transcript_biotype"] = "NA"

    # sort on gene then transcript id
    # remove exon_number and exon_id to maintain
    # compatibility with previous code
    try:
        merged_df.drop(["exon_id", "exon_number"], axis=1, inplace=True)
    except KeyError:
        try:
            merged_df.drop(["exon_id"], axis=1, inplace=True)
        except KeyError:
            try:
                merged_df.drop(["exon_number"], axis=1, inplace=True)
            except KeyError:
                pass

    # sort the output and load into the csvdb
    # add a multindex to use multiple SQL indices
    merged_df.sort_values(by=["gene_id", "transcript_id"], inplace=True)

    merged_df.set_index(
        ["gene_id", "gene_name", "protein_id", "transcript_id"],
        inplace=True,
        drop=True)

    merged_df.to_sql(
        name=table,
        con=sqlite3.connect(csvdb),
        if_exists='replace',
        index_label=["gene_id", "gene_name", "protein_id", "transcript_id"])
    return 1
Beispiel #43
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--j-gtf",
                      dest="j_gtf",
                      type="string",
                      help="gtf file of IgH J genes")

    parser.add_option("--c-gtf",
                      dest="c_gtf",
                      type="string",
                      help="gtf file of IgH constant genes")

    parser.add_option("--ig-coordinates",
                      dest="locus",
                      type="string",
                      help="reference coordinates for IgH locus in "
                      "aligned genome")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    # check for bam file index, otherwise make one
    try:
        assert os.path.exists(infile + ".bai")
    except AssertionError:
        E.info("No index file exists for %s" % infile)
        E.info("generating index for bam file %s" % infile)
        os.system("samtools index %s" % infile)

    j_pos = gtfPositions(options.j_gtf)
    c_pos = gtfPositions(options.c_gtf)

    coords = options.locus.split(":")
    contig = coords[0]
    start = int(coords[-1].split("-")[0])
    end = int(coords[-1].split("-")[1])

    samfile = pysam.AlignmentFile(infile, "rb")
    read_cache = {}
    for read in samfile.fetch(reference=contig, start=start, end=end):
        if read.is_proper_pair:
            try:
                read_cache[read.query_name].append(read)
            except KeyError:
                read_cache[read.query_name] = []
                read_cache[read.query_name].append(read)
        else:
            pass

    switched_reads = set()
    for pair in read_cache.values():
        # select unique pairs
        if len(pair) > 2:
            pass
        elif len(pair) == 2:
            read1 = pair[0]
            read2 = pair[1]
            r1_ref = set(read1.get_reference_positions())
            r2_ref = set(read2.get_reference_positions())
            if (r1_ref.intersection(j_pos)) and (r2_ref.intersection(c_pos)):
                switched_reads.add((read1, read2))
            elif (r1_ref.intersection(c_pos)) and (r2_ref.intersection(j_pos)):
                switched_reads.add((read1, read2))
        else:
            pass

    # add Ig constant genes to dictionary
    ig_dict = {}
    with IOTools.openFile(options.c_gtf) as gfile:
        for gene in GTF.transcript_iterator(GTF.iterator(gfile)):
            for trans in gene:
                pos = set(range(trans.start, trans.end))
                symbol = trans.asDict()['transcript_name'].split("-")[0]
                try:
                    ig_dict[symbol].update(pos)
                except KeyError:
                    ig_dict[symbol] = pos

    ig_count = {}
    for pair in switched_reads:
        all_refs = set()
        all_refs.update(pair[0].get_reference_positions())
        all_refs.update(pair[1].get_reference_positions())
        for gene in ig_dict.keys():
            inter = len(ig_dict[gene].intersection(all_refs))
            try:
                if inter:
                    ig_count[gene] += 1
                else:
                    ig_count[gene] += 0
            except KeyError:
                if inter:
                    ig_count[gene] = 1
                else:
                    ig_count[gene] = 0

    options.stdout.write("Ig_isotype\tcounts\n")
    for each in ig_count.keys():
        options.stdout.write("%s\t%i\n" % (each, ig_count[each]))
    # cache reads based on read name

    # write footer and output benchmark information.
    E.Stop()
Beispiel #44
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2gff.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-i",
        "--ignore-missing",
        dest="ignore_missing",
        action="store_true",
        help=
        "Ignore transcripts on contigs that are not in the genome-file [default=%default]."
    )

    parser.add_option("-s",
                      "--restrict-source",
                      dest="restrict_source",
                      type="choice",
                      choices=("protein_coding", "pseudogene", "lncRNA"),
                      help="restrict input by source [default=%default].")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=(
                          "full",
                          "genome",
                          "exons",
                          "promotors",
                          "tts",
                          "regulons",
                          "tts-regulons",
                          "genes",
                          "territories",
                          "tss-territories",
                          "great-domains",
                      ),
                      help="method for defining segments [default=%default].")

    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius of a territory [default=%default].")

    parser.add_option(
        "-f",
        "--flank",
        dest="flank",
        type="int",
        help="size of the flanking region next to a gene [default=%default].")

    parser.add_option(
        "--increment",
        dest="increment",
        type="int",
        help=
        "size of increment in flank in genestructure annotation [default=%default]."
    )

    parser.add_option("-p",
                      "--promotor",
                      dest="promotor",
                      type="int",
                      help="size of a promotor region [default=%default].")

    parser.add_option(
        "-u",
        "--upstream",
        dest="upstream",
        type="int",
        help="size of region upstream of tss [default=%default].")

    parser.add_option(
        "-d",
        "--downstream",
        dest="downstream",
        type="int",
        help="size of region downstream of tss [default=%default].")

    parser.add_option(
        "--detail",
        dest="detail",
        type="choice",
        choices=("introns+exons", "exons", "introns"),
        help="level of detail for gene structure annotation [default=%default]."
    )

    parser.add_option("--merge-promotors",
                      dest="merge_promotors",
                      action="store_true",
                      help="merge promotors [default=%default].")

    parser.add_option(
        "--min-intron-length",
        dest="min_intron_length",
        type="int",
        help=
        "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]."
    )

    parser.add_option(
        "-o",
        "--sort",
        dest="sort",
        action="store_true",
        help="sort input before processing. Otherwise, the input is assumed "
        "to be sorted [default=%default].")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        increment=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="genome",
        radius=50000,
        promotor=5000,
        merge_promotors=False,
        upstream=5000,
        downstream=5000,
        detail="exons",
        sort=False,
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        raise ValueError("please specify a --genome-file")

    if not options.restrict_source:
        iterator = GTF.iterator(options.stdin)

    elif options.restrict_source:
        iterator = GTF.iterator_filtered(GTF.iterator(options.stdin),
                                         source=options.restrict_source)

    # elif options.method in ("promotors", "tts", "regulons"):
    #     iterator = GTF.iterator_filtered( GTF.iterator(options.stdin), source = "protein_coding")
    # else:
    #     iterator = GTF.iterator(options.stdin)

    if options.sort:
        iterator = GTF.iterator_sorted(iterator, sort_order="position")

    if options.method == "full" or options.method == "genome":
        segmentor = annotateGenome(iterator, fasta, options)
    elif options.method == "territories":
        segmentor = buildTerritories(iterator, fasta, 'gene', options)
    elif options.method == "tss-territories":
        segmentor = buildTerritories(iterator, fasta, 'tss', options)
    elif options.method == "exons":
        segmentor = annotateExons(iterator, fasta, options)
    elif options.method == "promotors":
        segmentor = annotatePromoters(iterator, fasta, options)
    elif options.method == "regulons":
        segmentor = annotateRegulons(iterator, fasta, True, options)
    elif options.method == "tts-regulons":
        segmentor = annotateRegulons(iterator, fasta, False, options)
    elif options.method == "tts":
        segmentor = annotateTTS(iterator, fasta, options)
    elif options.method == "genes":
        segmentor = annotateGenes(iterator, fasta, options)
    elif options.method == "great-domains":
        segmentor = annotateGREATDomains(iterator, fasta, options)

    E.Stop()
    def test_number_of_intervals_is_correct(self):

        with IOTools.openFile(self.filename) as inf:
            records = list(GTF.iterator(inf))

        self.assertEqual(len(records), 100)
Beispiel #46
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-e", "--filename-exons", "--filename-gtf", dest="filename_exons", type="string", metavar="gtf",
                      help="gtf formatted file with non-overlapping exon locations. [%default]")

    parser.set_defaults(
        filename_exons=None,
        read_length=200,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    exons = GTF.readAndIndex(
        GTF.iterator(IOTools.openFile(options.filename_exons)))

    pysam_in = pysam.Samfile("-", "rb")

    nspliced = 0
    nspliced_ignored = 0
    nspliced_nooverlap = 0
    nspliced_halfoverlap = 0
    nspliced_bothoverlap = 0
    nspliced_overrun = [0] * 2 * (options.read_length + 10)
    nspliced_exact = 0
    nspliced_inexact = 0
    nunspliced = 0
    nunspliced_overlap = 0
    nunspliced_ignored = 0
    nunspliced_nooverlap = 0
    nunspliced_overrun = [0] * (options.read_length + 10)
    overrun_offset = options.read_length + 10
    ninput = 0
    nunmapped = 0

    c = E.Counter()

    def _splice_overrun(start, end, overlap):
        '''return splicesite over/underrun.

        positive values: overrun
        negative values: underrun
        0: no over/underrun
        '''

        exon_start = min([x[0] for x in overlap])
        exon_end = max([x[1] for x in overlap])

        if start <= exon_start and end > exon_start:
            # overrun at start or match
            r = exon_start - start
        elif start < exon_end and end >= exon_end:
            # overrun at end or match
            r = end - exon_end
        else:
            # underrun - distance to closest exon boundary
            r = -min(start - exon_start, exon_end - end)

        return r

    for read in pysam_in:
        ninput += 1
        if read.is_unmapped:
            nunmapped += 1
            continue

        # check for BAM_CREF_SKIP code in cigar string
        cigar = read.cigar
        is_spliced = 3 in [x[0] for x in cigar]

        contig = pysam_in.getrname(read.tid)
        start = read.pos
        end = read.aend
        if is_spliced:
            # count both ends
            nspliced += 1

            if len(cigar) != 3:
                nspliced_ignored += 1
                continue

            start5, end5 = start, start + cigar[0][1]
            start3, end3 = end - cigar[2][1], end
            try:
                overlap3 = list(exons.get(contig, start3, end3))
                overlap5 = list(exons.get(contig, start5, end5))
            except KeyError:
                overlap3 = overlap5 = []

            ovl3 = len(overlap3)
            ovl5 = len(overlap5)
            o3 = o5 = None
            if not ovl3 and not ovl5:
                nspliced_nooverlap += 1
            elif ovl3 and not ovl5:
                nspliced_halfoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
            elif ovl5 and not ovl3:
                nspliced_halfoverlap += 1
                o5 = _splice_overrun(start5, end5, overlap5)
            else:
                # both overlap
                nspliced_bothoverlap += 1
                o3 = _splice_overrun(start3, end3, overlap3)
                o5 = _splice_overrun(start5, end5, overlap5)

            if o3 is not None:
                if o3 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o3)] += 1
            if o5 is not None:
                if o5 == 0:
                    nspliced_exact += 1
                else:
                    nspliced_inexact += 1
                nspliced_overrun[max(0, overrun_offset + o5)] += 1
        else:
            nunspliced += 1
            try:
                overlap = list(exons.get(contig, start, end))
            except KeyError:
                overlap = []

            if len(overlap) == 0:
                nunspliced_nooverlap += 1
            elif len(overlap) >= 1:
                nunspliced_overlap += 1
                # multiple overlap - merge exons (usually: small introns)
                exon_start = min([x[0] for x in overlap])
                exon_end = max([x[1] for x in overlap])
                ostart = max(0, exon_start - start)
                oend = max(0, end - exon_end)
                o = min(end, exon_end) - max(start, exon_start)
                overrun = ostart + oend
                nunspliced_overrun[overrun] += 1

    # output histograms
    outfile = E.openOutputFile("overrun")
    outfile.write(
        "bases\tunspliced_overrun_counts\tspliced_overrun_counts\tspliced_underrun_counts\n")
    _nspliced_overrun = nspliced_overrun[overrun_offset:]
    _nspliced_underrun = nspliced_overrun[:overrun_offset + 1]
    _nspliced_underrun.reverse()
    for x, v in enumerate(zip(nunspliced_overrun, _nspliced_overrun, _nspliced_underrun)):
        outfile.write("%i\t%s\n" % (x, "\t".join(map(str, v))))
    outfile.close()

    # output summary
    # convert to counter
    c.input = ninput
    c.unmapped = nunmapped
    c.mapped = ninput - nunmapped

    c.unspliced = nunspliced
    c.unspliced_nooverlap = nunspliced_nooverlap
    c.unspliced_nooverrun = nunspliced_overrun[0]
    c.unspliced_overlap = nunspliced_overlap
    c.unspliced_overrun = sum(nunspliced_overrun[1:])

    c.spliced = nspliced
    c.spliced_nooverlap = nspliced_nooverlap
    c.spliced_halfoverlap = nspliced_halfoverlap
    c.spliced_bothoverlap = nspliced_bothoverlap
    c.spliced_exact = nspliced_exact
    c.spliced_inexact = nspliced_inexact
    c.spliced_ignored = nspliced_ignored
    c.spliced_underrun = sum(_nspliced_underrun[1:])
    c.spliced_overrun = sum(_nspliced_overrun[1:])

    outfile = options.stdout
    outfile.write("category\tcounts\n")
    for k, v in c.iteritems():
        outfile.write("%s\t%i\n" % (k, v))

    # write footer and output benchmark information.
    E.Stop()
Beispiel #47
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-s",
        "--filename-strand",
        dest="filename_strand",
        type="string",
        help="set strand information according to file [default=%DEFAULT].")

    parser.set_defaults(as_gtf=False, filename_strand=None, test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    ####################################
    if options.filename_strand:
        map_id2strand = IOTools.readMap(open(options.filename_strand, "r"))
    else:
        map_id2strand = {}

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, noutput, nskipped = 0, 0, 0

    if options.as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "psl"
    gff.feature = "exon"

    ids = {}

    while 1:

        if options.test and ninput >= options.test:
            break

        match = next(iterator)

        if match is None:
            break

        ninput += 1

        if match.mQueryId not in ids:
            ids[match.mQueryId] = 1
            id = match.mQueryId
        else:
            id = match.mQueryId + ":%i" % ids[match.mQueryId]
            ids[match.mQueryId] += 1

        if options.as_gtf:
            gff.contig = match.mSbjctId
            gff.gene_id = id
            gff.transcript_id = id
        else:
            gff.contig = match.mSbjctId
            gff.clearAttributes()
            gff.addAttribute("gene_id", id)

        if id in map_id2strand:
            gff.strand = map_id2strand[id]
        else:
            gff.strand = match.strand

        for qstart, sstart, size in match.getBlocks():

            gff.start = sstart
            gff.end = sstart + size
            options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Beispiel #48
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: diff_gtf.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-e", "--write-equivalent", dest="write_equivalent",
                      help="write equivalent entries [default=%default].", action="store_true")

    parser.add_option("-f", "--write-full", dest="write_full",
                      help="write full gff entries [default=%default].", action="store_true")

    parser.add_option("-o", "--format=", dest="format",
                      help="output format [flat|multi-line] [default=%default]")

    parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        format="flat",
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.Start(parser, argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    # duplicated features cause a problem. Make sure
    # features are non-overlapping by running
    # gff_combine.py on GFF files first.

    E.info("reading data started")

    idx, genes2 = {}, set()
    for e in GTF.readFromFile(IOTools.openFile(input_filename2, "r")):
        genes2.add(e.gene_id)
        if e.contig not in idx:
            idx[e.contig] = bx.intervals.intersection.Intersecter()
        idx[e.contig].add_interval(
            bx.intervals.Interval(e.start, e.end, value=e))

    overlaps_genes = []

    E.info("reading data finished: %i contigs" % len(idx))

    # outfile_diff and outfile_overlap not implemented
    # outfile_diff = getFile( options, "diff" )
    # outfile_overlap = getFile( options, "overlap" )
    overlapping_genes = set()

    genes1 = set()

    # iterate over exons
    with IOTools.openFile(input_filename1, "r") as infile:
        for this in GTF.iterator(infile):

            genes1.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(this.start, this.end)
            except KeyError:
                continue

            others = [x.value for x in intervals]
            for other in others:
                overlapping_genes.add((this.gene_id, other.gene_id))

            # check for identical/half-identical matches
            output = None
            for other in others:
                if this.start == other.start and this.end == other.end:
                    output, symbol = other, "="
                    break
            else:
                for other in others:
                    if this.start == other.start or this.end == other.end:
                        output, symbol = other, "|"
                        break
                else:
                    symbol = "~"

    # if outfile_diff != options.stdout: outfile_diff.close()
    # if outfile_overlap != options.stdout: outfile_overlap.close()

    outfile = None
    ##################################################################
    ##################################################################
    ##################################################################
    # print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in overlapping_genes:
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout:
            outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        b = set([x[0] for x in overlapping_genes])
        d = genes1.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(d) + "\n")
        if outfile != options.stdout:
            outfile.close()
        outfile_total.write("%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (
            os.path.basename(input_filename1), len(
                genes1), len(b), 100.0 * len(b) / len(a),
            len(d), 100.0 * len(d) / len(genes1)))

        outfile = getFile(options, "genes_uniq2")
        b = set([x[1] for x in overlapping_genes])
        d = genes2.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(d) + "\n")
        if outfile != options.stdout:
            outfile.close()

        outfile_total.write("%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (
            os.path.basename(input_filename2), len(
                genes2), len(b), 100.0 * len(b) / len(a),
            len(d), 100.0 * len(d) / len(genes2)))
        if outfile_total != options.stdout:
            outfile_total.close()

    E.Stop()
Beispiel #49
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header",
                      dest="with_header",
                      action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("--queries-tsv-file",
                      dest="input_filename_queries",
                      type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option(
        "--allow-duplicates",
        dest="allow_duplicates",
        action="store_true",
        help=
        """permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""
    )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(
            GTF.iterator(sys.stdin), feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend, start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gffs[0].transcript_id
        entry.mSbjctId = gffs[0].contig
        entry.strand = gffs[0].strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
Beispiel #50
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t",
                      "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes",
                      dest="merge_genes",
                      action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j",
                      "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes",
                      dest="unset_genes",
                      type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene", "gene+transcript", "transcript",
                               "position", "contig+gene", "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u",
                      "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i",
                      "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g",
                      "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G",
                      "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r",
                      "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes",
                      dest="renumber_genes",
                      type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a",
                      "--apply",
                      dest="filename_filter",
                      type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample_size",
                      type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length",
                      type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end "
        "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping",
                      dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates",
                      dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates",
                      dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates",
                      dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                           strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin),
                                         strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                               strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to "
                             "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id",
                             transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons
                                      ]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator(
            GTF.iterator(options.stdin)),
                                          sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(
                options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id",
                                     map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]),
                               max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript",
                                "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript
                                      if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([
                        exon_counts[(x.start, x.end)] for x in transcript
                        if x.feature == "exon"
                    ])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l
                    ]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs
                                      ]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [
            item for item in set(transcript_ids)
            if transcript_ids.count(item) > 1
        ]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute(
                        'gene_id',
                        gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute(
                        'transcript_id', gtf.transcript_id + "." +
                        str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin),
                                           strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple strands: %s" %
                    (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple contigs: %s" %
                    (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g",
        "--gtf-file",
        dest="filename_gtf",
        type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option("-m",
                      "--filename-mismapped",
                      dest="filename_mismapped",
                      type="string",
                      help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j",
        "--junctions-bed-file",
        dest="filename_junctions",
        type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r",
        "--filename-regions",
        dest="filename_regions",
        type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t",
        "--transcripts-gtf-file",
        dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option("-p",
                      "--map-tsv-file",
                      dest="filename_map",
                      type="string",
                      help="filename mapping transcript numbers (used by "
                      "--filename-transciptome) to transcript names "
                      "(used by --filename-gtf) [%default]")

    parser.add_option("-s",
                      "--filename-stats",
                      dest="filename_stats",
                      type="string",
                      help="filename to output stats to [%default]")

    parser.add_option(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option("-i",
                      "--ignore-mismatches",
                      dest="ignore_mismatches",
                      action="store_true",
                      help="ignore mismatches [%default]")

    parser.add_option(
        "-c",
        "--remove-contigs",
        dest="remove_contigs",
        type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option("-f",
                      "--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files [%default]")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = IOTools.readMap(IOTools.open_file(options.filename_map),
                                 has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(IOTools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(IOTools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(
            options.filename_transcriptome, "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-",
                                             "wh",
                                             template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-",
                                             "wb",
                                             template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = _bams2bam.filter(genome_samfile,
                         output_samfile,
                         output_mismapped,
                         transcripts_samfile,
                         junctions_samfile,
                         transcripts,
                         regions=regions_to_remove,
                         unique=options.unique,
                         remove_contigs=options.remove_contigs,
                         colour_mismatches=options.colour_mismatches,
                         ignore_mismatches=options.ignore_mismatches,
                         ignore_transcripts=transcripts_samfile is None,
                         ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = IOTools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Beispiel #52
0
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling, 
    use only a single transcript per gene and remove any overlap between 
    genes.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end) for x in transcript
                     if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in xrange(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))
Beispiel #53
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--mask-bed-file",
        dest="filename_rna",
        type="string",
        metavar='GFF',
        help="gff formatted file with masking locations. The number of "
        "reads overlapping the intervals in the given file will be "
        "computed. Note that the computation currently does not take "
        "into account indels, so it is an approximate count only. "
        "[%default]")

    parser.add_option(
        "-f",
        "--ignore-masked-reads",
        dest="remove_rna",
        action="store_true",
        help="as well as counting reads in the file given by --mask-bed-file, "
        "also remove these reads for duplicate and match statistics. "
        "[%default]")

    parser.add_option(
        "-i",
        "--num-reads",
        dest="input_reads",
        type="int",
        help="the number of reads - if given, used to provide percentages "
        "[%default]")

    parser.add_option(
        "-d",
        "--output-details",
        dest="output_details",
        action="store_true",
        help="output per-read details into a separate file. Read names are "
        "md5/base64 encoded [%default]")

    parser.add_option(
        "-q",
        "--fastq-file",
        dest="filename_fastq",
        help="filename with sequences and quality scores. This file is only "
        "used to collect sequence identifiers. Thus, for paired end data a "
        "single file is sufficient [%default]")

    parser.set_defaults(
        filename_rna=None,
        remove_rna=False,
        input_reads=0,
        force_output=False,
        filename_fastq=None,
        output_details=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if options.filename_rna:
        rna = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.filename_rna)))
    else:
        rna = None

    if len(args) > 0:
        pysam_in = pysam.AlignmentFile(args[0], "rb")
    elif options.stdin == sys.stdin:
        pysam_in = pysam.AlignmentFile("-", "rb")
    else:
        pysam_in = pysam.AlignmentFile(options.stdin, "rb")

    if options.output_details:
        outfile_details = E.openOutputFile("details", "w")
    else:
        outfile_details = None

    if options.filename_fastq and not os.path.exists(options.filename_fastq):
        raise IOError("file %s does not exist" % options.filename_fastq)

    (counter, flags_counts, nh_filtered, nh_all,
     nm_filtered, nm_all, mapq, mapq_all, max_hi) = \
        _bam2stats.count(pysam_in,
                         options.remove_rna,
                         rna,
                         filename_fastq=options.filename_fastq,
                         outfile_details=outfile_details)

    if max_hi > 0 and max_hi != max(nh_all.keys()):
        E.warn("max_hi(%i) is inconsistent with max_nh (%i) "
               "- counts will be corrected" % (max_hi, max(nh_all.keys())))

    outs = options.stdout
    outs.write("category\tcounts\tpercent\tof\n")

    def _write(outs, text, numerator, denominator, base):
        percent = IOTools.prettyPercent(numerator, denominator)
        outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base))

    ###############################
    ###############################
    ###############################
    # Output alignment information
    ###############################
    nalignments_unmapped = flags_counts["unmapped"]
    nalignments_mapped = counter.alignments_input - nalignments_unmapped

    _write(outs, "alignments_total", counter.alignments_input,
           counter.alignments_input, "alignments_total")

    if counter.alignments_input == 0:
        E.warn("no alignments in BAM file - no further output")
        E.Stop()
        return

    _write(outs, "alignments_mapped", nalignments_mapped,
           counter.alignments_input, 'alignments_total')
    _write(outs, "alignments_unmapped", nalignments_unmapped,
           counter.alignments_input, 'alignments_total')

    if nalignments_mapped == 0:
        E.warn("no mapped alignments - no further output")
        E.Stop()
        return

    for flag, counts in sorted(flags_counts.items()):
        if flag == "unmapped":
            continue
        _write(outs, 'alignments_' + flag, counts, nalignments_mapped,
               'alignments_mapped')

    if options.filename_rna:
        _write(outs, "alignments_rna", counter.alignments_rna,
               nalignments_mapped, 'alignments_mapped')
        _write(outs, "alignments_no_rna", counter.alignments_no_rna,
               nalignments_mapped, 'alignments_mapped')

    _write(outs, "alignments_filtered", counter.alignments_filtered,
           nalignments_mapped, "alignments_mapped")

    if counter.filtered == nalignments_mapped:
        normby = "alignments_mapped"
    else:
        normby = "alignments_filtered"

    if counter.filtered > 0:
        _write(outs, "alignments_duplicates", counter.alignments_duplicates,
               counter.alignments_filtered, normby)
        _write(outs, "alignments_unique",
               counter.aligmnments_filtered - counter.alignments_duplicates,
               counter.alignments_filtered, normby)

    ###############################
    ###############################
    ###############################
    # Output read based information
    ###############################

    # derive the number of mapped reads in file from alignment counts
    if options.filename_fastq:
        nreads_total = counter.total_read
        _write(outs, "reads_total", counter.total_read, nreads_total,
               'reads_total')
        _write(outs, "reads_unmapped", counter.total_read_is_unmapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped", counter.total_read_is_mapped,
               nreads_total, 'reads_total')
        _write(outs, "reads_missing", counter.total_read_is_missing,
               nreads_total, 'reads_total')
        _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq,
               counter.total_read_is_mapped, 'reads_mapped')
        _write(outs, "reads_multimapping", counter.total_read_is_mmap,
               counter.total_read_is_mapped, 'reads_mapped')
    else:
        E.warn('inferring read counts from alignments and NH tags')
        nreads_unmapped = flags_counts["unmapped"]
        nreads_mapped = computeMappedReadsFromAlignments(
            nalignments_mapped, nh_all, max_hi)

        nreads_missing = 0
        if options.input_reads:
            nreads_total = options.input_reads
            # unmapped reads in bam file?
            if nreads_unmapped:
                nreads_missing = nreads_total - nreads_unmapped - nreads_mapped
            else:
                nreads_unmapped = nreads_total - nreads_mapped

        elif nreads_unmapped:
            # if unmapped reads are in bam file, take those
            nreads_total = nreads_mapped + nreads_unmapped
        else:
            # otherwise normalize by mapped reads
            nreads_unmapped = 0
            nreads_total = nreads_mapped

        outs.write("reads_total\t%i\t%5.2f\treads_total\n" %
                   (nreads_total, 100.0))
        outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_mapped, 100.0 * nreads_mapped / nreads_total))
        outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" %
                   (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total))
        outs.write("reads_missing\t%i\t%5.2f\treads_total\n" %
                   (nreads_missing, 100.0 * nreads_missing / nreads_total))

        if len(nh_all) > 1:
            outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" %
                       (nh_all[1], 100.0 * nh_all[1] / nreads_mapped))

        # compute after filtering
        # not that these are rough guesses
        if options.filename_rna:
            nreads_norna = computeMappedReadsFromAlignments(
                counter.filtered, nh_filtered, max_hi)
            _write(outs, "reads_norna", nreads_norna, nreads_mapped,
                   "reads_mapped")
            if len(nh_filtered) > 1:
                _write(outs, "reads_norna_unique", nh_filtered[1],
                       nreads_norna, "reads_mapped")

    pysam_in.close()

    ###############################
    ###############################
    ###############################
    # Output pair information
    ###############################
    if flags_counts["read2"] > 0:
        if options.filename_fastq:
            pairs_mapped = counter.total_pair_is_mapped

            # sanity check
            assert counter.total_pair_is_mapped == \
                (counter.total_pair_is_proper_uniq +
                 counter.total_pair_is_incomplete_uniq +
                 counter.total_pair_is_incomplete_mmap +
                 counter.total_pair_is_proper_duplicate +
                 counter.total_pair_is_proper_mmap +
                 counter.total_pair_not_proper_uniq +
                 counter.total_pair_is_other)

            outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pairs,
                        100.0 * counter.total_pairs / counter.total_pairs))
            outs.write(
                "pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs))
            outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_unmapped, 100.0 *
                        counter.total_pair_is_unmapped / counter.total_pairs))
            outs.write(
                "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_uniq, 100.0 *
                 counter.total_pair_is_proper_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_uniq, 100.0 *
                 counter.total_pair_is_incomplete_uniq / counter.total_pairs))
            outs.write(
                "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_incomplete_mmap, 100.0 *
                 counter.total_pair_is_incomplete_mmap / counter.total_pairs))
            outs.write(
                "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_duplicate, 100.0 *
                 counter.total_pair_is_proper_duplicate / counter.total_pairs))
            outs.write(
                "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_is_proper_mmap, 100.0 *
                 counter.total_pair_is_proper_mmap / counter.total_pairs))
            outs.write(
                "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" %
                (counter.total_pair_not_proper_uniq, 100.0 *
                 counter.total_pair_not_proper_uniq / counter.total_pairs))
            outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" %
                       (counter.total_pair_is_other, 100.0 *
                        counter.total_pair_is_other / counter.total_pairs))

            nread1_total = counter.total_read1
            _write(outs, "read1_total", counter.total_read1, nread1_total,
                   'read1_total')
            _write(outs, "read1_unmapped", counter.total_read1_is_unmapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped", counter.total_read1_is_mapped,
                   nread1_total, 'read1_total')
            _write(outs, "read1_mapped_unique",
                   counter.total_read1_is_mapped_uniq,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "reads_multimapping", counter.total_read1_is_mmap,
                   counter.total_read1_is_mapped, 'read1_mapped')
            _write(outs, "read1_missing", counter.total_read1_is_missing,
                   counter.total_read1_is_mapped, 'read1_total')

            nread2_total = counter.total_read2
            _write(outs, "read2_total", counter.total_read2, nread2_total,
                   'read2_total')
            _write(outs, "read2_unmapped", counter.total_read2_is_unmapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped", counter.total_read2_is_mapped,
                   nread2_total, 'read2_total')
            _write(outs, "read2_mapped_unique",
                   counter.total_read2_is_mapped_uniq,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "reads_multimapping", counter.total_read2_is_mmap,
                   counter.total_read2_is_mapped, 'read2_mapped')
            _write(outs, "read2_missing", counter.total_read2_is_missing,
                   counter.total_read2_is_mapped, 'read2_total')

        else:
            # approximate counts
            pairs_total = nreads_total // 2
            pairs_mapped = flags_counts["proper_pair"] // 2
            _write(outs, "pairs_total", pairs_total, pairs_total,
                   "pairs_total")
            _write(outs, "pairs_mapped", pairs_mapped, pairs_total,
                   "pairs_total")
    else:
        # no paired end data
        pairs_total = pairs_mapped = 0
        outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_total, 0.0))
        outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" %
                   (pairs_mapped, 0.0))

    if options.force_output or len(nm_filtered) > 0:
        outfile = E.openOutputFile("nm", "w")
        outfile.write("NM\talignments\n")
        if len(nm_filtered) > 0:
            for x in range(0, max(nm_filtered.keys()) + 1):
                outfile.write("%i\t%i\n" % (x, nm_filtered[x]))
        else:
            outfile.write("0\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(nh_all) > 1:
        outfile = E.openOutputFile("nh_all", "w")
        outfile.write("NH\treads\n")
        if len(nh_all) > 0:
            writeNH(outfile, nh_all, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.mapped_reads))
        outfile.close()

    if options.force_output or len(nh_filtered) > 1:
        outfile = E.openOutputFile("nh", "w")
        outfile.write("NH\treads\n")
        if len(nh_filtered) > 0:
            writeNH(outfile, nh_filtered, max_hi)
        else:
            # assume all are unique if NH flag not set
            outfile.write("1\t%i\n" % (counter.filtered))
        outfile.close()

    if options.force_output or len(mapq_all) > 1:
        outfile = E.openOutputFile("mapq", "w")
        outfile.write("mapq\tall_reads\tfiltered_reads\n")
        for x in range(0, max(mapq_all.keys()) + 1):
            outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x]))
        outfile.close()

    # write footer and output benchmark information.
    E.Stop()
Beispiel #54
0
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run(statement)
        return

    tmpfile1 = P.get_temp_filename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | cgat gff2fasta
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        IOTools.touch_file(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    os.unlink(tmpfile1)

    inf = IOTools.open_file("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([
        x[0] for x in cc.execute("""SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)
    ])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" %
           ((len(new_pseudos), len(known_pseudos),
             len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.open_file(outfile, "w")
    inf = GTF.iterator(IOTools.open_file(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
Beispiel #55
0
def readWorkspace(infile,
                  workspace_builder="raw",
                  label="none",
                  map_id2annotation={}):
    """read workspace from infile.

    A workspace is a collection of intervals with two labels associated
    to each interval, one for the 5' and one for the 3' end.

    Available workspace builders are:

    gff
       take a gff file.

    gtf-intergenic
       build workspace from intergenic segments in a gtf file.

    gtf-intronic
       build workspace from intronic segments in a gtf file

    gtf-genic
       the workspace is built from genes (first to last exon).

    Available labels are:

    none
       no labels are given to the ends of workspaces

    direction
       labels are given based on the 5'/3' end of the
       bounding exon

    annotation
       labels are given based on a gene2annotation map.

    returns a list of segments for each contig in a dictionary
    """

    if label == "none":
        label_f = lambda x, y: (("X", ), ("X", ))
        info_f = lambda x: None
    elif label == "direction":
        label_f = lambda x, y: ((("5", "3")[x], ), (("3", "5")[y], ))
        info_f = lambda x: x.strand == "+"
    elif label == "annotation":
        label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y])
        info_f = lambda x: x.gene_id

    if workspace_builder == "gff":
        workspace = GTF.readAsIntervals(GFF.iterator(infile))

    elif workspace_builder == "gtf-intergenic":

        workspace = collections.defaultdict(list)
        # get all genes
        for e in GTF.merged_gene_iterator(GTF.iterator(infile)):
            workspace[e.contig].append((e.start, e.end, info_f(e)))

        # convert to intergenic regions.
        # overlapping genes are merged and the labels
        # of the right-most entry is retained
        for contig in workspace.keys():
            segs = workspace[contig]
            segs.sort()
            last = segs[0]
            new_segs = []
            for this in segs[1:]:
                if last[1] >= this[0]:
                    if this[1] > last[1]:
                        last = (last[0], this[1], this[2])
                    continue
                assert last[1] < this[0], "this=%s, last=%s" % (this, last)

                new_segs.append((last[1], this[0], label_f(last[2], this[2])))
                last = this
            workspace[contig] = new_segs

    elif workspace_builder == "gtf-intronic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            introns = Intervals.complement(exons)

            r = ee[0]
            for start, end in introns:
                workspace[r.contig].append(
                    (start, end, label_f(info_f(r), info_f(r))))
    elif workspace_builder == "gtf-genic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            start, end = exons[0][0], exons[-1][1]
            r = ee[0]
            workspace[r.contig].append(
                (start, end, label_f(info_f(r), info_f(r))))

    else:
        raise ValueError("unknown workspace_builder %s" % workspace_builder)

    return workspace
Beispiel #56
0
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        IOTools.touch_file(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        IOTools.touch_file(outfile)
        return

    tmpfile_mito = P.get_temp_filename(".")

    statement = '''
    cgat index_fasta
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        IOTools.touch_file(outfile)
        return

    format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    # convert to gtf
    inf = IOTools.open_file("%s.links.gz" % outfile)
    outf = IOTools.open_file(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end, target_contig,
         target_strand, target_start, target_end, score, pid,
         alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
Beispiel #57
0
def main(argv=None):
    '''
    main function
    '''

    if sys.argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2tsv.py 2887 2010-04-07 08:48:04Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--only-attributes",
        dest="only_attributes",
        action="store_true",
        help="output attributes as separate columns [default=%default].")
    parser.add_option(
        "-f",
        "--full",
        dest="full",
        action="store_true",
        help="output attributes as separate columns [default=%default].")
    parser.add_option(
        "-i",
        "--invert",
        dest="invert",
        action="store_true",
        help="convert tab-separated table back to gtf [default=%default].")
    parser.add_option(
        "-m",
        "--map",
        dest="map",
        type="choice",
        choices=("transcript2gene", "peptide2gene", "peptide2transcript"),
        help="output a map mapping transcripts to genes [default=%default].")

    parser.set_defaults(
        only_attributes=False,
        full=False,
        invert=False,
        map=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    if options.full:

        # output full table with column for each attribute
        attributes = set()
        data = []
        for gtf in GTF.iterator(options.stdin):
            data.append(gtf)
            attributes = attributes.union(set(gtf.keys()))

        # remove gene_id and transcript_id, as they are used
        # explicitely later
        attributes.difference_update(["gene_id", "transcript_id"])

        attributes = sorted(list(attributes))

        if options.only_attributes:
            header = ["gene_id", "transcript_id"] + attributes
        else:
            header = [
                "contig",
                "source",
                "feature",
                "start",
                "end",
                "score",
                "strand",
                "frame",
                "gene_id",
                "transcript_id",
            ] + attributes

        options.stdout.write("\t".join(header) + "\n")

        if options.only_attributes:
            for gtf in data:
                options.stdout.write("\t".join(
                    map(str, (
                        gtf.gene_id,
                        gtf.transcript_id,
                    ))))
                for a in attributes:
                    if a in ("gene_id", "transcript_id"):
                        continue
                    try:
                        val = getattr(gtf, a)
                    except AttributeError:
                        val = ""
                    options.stdout.write("\t%s" % val)
                options.stdout.write("\n")
        else:
            for gtf in data:
                options.stdout.write("\t".join(
                    map(str, (
                        gtf.contig,
                        gtf.source,
                        gtf.feature,
                        gtf.start,
                        gtf.end,
                        gtf.score,
                        gtf.strand,
                        gtf.frame,
                        gtf.gene_id,
                        gtf.transcript_id,
                    ))))
                for a in attributes:
                    try:
                        val = getattr(gtf, a)
                    except AttributeError:
                        val = ""
                    options.stdout.write("\t%s" % val)
                options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"):
                continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict([(y, x)
                                          for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError, msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))
Beispiel #58
0
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles, job_memory="4G"):
    '''output a bed file with functional annotations.

    The genomic region a gene covers is taken from the `gtffile`.
    There should only be one entry per gene, i.e. exons should
    have been combined into a gene territory.

    Each entry in the output bed file is a gene territory. Bed entries
    are labeled by functional annotations associated by that gene.

    Ambiguities in territories are resolved by outputting annotations
    for all genes within a territory.

    The output file contains annotations for both GO and GOSlim. These
    are prefixed by ``go:`` and ``goslim:``.

    Arguments
    ---------
    gtffile : string
       ENSEMBL geneset in :term:`gtf` format.
    dbh : object
       Database handle to retrieve GO assignments for each gene
    outfiles : list
       Output filenames. The first is a :term:`bed` formatted file
       of gene territories. The second is a :term:`tsv` formatted
       table mapping GO terms to their description.

    '''
    outfile_bed, outfile_tsv = outfiles

    gene2region = {}
    for gtf in GTF.iterator(IOTools.open_file(gtffile, "r")):
        gid = gtf.gene_id.split(":")
        for g in gid:
            gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand)

    cc = dbh.cursor()

    outf = P.get_temp_file(".")
    c = E.Counter()
    term2description = {}
    for db in ('go', 'goslim'):
        for gene_id, go_id, description in cc.execute(
                "SELECT gene_id, go_id, description FROM %s_assignments" % db):
            try:
                contig, start, end, strand = gene2region[gene_id]
            except KeyError:
                c.notfound += 1
                continue
            outf.write("\t".join(
                map(str, (contig, start, end, "%s:%s" %
                          (db, go_id), 1, strand))) + "\n")
            term2description["%s:%s" % (db, go_id)] = description
    outf.close()
    tmpfname = outf.name
    statement = '''sort -k1,1 -k2,2n  < %(tmpfname)s | uniq
    | gzip > %(outfile_bed)s'''

    P.run(statement, job_memory=job_memory)

    outf = IOTools.open_file(outfile_tsv, "w")
    outf.write("term\tdescription\n")
    for term, description in term2description.items():
        outf.write("%s\t%s\n" % (term, description))
    outf.close()

    os.unlink(tmpfname)
Beispiel #59
0
def convert_hierarchy(first_gffs, second_gffs, options):
    ''' Converts GFF to GTF by parsing the hierarchy.
    First parses :param:first_gffs to build the hierarchy then iterates over second_gffs
    using a call to the recursive function search_hierarchy to identify gene_ids and transcript_ids.

    If multiple gene and transcript_ids are found outputs a record for each combination.

    If no definitive transcript_id is found and options.missing_gene is True, it will use the 
    possible_transcript_id as transcript_id, which is the ID one level below the entry used as gene_id.
    If this is also None (that is there was only on level), sets transcript_id to gene_id.

    Might raise ValueError if options.missing_gene is false and either no gene or no transcript_id
    was found for an entry.

    Might raise RuntimeError if the recursion limit was reached because the input contains circular
    references. '''

    hierarchy = {}

    for gff in first_gffs:

        if not (options.parent == "Parent"):
            if options.parent in gff.asDict():
                gff['Parent'] = gff[options.parent].split(",")
            else:
                gff['Parent'] = []

        hierarchy[gff['ID']] = {
            "type":
            gff.feature,
            "Parent":
            gff.asDict().get("Parent", []),
            "gene_id":
            gff.attributes.get(options.gene_field_or_pattern, gff['ID']),
            "transcript_id":
            gff.attributes.get(options.transcript_field_or_pattern, gff['ID'])
        }

    for gff in second_gffs:

        if options.discard and (
            (options.missing_gene and options.parent not in gff) or
            (gff.feature in (options.gene_type, options.transcript_type))):

            continue

        gene_ids, transcript_ids, poss_transcript_ids = search_hierarchy(
            gff['ID'], hierarchy, options)

        assert len(gene_ids) > 0 and len(transcript_ids) > 0

        if options.missing_gene:

            transcript_ids = [
                poss if found is None else found
                for found, poss in zip(transcript_ids, poss_transcript_ids)
            ]

            transcript_ids = [
                gid if found is None else found
                for found, gid in zip(transcript_ids, gene_ids)
            ]

        elif None in transcript_ids:
            raise ValueError("failed to find transcript id for %s" % gff['ID'])

        for gene_id, transcript_id in zip(gene_ids, transcript_ids):

            gff.gene_id = gene_id
            gff.transcript_id = transcript_id

            gtf_entry = GTF.Entry()
            gtf_entry.copy(gff)
            if "Parent" in gtf_entry:
                gtf_entry['Parent'] = ",".join(gtf_entry['Parent'])

            options.stdout.write(str(gtf_entry) + "\n")
Beispiel #60
0
def annotateRegulons(iterator, fasta, tss, options):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have
            # changed)
            regulons = Intervals.combine(regulons)
            transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            nregulons += 1
            x += 1

    E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" %
           (ngenes, ntranscripts, nregulons))