Esempio n. 1
0
def getGeneTable(reffile):
    E.info("Loading reference")
    table = defaultdict(dict)
    for ens_gene in GTF.gene_iterator(GTF.iterator(
            IOTools.open_file(reffile))):
        geneid = ens_gene[0][0].gene_id
        table[geneid]["models"] = dict()
        table[geneid]["start_codons"] = defaultdict(list)

        for transcript in ens_gene:

            transcript_id = transcript[0].transcript_id
            table[geneid]["models"][transcript_id] = transcript

            CDS = GTF.asRanges(transcript, "start_codon")
            if len(CDS) == 0:
                continue

            if transcript[0].strand == "-":
                start_codon = max(e[1] for e in CDS)
            else:
                start_codon = min(e[0] for e in CDS)

            table[geneid]["start_codons"][start_codon].append(transcript_id)

    E.info("Reference Loaded")
    return table
Esempio n. 2
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("--is-gtf", dest="is_gtf", action="store_true",
                        help="input is gtf.")

    parser.set_defaults(
        is_gtf=False,
    )

    (args, unknown) = E.start(parser,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 0:
        files = [args.stdin]
    else:
        files = args

    args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields)))

    if args.is_gtf:
        args.stdout.write("\t%s" % ("\t".join(counter_exons.fields)))
    args.stdout.write("\n")

    for f in files:
        if f == args.stdin:
            infile = f
            args.stdout.write("stdin")
        else:
            infile = iotools.open_file(f)
            args.stdout.write(f)

        counters = []
        if args.is_gtf:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))
            counters.append(counter_exons(counters[0]))
        else:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))

        c = counters[-1]
        for x in c:
            pass

        for c in counters:
            args.stdout.write("\t%s" % str(c))
        args.stdout.write("\n")

        if infile != sys.stdin:
            infile.close()

    E.stop()
Esempio n. 3
0
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, noverlapping=%i\n" %
                             (ninput, noutput, noverlapping))
Esempio n. 4
0
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript
                          ]), max([x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append((min(ma, lcontig - options.promotor),
                            min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
                             (ngenes, ntranscripts, npromotors))
def getTranscript2GeneMap(outfile):
    ''' Extract a 1:1 map of transcript_id to gene_id from the geneset '''

    iterator = GTF.iterator(IOTools.open_file(PARAMS['geneset']))
    transcript2gene_dict = {}

    for entry in iterator:

        try:
            gene_id = entry[PARAMS["gene_id_field"]]
        except KeyError:
            gene_id = entry.gene_id

        try:
            transcript_id = entry[PARAMS["transcript_id_field"]]
        except KeyError:
            transcript_id = entry.transcript_id

        # Check the same transcript_id is not mapped to multiple gene_ids!
        if transcript_id in transcript2gene_dict:
            if not gene_id == transcript2gene_dict[transcript_id]:
                raise ValueError(
                    '''multipe gene_ids associated with
                the same transcript_id %s %s''' %
                    (gene_id, transcript2gene_dict[transcript_id]))
        else:
            transcript2gene_dict[transcript_id] = gene_id

    with IOTools.open_file(outfile, "w") as outf:
        outf.write("transcript_id\tgene_id\n")
        for key, value in sorted(transcript2gene_dict.items()):
            outf.write("%s\t%s\n" % (key, value))
Esempio n. 6
0
def filterGTF(gtf, filterstring, tempout):

    if "!=" in filterstring:
        column, value = filterstring.split("!=")
        value = value.split("+")
        filtertype = "notin"

    elif "=" in filterstring:
        column, value = filterstring.split("=")
        value = value.split("+")
        filtertype = "in"

    elif "-in_file-" in filterstring:
        column, value = filterstring.split("-in_file-")
        value = [line.strip() for line in iotools.open_file(value)]
        filtertype = "in_file"

    elif "-notin_file-" in filterstring:
        column, value = filterstring.split("-notin_file-")
        value = [line.strip() for line in iotools.open_file(value)]
        filtertype = "notin_file"

    elif "-morethan-" in filterstring:
        column, value = filterstring.split("-morethan-")
        value = float(value)
        filtertype = "morethan"

    elif "-lessthan-" in filterstring:
        column, value = filterstring.split("-lessthan-")
        value = float(value)
        filtertype = "lessthan"

    gfile = iotools.open_file(gtf)
    G = GTF.iterator(gfile)

    out = iotools.open_file(tempout, "w")
    for item in G:
        D = item.asDict()
        D['contig'] = item.contig
        D['source'] = item.source
        D['feature'] = item.feature
        D['start'] = item.start
        D['end'] = item.end
        D['strand'] = item.strand
        D['frame'] = item.frame

        if filtertype == "in" or filtertype == 'in_file':
            if D[column] in value:
                out.write("%s\n" % str(item))
        elif filtertype == "notin" or filtertype == 'notin_file':
            if D[column] not in value:
                out.write("%s\n" % str(item))
        elif filtertype == "morethan":
            if float(D[column]) > value:
                out.write("%s\n" % str(item))
        elif filtertype == "lessthan":
            if float(D[column]) < value:
                out.write("%s\n" % str(item))
    out.close()
    gfile.close()
Esempio n. 7
0
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = iotools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | cgat gtf2gtf
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''
            zcat %(infile)s | cgat gtf2gtf
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | cgat gtf2gtf
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | cgat gtf2gtf
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()
Esempio n. 8
0
    def _iterator(iterator):
        """yield gene plus the locations of the end of the previous gene and
        start of next gene"""

        last_end, prev_end = 0, 0
        last_contig = None
        last = None
        for matches in GTF.iterator_overlaps(iterator):

            this_start = min([x.start for x in matches])
            this_end = max([x.end for x in matches])

            if method == "tss":
                # restrict to tss
                if matches[0].strand == "+":
                    this_end = this_start + 1
                else:
                    this_start = this_end - 1

            this_contig = matches[0].contig

            if last_contig != this_contig:
                if last:
                    yield prev_end, last, fasta.getLength(last_contig)
                last_end, prev_end = 0, 0
            else:
                yield prev_end, last, this_start

            prev_end = last_end
            last_end = this_end
            last = matches
            last_contig = this_contig

        if last:
            yield prev_end, last, fasta.getLength(last_contig)
Esempio n. 9
0
    def update(self, bed):

        # convert to a gtf entry
        gtf = GTF.Entry()

        gtf.fromBed(bed)
        gtf.feature = 'exon'
        GeneModelAnalysis.Classifier.update(self, [gtf])
Esempio n. 10
0
 def _add(interval, anno):
     gtf = GTF.Entry()
     gtf.contig = transcript[0].contig
     gtf.gene_id = transcript[0].gene_id
     gtf.transcript_id = transcript[0].transcript_id
     gtf.strand = transcript[0].strand
     gtf.feature = anno
     gtf.start, gtf.end = interval
     results.append(gtf)
Esempio n. 11
0
    def test_entry(frame, strand, xfrom, xto, start, end, ref):

        entry = GTF.Entry()
        entry.frame = frame
        entry.strand = strand
        entry.start = xfrom
        entry.end = xto

        intervals = transform_third_codon(start, end, [(xfrom, xto, entry)])
        if ref != intervals:
            print("failed:", ref != intervals)
Esempio n. 12
0
    def buildIndex(self, filename):
        """read and index."""

        idx = {}
        infile = iotools.open_file(filename, "r")
        for e in GTF.readFromFile(infile):
            if e.contig not in idx:
                idx[e.contig] = NCL.NCLSimple()
            idx[e.contig].add(e.start, e.end)
        infile.close()
        return idx
def buildRepeatTrack(infile, outfile):
    '''build a repeat track as negative control.'''

    nrepeats = 0
    for gff in GTF.iterator(gzip.open(infile, "r")):
        nrepeats += 1
    sample = set(
        random.sample(range(nrepeats), PARAMS["ancestral_repeats_samplesize"]))

    outf = gzip.open(outfile, "w")
    gtf = GTF.Entry()
    for x, gff in enumerate(GTF.iterator(gzip.open(infile, "r"))):
        if x not in sample:
            continue
        gtf.fromGTF(gff, "%08i" % x, "%08i" % x)
        outf.write("%s\n" % str(gtf))
    outf.close()

    E.debug("created sample of %i repeats out of %i in %s" %
            (len(sample), nrepeats, outfile))
Esempio n. 14
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''

    # just load each transcript with its classification
    temp = P.getTempFile(".")
    inf = iotools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    P.load(temp.name,
           outfile,
           options="--header-names=transcript_id,gene_id,class "
           "--add-index=transcript_id "
           "--add-index=gene_id")

    os.unlink(temp.name)
Esempio n. 15
0
def convert_set(gffs, gene_pattern, transcript_pattern, options):
    ''' creates the gene_id and transcript_id fields from a string format pattern using
    fields of the gff. '''

    for gff in gffs:

        gff.gene_id = str(gene_pattern) % gff.asDict()
        gff.transcript_id = str(gene_pattern) % gff.asDict()

        gtf_entry = GTF.Entry()

        gtf_entry.copy(gff)
        if "Parent" in gtf_entry:
            gtf_entry['Parent'] = ",".join(gtf_entry['Parent'])

        options.stdout.write(str(gtf_entry) + "\n")
Esempio n. 16
0
def extractEnsemblLincRNA(infile, outfile):
    tmpf = P.getTempFile("/ifs/scratch")
    for gtf in GTF.iterator(iotools.openFile(infile)):
        if gtf.source == "lincRNA":
            tmpf.write(str(gtf) + "\n")
        else:
            continue
    tmpf.close()
    tmpf = tmpf.name

    statement = ("cat %(tmpf)s |"
                 " cgat gtf2gtf"
                 "  --method=sort --sort-order=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip > %(outfile)s")
    P.run()

    os.unlink(tmpf)
Esempio n. 17
0
def transcript2bed12(transcript):

    new_entry = Bed.Bed()
    start = min(entry.start for entry in transcript)
    end = max(entry.end for entry in transcript)

    try:
        thickStart = min(entry.start for entry in transcript
                         if entry.feature == "CDS")
        thickEnd = max(entry.end for entry in transcript
                       if entry.feature == "CDS")
    except ValueError:

        # if there is no CDS, then set first base of transcript as
        # start

        if transcript[0].strand == "-":
            thickStart = end
            thickEnd = end
        else:
            thickStart = start
            thickEnd = start

    exons = GTF.asRanges(transcript, "exon")

    exon_starts = [es - start for (es, ee) in exons]
    exon_lengths = [ee - es for (es, ee) in exons]
    exon_count = len(exons)
    new_entry.contig = transcript[0].contig
    new_entry.start = start
    new_entry.end = end
    new_entry["strand"] = transcript[0].strand
    new_entry["name"] = transcript[0].transcript_id

    new_entry["thickStart"] = thickStart
    new_entry["thickEnd"] = thickEnd

    new_entry["blockCount"] = exon_count
    new_entry["blockStarts"] = ",".join(map(str, exon_starts))
    new_entry["blockSizes"] = ",".join(map(str, exon_lengths))

    return new_entry
def get_contigs(infile, outfile):
    '''Generate a pseudo-contigs file from the geneset, where the length of 
    each contigs is determined by the GTF entry with the highest end coordinate.
    Will not stop things going off the end on contigs, but that doesn't really
    matter for our purposes'''

    last_contig = None
    max_end = 0
    outlines = []
    for entry in GTF.iterator(iotools.open_file(infile)):

        if last_contig and entry.contig != last_contig:
            outlines.append([entry.contig, str(max_end)])
            max_end = 0

        max_end = max(max_end, entry.end)
        last_contig = entry.contig

    outlines.append([last_contig, str(max_end)])
    iotools.write_lines(outfile, outlines, header=None)
Esempio n. 19
0
    def _count(self, filename, idx):

        overlapping_genes = set()
        genes = set()
        # iterate over exons
        infile = iotools.open_file(filename, "r")
        it = GTF.iterator(infile)

        nexons, nexons_overlapping = 0, 0
        nbases, nbases_overlapping = 0, 0
        for this in it:
            nexons += 1
            nbases += this.end - this.start
            genes.add(this.gene_id)

            try:
                intervals = list(idx[this.contig].find(this.start, this.end))
            except KeyError:
                continue

            if len(intervals) == 0:
                continue

            overlapping_genes.add(this.gene_id)
            nexons_overlapping += 1
            start, end = this.start, this.end
            counts = numpy.zeros(end - start, numpy.int)
            for other_start, other_end, other_value in intervals:
                for x in range(
                        max(start, other_start) - start,
                        min(end, other_end) - start):
                    counts[x] += 1
            nbases_overlapping += sum([1 for x in counts if x > 0])

        infile.close()

        return len(genes), len(
            overlapping_genes
        ), nexons, nexons_overlapping, nbases, nbases_overlapping
Esempio n. 20
0
def addSegment(feature, start, end, template, options):
    """add a generic segment of type *feature*.
    """
    if start >= end:
        return 0

    entry = GTF.Entry()

    if isinstance(template, tuple):
        entry.copy(template[0])
        entry.clearAttributes()
        entry.addAttribute("downstream_gene_id", template[1].gene_id)
    else:
        entry.copy(template)
        entry.clearAttributes()

    entry.start, entry.end = start, end
    entry.feature = feature
    if feature not in ("exon", "CDS", "UTR", "UTR3", "UTR5"):
        entry.score = "."
    options.stdout.write(str(entry) + "\n")

    return 1
def annotate(infile, annotation_file, outfile):
    '''
    annotate infile with annotations from
    annotation gtf file
    '''
    inf = open(infile)
    header = inf.readline()
    include = set()

    E.info("reading genes to keep")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        gene_id = data[8].strip('"')
        include.add(gene_id)

    E.info("reading annotations file")
    annotations = {}
    for gtf in GTF.iterator(iotools.openFile(annotation_file)):
        if gtf.gene_id in include:
            annotations[gtf.gene_id] = \
                [gtf.gene_name, gtf.species, gtf.description]

    inf = open(infile)
    header = inf.readline()

    E.info("writing results with annotations")
    outf = open(outfile, "w")
    outf.write(
        header.strip("\n") + "\tgene_name\tspecies_centroid\tdescription\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        gene_id = data[8].strip('"')
        try:
            outf.write("\t".join(data + annotations[gene_id]) + "\n")
        except KeyError:
            outf.write("\t".join(data + ["NA", "NA", "NA"]) + "\n")
    outf.close()
Esempio n. 22
0
    def _count(self, filename, idx):

        overlapping_genes = set()
        genes = set()
        # iterate over exons
        infile = iotools.open_file(filename, "r")
        it = GTF.iterator(infile)

        for this in it:
            genes.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(this.start, this.end)
            except KeyError:
                continue

            if len(intervals) == 0:
                continue

            overlapping_genes.add(this.gene_id)

        infile.close()

        return genes, overlapping_genes
Esempio n. 23
0
def getRefSeqFromUCSC(dbhandle, outfile, remove_duplicates=False):
    '''get refseq gene set from UCSC database and save as :term:`gtf`
    formatted file.

    Matches to ``chr_random`` are ignored (as does ENSEMBL).

    Note that this approach does not work as a gene set, as refseq
    maps are not real gene builds and unalignable parts cause
    differences that are not reconcilable.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`gtf` format. The filename
       aims to be close to the ENSEMBL gtf format.
    remove_duplicate : bool
       If True, duplicate mappings are removed.

    '''

    duplicates = set()

    if remove_duplicates:
        cc = dbhandle.execute("""SELECT name, COUNT(*) AS c FROM refGene
        WHERE chrom NOT LIKE '%_random'
        GROUP BY name HAVING c > 1""")
        duplicates = set([x[0] for x in cc.fetchall()])
        E.info("removing %i duplicates" % len(duplicates))

    # these are forward strand coordinates
    statement = '''
    SELECT gene.name, link.geneName, link.name, gene.name2, product,
    protAcc, chrom, strand, cdsStart, cdsEnd,
    exonCount, exonStarts, exonEnds, exonFrames
    FROM refGene as gene, refLink as link
    WHERE gene.name = link.mrnaAcc
    AND chrom NOT LIKE '%_random'
    ORDER by chrom, cdsStart
    '''

    outf = iotools.open_file(outfile, "w")

    cc = dbhandle.execute(statement)

    SQLResult = collections.namedtuple(
        'Result', '''transcript_id, gene_id, gene_name, gene_id2, description,
        protein_id, contig, strand, start, end,
        nexons, starts, ends, frames''')

    counts = E.Counter()
    counts.duplicates = len(duplicates)

    for r in map(SQLResult._make, cc.fetchall()):

        if r.transcript_id in duplicates:
            continue

        starts = list(map(int, r.starts.split(",")[:-1]))
        ends = list(map(int, r.ends.split(",")[:-1]))
        frames = list(map(int, r.frames.split(",")[:-1]))

        gtf = GTF.Entry()
        gtf.contig = r.contig
        gtf.source = "protein_coding"
        gtf.strand = r.strand
        gtf.gene_id = r.gene_id
        gtf.transcript_id = r.transcript_id
        gtf.addAttribute("protein_id", r.protein_id)
        gtf.addAttribute("transcript_name", r.transcript_id)
        gtf.addAttribute("gene_name", r.gene_name)

        assert len(starts) == len(ends) == len(frames)

        if gtf.strand == "-":
            starts.reverse()
            ends.reverse()
            frames.reverse()

        counts.transcripts += 1
        i = 0
        for start, end, frame in zip(starts, ends, frames):
            gtf.feature = "exon"
            counts.exons += 1
            i += 1
            gtf.addAttribute("exon_number", i)
            # frame of utr exons is set to -1 in UCSC
            gtf.start, gtf.end, gtf.frame = start, end, "."
            outf.write("%s\n" % str(gtf))

            cds_start, cds_end = max(r.start, start), min(r.end, end)
            if cds_start >= cds_end:
                # UTR exons have no CDS
                # do not expect any in UCSC
                continue
            gtf.feature = "CDS"
            # invert the frame
            frame = (3 - frame % 3) % 3
            gtf.start, gtf.end, gtf.frame = cds_start, cds_end, frame
            outf.write("%s\n" % str(gtf))

    outf.close()

    E.info("%s" % str(counts))
Esempio n. 24
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--output-equivalent",
                      dest="write_equivalent",
                      action="store_true",
                      help="write equivalent entries [default=%default].")

    parser.add_option("-f",
                      "--output-full",
                      dest="write_full",
                      action="store_true",
                      help="write full gff entries [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    # duplicated features cause a problem. Make sure
    # features are non-overlapping by running
    # gff_combine.py on GFF files first.

    E.info("reading data started")

    idx, genes2 = {}, set()
    for e in GTF.readFromFile(iotools.open_file(input_filename2, "r")):
        genes2.add(e.gene_id)
        if e.contig not in idx:
            idx[e.contig] = quicksect.IntervalTree()
        idx[e.contig].add(e.start, e.end, e)

    overlaps_genes = []

    E.info("reading data finished: %i contigs" % len(idx))

    # outfile_diff and outfile_overlap not implemented
    # outfile_diff = getFile( options, "diff" )
    # outfile_overlap = getFile( options, "overlap" )
    overlapping_genes = set()

    genes1 = set()

    # iterate over exons
    with iotools.open_file(input_filename1, "r") as infile:
        for this in GTF.iterator(infile):

            genes1.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(
                    quicksect.Interval(this.start, this.end))
            except KeyError:
                continue

            others = [x.data for x in intervals]
            for other in others:
                overlapping_genes.add((this.gene_id, other.gene_id))

            # check for identical/half-identical matches
            output = None
            for other in others:
                if this.start == other.start and this.end == other.end:
                    output, symbol = other, "="
                    break
            else:
                for other in others:
                    if this.start == other.start or this.end == other.end:
                        output, symbol = other, "|"
                        break
                else:
                    symbol = "~"

    # if outfile_diff != options.stdout: outfile_diff.close()
    # if outfile_overlap != options.stdout: outfile_overlap.close()

    outfile = None
    ##################################################################
    ##################################################################
    ##################################################################
    # print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in sorted(overlapping_genes):
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout:
            outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        b = set([x[0] for x in overlapping_genes])
        d = genes1.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()
        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename1), len(genes1), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1)))

        outfile = getFile(options, "genes_uniq2")
        b = set([x[1] for x in overlapping_genes])
        d = genes2.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()

        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename2), len(genes2), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2)))
        if outfile_total != options.stdout:
            outfile_total.close()

    E.stop()
Esempio n. 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-i",
        "--ignore-missing",
        dest="ignore_missing",
        action="store_true",
        help=
        "Ignore transcripts on contigs that are not in the genome-file [default=%default]."
    )

    parser.add_option(
        "--min-intron-length",
        dest="min_intron_length",
        type="int",
        help=
        "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]."
    )

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("full", ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    annotateGenome(iterator, fasta, options)

    # write footer and output benchmark information.
    E.stop()
Esempio n. 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = iotools.read_map(
            iotools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(iotools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(iotools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = iotools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Esempio n. 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: "
                            "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z "
                            "andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default]")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      action="append",
                      help="features to collect "
                      "[default=%default]")

    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      help="window size in bp for histogram computation. "
                      "Determines the bin size.  "
                      "[default=%default]")

    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins for histogram computation "
                      "if window size is not given. "
                      "[default=%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=(
                          "genomic",
                          "histogram",
                      ),
                      help="methods to apply. "
                      "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        window_size=None,
        num_bins=1000,
        value_format="%6.4f",
        features=[],
        method="genomic",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.method == "histogram":

        gff = GTF.readFromFile(options.stdin)

        gff.sort(key=lambda x: (x.contig, x.start))

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk(last_contig, chunk, options, fasta)
                last_contig = entry.contig
                chunk = []

            chunk.append(entry)

        processChunk(last_contig, chunk, options, fasta)

    elif options.method == "genomic":
        intervals = collections.defaultdict(int)
        bases = collections.defaultdict(int)
        total = 0
        for entry in GTF.iterator(options.stdin):
            intervals[(entry.contig, entry.source, entry.feature)] += 1
            bases[(entry.contig, entry.source,
                   entry.feature)] += entry.end - entry.start
            total += entry.end - entry.start

        options.stdout.write("contig\tsource\tfeature\tintervals\tbases")
        if fasta:
            options.stdout.write(
                "\tpercent_coverage\ttotal_percent_coverage\n")
        else:
            options.stdout.write("\n")

        total_genome_size = sum(
            fasta.getContigSizes(with_synonyms=False).values())

        for key in sorted(intervals.keys()):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            options.stdout.write("\t".join(
                ("\t".join(key), str(nintervals), str(nbases))))
            if fasta:
                options.stdout.write(
                    "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig)))
                options.stdout.write(
                    "\t%f\n" % (100.0 * float(nbases) / total_genome_size))
            else:
                options.stdout.write("\n")

    E.stop()
Esempio n. 28
0
def processChunk(contig, chunk, options, fasta=None):
    """
    This function requires segments to be non-overlapping.
    """

    if len(chunk) == 0:
        return

    # check whether there are overlapping features or not
    checked = []
    for feature in chunk:
        checked.append(feature)
        others = [x for x in chunk if x not in checked]
        for otherFeature in others:
            if GTF.Overlap(feature, otherFeature):
                raise ValueError(" Histogram could not be created"
                                 " since the file contains overlapping "
                                 "features! \n%s\n%s  " %
                                 (feature, otherFeature))
    # clear auxiliary list
    del checked[:]

    # compute max_coordinate for the histogram
    max_coordinate = max([x.end for x in chunk])
    # compute window size
    if options.window_size:
        window_size = options.window_size
        num_bins = int(math.ceil((float(max_coordinate) / window_size)))
    elif options.num_bins and fasta:
        contig_length = fasta.getLength(contig)
        assert max_coordinate <= contig_length, (
            "maximum coordinate (%i) "
            "larger than contig size (%i)"
            " for contig %s" % (max_coordinate, contig_length, contig))
        max_coordinate = contig_length
        window_size = int(math.floor(float(contig_length) / options.num_bins))
        num_bins = options.num_bins
    else:
        raise ValueError("please specify a window size of provide "
                         "genomic sequence with number of bins.")

    values = [[] for x in range(num_bins)]

    # do several parses for each feature, slow, but easier to code
    # alternatively: sort by feature and location.
    for feature in options.features:
        total = 0
        bin = 0
        end = window_size
        for entry in chunk:
            if entry.feature != feature:
                continue

            while end < entry.start:
                values[bin].append(total)
                bin += 1
                end += window_size

            while entry.end > end:
                seg_start = max(entry.start, end - window_size)
                seg_end = min(entry.end, end)
                total += seg_end - seg_start
                values[bin].append(total)
                end += window_size
                bin += 1
            else:
                seg_start = max(entry.start, end - window_size)
                seg_end = min(entry.end, end)
                total += seg_end - seg_start

        while bin < num_bins:
            values[bin].append(total)
            bin += 1

    printValues(contig, max_coordinate, window_size, values, options)
Esempio n. 29
0
def main(argv=None):
    '''
    main function
    '''

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o", "--output-only-attributes", dest="only_attributes",
        action="store_true",
        help="output only attributes as separate columns "
        "[default=%default].")

    parser.add_option(
        "-f", "--attributes-as-columns", dest="output_full",
        action="store_true",
        help="output attributes as separate columns "
        "[default=%default].")

    parser.add_option("--is-gff3", dest="is_gtf", action="store_false",
                      help="input file is in gtf format [default=%default] ")

    parser.add_option(
        "-i", "--invert", dest="invert", action="store_true",
        help="convert tab-separated table back to gtf "
        "[default=%default].")

    parser.add_option(
        "-m", "--output-map", dest="output_map", type="choice",
        choices=(
            "transcript2gene",
            "peptide2gene",
            "peptide2transcript"),
        help="output a map mapping transcripts to genes "
        "[default=%default].")

    parser.set_defaults(
        only_attributes=False,
        output_full=False,
        invert=False,
        output_map=None,
        is_gtf=True
    )

    (options, args) = E.start(parser, argv=argv)

    if options.output_full:
        # output full table with column for each attribute

        attributes = set()
        data = []
        if options.is_gtf:
            for gtf in GTF.iterator(options.stdin):
                data.append(gtf)
                attributes = attributes.union(set(gtf.keys()))

        else:
            for gff in GFF3.iterator_from_gff(options.stdin):
                data.append(gff)
                attributes = attributes.union(set(gff.attributes))

        # remove gene_id and transcript_id, as they are used
        # explicitely later
        attributes.difference_update(["gene_id", "transcript_id"])

        attributes = sorted(list(attributes))

        # Select whether gtf of gff for output columns
        if options.is_gtf:
            if options.only_attributes:
                header = ["gene_id", "transcript_id"] + attributes
            else:
                header = ["contig", "source", "feature",
                          "start", "end", "score", "strand",
                          "frame", "gene_id",
                          "transcript_id", ] + attributes
        else:
            if options.only_attributes:
                header = attributes
            else:
                header = ["contig", "source", "feature",
                          "start", "end", "score", "strand",
                          "frame"] + attributes

        attributes_new = header

        options.stdout.write("\t".join(header) + "\n")

        if options.is_gtf:
            for gtf in data:
                first = True
                for a in attributes_new:
                    try:
                        val = getattr(gtf, a)
                    except (AttributeError, KeyError):
                        val = ""
                    if first:
                        options.stdout.write("%s" % val)
                        first = False
                    else:
                        options.stdout.write("\t%s" % val)
                options.stdout.write("\n")
        else:
            for gff in data:
                options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t") % (gff.contig,
                                                                             gff.source, gff.feature, gff.start, gff.end,
                                                                             gff.score, gff.strand, gff.frame))

                first = True
                for a in attributes:
                    try:
                        val = (gff.attributes[a])
                    except (AttributeError, KeyError):
                        val = ''
                    if first:
                        options.stdout.write("%s" % val)
                        first = False
                    else:
                        options.stdout.write("\t%s" % val)
                options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"):
                continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict(
                    [(y, x) for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError as msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            if gtf.frame is None:
                gtf.frame = "."
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))

    elif options.output_map:

        if options.output_map == "transcript2gene":
            fr = lambda x: x.transcript_id
            to = lambda x: x.gene_id
            options.stdout.write("transcript_id\tgene_id\n")
        elif options.output_map == "peptide2gene":
            fr = lambda x: x.protein_id
            to = lambda x: x.gene_id
            options.stdout.write("peptide_id\tgene_id\n")
        elif options.output_map == "peptide2transcript":
            fr = lambda x: x.protein_id
            to = lambda x: x.transcript_id
            options.stdout.write("peptide_id\ttranscript_id\n")

        map_fr2to = {}
        for gtf in GTF.iterator(options.stdin):
            try:
                map_fr2to[fr(gtf)] = to(gtf)
            except (AttributeError, KeyError):
                pass

        for x, y in sorted(map_fr2to.items()):
            options.stdout.write("%s\t%s\n" % (x, y))
    else:
        header = ("contig", "source", "feature", "start", "end", "score",
                  "strand", "frame", "gene_id", "transcript_id", "attributes")
        options.stdout.write("\t".join(header) + "\n")

        for gtf in GTF.iterator(options.stdin):
            attributes = []
            for a in list(gtf.keys()):
                if a in ("gene_id", "transcript_id"):
                    continue
                attributes.append('%s %s' % (a, GTF.quote(gtf[a])))

            attributes = "; ".join(attributes)

            # Capture if None and set to . format
            if gtf.frame is None:
                gtf.frame = "."

            options.stdout.write(str(gtf) + "\n")

    E.stop()
Esempio n. 30
0
def convert_hierarchy(first_gffs, second_gffs, options):
    ''' Converts GFF to GTF by parsing the hierarchy.
    First parses :param:first_gffs to build the hierarchy then iterates over second_gffs
    using a call to the recursive function search_hierarchy to identify gene_ids and transcript_ids.

    If multiple gene and transcript_ids are found outputs a record for each combination.

    If no definitive transcript_id is found and options.missing_gene is True, it will use the
    possible_transcript_id as transcript_id, which is the ID one level below the entry used as gene_id.
    If this is also None (that is there was only on level), sets transcript_id to gene_id.

    Might raise ValueError if options.missing_gene is false and either no gene or no transcript_id
    was found for an entry.

    Might raise RuntimeError if the recursion limit was reached because the input contains circular
    references. '''

    hierarchy = {}

    for gff in first_gffs:

        if not (options.parent == "Parent"):
            if options.parent in gff.asDict():
                gff['Parent'] = gff[options.parent].split(",")
            else:
                gff['Parent'] = []

        hierarchy[gff['ID']] = {
            "type":
            gff.feature,
            "Parent":
            gff.asDict().get("Parent", []),
            "gene_id":
            gff.attributes.get(options.gene_field_or_pattern, gff['ID']),
            "transcript_id":
            gff.attributes.get(options.transcript_field_or_pattern, gff['ID'])
        }

    for gff in second_gffs:

        if options.discard and (
            (options.missing_gene and options.parent not in gff) or
            (gff.feature in (options.gene_type, options.transcript_type))):

            continue

        gene_ids, transcript_ids, poss_transcript_ids = search_hierarchy(
            gff['ID'], hierarchy, options)

        assert len(gene_ids) > 0 and len(transcript_ids) > 0

        if options.missing_gene:

            transcript_ids = [
                poss if found is None else found
                for found, poss in zip(transcript_ids, poss_transcript_ids)
            ]

            transcript_ids = [
                gid if found is None else found
                for found, gid in zip(transcript_ids, gene_ids)
            ]

        elif None in transcript_ids:
            raise ValueError("failed to find transcript id for %s" % gff['ID'])

        for gene_id, transcript_id in zip(gene_ids, transcript_ids):

            gff.gene_id = gene_id
            gff.transcript_id = transcript_id

            gtf_entry = GTF.Entry()
            gtf_entry.copy(gff)
            if "Parent" in gtf_entry:
                gtf_entry['Parent'] = ",".join(gtf_entry['Parent'])

            options.stdout.write(str(gtf_entry) + "\n")