def read_annotations(self):
        """Read pre- and post-synaptic site annotations.
        """

        annotations = Annotations()

        if not "/annotations" in self.h5file:
            return annotations

        offset = (0.0, 0.0, 0.0)
        if "offset" in self.h5file["/annotations"].attrs:
            offset = self.h5file["/annotations"].attrs["offset"]
        annotations.offset = offset

        ids = self.h5file["/annotations/ids"]
        types = self.h5file["/annotations/types"]
        locations = self.h5file["/annotations/locations"]
        for i in range(len(ids)):
            annotations.add_annotation(ids[i], types[i], locations[i])

        if "comments" in self.h5file["/annotations"]:
            ids = self.h5file["/annotations/comments/target_ids"]
            comments = self.h5file["/annotations/comments/comments"]
            for (id, comment) in zip(ids, comments):
                annotations.add_comment(id, comment)

        if "presynaptic_site/partners" in self.h5file["/annotations"]:
            pre_post = self.h5file["/annotations/presynaptic_site/partners"]
            for (pre, post) in pre_post:
                annotations.set_pre_post_partners(pre, post)

        return annotations
Beispiel #2
0
def getAnnotations(opt):
    # hold the annotList in the RAM, which contains information of
    # reference transcripts
    if opt.format == 'pickle':
        annotList = anno.AnnotationList.fromPickle(opt.gtf)
    elif opt.format == 'alt':
        annotList = anno.AnnotationList(opt.gtf, altFormat=True)
    else:  # standard format
        annotList = anno.AnnotationList(opt.gtf)

    return annotList
Beispiel #3
0
def get_tokens(ns, root, annotation_type, text):

    if annotation_type == "word":
        annots = root.findall('syntax:WordToken', ns)
    elif annotation_type == "punctuation":
        annots = root.findall('syntax:PunctuationToken', ns)
    elif annotation_type == "symbol":
        annots = root.findall('syntax:SymbolToken', ns)
    elif annotation_type == "number":
        annots = root.findall('syntax:NumToken', ns)
    elif annotation_type == "contraction":
        annots = root.findall('syntax:ContractionToken', ns)
    else:
        raise ValueError("Invalid annotation type.")

    tokens = []
    for cur_annot in annots:

        begin = int(cur_annot.get('begin'))
        end = int(cur_annot.get('end'))
        pos = cur_annot.get('partOfSpeech')
        string = text[begin:end]
        tokens.append(Annotations.Token(begin, end, string, pos))

    return tokens
Beispiel #4
0
def getGeneFromAnnotation (opt, tranList, exonList):
    '''Add to lists of transcripts and exons: annotations for gene of interest.'''

    if opt.gtf == None:
        return tranList, exonList

    omits = [] if opt.omit is None else opt.omit.split(',')            # transcripts which must not be included

    if opt.format == 'pickle':
        annotList   = anno.AnnotationList.fromPickle (opt.gtf)
    elif opt.format == 'alt':
        annotList   = anno.AnnotationList (opt.gtf, altFormat=True)
    else:     # standard format
        annotList   = anno.AnnotationList (opt.gtf)

    allGenes = annotList.getGeneDict()
    if opt.gene not in allGenes:
        raise RuntimeError ('gene %s is not in the annotation file' % opt.gene)
    geneList = allGenes[opt.gene]       # a list of Annotation objects
    if len(geneList) > 1:
        logger.warning('gene %s appears %d times in annotations, first occurrence plotted' \
                           % (opt.gene, len(geneList)))
    myGene = geneList[0]

    for tran in myGene.getChildren():                       # tran is an Annotation object

        if tran.name not in omits:                          # if not in ignore list

            myTran = Transcript(tran.name, annot=True)

            if hasattr(tran, 'startcodon'):
                myTran.startcodon = tran.startcodon
            if hasattr(tran, 'stopcodon'):
                myTran.stopcodon = tran.stopcodon

            for exon in tran.getChildren():                 # exon is an Annotation object
                myExon = Exon(myTran, exon.name, exon.start, exon.end, exon.strand)     # no Q score
                if hasattr (exon, 'polyAs'):
                    print exon.name
                    myExon.polyAs = exon.polyAs
                exonList.append (myExon)
                myTran.exons.append(myExon)

            tranList.append (myTran)

    return tranList, exonList
Beispiel #5
0
def getGeneFromAnnotation(opt, tranList, exonList):
    # Add to lists of transcripts and exons: annotations for gene of interest.
    if opt.gtf is None:
        return tranList, exonList
    if opt.annotations:
        annotList = opt.annotations
    else:
        if opt.format == 'pickle':
            annotList = anno.AnnotationList.fromPickle(opt.gtf)
        elif opt.format == 'alt':
            annotList = anno.AnnotationList(opt.gtf, altFormat=True)
        else:  # standard format
            annotList = anno.AnnotationList(opt.gtf)
    allGenes = annotList.getGeneDict()
    allGenes.update({k.upper(): v for k, v in allGenes.iteritems()})
    if opt.gene not in allGenes:
        raise RuntimeError('gene %s is not in the annotation file' % opt.gene)
    geneList = allGenes[opt.gene]  # a list of Annotation objects
    if len(geneList) > 1:
        logger.warning(
            'gene %s appears %d times in annotations, first occurrence plotted'
            % (opt.gene, len(geneList)))
    myGene = geneList[0]
    for tran in myGene.getChildren():  # tran is an Annotation object
        myTran = Transcript(tran.name,
                            start=tran.start,
                            end=tran.end,
                            annot=True,
                            ID=tran.ID,
                            source=(0, opt.gtf))
        if hasattr(tran, 'startcodon'):
            myTran.startcodon = tran.startcodon
        if hasattr(tran, 'stopcodon'):
            myTran.stopcodon = tran.stopcodon
        for exon in tran.getChildren():  # exon is an Annotation object
            myExon = Exon(myTran, exon.name, exon.start, exon.end,
                          exon.strand)  # no Q score
            if hasattr(exon, 'polyAs'):
                myExon.polyAs = exon.polyAs
            exonList.append(myExon)
            myTran.exons.append(myExon)
        tranList.append(myTran)
    return tranList, exonList
Beispiel #6
0
def main():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    # We may want to re-pickle a pickled annotation file. e.g., to add polyA annotations.

    if opt.format == 'pickle':
        annotList = anno.AnnotationList.fromPickle(opt.gtf)
    elif opt.format == 'alt':
        annotList = anno.AnnotationList(opt.gtf, altFormat=True)
    else:
        annotList = anno.AnnotationList(opt.gtf)

    if opt.ref is not None:  # if a reference was specified, look for PolyA tracts in exons
        refObj = ref.Reference.fromPickle(opt.ref)
        annotList.annotatePolyA(refObj)

    annotList.toPickle(opt.output)

    logger.debug('finished')

    return
Beispiel #7
0
def main():

    opt, args = getParms()

    gtf = args[0]

    annotList = anno.AnnotationList(gtf)

    tranSizes = list()
    for ix in xrange(len(SIZE_BINS)):
        tranSizes.append(list())

    for chr in annotList.chromosomes():
        for strand in annotList.strands(chr):
            for geneEnt in annotList.geneList(chr, strand).getChildren():
                for tranEnt in geneEnt.getChildren():

                    if tranEnt.length > 100000:
                        print '%-5s  %s  %2d  %5d  %s' % (
                            chr, strand, tranEnt.numChildren(), tranEnt.length,
                            tranEnt.name)

                    for ix, size in enumerate(SIZE_BINS):
                        if tranEnt.numChildren() <= size:
                            tranSizes[ix].append(tranEnt.length)
                            break

    plt.figure(figsize=(12, 6))
    counts, bins, patches = plt.hist(tranSizes,
                                     bins=80,
                                     range=(0, opt.xmax),
                                     rwidth=0.8,
                                     color=SIZE_COLORS,
                                     histtype='barstacked',
                                     label=SIZE_LEGENDS)
    plt.legend(loc='best', prop={'size': 10})
    plt.xlabel('transcript length')
    plt.ylabel('number of transcripts')

    if opt.title is not None:
        plt.suptitle(opt.title)

    plt.savefig(opt.output)
    plt.close()

    print counts
    print bins
    print patches
Beispiel #8
0
def get_sentences(ns, root, token2idx):

    sentences = []

    sentAnnots = root.findall('textspan:Sentence', ns)

    for sentence in sentAnnots:

        begin = int(sentence.get('begin'))
        end = int(sentence.get('end'))

        tokens = match2span(token2idx, begin, end)

        sentences.append(Annotations.Sentence(begin, end, tokens))

    return sentences
Beispiel #9
0
def get_concepts(root, token2idx):
    concepts = []

    conceptAnnots = root.findall('concepts_FILEUMLS')

    for cur_annot in conceptAnnots:

        begin = int(cur_annot.get('begin'))
        end = int(cur_annot.get('end'))
        ide = cur_annot.get('identifier')
        certainty = float(cur_annot.get('certainty'))
        tokens = match2span(token2idx, begin, end)

        cur_concept = Annotations.Concept(begin, end, ide, certainty, tokens)

        concepts.append(cur_concept)

    return concepts
Beispiel #10
0
def get_chunks(ns, root, token2idx):

    chunks = []

    chunk_annotations = root.findall('syntax:Chunk', ns)

    for curChunk in chunk_annotations:

        begin = int(curChunk.get('begin'))
        end = int(curChunk.get('end'))
        chunk_type = curChunk.get('chunkType')

        tokens = match2span(token2idx, begin, end)
        # TODO: link chunk to token

        chunks.append(Annotations.Chunk(begin, end, tokens, chunk_type))

    return chunks
Beispiel #11
0
def main():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    if opt.gtfpickle is not None:
        handle = open(opt.gtfpickle, 'r')
        pk = pickle.Unpickler(handle)
        annotList = pk.load()
        handle.close()
    else:
        annotList = anno.AnnotationList(opt.gtf)

    geneList = annotList.getGene(opt.gene)
    if geneList is None:
        print 'gene %s not found in annotations' % opt.gene
    elif len(geneList) != 1:
        print 'there are %d occurrences of gene %s in annotations' % (
            len(geneList), opt.gene)
    else:
        geneEnt = geneList[0]

        print 'gene:    ',
        printEnt(geneEnt)

        for transEnt in geneEnt.getChildren():
            print '\ntr:      ',
            printTran(transEnt)

            for exonEnt in transEnt.getChildren():
                print 'exon:    ',
                printEnt(exonEnt)

    logger.debug('finished')

    return
Beispiel #12
0
    def __init__( self, path=None, show_window=True, time_machine=False, mem_access=False ):
        if path is not None:
            import os
            os.chdir( path )

        sys.setrecursionlimit( 3000 )

        self.emulator = Emulator( no_display=not show_window, quiet=True, time_machine=time_machine, mem_access=mem_access )
        self.emulator.load_image( 0x2dfd, r'bin\ROBOTRON.BIN' )
        # self.emulator.load_image( 0x2dfd, r"tmp\ROBOTRON#062DFD.BIN" )

        self.apple2 = self.emulator.apple2
        self.display = self.apple2.display
        self.cpu = self.emulator.cpu
        self.mem = self.emulator.mem
        self.map = self.emulator.map

        self.labels = Labels( )
        self.dis = Disassembler( self.cpu, self.map, self.labels )  # type: Disassembler

        self.tile_factory = TileFactory( self.emulator.map )
        self.annotations = Annotations( )  # type: Annotations

        self.memlog_dialog = None  # type: MemLogDialog
Beispiel #13
0
def main():

    logger.debug('version %s starting' % VERSION)

    opt, args = getParms()

    if opt.clusters is not None:
        clusterList = clrep.ClusterList(
            opt.clusters)  # read the cluster_report.csv file, if supplied

    if opt.format == 'pickle' or opt.gtf.endswith('pickle'):
        annotList = anno.AnnotationList.fromPickle(opt.gtf)
    elif opt.format == 'alt':
        annotList = anno.AnnotationList(opt.gtf, altFormat=True)
    else:  # standard format
        annotList = anno.AnnotationList(opt.gtf)

    annotCursor = anno.AnnotationCursor(annotList)

    polyAFinder = PolyA.PolyA()

    regexAS = re.compile('(AS:i:\d+)')  # alignment score
    regexUT = re.compile(
        '(uT:A:\d+)')  # mismatch reason (ToDo: Translate this)
    regexMD = re.compile('MD:Z:(\S+)')  # MD string

    if len(args) > 0:
        logger.debug('reading SAM file %s' % args[0])
        handle = open(args[0], 'r')
    else:
        logger.debug('reading SAM data from stdin')
        handle = sys.stdin

    totReads = 0  # assorted counters
    totAlign = 0
    totWithGene = 0
    totMulti = 0
    totReverse = 0
    totByScore = [0, 0, 0, 0, 0, 0]  # indexed by score
    totSplice = [0, 0, 0, 0, 0, 0]
    skipped = 0

    clusterDict = cl.ClusterDict(
    )  # annotation matches saved for later pickling
    lastPos = dict()  # current position in SAM file by chr (for sort check)

    for line in handle:  # main loop: read the SAM file

        try:
            if line.startswith('@'):
                continue

            totReads += 1

            lineFields = line.split(
                '\t')  # just split it once, not 6 times in list comp
            if len(lineFields) < 10:
                raise RuntimeError('mis-formed SAM line: %s' % line)
            clusterName, flags, chr, start, cigarString, bases = [
                lineFields[i] for i in (0, 1, 2, 3, 5, 9)
            ]
            flags = int(flags)

            if flags & FLAG_NOT_ALIGNED:

                print '\nisoform:  %-16s' % (clusterName)

                if opt.clusters is not None:  # print cluster (cl:) lines
                    printClusterReads(clusterList, clusterName)

                print 'result:   %-50s no_alignment_found' % clusterName,  # no EOL yet

                alnReason = re.search(regexUT, line)  # mismatch reason
                if alnReason is not None:
                    print ' %s' % alnReason.group(1),
                alnScore = re.search(regexAS, line)  # alignment score
                if alnScore is not None:
                    print ' %s' % alnScore.group(1),
                print

                continue

            totAlign += 1

            start = int(start)
            if start < lastPos.get(chr, 0):
                raise RuntimeError('SAM file is not sorted by position')
            lastPos[chr] = start

            match = re.search(regexMD, line)
            if match is not None:  # if  MD string is present
                cigar = cs.CigarString(cigarString, start, match.group(1))
            else:
                cigar = cs.CigarString(cigarString, start)

            end = start + cigar.genomicLength() - 1
            # -1 to report last base, rather than last+1

            exons = cigar.exons()

            strand = '-' if (flags & FLAG_REVERSE) else '+'

            if opt.outpickle is not None:
                myCluster = cl.Cluster(clusterName, flags, chr, start, strand,
                                       cigar,
                                       bases)  # cigar is a CigarString object
                clusterDict.addCluster(myCluster)

            print '\nisoform:  %-16s    %9d                    %9d         %-5s  %s  %6d' \
                % (clusterName, start, end, chr, strand, end-start),
            if flags & FLAG_SECONDARY:
                print ' multimap',
                totMulti += 1
            print

            print 'cigar:    %s' % cigar.prettyPrint()
            if cigar.MD is not None:
                print 'MD:       %s' % cigar.MD

            if opt.vars is not None:  # print variant (var:) lines
                cigar.printVariantList(bases)

            if opt.clusters is not None:  # print cluster (cl:) lines
                printClusterReads(clusterList, clusterName)

            foundPolyA = False
            print 'polyA:  ',  # print 'polyA:' line
            for motif, offset in polyAFinder.findMotifs(
                    bases, strand, POLYA_REACH):
                print ' %s: %4d' % (motif, offset),
                foundPolyA = True
            print

            # Now let's do the genes...

            # Bump cursor past any genes which end prior to the start of
            # the current read. We're done looking at them.

            annotCursor.advance(chr, start)

            bestHit = best.Best()

            # Loop through genes this cluster overlaps. Try the aligned
            # strand first, but if no joy, try the other strand, since
            # IsoSeq can get it backwards sometimes.

            for str2try in (strand, '-' if strand == '+' else
                            '+'):  # try aligned strand first

                for curGene in annotCursor.getOverlappingGenes(
                        chr, start, end, str2try):

                    print 'gene:     %-16s    %9d            %6d             %9d  %5d     %s' \
                        % (curGene.name, curGene.start, curGene.start-start, curGene.end, curGene.end-end, curGene.strand),
                    if str2try != strand:
                        print '  rev',
                    print

                    bestTran, bestScore = matchTranscripts(
                        exons, curGene
                    )  # match this cluster to all transcripts of gene
                    bestHit.update(bestScore,
                                   [curGene, bestTran
                                    ])  # best transcript of best gene so far?

                if bestHit.which > 1:  # if decent match found, don't try the other strand
                    break

            if bestHit.value is None:
                print 'result:   %-50s no_genes_found' % (clusterName)
            else:

                bestGene, bestTran = bestHit.which
                print 'result:   %-50s  %-20s  %-24s  ex: %2d  sc: %d' \
                    % (clusterName, bestGene.name, bestTran.name, len(exons), bestHit.value),

                if bestGene.strand != strand:  # if best hit was on other strand from alignment
                    print 'rev',
                    totReverse += 1

                if bestHit.value >= 3:
                    delta5 = bestTran[0].start - exons[0].start  # 5' delta
                    delta3 = bestTran[-1].end - exons[-1].end  # 3' delta
                    print ' 5-3: %5d %5d' % (delta5, delta3),
                print

                totWithGene += 1
                totByScore[bestHit.value] += 1
                if foundPolyA:
                    totSplice[bestHit.value] += 1

                if opt.outpickle is not None:
                    myCluster.best(
                        bestGene, bestTran,
                        bestScore)  # keep track of best gene in pickle object
        except RuntimeError:
            skipped = skipped + 1
            continue

    if opt.outpickle is not None:
        clusterDict.toPickle(opt.outpickle)  # save matches as pickle file

    print '\nsummary: version %s\n' % VERSION

    if opt.clusters is not None:
        for cellNo, cell in clusterList.showCells():
            print 'summary:   cell %d = %s' % (cellNo, cell)
        print

    print 'summary: %7d isoforms skipped, due to MD tag parsing problems' % skipped
    print 'summary: %7d isoforms read' % totReads
    print 'summary: %7d isoforms aligned, of which %d were multiply mapped' % (
        totAlign, totMulti)
    print 'summary: %7d isoforms hit at least one gene, of which %d were on opposite strand' % (
        totWithGene, totReverse)
    print

    for score in xrange(5, -1, -1):
        print 'summary: %7d isoforms scored %d, of which %6d had splice termination motif' \
            % (totByScore[score], score, totSplice[score])

    logger.debug('finished')