Esempio n. 1
0
    def map2region(self, sg, type):
        """get annotations for the regions spaned by a SpliceGraph
        returns the annotation dictionary for all element within the specified region
        with complete information, such as chr, strand, start, stop etc...
        --> (dict)
        """

        if not isinstance(sg, SpliceGraph.SpliceGraph) and os.path.exists(sg):
            sg = SpliceGraph.SpliceGraph(filename=sg)

        if isinstance(sg, SpliceGraph.SpliceGraph):

            # get annotation
            if not self.annotationFiles.has_key(type):  # give up
                self.__inform(
                    "ERROR in map2region: unknown annotation type [%s]" % type)
                raise Errors.ArgumentError(
                    "map2region", "unknown annotation type [%s]" % type)

            elif not self.exonInfo.has_key(type):  # parse it
                self.parseAnnotation(type)

            # get region
            #regStart, regEnd = sg.genomicRange()
            ex = sg.allExons()
            regStart = ex[0][0]
            regEnd = ex[-1][1]

            # get overlap
            genes = {}
            (overlapGns, overlapExons) = self.overlappingGenesExons(type,
                                                                    regStart,
                                                                    regEnd,
                                                                    detailed=1)

            for overlapGn in overlapGns:
                if genes.has_key(overlapGn):
                    genes[overlapGn] += 1
                else:
                    genes[overlapGn] = 1

        return genes, overlapExons
Esempio n. 2
0
    def parseAnnotation(self, type):
        "parse and store annotation files from MetaData"

        if type not in self.annotationFiles.keys():
            raise Errors.ArgumentError(
                'Mapper.parseAnnotation',
                'no annotation file for type "%s"' % type)

        elif type == 'ensembl' or type == 'vega' or type == 'genrate' or type == "HuExonStv2":
            versionPattern = re.compile('\.\d+$')
            self.geneInfo[type] = {}
            self.ssInfo[type] = {}
            self.exonInfo[type] = {}
            nb = 0

            fh = gzip.GzipFile(self.annotationFiles[type], 'r')

            while 1:
                line = fh.readline()
                if not line:
                    break
                elif line.startswith('#') or line.startswith('Chromosome'):
                    continue
                else:
                    line = line.strip('\n')
                    (chr, strand, gnId, txId, extGnId, biotype, exId, start,
                     end) = line.split('\t')
                    if not chr.startswith('chr'):
                        chr = "chr%s" % chr
                    strand = ((strand == '1') and '+' or '-')
                    exId = versionPattern.sub('', exId)

                    # store in self.geneInfo, self.ssInfo and self.exonInfo
                    if not self.geneInfo[type].has_key(gnId):
                        self.geneInfo[type][gnId] = {
                            'extGnId': extGnId,
                            'biotype': biotype
                        }

                    if not self.ssInfo[type].has_key(chr):
                        self.ssInfo[type][chr] = {'+': {}, '-': {}}
                    self.ssInfo[type][chr][strand][start] = (gnId, exId)
                    self.ssInfo[type][chr][strand][end] = (gnId, exId)

                    if not self.exonInfo[type].has_key(chr):
                        self.exonInfo[type][chr] = {'+': {}, '-': {}}
                    if not self.exonInfo[type][chr][strand].has_key(start):
                        self.exonInfo[type][chr][strand][start] = {}
                    self.exonInfo[type][chr][strand][start][end] = (gnId, exId)

                    nb += 1
                    self.__inform(
                        "just stored %s -> %s : %s\n" % (start, end, gnId), 5)
                    self.__inform(
                        "\t%s,%s,%s,%s\n" % (type, chr, strand, start), 7)

            fh.close()
            self.__inform(
                "parseAnnotation: successfully parsed gene association of %i %s exons\n"
                % (nb, type))

        elif type == 'U133target' or type == 'gnf1m' or type == 'u133v2':
            self.geneInfo[type] = {}
            self.ssInfo[type] = {}
            self.exonInfo[type] = {}
            nb = 0

            fh = gzip.GzipFile(self.annotationFiles[type], 'r')

            while 1:
                line = fh.readline()
                if not line:
                    break
                elif line.startswith('#'):
                    continue
                else:
                    line = line.strip('\n')
                    (probesetId, chr, start, end, strand) = line.split('\t')

                    # store in self.exonInfo
                    if not self.exonInfo[type].has_key(chr):
                        self.exonInfo[type][chr] = {'+': {}, '-': {}}
                    if not self.exonInfo[type][chr][strand].has_key(start):
                        self.exonInfo[type][chr][strand][start] = {}
                    self.exonInfo[type][chr][strand][start][end] = (probesetId,
                                                                    probesetId)

                    nb += 1
                    self.__inform(
                        "just stored %s -> %s : %s\n" %
                        (start, end, probesetId), 5)

            fh.close()
            self.__inform(
                "parseAnnotation: successfully parsed gene association of %i %s exons\n"
                % (nb, type))

        elif type == 'hgu95av2':
            #versionPattern = re.compile('\.\d+$')
            self.geneInfo[type] = {}
            self.ssInfo[type] = {}
            self.exonInfo[type] = {}
            nb = 0

            fh = gzip.GzipFile(self.annotationFiles[type], 'r')

            while 1:
                line = fh.readline()
                if not line:
                    break
                else:
                    line = line.strip('\n')

                blocks = line.split('\t')

                (probeId, map) = blocks[0].split('::')
                (chr, start, end, strand) = map.split(':')

                if not self.ssInfo[type].has_key(chr):
                    self.ssInfo[type][chr] = {'+': {}, '-': {}}
                self.ssInfo[type][chr][strand][start] = (probeId, probeId)
                self.ssInfo[type][chr][strand][end] = (probeId, probeId)

                if not self.exonInfo[type].has_key(chr):
                    self.exonInfo[type][chr] = {'+': {}, '-': {}}
                if not self.exonInfo[type][chr][strand].has_key(start):
                    self.exonInfo[type][chr][strand][start] = {}
                self.exonInfo[type][chr][strand][start][end] = (probeId,
                                                                probeId)

                nb += 1
                self.__inform(
                    "just stored %s -> %s : %s\n" % (start, end, probeId), 5)
            fh.close()
            self.__inform(
                "parseAnnotation: successfully parsed gene association of %i %s exons\n"
                % (nb, type))
Esempio n. 3
0
    def mapEvents(self, event, event_type, type, outFile=None, mode="within"):
        """maps a list of events (e.g. CE or SE) in the raw text format stored in *_events files 
        to annotation type using the specifid mode, by default within (to use for microarray probeset regions"""

        if isinstance(event, basestring):
            if os.path.exists(event):
                f = open(event, "r")
                event = f.readlines()
                f.close()
        if not isinstance(event, list):
            raise Errors.ArgumentError(
                "Mapper.mapEvents",
                "event argument must either be a filename or list of events (strings)"
            )

        # get annotation
        if not self.annotationFiles.has_key(type):  # give up
            self.__inform(
                "ERROR in Mapper.map2gene: unknown annotation type '%s'" %
                type)
            raise Errors.ArgumentError("Mapper.map2gene",
                                       "unknown annotation type '%s'" % type)

        elif not self.exonInfo.has_key(type):  # parse it
            self.parseAnnotation(type)

        genes = {}  #format: genes[gnId] = nb_of_exons
        exons = {
        }  #format: exons[(sg.name,sg.exon)] = (mapped.gene, mapped.exon) #where sg.exon=element1:E:element2

        # open outFile fh
        if outFile is None:
            outFile = "%s.mapped_%s" % (event_type, type)
        fh = open(outFile, "w")

        if event_type == "SE" or event_type == "CE" or event_type == "CI" or event_type == "RI":
            fh.write("#gnId\texStart\texEnd\tExon_id\tExon_coordinates\n")
            for ev in event:
                if ev.startswith("#"): continue
                line = ev.strip().replace('\n', '')
                if event_type == "SE":
                    gnId, exStart, exEnd, skCoverage, incCoverage = line.split(
                        '\t')
                elif event_type == "CE" or event_type == "CI":
                    gnId, exStart, exEnd, Coverage = line.split('\t')
                elif event_type == "RI":
                    gnId, exStart, exEnd, retainedCoverage, splicedCoverage = line.split(
                        '\t')
                elif event_type == "A5SS" or event_type == "A3SS":
                    gnId, nbASS, altSS, Coverage, anotherSS, yetanotherSS = line.split(
                        '\t')
                    if nbASS == 2:
                        exStart, exEnd = altSS.split(',')
                    else:
                        continue  # try next event from the list
                else:
                    sys.stderr.write(
                        "Error. Mapper.MapEvents. event not implemented: %s" %
                        event_type)

                # remember: exStart/exEnd are biological, not always exStart<exEnd --> flip
                (exChromosome, exStartCoord, exStrand) = exStart.split(':')
                exEndCoord = exEnd.split(':')[1]
                if exStrand == '-':
                    exStartCoord, exEndCoord = exEndCoord, exStartCoord

                # if correspondance is found with annotation, store gene and exon IDs
                if mode == "exon" and \
                    self.exonInfo[type].has_key(exChromosome) and \
                    self.exonInfo[type][exChromosome].has_key(exStrand) and \
                    self.exonInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \
                    self.exonInfo[type][exChromosome][exStrand][exStartCoord].has_key(exEndCoord):
                    if genes.has_key(self.exonInfo[type][exChromosome]
                                     [exStrand][exStartCoord][exEndCoord][0]):
                        genes[self.exonInfo[type][exChromosome][exStrand]
                              [exStartCoord][exEndCoord][0]] += 1
                    else:
                        genes[self.exonInfo[type][exChromosome][exStrand]
                              [exStartCoord][exEndCoord][0]] = 1
                    exons[(gnId, exStart + ":E:" + exEnd)] = self.exonInfo[
                        type][exChromosome][exStrand][exStartCoord][exEndCoord]

                elif mode == "bothSS" and \
                     self.ssInfo[type].has_key(exChromosome) and \
                     self.ssInfo[type][exChromosome].has_key(exStrand) and \
                     self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \
                     self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) and \
                     self.ssInfo[type][exChromosome][exStrand][exStartCoord] == self.ssInfo[type][exChromosome][exStrand][exEndCoord]:
                    if genes.has_key(self.ssInfo[type][exChromosome][exStrand]
                                     [exStartCoord][0]):
                        genes[self.ssInfo[type][exChromosome][exStrand]
                              [exStartCoord][0]] += 1
                    else:
                        genes[self.ssInfo[type][exChromosome][exStrand]
                              [exStartCoord][0]] = 1
                    exons[(gnId, exStart + ":E:" + exEnd)] = self.ssInfo[type][
                        exChromosome][exStrand][exStartCoord]

                elif mode == "singleSS" and \
                     self.ssInfo[type].has_key(exChromosome) and \
                     self.ssInfo[type][exChromosome].has_key(exStrand) and \
                     ( self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) or \
                       self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) ):
                    ss = self.ssInfo[type][exChromosome][exStrand].has_key(
                        exStartCoord) and exStartCoord or exEndCoord
                    if genes.has_key(
                            self.ssInfo[type][exChromosome][exStrand][ss][0]):
                        genes[self.ssInfo[type][exChromosome][exStrand][ss]
                              [0]] += 1
                    else:
                        genes[self.ssInfo[type][exChromosome][exStrand][ss]
                              [0]] = 1
                    exons[(
                        gnId, exStart + ":E:" +
                        exEnd)] = self.ssInfo[type][exChromosome][exStrand][ss]

                elif mode == "overlap":
                    (overlapGns, overlapExons) = self.overlappingGenesExons(
                        type, exStart, exEnd)
                    for overlapGn in overlapGns:
                        if genes.has_key(overlapGn):
                            genes[overlapGn] += 1
                        else:
                            genes[overlapGn] = 1
                    for overlapExon in overlapExons.iterkeys():
                        exons[(gnId, exStart + ":E:" +
                               exEnd)] = (overlapExons[overlapExon],
                                          overlapExon)
                        fh.write("%s\t%s\t%s\t%s\t%s\n" %
                                 (gnId, exStart, exEnd,
                                  overlapExons[overlapExon], overlapExon))

                elif mode == "within":
                    (withinGns, withinExons) = self.withinExons(type,
                                                                exStart,
                                                                exEnd,
                                                                detailed=1)
                    for withinGn in withinGns:
                        if genes.has_key(withinGn):
                            genes[withinGn] += 1
                        else:
                            genes[withinGn] = 1
                    for withinExon in withinExons.iterkeys():
                        exons[(gnId, exStart + ":E:" +
                               exEnd)] = (withinExons[withinExon], withinExon)
                    if len(withinExons.keys()) > 0:
                        fh.write("%s\t%s\t%s\t%s\t%s\n" % (gnId, exStart, exEnd,\
                        #",".join(str(withinExons[withinExon]) for withinExon in withinExons),\
                        ",".join("%s:%s:%s" % withinExons[withinExon] for withinExon in withinExons),\
                        ",".join(withinExon for withinExon in withinExons)))
                        #",".join("%s:%s:%s" % withinExons[withinExon] for withinExon in withinExons),\

                elif mode not in [
                        "bothSS", "singleSS", "overlap", "exon", "within"
                ]:
                    self.__inform("map2gene: Ignoring unknown mode %s\n" %
                                  mode)

        elif event_type == "A5SS" or event_type == "A3SS":
            pass
        elif event_type == "MXE":
            pass

        fh.close()
Esempio n. 4
0
    def map2gene(self, sg, type, mode="singleSS"):
        """map a SpliceGraph to a gene according to the specified exon matching mode

        returns a (gene-dict,exon-dict)-tuple, where
          gene-dict is a dictionary with mapped gene ids as keys and the number of matching exons as values
          exon-dict is a dictionary with (sg.name, sg.exon_ids)-tuples as keys and the corresponding (gnId, exId)-tuples as values

        --> (dict, dict)
        """

        genes = {}  #format: genes[gnId] = nb_of_exons
        exons = {
        }  #format: exons[(sg.name,sg.exon)] = (mapped.gene, mapped.exon) #where sg.exon=element1:E:element2

        if not isinstance(sg, SpliceGraph.SpliceGraph) and os.path.exists(sg):
            sg = SpliceGraph.SpliceGraph(filename=sg)

        if isinstance(sg, SpliceGraph.SpliceGraph):

            # get annotation
            if not self.annotationFiles.has_key(type):  # give up
                self.__inform(
                    "ERROR in Mapper.map2gene: unknown annotation type '%s'" %
                    type)
                raise Errors.ArgumentError(
                    "Mapper.map2gene", "unknown annotation type '%s'" % type)

            elif not self.exonInfo.has_key(type):  # parse it
                self.parseAnnotation(type)

            # for each exon, get associated gene
            for exStart in sg.allElements():

                if sg.is3ss(exStart) or sg.isTSS(exStart):

                    # look for a corresponding exon end (5ss element)
                    for exEnd in sg.downstreamConnectedElements(exStart):

                        # make sure exEnd is a 5ss element
                        if sg.is5ss(exEnd) or sg.isTER(exEnd):

                            # remember: exStart/exEnd are biological, not always exStart<exEnd --> flip
                            (exChromosome, exStartCoord,
                             exStrand) = exStart.split(':')
                            exEndCoord = exEnd.split(':')[1]
                            if exStrand == '-':
                                exStartCoord, exEndCoord = exEndCoord, exStartCoord

                            # if correspondance is found with annotation, store gene and exon IDs
                            if mode == "exon" and \
                                self.exonInfo[type].has_key(exChromosome) and \
                                self.exonInfo[type][exChromosome].has_key(exStrand) and \
                                self.exonInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \
                                self.exonInfo[type][exChromosome][exStrand][exStartCoord].has_key(exEndCoord):
                                if genes.has_key(
                                        self.exonInfo[type][exChromosome]
                                    [exStrand][exStartCoord][exEndCoord][0]):
                                    genes[self.exonInfo[type][exChromosome]
                                          [exStrand][exStartCoord][exEndCoord]
                                          [0]] += 1
                                else:
                                    genes[self.exonInfo[type][exChromosome]
                                          [exStrand][exStartCoord][exEndCoord]
                                          [0]] = 1
                                exons[(sg.name, exStart + ":E:" + exEnd
                                       )] = self.exonInfo[type][exChromosome][
                                           exStrand][exStartCoord][exEndCoord]

                            elif mode == "bothSS" and \
                                 self.ssInfo[type].has_key(exChromosome) and \
                                 self.ssInfo[type][exChromosome].has_key(exStrand) and \
                                 self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \
                                 self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) and \
                                 self.ssInfo[type][exChromosome][exStrand][exStartCoord] == self.ssInfo[type][exChromosome][exStrand][exEndCoord]:
                                if genes.has_key(
                                        self.ssInfo[type][exChromosome]
                                    [exStrand][exStartCoord][0]):
                                    genes[self.ssInfo[type][exChromosome]
                                          [exStrand][exStartCoord][0]] += 1
                                else:
                                    genes[self.ssInfo[type][exChromosome]
                                          [exStrand][exStartCoord][0]] = 1
                                exons[(
                                    sg.name, exStart + ":E:" +
                                    exEnd)] = self.ssInfo[type][exChromosome][
                                        exStrand][exStartCoord]

                            elif mode == "singleSS" and \
                                 self.ssInfo[type].has_key(exChromosome) and \
                                 self.ssInfo[type][exChromosome].has_key(exStrand) and \
                                 ( self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) or \
                                   self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) ):
                                ss = self.ssInfo[type][exChromosome][
                                    exStrand].has_key(
                                        exStartCoord
                                    ) and exStartCoord or exEndCoord
                                if genes.has_key(
                                        self.ssInfo[type][exChromosome]
                                    [exStrand][ss][0]):
                                    genes[self.ssInfo[type][exChromosome]
                                          [exStrand][ss][0]] += 1
                                else:
                                    genes[self.ssInfo[type][exChromosome]
                                          [exStrand][ss][0]] = 1
                                exons[(sg.name,
                                       exStart + ":E:" + exEnd)] = self.ssInfo[
                                           type][exChromosome][exStrand][ss]

                            elif mode == "overlap":
                                (overlapGns,
                                 overlapExons) = self.overlappingGenesExons(
                                     type, exStart, exEnd)
                                for overlapGn in overlapGns:
                                    if genes.has_key(overlapGn):
                                        genes[overlapGn] += 1
                                    else:
                                        genes[overlapGn] = 1
                                for overlapExon in overlapExons.iterkeys():
                                    exons[(sg.name,
                                           exStart + ":E:" + exEnd)] = (
                                               overlapExons[overlapExon],
                                               overlapExon)

                            elif mode == "within":
                                (overlapGns, overlapExons) = self.withinExons(
                                    type, exStart, exEnd)
                                for overlapGn in overlapGns:
                                    if genes.has_key(overlapGn):
                                        genes[overlapGn] += 1
                                    else:
                                        genes[overlapGn] = 1
                                for overlapExon in overlapExons.iterkeys():
                                    exons[(sg.name,
                                           exStart + ":E:" + exEnd)] = (
                                               overlapExons[overlapExon],
                                               overlapExon)

                            elif mode not in [
                                    "bothSS", "singleSS", "overlap", "exon",
                                    "within"
                            ]:
                                self.__inform(
                                    "map2gene: Ignoring unknown mode %s\n" %
                                    mode)

        else:
            self.__inform(
                "ERROR in map2gene: 1st argument must either be filename or instance of SpliceGraph"
            )
            raise Errors.ArgumentError(
                "Mapper.map2gene",
                "1st argument must either be filename or instance of SpliceGraph"
            )

        return genes, exons