def map2region(self, sg, type): """get annotations for the regions spaned by a SpliceGraph returns the annotation dictionary for all element within the specified region with complete information, such as chr, strand, start, stop etc... --> (dict) """ if not isinstance(sg, SpliceGraph.SpliceGraph) and os.path.exists(sg): sg = SpliceGraph.SpliceGraph(filename=sg) if isinstance(sg, SpliceGraph.SpliceGraph): # get annotation if not self.annotationFiles.has_key(type): # give up self.__inform( "ERROR in map2region: unknown annotation type [%s]" % type) raise Errors.ArgumentError( "map2region", "unknown annotation type [%s]" % type) elif not self.exonInfo.has_key(type): # parse it self.parseAnnotation(type) # get region #regStart, regEnd = sg.genomicRange() ex = sg.allExons() regStart = ex[0][0] regEnd = ex[-1][1] # get overlap genes = {} (overlapGns, overlapExons) = self.overlappingGenesExons(type, regStart, regEnd, detailed=1) for overlapGn in overlapGns: if genes.has_key(overlapGn): genes[overlapGn] += 1 else: genes[overlapGn] = 1 return genes, overlapExons
def parseAnnotation(self, type): "parse and store annotation files from MetaData" if type not in self.annotationFiles.keys(): raise Errors.ArgumentError( 'Mapper.parseAnnotation', 'no annotation file for type "%s"' % type) elif type == 'ensembl' or type == 'vega' or type == 'genrate' or type == "HuExonStv2": versionPattern = re.compile('\.\d+$') self.geneInfo[type] = {} self.ssInfo[type] = {} self.exonInfo[type] = {} nb = 0 fh = gzip.GzipFile(self.annotationFiles[type], 'r') while 1: line = fh.readline() if not line: break elif line.startswith('#') or line.startswith('Chromosome'): continue else: line = line.strip('\n') (chr, strand, gnId, txId, extGnId, biotype, exId, start, end) = line.split('\t') if not chr.startswith('chr'): chr = "chr%s" % chr strand = ((strand == '1') and '+' or '-') exId = versionPattern.sub('', exId) # store in self.geneInfo, self.ssInfo and self.exonInfo if not self.geneInfo[type].has_key(gnId): self.geneInfo[type][gnId] = { 'extGnId': extGnId, 'biotype': biotype } if not self.ssInfo[type].has_key(chr): self.ssInfo[type][chr] = {'+': {}, '-': {}} self.ssInfo[type][chr][strand][start] = (gnId, exId) self.ssInfo[type][chr][strand][end] = (gnId, exId) if not self.exonInfo[type].has_key(chr): self.exonInfo[type][chr] = {'+': {}, '-': {}} if not self.exonInfo[type][chr][strand].has_key(start): self.exonInfo[type][chr][strand][start] = {} self.exonInfo[type][chr][strand][start][end] = (gnId, exId) nb += 1 self.__inform( "just stored %s -> %s : %s\n" % (start, end, gnId), 5) self.__inform( "\t%s,%s,%s,%s\n" % (type, chr, strand, start), 7) fh.close() self.__inform( "parseAnnotation: successfully parsed gene association of %i %s exons\n" % (nb, type)) elif type == 'U133target' or type == 'gnf1m' or type == 'u133v2': self.geneInfo[type] = {} self.ssInfo[type] = {} self.exonInfo[type] = {} nb = 0 fh = gzip.GzipFile(self.annotationFiles[type], 'r') while 1: line = fh.readline() if not line: break elif line.startswith('#'): continue else: line = line.strip('\n') (probesetId, chr, start, end, strand) = line.split('\t') # store in self.exonInfo if not self.exonInfo[type].has_key(chr): self.exonInfo[type][chr] = {'+': {}, '-': {}} if not self.exonInfo[type][chr][strand].has_key(start): self.exonInfo[type][chr][strand][start] = {} self.exonInfo[type][chr][strand][start][end] = (probesetId, probesetId) nb += 1 self.__inform( "just stored %s -> %s : %s\n" % (start, end, probesetId), 5) fh.close() self.__inform( "parseAnnotation: successfully parsed gene association of %i %s exons\n" % (nb, type)) elif type == 'hgu95av2': #versionPattern = re.compile('\.\d+$') self.geneInfo[type] = {} self.ssInfo[type] = {} self.exonInfo[type] = {} nb = 0 fh = gzip.GzipFile(self.annotationFiles[type], 'r') while 1: line = fh.readline() if not line: break else: line = line.strip('\n') blocks = line.split('\t') (probeId, map) = blocks[0].split('::') (chr, start, end, strand) = map.split(':') if not self.ssInfo[type].has_key(chr): self.ssInfo[type][chr] = {'+': {}, '-': {}} self.ssInfo[type][chr][strand][start] = (probeId, probeId) self.ssInfo[type][chr][strand][end] = (probeId, probeId) if not self.exonInfo[type].has_key(chr): self.exonInfo[type][chr] = {'+': {}, '-': {}} if not self.exonInfo[type][chr][strand].has_key(start): self.exonInfo[type][chr][strand][start] = {} self.exonInfo[type][chr][strand][start][end] = (probeId, probeId) nb += 1 self.__inform( "just stored %s -> %s : %s\n" % (start, end, probeId), 5) fh.close() self.__inform( "parseAnnotation: successfully parsed gene association of %i %s exons\n" % (nb, type))
def mapEvents(self, event, event_type, type, outFile=None, mode="within"): """maps a list of events (e.g. CE or SE) in the raw text format stored in *_events files to annotation type using the specifid mode, by default within (to use for microarray probeset regions""" if isinstance(event, basestring): if os.path.exists(event): f = open(event, "r") event = f.readlines() f.close() if not isinstance(event, list): raise Errors.ArgumentError( "Mapper.mapEvents", "event argument must either be a filename or list of events (strings)" ) # get annotation if not self.annotationFiles.has_key(type): # give up self.__inform( "ERROR in Mapper.map2gene: unknown annotation type '%s'" % type) raise Errors.ArgumentError("Mapper.map2gene", "unknown annotation type '%s'" % type) elif not self.exonInfo.has_key(type): # parse it self.parseAnnotation(type) genes = {} #format: genes[gnId] = nb_of_exons exons = { } #format: exons[(sg.name,sg.exon)] = (mapped.gene, mapped.exon) #where sg.exon=element1:E:element2 # open outFile fh if outFile is None: outFile = "%s.mapped_%s" % (event_type, type) fh = open(outFile, "w") if event_type == "SE" or event_type == "CE" or event_type == "CI" or event_type == "RI": fh.write("#gnId\texStart\texEnd\tExon_id\tExon_coordinates\n") for ev in event: if ev.startswith("#"): continue line = ev.strip().replace('\n', '') if event_type == "SE": gnId, exStart, exEnd, skCoverage, incCoverage = line.split( '\t') elif event_type == "CE" or event_type == "CI": gnId, exStart, exEnd, Coverage = line.split('\t') elif event_type == "RI": gnId, exStart, exEnd, retainedCoverage, splicedCoverage = line.split( '\t') elif event_type == "A5SS" or event_type == "A3SS": gnId, nbASS, altSS, Coverage, anotherSS, yetanotherSS = line.split( '\t') if nbASS == 2: exStart, exEnd = altSS.split(',') else: continue # try next event from the list else: sys.stderr.write( "Error. Mapper.MapEvents. event not implemented: %s" % event_type) # remember: exStart/exEnd are biological, not always exStart<exEnd --> flip (exChromosome, exStartCoord, exStrand) = exStart.split(':') exEndCoord = exEnd.split(':')[1] if exStrand == '-': exStartCoord, exEndCoord = exEndCoord, exStartCoord # if correspondance is found with annotation, store gene and exon IDs if mode == "exon" and \ self.exonInfo[type].has_key(exChromosome) and \ self.exonInfo[type][exChromosome].has_key(exStrand) and \ self.exonInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \ self.exonInfo[type][exChromosome][exStrand][exStartCoord].has_key(exEndCoord): if genes.has_key(self.exonInfo[type][exChromosome] [exStrand][exStartCoord][exEndCoord][0]): genes[self.exonInfo[type][exChromosome][exStrand] [exStartCoord][exEndCoord][0]] += 1 else: genes[self.exonInfo[type][exChromosome][exStrand] [exStartCoord][exEndCoord][0]] = 1 exons[(gnId, exStart + ":E:" + exEnd)] = self.exonInfo[ type][exChromosome][exStrand][exStartCoord][exEndCoord] elif mode == "bothSS" and \ self.ssInfo[type].has_key(exChromosome) and \ self.ssInfo[type][exChromosome].has_key(exStrand) and \ self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \ self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) and \ self.ssInfo[type][exChromosome][exStrand][exStartCoord] == self.ssInfo[type][exChromosome][exStrand][exEndCoord]: if genes.has_key(self.ssInfo[type][exChromosome][exStrand] [exStartCoord][0]): genes[self.ssInfo[type][exChromosome][exStrand] [exStartCoord][0]] += 1 else: genes[self.ssInfo[type][exChromosome][exStrand] [exStartCoord][0]] = 1 exons[(gnId, exStart + ":E:" + exEnd)] = self.ssInfo[type][ exChromosome][exStrand][exStartCoord] elif mode == "singleSS" and \ self.ssInfo[type].has_key(exChromosome) and \ self.ssInfo[type][exChromosome].has_key(exStrand) and \ ( self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) or \ self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) ): ss = self.ssInfo[type][exChromosome][exStrand].has_key( exStartCoord) and exStartCoord or exEndCoord if genes.has_key( self.ssInfo[type][exChromosome][exStrand][ss][0]): genes[self.ssInfo[type][exChromosome][exStrand][ss] [0]] += 1 else: genes[self.ssInfo[type][exChromosome][exStrand][ss] [0]] = 1 exons[( gnId, exStart + ":E:" + exEnd)] = self.ssInfo[type][exChromosome][exStrand][ss] elif mode == "overlap": (overlapGns, overlapExons) = self.overlappingGenesExons( type, exStart, exEnd) for overlapGn in overlapGns: if genes.has_key(overlapGn): genes[overlapGn] += 1 else: genes[overlapGn] = 1 for overlapExon in overlapExons.iterkeys(): exons[(gnId, exStart + ":E:" + exEnd)] = (overlapExons[overlapExon], overlapExon) fh.write("%s\t%s\t%s\t%s\t%s\n" % (gnId, exStart, exEnd, overlapExons[overlapExon], overlapExon)) elif mode == "within": (withinGns, withinExons) = self.withinExons(type, exStart, exEnd, detailed=1) for withinGn in withinGns: if genes.has_key(withinGn): genes[withinGn] += 1 else: genes[withinGn] = 1 for withinExon in withinExons.iterkeys(): exons[(gnId, exStart + ":E:" + exEnd)] = (withinExons[withinExon], withinExon) if len(withinExons.keys()) > 0: fh.write("%s\t%s\t%s\t%s\t%s\n" % (gnId, exStart, exEnd,\ #",".join(str(withinExons[withinExon]) for withinExon in withinExons),\ ",".join("%s:%s:%s" % withinExons[withinExon] for withinExon in withinExons),\ ",".join(withinExon for withinExon in withinExons))) #",".join("%s:%s:%s" % withinExons[withinExon] for withinExon in withinExons),\ elif mode not in [ "bothSS", "singleSS", "overlap", "exon", "within" ]: self.__inform("map2gene: Ignoring unknown mode %s\n" % mode) elif event_type == "A5SS" or event_type == "A3SS": pass elif event_type == "MXE": pass fh.close()
def map2gene(self, sg, type, mode="singleSS"): """map a SpliceGraph to a gene according to the specified exon matching mode returns a (gene-dict,exon-dict)-tuple, where gene-dict is a dictionary with mapped gene ids as keys and the number of matching exons as values exon-dict is a dictionary with (sg.name, sg.exon_ids)-tuples as keys and the corresponding (gnId, exId)-tuples as values --> (dict, dict) """ genes = {} #format: genes[gnId] = nb_of_exons exons = { } #format: exons[(sg.name,sg.exon)] = (mapped.gene, mapped.exon) #where sg.exon=element1:E:element2 if not isinstance(sg, SpliceGraph.SpliceGraph) and os.path.exists(sg): sg = SpliceGraph.SpliceGraph(filename=sg) if isinstance(sg, SpliceGraph.SpliceGraph): # get annotation if not self.annotationFiles.has_key(type): # give up self.__inform( "ERROR in Mapper.map2gene: unknown annotation type '%s'" % type) raise Errors.ArgumentError( "Mapper.map2gene", "unknown annotation type '%s'" % type) elif not self.exonInfo.has_key(type): # parse it self.parseAnnotation(type) # for each exon, get associated gene for exStart in sg.allElements(): if sg.is3ss(exStart) or sg.isTSS(exStart): # look for a corresponding exon end (5ss element) for exEnd in sg.downstreamConnectedElements(exStart): # make sure exEnd is a 5ss element if sg.is5ss(exEnd) or sg.isTER(exEnd): # remember: exStart/exEnd are biological, not always exStart<exEnd --> flip (exChromosome, exStartCoord, exStrand) = exStart.split(':') exEndCoord = exEnd.split(':')[1] if exStrand == '-': exStartCoord, exEndCoord = exEndCoord, exStartCoord # if correspondance is found with annotation, store gene and exon IDs if mode == "exon" and \ self.exonInfo[type].has_key(exChromosome) and \ self.exonInfo[type][exChromosome].has_key(exStrand) and \ self.exonInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \ self.exonInfo[type][exChromosome][exStrand][exStartCoord].has_key(exEndCoord): if genes.has_key( self.exonInfo[type][exChromosome] [exStrand][exStartCoord][exEndCoord][0]): genes[self.exonInfo[type][exChromosome] [exStrand][exStartCoord][exEndCoord] [0]] += 1 else: genes[self.exonInfo[type][exChromosome] [exStrand][exStartCoord][exEndCoord] [0]] = 1 exons[(sg.name, exStart + ":E:" + exEnd )] = self.exonInfo[type][exChromosome][ exStrand][exStartCoord][exEndCoord] elif mode == "bothSS" and \ self.ssInfo[type].has_key(exChromosome) and \ self.ssInfo[type][exChromosome].has_key(exStrand) and \ self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) and \ self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) and \ self.ssInfo[type][exChromosome][exStrand][exStartCoord] == self.ssInfo[type][exChromosome][exStrand][exEndCoord]: if genes.has_key( self.ssInfo[type][exChromosome] [exStrand][exStartCoord][0]): genes[self.ssInfo[type][exChromosome] [exStrand][exStartCoord][0]] += 1 else: genes[self.ssInfo[type][exChromosome] [exStrand][exStartCoord][0]] = 1 exons[( sg.name, exStart + ":E:" + exEnd)] = self.ssInfo[type][exChromosome][ exStrand][exStartCoord] elif mode == "singleSS" and \ self.ssInfo[type].has_key(exChromosome) and \ self.ssInfo[type][exChromosome].has_key(exStrand) and \ ( self.ssInfo[type][exChromosome][exStrand].has_key(exStartCoord) or \ self.ssInfo[type][exChromosome][exStrand].has_key(exEndCoord) ): ss = self.ssInfo[type][exChromosome][ exStrand].has_key( exStartCoord ) and exStartCoord or exEndCoord if genes.has_key( self.ssInfo[type][exChromosome] [exStrand][ss][0]): genes[self.ssInfo[type][exChromosome] [exStrand][ss][0]] += 1 else: genes[self.ssInfo[type][exChromosome] [exStrand][ss][0]] = 1 exons[(sg.name, exStart + ":E:" + exEnd)] = self.ssInfo[ type][exChromosome][exStrand][ss] elif mode == "overlap": (overlapGns, overlapExons) = self.overlappingGenesExons( type, exStart, exEnd) for overlapGn in overlapGns: if genes.has_key(overlapGn): genes[overlapGn] += 1 else: genes[overlapGn] = 1 for overlapExon in overlapExons.iterkeys(): exons[(sg.name, exStart + ":E:" + exEnd)] = ( overlapExons[overlapExon], overlapExon) elif mode == "within": (overlapGns, overlapExons) = self.withinExons( type, exStart, exEnd) for overlapGn in overlapGns: if genes.has_key(overlapGn): genes[overlapGn] += 1 else: genes[overlapGn] = 1 for overlapExon in overlapExons.iterkeys(): exons[(sg.name, exStart + ":E:" + exEnd)] = ( overlapExons[overlapExon], overlapExon) elif mode not in [ "bothSS", "singleSS", "overlap", "exon", "within" ]: self.__inform( "map2gene: Ignoring unknown mode %s\n" % mode) else: self.__inform( "ERROR in map2gene: 1st argument must either be filename or instance of SpliceGraph" ) raise Errors.ArgumentError( "Mapper.map2gene", "1st argument must either be filename or instance of SpliceGraph" ) return genes, exons