def countBySeed(editFN, microFN, flankAmount, outFN): '''FlankF amount should be +/- 6''' flankAmount = int(flankAmount) eSites = cgEdit.loadEditingSites(editFN) micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa') gf = GenomeFetch.GenomeFetch('hg19') for eSite in eSites: chrom = eSite.chromosome coord = eSite.coordinate strand = eSite.strand flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand) eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:] flankingSeq.replace('T', 'U') eFlankingSeq.replace('T','U') checkID = 'hsa-miR-330-5p' for microRNA in micros: comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True) if comSeed in flankingSeq: eSite.microTargets.append(microRNA.id) if microRNA.id == checkID: print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene microRNA.numBefore += 1 #print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed if comSeed in eFlankingSeq: microRNA.numAfter += 1 if microRNA.id == checkID: print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene eSite.eMicroTargets.append(microRNA.id) #print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed for micro in micros: if micro.numBefore > 0 or micro.numAfter > 0: #print micro.id, micro.numBefore, micro.numAfter pass #write contents to file... outF = open(outFN, 'w') for eSite in eSites: if len(eSite.microTargets) == 0: targets = 'None' else: targets = ','.join(eSite.microTargets) if len(eSite.eMicroTargets) == 0: eTargets = 'None' else: eTargets = ','.join(eSite.eMicroTargets) outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
def updateMicroTargets(editFN, microFN, flankAmount, outFN): '''FlankF amount should be +/- 6''' flankAmount = int(flankAmount) eSites = cgEdit.loadEditingSites(editFN) micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa') gf = GenomeFetch.GenomeFetch('hg19') for eSite in eSites: chrom = eSite.chromosome coord = eSite.coordinate strand = eSite.strand flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand) eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:] for microRNA in micros: comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True) if comSeed in flankingSeq: eSite.microTargets.append(microRNA.id) print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed if comSeed in eFlankingSeq: eSite.eMicroTargets.append(microRNA.id) print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed #write contents to file... outF = open(outFN, 'w') for eSite in eSites: if len(eSite.microTargets) == 0: targets = 'None' else: targets = ','.join(eSite.microTargets) if len(eSite.eMicroTargets) == 0: eTargets = 'None' else: eTargets = ','.join(eSite.eMicroTargets) outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
def getMRNA(self, coding=False): gf = GenomeFetch.GenomeFetch('hg19') if self.strand == '1': mRNAList = [] #Get CDS START/END try: cdsStart = self.utr5[-1][ 1] + 1 #start of coding is right after utr except IndexError: cdsStart = int(bioLibCG.tccSplit(self.tcc)[2]) try: cdsEnd = self.utr3[-1][ 0] - 1 #Check if this is how you added Elements to the UTR except IndexError: cdsEnd = int(bioLibCG.tccSplit(self.tcc)[3]) if not coding: chrom, strand, start, end = bioLibCG.tccSplit(self.tcc) cdsStart = start cdsEnd = end coordinates = [] coordinates.extend([cdsStart, cdsEnd]) for exon in self.exonList: for exonCoord in exon: if cdsStart < exonCoord < cdsEnd: coordinates.append(exonCoord) #now start at cdsStart and make exon pair for each region till cdsEnd coordinates.sort() tccStarts = [(coordinates[i * 2]) for i in range(len(coordinates) // 2)] tccEnds = [ coordinates[1 + i * 2] for i in range(len(coordinates) // 2) ] tccPairs = zip(tccStarts, tccEnds) for tccPair in tccPairs: mRNAList.append( gf.get_seq_from_to(self.chromosome, tccPair[0] + 1, tccPair[1] + 1)) #gf is 1-based... mRNA = ''.join(mRNAList) return mRNA.replace('T', 'U') if self.strand == '-1': '''This this differs in that: 1) the UTR coordinates have to be retrieved differently, 2) The exon coordinates that will be kept have to be checked with signs reversed, 3) have to revComp it''' mRNAList = [] try: cdsStart = self.utr5[-1][0] - 1 except IndexError: cdsStart = int(bioLibCG.tccSplit(self.tcc)[3]) try: cdsEnd = self.utr3[-1][ 1] + 1 #Check if this is how you added Elements to the UTR except IndexError: cdsEnd = int(bioLibCG.tccSplit(self.tcc)[2]) if not coding: chrom, strand, start, end = bioLibCG.tccSplit(self.tcc) cdsStart = end cdsEnd = start coordinates = [] coordinates.extend([cdsStart, cdsEnd]) for exon in self.exonList: for exonCoord in exon: if cdsStart > exonCoord > cdsEnd: #switched signs for negative coordinates.append(exonCoord) #now start at cdsStart and make tccs for each region till cdsEnd coordinates.sort() #print coordinates #print len(coordinates) tccStarts = [(coordinates[i * 2]) for i in range(len(coordinates) // 2)] tccEnds = [ coordinates[1 + i * 2] for i in range(len(coordinates) // 2) ] tccPairs = zip(tccStarts, tccEnds) #print tccPairs for tccPair in tccPairs: mRNAList.append( gf.get_seq_from_to(self.chromosome, tccPair[0] + 1, tccPair[1] + 1)) #gf is 1-based mRNA = ''.join(mRNAList) mRNA = cgSeqMod.reverseComplementSequence(mRNA, False) return mRNA.replace('T', 'U')
def getMRNA(self, coding=False): gf = GenomeFetch.GenomeFetch("hg19") if self.strand == "1": mRNAList = [] # Get CDS START/END try: cdsStart = self.utr5[-1][1] + 1 # start of coding is right after utr except IndexError: cdsStart = int(bioLibCG.tccSplit(self.tcc)[2]) try: cdsEnd = self.utr3[-1][0] - 1 # Check if this is how you added Elements to the UTR except IndexError: cdsEnd = int(bioLibCG.tccSplit(self.tcc)[3]) if not coding: chrom, strand, start, end = bioLibCG.tccSplit(self.tcc) cdsStart = start cdsEnd = end coordinates = [] coordinates.extend([cdsStart, cdsEnd]) for exon in self.exonList: for exonCoord in exon: if cdsStart < exonCoord < cdsEnd: coordinates.append(exonCoord) # now start at cdsStart and make exon pair for each region till cdsEnd coordinates.sort() tccStarts = [(coordinates[i * 2]) for i in range(len(coordinates) // 2)] tccEnds = [coordinates[1 + i * 2] for i in range(len(coordinates) // 2)] tccPairs = zip(tccStarts, tccEnds) for tccPair in tccPairs: mRNAList.append(gf.get_seq_from_to(self.chromosome, tccPair[0] + 1, tccPair[1] + 1)) # gf is 1-based... mRNA = "".join(mRNAList) return mRNA.replace("T", "U") if self.strand == "-1": """This this differs in that: 1) the UTR coordinates have to be retrieved differently, 2) The exon coordinates that will be kept have to be checked with signs reversed, 3) have to revComp it""" mRNAList = [] try: cdsStart = self.utr5[-1][0] - 1 except IndexError: cdsStart = int(bioLibCG.tccSplit(self.tcc)[3]) try: cdsEnd = self.utr3[-1][1] + 1 # Check if this is how you added Elements to the UTR except IndexError: cdsEnd = int(bioLibCG.tccSplit(self.tcc)[2]) if not coding: chrom, strand, start, end = bioLibCG.tccSplit(self.tcc) cdsStart = end cdsEnd = start coordinates = [] coordinates.extend([cdsStart, cdsEnd]) for exon in self.exonList: for exonCoord in exon: if cdsStart > exonCoord > cdsEnd: # switched signs for negative coordinates.append(exonCoord) # now start at cdsStart and make tccs for each region till cdsEnd coordinates.sort() # print coordinates # print len(coordinates) tccStarts = [(coordinates[i * 2]) for i in range(len(coordinates) // 2)] tccEnds = [coordinates[1 + i * 2] for i in range(len(coordinates) // 2)] tccPairs = zip(tccStarts, tccEnds) # print tccPairs for tccPair in tccPairs: mRNAList.append(gf.get_seq_from_to(self.chromosome, tccPair[0] + 1, tccPair[1] + 1)) # gf is 1-based mRNA = "".join(mRNAList) mRNA = cgSeqMod.reverseComplementSequence(mRNA, False) return mRNA.replace("T", "U")
def checkSeeds(editFN, contextFN, miLocationFN, miSequenceFN, gFN): eSites = cgEdit.loadEditingSites(editFN) cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context geneSet = cgGenes3.createGeneSetEditing(gFN) tName_t = {} for t in geneSet.transcripts: tName_t[t.id] = t miName_miSequence = {} f = open(miSequenceFN, 'r') for line in f: ls = line.strip().split('\t') name = ls[0] seq = ls[1] name = 'hsa-' + name miName_miSequence[name] = seq tName_miInfo = {} f = open(miLocationFN, 'r') for line in f: ls = line.strip().split('\t') tName = ls[0] miName = ls[1] loc = int(ls[2]) tName_miInfo.setdefault(tName, []).append([miName, loc]) foundIt = [] notFoundIt = [] for tName in tName_miInfo: try: t = tName_t[tName] except: continue checkSeq = get3UTRSeq(t) try: mRNA = t.getMRNA() except: continue for miInfo in tName_miInfo[tName]: miName = miInfo[0] loc = miInfo[1] try: miSequence = miName_miSequence[miName] miSeed = miSequence[1:8] except: continue rcMiSeed = cgSeqMod.reverseComplementSequence(miSeed, True) newLoc = loc - (len(mRNA) - len(checkSeq)) finding = checkSeq.find(rcMiSeed, newLoc - 25) if finding != -1: if (0 < newLoc - finding < 30): newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc) if newResult not in foundIt: foundIt.append(newResult) else: if miName == 'hsa-miR-21': print loc, len(checkSeq), len(mRNA) print mRNA print checkSeq newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc) if newResult not in notFoundIt: notFoundIt.append(newResult) print len(foundIt) print len(notFoundIt) print '' for i in foundIt: print i print '' for i in notFoundIt: print i