def countBySeed(editFN, microFN, flankAmount, outFN): '''FlankF amount should be +/- 6''' flankAmount = int(flankAmount) eSites = cgEdit.loadEditingSites(editFN) micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa') gf = GenomeFetch.GenomeFetch('hg19') for eSite in eSites: chrom = eSite.chromosome coord = eSite.coordinate strand = eSite.strand flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand) eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:] flankingSeq.replace('T', 'U') eFlankingSeq.replace('T','U') checkID = 'hsa-miR-330-5p' for microRNA in micros: comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True) if comSeed in flankingSeq: eSite.microTargets.append(microRNA.id) if microRNA.id == checkID: print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene microRNA.numBefore += 1 #print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed if comSeed in eFlankingSeq: microRNA.numAfter += 1 if microRNA.id == checkID: print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.strand, eSite.gene eSite.eMicroTargets.append(microRNA.id) #print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed for micro in micros: if micro.numBefore > 0 or micro.numAfter > 0: #print micro.id, micro.numBefore, micro.numAfter pass #write contents to file... outF = open(outFN, 'w') for eSite in eSites: if len(eSite.microTargets) == 0: targets = 'None' else: targets = ','.join(eSite.microTargets) if len(eSite.eMicroTargets) == 0: eTargets = 'None' else: eTargets = ','.join(eSite.eMicroTargets) outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
def getFolded(fN): eSites = cgEdit.loadEditingSites(fN) gf = GenomeFetch.GenomeFetch('hg19') for eSite in eSites: #Get +/- 200 bp of eSite chrom, strand, coord = eSite.chromosome, eSite.strand, eSite.coordinate start, end = coord - 200, coord + 200 seq = gf.get_seq_from_to(chrom, start, end, strand) print '>', eSite.ID print seq
def getEditInfo(fN, idList): eSites = cgEdit.loadEditingSites(fN) idDict = {} for eSite in eSites: idDict[eSite.ID] = eSite list = [] f = open(idList, 'r') for line in f: ls = line.strip().split('\t') list.append(int(ls[0])) for id in list: eSite = idDict[id] print eSite.ID, '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.gene, eSite.eRatio
def updateMicroTargets(editFN, microFN, flankAmount, outFN): '''FlankF amount should be +/- 6''' flankAmount = int(flankAmount) eSites = cgEdit.loadEditingSites(editFN) micros = cgMicroRNA.loadMicroRNAFromTargetScan(microFN, 'hsa') gf = GenomeFetch.GenomeFetch('hg19') for eSite in eSites: chrom = eSite.chromosome coord = eSite.coordinate strand = eSite.strand flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand) eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:] for microRNA in micros: comSeed = cgSeqMod.reverseComplementSequence(microRNA.seed, True) if comSeed in flankingSeq: eSite.microTargets.append(microRNA.id) print '@', 'flank', eSite.ID, microRNA.id, flankingSeq, comSeed if comSeed in eFlankingSeq: eSite.eMicroTargets.append(microRNA.id) print '@', 'eFlank', eSite.ID, microRNA.id, eFlankingSeq, comSeed #write contents to file... outF = open(outFN, 'w') for eSite in eSites: if len(eSite.microTargets) == 0: targets = 'None' else: targets = ','.join(eSite.microTargets) if len(eSite.eMicroTargets) == 0: eTargets = 'None' else: eTargets = ','.join(eSite.eMicroTargets) outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps)
def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, "r") for line in f: ls = line.strip().split("\t") chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps)
def updateLocationBasedTargets(editFN, contextFN, miLocationFN, gFN): eSites = cgEdit.loadEditingSites(editFN) cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context geneSet = cgGenes3.createGeneSetEditing(gFN) tName_t = {} for t in geneSet.transcripts: tName_t[t.id] = t tName_miInfo = {} f = open(miLocationFN, 'r') for line in f: ls = line.strip().split('\t') tName = ls[0] miName = ls[1] loc = int(ls[2]) tName_miInfo.setdefault(tName, []).append([miName, loc]) for eSite in eSites: if '3UTR' not in eSite.context: continue for tName in eSite.transcripts: if tName in tName_miInfo: t = tName_t[tName] for info in tName_miInfo[tName]: miName = info[0] loc = info[1] #get the position of e site in mrna for this transcript ePosition = t.getRelativePositionMRNA(eSite.coordinate, coding = False) print tName, miName, loc, ePosition if loc - 22 <= ePosition <= loc: print tName, miName, '%s:%s' % (eSite.chromosome, eSite.coordinate) pass
def makeTable(fN, eFN): eSites = cgEdit.loadEditingSites(eFN) eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite f = open(fN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) eSite = eID_eSite[eID] gName = ls[1] b = ls[8] a = ls[9] eRatio = eSite.eRatio eLoc = eSite.tcc print '%s\t%s\t%s\t%s\t%s' % (gName, eLoc, eRatio, b, a)
def makeTargetExpressionHistogram(eFN, targetFN, contextFN, geneFN, eChangeFN): print 'loading expression ratios' gName_eChange = getERatioDict(eChangeFN) print 'loading eSites and Transcripts' eSites = cgEdit.loadEditingSites(eFN) geneSet = cgGenes3.createGeneSetEditing(geneFN) print 'making joint dicts and loading extra data' #joint eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite #joint tID_gName = {} for transcript in geneSet.transcripts: tID_gName[transcript.id] = transcript.parent #load context data f = open(contextFN, 'r') eID_tID = {} # eID: tID tID_tType = {} for line in f: ls = line.strip().split('\t') eID = int(ls[0]) tID = ls[1] tType = ls[2] tID_tType[tID] = tType if eID in eID_tID: eID_tID[eID].append(tID) else: eID_tID[eID] = [tID] f.close() print 'analyzing' #Get created or destroyed f = open(targetFN, 'r') altered = [] for line in f: ls = line.strip().split('\t') #created/destroyed if (ls[3] != 'None') and (ls[4] == 'None'): altered.append(int(ls[0])) f.close() print 'number of created/destroyed sites:', len(altered) alteredSites = [] for id in altered: alteredSites.append(eID_eSite[id]) eChanges = [] gDone = [] #Get gene names for each eSite for eSite in alteredSites: genes = [] for tID in eID_tID[eSite.ID]: if tID == 'NONE': continue gName = tID_gName[tID] if tID_tType[tID] != '3UTR': print 'Not 3UTR', tID continue if gName not in genes: genes.append(gName) if len(genes) > 1: print 'more than one gene for eSite...', genes continue if gName in gDone: continue else: gDone.append(gName) #Now add expression to HistoGram List... if gName in gName_eChange: eChange = gName_eChange[gName] else: print 'gene not in expression list', gName continue eChange = gName_eChange[gName] eChange = math.log(eChange, 2) eChanges.append(eChange) #Now plot the histogram plt.hist(eChanges, 40) plt.xlabel('log2(RPKM KD/ RPKM CONTROL)') plt.ylabel('# Genes') plt.show()
def updateSynonomous(eFN, gFN, resultsFN, outFN): #Load Transcripts and Editing Sites print 'Loading editing sites' eSites = cgEdit.loadEditingSites(eFN) print 'Loading gene set' geneSet = cgGenes3.createGeneSetEditing(gFN) codingTID_eID = {} f = open(resultsFN, 'r') for line in f: ls = line.strip().split('\t') if ls[4] == 'C': codingTID_eID[ls[2]] = int(ls[0]) #Get coding Transcripts codingTranscripts = {} #tID : eID ! many:one always! f = open(resultsFN, 'r') for line in f: ls = line.strip().split('\t') if ls[4] == 'C': codingTranscripts[ls[2]] = int(ls[0]) eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite tID_transcript = {} for transcript in geneSet.transcripts: tID[transcript.id] = transcript codingT_eSite for tID in codingTID_eID: eID = codingTID_eID[tID] t = tID_transcript[tID] e = eID_eSite[eID] print 'Creating scroll dict' scrollDict = {} # transcript: eSite for tID in codingTranscripts: e = eJoint[codingTranscripts[tID]] try: t = tJoint[tID] scrollDict[t] = e except KeyError: pass print 'Deducing synonomous' map = cgSeqMod.loadCodonMap('hg19') finalDict = {} # tID: [SYN, AAA, AAB, G, A] #Figure out if they are synonomous for t in scrollDict: eSite = scrollDict[t] #dumpObj.dumpObj(t) #dumpObj.dumpObj(eSite) ePositionInMRNA = t.getRelativePositionMRNA(eSite.coordinate - 1) if ePositionInMRNA == -1: print t.id, 'should not be designated coding...' continue #grab mRNA and emRNA mRNA = t.getMRNA(coding=True) emRNA = t.getMRNA(coding=True) if mRNA[ePositionInMRNA] != 'A': print 'wrong position', t.id, '%s:%s' % ( eSite.chromosome, eSite.coordinate), eSite.strand, mRNA[ ePositionInMRNA - 5:ePositionInMRNA - 1], mRNA[ePositionInMRNA], mRNA[ePositionInMRNA + 1:ePositionInMRNA + 5] #edit the site emRNA = list(emRNA) emRNA[ePositionInMRNA] = 'G' emRNA = ''.join(emRNA) #Test the protein sequences pRNA = cgSeqMod.translateRNA(mRNA, map) epRNA = cgSeqMod.translateRNA(emRNA, map) #print t.parent, t.id newString = ['%s ' % x for x in list(pRNA)] newString = ''.join(newString) if pRNA[0] != 'M': print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10] if pRNA[-1] != '*': print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:] #compare the codons. mCodonList = cgSeqMod.getCodonListFromRNA(mRNA) emCodonList = cgSeqMod.getCodonListFromRNA(emRNA) compareList = zip(mCodonList, emCodonList) synFlag = 'SYN' codonNumber = ePositionInMRNA // 3 codonPair = compareList[codonNumber] print t.id print eSite.ID print mCodonList[:codonNumber] print mRNA[:ePositionInMRNA] bCodon = codonPair[0] aCodon = codonPair[1] baa = cgSeqMod.translateRNA(bCodon, map) aaa = cgSeqMod.translateRNA(aCodon, map) if baa != aaa: synFlag = 'NON' bCodonList = list(bCodon) aCodonList = list(aCodon) matchedLetters = zip(bCodonList, aCodonList) for pair in matchedLetters: if pair[0] != 'A': if pair[1] == 'G' and pair[0] != 'G': print 'messed up codon switch', bCodonList, aCodonList print t.parent, '%s:%s' % ( eSite.chromosome, eSite.coordinate ), eSite.strand, bCodon, aCodon, baa, aaa else: synFlag = 'SYN' finalDict[t.id] = [synFlag, bCodon, aCodon, baa, aaa] print 'writing to file' #update line by line newLines = [] f = open(resultsFN, 'r') for line in f: newLine = line.strip() tID = line.strip().split('\t')[2] if tID in finalDict: newLine = newLine + '\t%s\t%s\t%s\t%s\t%s\n' % ( finalDict[tID][0], finalDict[tID][1], finalDict[tID][2], finalDict[tID][3], finalDict[tID][4]) else: newLine = newLine + '\tNA\tNA\tNA\tNA\tNA\n' newLines.append(newLine) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def updateContext(editFN, geneSetFN, outFN, refBase = 'A'): print refBase #Load Transcripts and Editing Sites print 'loading editing sites' eSites = cgEdit.loadEditingSites(editFN, refBase) print 'loading gene set' geneSet = cgGenes3.createGeneSetEditing(geneSetFN) #make the eSites 0 based for eSite in eSites: #redo coordinate and tcc eSite.coordinate = eSite.coordinate - 1 eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand, eSite.coordinate, eSite.coordinate) #Create Joint dictionaries print 'creating joint dictionaries' eJoint = {} #tcc : eSite for eSite in eSites: eJoint[eSite.tcc] = eSite tJoint = {} # tcc : [transcript, ...] for transcript in geneSet.transcripts: if transcript.tcc in tJoint: tJoint[transcript.tcc].append(transcript) else: tJoint[transcript.tcc] = [transcript] #Overlap tccs print 'overlapping joints' ##make new 0-based keys tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(), tJoint.keys(), 1) print 'creating final dictionary' #create final dictionary containing {edit sites : [transcript, ..]} eSiteTranscripts = {} # edit site: [transcript, ..] for eTcc in tccOverlaps: eSite = eJoint[eTcc] eSiteTranscripts[eSite] = [] for tTcc in tccOverlaps[eTcc]: eSiteTranscripts[eSite].extend(tJoint[tTcc]) print 'get context info' #Go through each site and find out what it overlaps, and if it is in a coding region... fOut = open(outFN, 'w') for eSite in eSiteTranscripts: if len(eSiteTranscripts[eSite]) == 0: #label intergenic tType = 'INTER' codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE')) continue for transcript in eSiteTranscripts[eSite]: codingTranscript = '_coding' in transcript.tType tType = None codingFlag = None tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc)] if '3UTR' in tTypes: tType = '3UTR' elif '5UTR' in tTypes: tType = '5UTR' else: tType = tTypes[0] #has to be one thing...exon or intron #This only works because UTR takes precedence over EXON in TYPE. if tType == 'EXON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType)) #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag)) fOut.close()
def newExpression(eFN, gFN, contextFN, targetFN, rpkmFN): """docstring for newExpression""" #print 'loading editing/genes' eSites = cgEdit.loadEditingSites(eFN) geneSet = cgGenes3.createGeneSetEditing(gFN) #print 'making joints' #joints eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite gID_eIDs = {} eID_tTypes = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') gID = ls[1] eID = int(ls[0]) tType = ls[3] if gID in gID_eIDs: if eID not in gID_eIDs[gID]: gID_eIDs[gID].append(eID) else: gID_eIDs[gID] = [eID] if eID in eID_tTypes: eID_tTypes[eID].append(tType) else: eID_tTypes[eID] = [tType] #print 'updating target sites' #update targetting for eSites: f = open(targetFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) before = ls[3].split(',') if before == ['None']: before = [] after = ls[4].split(',') if after == ['None']: after = [] eID_eSite[eID].before = before eID_eSite[eID].after = after #scrolldict gene_eSites = {} for gID in gID_eIDs: try: gene = geneSet.set[gID] except KeyError: #print gID, 'not in geneSet' pass gene_eSites[gene] = [] #get eSites for eID in gID_eIDs[gID]: eSite = eID_eSite[eID] gene_eSites[gene].append(eSite) #print 'updating gene target site info' #update before/after editing target sites for GENES createdGenes = [] destroyedGenes = [] histoVals = [] for gene in gene_eSites: gene.before = [] gene.after = [] for eSite in gene_eSites[gene]: if '3UTR' in eID_tTypes[eSite.ID]: for micro in eSite.before: gene.before.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate)) for micro in eSite.after: gene.after.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate)) if len(gene.before) > len(gene.after): destroyedGenes.append(gene) if len(gene.after) > len(gene.before): createdGenes.append(gene) change = len(gene.after) - len(gene.before) if (len(gene.after) != 0) or (len(gene.before) != 0): histoVals.append(change) #print the created/destroyed sites for gene in createdGenes: print gene.before print gene.after for gene in destroyedGenes: print gene.before print gene.after ''' plt.title('Target Site Changes Due To Editing') plt.xlabel('Change in Number of Target Sites After Editing') plt.ylabel('Number of Genes') plt.hist(histoVals, 6) plt.show() return 0 ''' #print out microRNA gene List uniqueMicros = {} for gene in createdGenes: for micro in gene.after: uniqueMicros[micro] = 1 for gene in destroyedGenes: for micro in gene.before: uniqueMicros[micro] = 1 for micro in uniqueMicros: #print micro pass #print 'loading rpkm' gName_ratio = getERatioDict(rpkmFN) eChanges = [] for gene in createdGenes: try: ratio = gName_ratio[gene.id] except KeyError: #print gene.id, 'not in RPKM file --> not expressed' pass eChange = math.log(ratio, 2) eChanges.append(eChange) eChanges2 = [] for gene in destroyedGenes: try: ratio = gName_ratio[gene.id] except KeyError: #print gene.id, 'not in RPKM file --> not expressed' pass eChange = math.log(ratio, 2) eChanges2.append(eChange) #Now plot the histogram plt.hist(eChanges, 40, cumulative = True, histtype = 'step', normed = True, label = 'Created') plt.hist(eChanges2, 40, cumulative = True, histtype = 'step', normed = True, label = 'Destroyed') plt.legend() plt.title('eCDF of Genes with Target Site Changes in 3UTRs') plt.xlabel('log2(RPKM KD/ RPKM CONTROL)') plt.ylabel('Fraction of Genes') plt.show()
def updateValidatedMicroTargets(editFN, microTargetFN, microSequenceFN, outFN, gFN): flankAmount = 6 eSites = cgEdit.loadEditingSites(editFN) cgEdit.updateContextEditingSites(eSites) miNames_micros = cgMicroRNA.loadMicroRNAFromValidated(microTargetFN, microSequenceFN) gf = GenomeFetch.GenomeFetch('hg19') #update flanking region for eSite in eSites: chrom = eSite.chromosome coord = eSite.coordinate strand = eSite.strand flankingSeq = gf.get_seq_from_to(chrom, coord - flankAmount, coord + flankAmount, strand) eFlankingSeq = flankingSeq[:flankAmount] + 'G' + flankingSeq[flankAmount + 1:] eSite.flank = flankingSeq.replace('T', 'U') eSite.eFlank = eFlankingSeq.replace('T', 'U') #joint gName_micros = {} for micro in miNames_micros.values(): for target in micro.targetGenes: if micro not in gName_micros.setdefault(target, []): gName_micros[target].append(micro) gene_m = {} for eSite in eSites: sharedMicros = gName_micros.get(eSite.gene) if sharedMicros is None: continue for micro in sharedMicros: print '' print micro.name, eSite.gene, micro.sequence, micro.seed print micro.comSeed print eSite.flank, eSite.eFlank print '%s:%s' % (eSite.chromosome, eSite.coordinate), eSite.flank, eSite.gene if micro.comSeed == None: #dumpObj.dumpObj(micro) #print 'miR not in sequence file:', micro.name #print micro.targetGenes continue if eSite.gene in gene_m: if micro not in gene_m[eSite.gene]: gene_m[eSite.gene].append(micro) else: gene_m[eSite.gene] = [micro] #flanking if micro.comSeed in eSite.flank: eSite.before.append(micro.name) if micro.comSeed in eSite.eFlank: eSite.after.append(micro.name) print len(gene_m) count = 0 for g in gene_m: print g for m in gene_m[g]: print '...', m.name count += 1 print count #check if these seeds are in the checkIfSeedPresent(gene_m, gFN) #write contents to file... outF = open(outFN, 'w') for eSite in eSites: if len(eSite.before) == 0: targets = 'None' else: targets = ','.join(eSite.before) if len(eSite.after) == 0: eTargets = 'None' else: eTargets = ','.join(eSite.after) outF.write('%s\t%s:%s\t%s\t%s\t%s\n' % (eSite.ID, eSite.chromosome, eSite.coordinate, eSite.strand, targets, eTargets))
def betterSynonymous(eFN, gFN, contextFN, outFN, refBase='A', eBase='G'): print 'loading e sites' eSites = cgEdit.loadEditingSites(eFN) print 'loading geneSet' geneSet = cgGenes3.createGeneSetEditing(gFN) contextInfo = {} # eID: tID : [UTR, C] f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) tID = ls[2] cInfo = [ls[3], ls[4]] if eID not in contextInfo: contextInfo[eID] = {} contextInfo[eID][tID] = cInfo else: contextInfo[eID][tID] = cInfo eID_tIDs = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) tID = ls[2] if tID not in eID_tIDs.setdefault(eID, []): eID_tIDs[eID].append(tID) eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite tID_transcript = {} for transcript in geneSet.transcripts: tID_transcript[transcript.id] = transcript eSite_transcripts = {} for eID in eID_tIDs: eSite = eID_eSite[eID] tList = [] for tID in eID_tIDs[eID]: if tID == 'NONE': continue if tID_transcript.get(tID, None) == None: continue tList.append(tID_transcript[tID]) eSite_transcripts[eSite] = tList outF = open(outFN, 'w') map = cgSeqMod.loadCodonMap('hg19') for eSite in eSite_transcripts: for transcript in eSite_transcripts[eSite]: siteType, codingType = contextInfo[eSite.ID][transcript.id] if '_noncoding' in transcript.tType: continue if codingType != 'C': continue ePositionInMRNA = transcript.getRelativePositionMRNA( eSite.coordinate - 1) mRNA = transcript.getMRNA(coding=True) emRNA = transcript.getMRNA(coding=True) if mRNA[ePositionInMRNA] != refBase: print 'Editing site was not an A...' #edit the site emRNA = list(emRNA) emRNA[ePositionInMRNA] = eBase emRNA = ''.join(emRNA) #Test the protein sequences pRNA = cgSeqMod.translateRNA(mRNA, map) epRNA = cgSeqMod.translateRNA(emRNA, map) if pRNA[0] != 'M': print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10] if pRNA[-1] != '*': print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:] #compare the codons. mCodonList = cgSeqMod.getCodonListFromRNA(mRNA) emCodonList = cgSeqMod.getCodonListFromRNA(emRNA) compareList = zip(mCodonList, emCodonList) codonNumber = ePositionInMRNA // 3 codonPair = compareList[codonNumber] bCodon = codonPair[0] aCodon = codonPair[1] baa = cgSeqMod.translateRNA(bCodon, map) aaa = cgSeqMod.translateRNA(aCodon, map) synFlag = 'SYN' if baa != aaa: synFlag = 'NON' bCodonList = list(bCodon) aCodonList = list(aCodon) matchedLetters = zip(bCodonList, aCodonList) for pair in matchedLetters: if pair[0] != 'A': if pair[1] == 'G' and pair[0] != 'G': print 'messed up codon switch', bCodonList, aCodonList print t.parent, '%s:%s' % ( eSite.chromosome, eSite.coordinate ), eSite.strand, bCodon, aCodon, baa, aaa outF.write('\t'.join([ str(eSite.ID), transcript.parent, transcript.id, synFlag, bCodon, aCodon, baa, aaa ]) + '\n')
def updateContext(editFN, geneSetFN, outFN, refBase='A'): print refBase #Load Transcripts and Editing Sites print 'loading editing sites' eSites = cgEdit.loadEditingSites(editFN, refBase) print 'loading gene set' geneSet = cgGenes3.createGeneSetEditing(geneSetFN) #make the eSites 0 based for eSite in eSites: #redo coordinate and tcc eSite.coordinate = eSite.coordinate - 1 eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand, eSite.coordinate, eSite.coordinate) #Create Joint dictionaries print 'creating joint dictionaries' eJoint = {} #tcc : eSite for eSite in eSites: eJoint[eSite.tcc] = eSite tJoint = {} # tcc : [transcript, ...] for transcript in geneSet.transcripts: if transcript.tcc in tJoint: tJoint[transcript.tcc].append(transcript) else: tJoint[transcript.tcc] = [transcript] #Overlap tccs print 'overlapping joints' ##make new 0-based keys tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(), tJoint.keys(), 1) print 'creating final dictionary' #create final dictionary containing {edit sites : [transcript, ..]} eSiteTranscripts = {} # edit site: [transcript, ..] for eTcc in tccOverlaps: eSite = eJoint[eTcc] eSiteTranscripts[eSite] = [] for tTcc in tccOverlaps[eTcc]: eSiteTranscripts[eSite].extend(tJoint[tTcc]) print 'get context info' #Go through each site and find out what it overlaps, and if it is in a coding region... fOut = open(outFN, 'w') for eSite in eSiteTranscripts: if len(eSiteTranscripts[eSite]) == 0: #label intergenic tType = 'INTER' codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE')) continue for transcript in eSiteTranscripts[eSite]: codingTranscript = '_coding' in transcript.tType tType = None codingFlag = None tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc) ] if '3UTR' in tTypes: tType = '3UTR' elif '5UTR' in tTypes: tType = '5UTR' else: tType = tTypes[0] #has to be one thing...exon or intron #This only works because UTR takes precedence over EXON in TYPE. if tType == 'EXON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType)) #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag)) fOut.close()
def checkSeeds(editFN, contextFN, miLocationFN, miSequenceFN, gFN): eSites = cgEdit.loadEditingSites(editFN) cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context geneSet = cgGenes3.createGeneSetEditing(gFN) tName_t = {} for t in geneSet.transcripts: tName_t[t.id] = t miName_miSequence = {} f = open(miSequenceFN, 'r') for line in f: ls = line.strip().split('\t') name = ls[0] seq = ls[1] name = 'hsa-' + name miName_miSequence[name] = seq tName_miInfo = {} f = open(miLocationFN, 'r') for line in f: ls = line.strip().split('\t') tName = ls[0] miName = ls[1] loc = int(ls[2]) tName_miInfo.setdefault(tName, []).append([miName, loc]) foundIt = [] notFoundIt = [] for tName in tName_miInfo: try: t = tName_t[tName] except: continue checkSeq = get3UTRSeq(t) try: mRNA = t.getMRNA() except: continue for miInfo in tName_miInfo[tName]: miName = miInfo[0] loc = miInfo[1] try: miSequence = miName_miSequence[miName] miSeed = miSequence[1:8] except: continue rcMiSeed = cgSeqMod.reverseComplementSequence(miSeed, True) newLoc = loc - (len(mRNA) - len(checkSeq)) finding = checkSeq.find(rcMiSeed, newLoc - 25) if finding != -1: if (0 < newLoc - finding < 30): newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc) if newResult not in foundIt: foundIt.append(newResult) else: if miName == 'hsa-miR-21': print loc, len(checkSeq), len(mRNA) print mRNA print checkSeq newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc) if newResult not in notFoundIt: notFoundIt.append(newResult) print len(foundIt) print len(notFoundIt) print '' for i in foundIt: print i print '' for i in notFoundIt: print i