def test(gFN): map = cgSeqMod.loadCodonMap("hg19") geneSet = cgGenes3.createGeneSetEditing(gFN) for transcript in geneSet.transcripts: if "_coding" in transcript.tType: try: print transcript.id print transcript.getMRNA(coding=True) print cgSeqMod.translateRNA(transcript.getMRNA(coding=True), map) except: print "fail"
def test(gFN): map = cgSeqMod.loadCodonMap('hg19') geneSet = cgGenes3.createGeneSetEditing(gFN) for transcript in geneSet.transcripts: if '_coding' in transcript.tType: try: print transcript.id print transcript.getMRNA(coding=True) print cgSeqMod.translateRNA(transcript.getMRNA(coding=True), map) except: print 'fail'
def testit(gFN): geneSet = cgGenes3.createGeneSetEditing(gFN) map = cgSeqMod.loadCodonMap('hg19') for gene in geneSet.genes: for transcript in gene.transcripts: try: print '' mRNA = transcript.getMRNA(coding = True) i = transcript.getRelativePositionMRNA(35872409) if i == -1: continue print transcript.id print i print mRNA[:i], mRNA[i], mRNA[i + 1:] print cgSeqMod.translateRNA(mRNA, map) except: pass
def updateLocationBasedTargets(editFN, contextFN, miLocationFN, gFN): eSites = cgEdit.loadEditingSites(editFN) cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context geneSet = cgGenes3.createGeneSetEditing(gFN) tName_t = {} for t in geneSet.transcripts: tName_t[t.id] = t tName_miInfo = {} f = open(miLocationFN, 'r') for line in f: ls = line.strip().split('\t') tName = ls[0] miName = ls[1] loc = int(ls[2]) tName_miInfo.setdefault(tName, []).append([miName, loc]) for eSite in eSites: if '3UTR' not in eSite.context: continue for tName in eSite.transcripts: if tName in tName_miInfo: t = tName_t[tName] for info in tName_miInfo[tName]: miName = info[0] loc = info[1] #get the position of e site in mrna for this transcript ePosition = t.getRelativePositionMRNA(eSite.coordinate, coding = False) print tName, miName, loc, ePosition if loc - 22 <= ePosition <= loc: print tName, miName, '%s:%s' % (eSite.chromosome, eSite.coordinate) pass
def getCoords(gFN): geneSet = cgGenes3.createGeneSetEditing(gFN) for gName in geneSet.set: gene = geneSet.set[gName] # find the longest utr... longestUTR = None longestT = None longest = 0 for transcript in gene.transcripts: l = 0 if transcript.utr3 is None: continue for utrPair in transcript.utr3: l += utrPair[1] - utrPair[0] + 1 if l > longest: longestUTR = transcript.utr3 longestT = transcript # Get the coordinate ends/etc starts, ends = [], [] if longestUTR is None: continue for utrPair in longestUTR: starts.append(utrPair[0]) ends.append(utrPair[1]) starts.sort() ends.sort() startS = ",".join([str(x) for x in starts]) endS = ",".join([str(x) for x in ends]) print "%s\t%s\t%s\t%s\t%s\t%s" % ( transcript.id, transcript.parent, transcript.chromosome, transcript.strand, startS, endS, )
def checkIfSeedPresent(gName_micros, gFN): gf = GenomeFetch.GenomeFetch('hg19') print 'loading gene set' geneSet = cgGenes3.createGeneSetEditing(gFN) print '....done loading' outF = open('utrSeeds', 'w') done = {} for gName in gName_micros: micros = gName_micros[gName] for transcript in geneSet.set[gName].transcripts: utrCoords = [] if len(transcript.utr3) == 0: continue for utrPair in transcript.utr3: utrCoords.extend(utrPair) utrCoords.sort() start, end = utrCoords[0], utrCoords[1] chrom = transcript.chromosome strand = transcript.strand checkSeq = gf.get_seq_from_to(chrom, start, end, strand) print '' print transcript.id print checkSeq checkSeq = checkSeq.replace('T', 'U') for micro in micros: findings = checkSeq.find(micro.comSeed) if findings != -1: outF.write('%s\t%s\t%s\n' % ( transcript.id, micro.name, transcript.parent)) found = findings
def getCoords(gFN): geneSet = cgGenes3.createGeneSetEditing(gFN) for gName in geneSet.set: gene = geneSet.set[gName] #find the longest utr... longestUTR = None longestT = None longest = 0 for transcript in gene.transcripts: l = 0 if transcript.utr3 is None: continue for utrPair in transcript.utr3: l += utrPair[1] - utrPair[0] + 1 if l > longest: longestUTR = transcript.utr3 longestT = transcript #Get the coordinate ends/etc starts, ends = [], [] if longestUTR is None: continue for utrPair in longestUTR: starts.append(utrPair[0]) ends.append(utrPair[1]) starts.sort() ends.sort() startS = ','.join([str(x) for x in starts]) endS = ','.join([str(x) for x in ends]) print '%s\t%s\t%s\t%s\t%s\t%s' % (transcript.id, transcript.parent, transcript.chromosome, transcript.strand, startS, endS)
def newExpression(eFN, gFN, contextFN, targetFN, rpkmFN): """docstring for newExpression""" #print 'loading editing/genes' eSites = cgEdit.loadEditingSites(eFN) geneSet = cgGenes3.createGeneSetEditing(gFN) #print 'making joints' #joints eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite gID_eIDs = {} eID_tTypes = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') gID = ls[1] eID = int(ls[0]) tType = ls[3] if gID in gID_eIDs: if eID not in gID_eIDs[gID]: gID_eIDs[gID].append(eID) else: gID_eIDs[gID] = [eID] if eID in eID_tTypes: eID_tTypes[eID].append(tType) else: eID_tTypes[eID] = [tType] #print 'updating target sites' #update targetting for eSites: f = open(targetFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) before = ls[3].split(',') if before == ['None']: before = [] after = ls[4].split(',') if after == ['None']: after = [] eID_eSite[eID].before = before eID_eSite[eID].after = after #scrolldict gene_eSites = {} for gID in gID_eIDs: try: gene = geneSet.set[gID] except KeyError: #print gID, 'not in geneSet' pass gene_eSites[gene] = [] #get eSites for eID in gID_eIDs[gID]: eSite = eID_eSite[eID] gene_eSites[gene].append(eSite) #print 'updating gene target site info' #update before/after editing target sites for GENES createdGenes = [] destroyedGenes = [] histoVals = [] for gene in gene_eSites: gene.before = [] gene.after = [] for eSite in gene_eSites[gene]: if '3UTR' in eID_tTypes[eSite.ID]: for micro in eSite.before: gene.before.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate)) for micro in eSite.after: gene.after.append('%s %s:%s' % (micro, eSite.ID, eSite.coordinate)) if len(gene.before) > len(gene.after): destroyedGenes.append(gene) if len(gene.after) > len(gene.before): createdGenes.append(gene) change = len(gene.after) - len(gene.before) if (len(gene.after) != 0) or (len(gene.before) != 0): histoVals.append(change) #print the created/destroyed sites for gene in createdGenes: print gene.before print gene.after for gene in destroyedGenes: print gene.before print gene.after ''' plt.title('Target Site Changes Due To Editing') plt.xlabel('Change in Number of Target Sites After Editing') plt.ylabel('Number of Genes') plt.hist(histoVals, 6) plt.show() return 0 ''' #print out microRNA gene List uniqueMicros = {} for gene in createdGenes: for micro in gene.after: uniqueMicros[micro] = 1 for gene in destroyedGenes: for micro in gene.before: uniqueMicros[micro] = 1 for micro in uniqueMicros: #print micro pass #print 'loading rpkm' gName_ratio = getERatioDict(rpkmFN) eChanges = [] for gene in createdGenes: try: ratio = gName_ratio[gene.id] except KeyError: #print gene.id, 'not in RPKM file --> not expressed' pass eChange = math.log(ratio, 2) eChanges.append(eChange) eChanges2 = [] for gene in destroyedGenes: try: ratio = gName_ratio[gene.id] except KeyError: #print gene.id, 'not in RPKM file --> not expressed' pass eChange = math.log(ratio, 2) eChanges2.append(eChange) #Now plot the histogram plt.hist(eChanges, 40, cumulative = True, histtype = 'step', normed = True, label = 'Created') plt.hist(eChanges2, 40, cumulative = True, histtype = 'step', normed = True, label = 'Destroyed') plt.legend() plt.title('eCDF of Genes with Target Site Changes in 3UTRs') plt.xlabel('log2(RPKM KD/ RPKM CONTROL)') plt.ylabel('Fraction of Genes') plt.show()
def updateContext(editFN, geneSetFN, outFN, refBase='A'): print refBase #Load Transcripts and Editing Sites print 'loading editing sites' eSites = cgEdit.loadEditingSites(editFN, refBase) print 'loading gene set' geneSet = cgGenes3.createGeneSetEditing(geneSetFN) #make the eSites 0 based for eSite in eSites: #redo coordinate and tcc eSite.coordinate = eSite.coordinate - 1 eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand, eSite.coordinate, eSite.coordinate) #Create Joint dictionaries print 'creating joint dictionaries' eJoint = {} #tcc : eSite for eSite in eSites: eJoint[eSite.tcc] = eSite tJoint = {} # tcc : [transcript, ...] for transcript in geneSet.transcripts: if transcript.tcc in tJoint: tJoint[transcript.tcc].append(transcript) else: tJoint[transcript.tcc] = [transcript] #Overlap tccs print 'overlapping joints' ##make new 0-based keys tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(), tJoint.keys(), 1) print 'creating final dictionary' #create final dictionary containing {edit sites : [transcript, ..]} eSiteTranscripts = {} # edit site: [transcript, ..] for eTcc in tccOverlaps: eSite = eJoint[eTcc] eSiteTranscripts[eSite] = [] for tTcc in tccOverlaps[eTcc]: eSiteTranscripts[eSite].extend(tJoint[tTcc]) print 'get context info' #Go through each site and find out what it overlaps, and if it is in a coding region... fOut = open(outFN, 'w') for eSite in eSiteTranscripts: if len(eSiteTranscripts[eSite]) == 0: #label intergenic tType = 'INTER' codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE')) continue for transcript in eSiteTranscripts[eSite]: codingTranscript = '_coding' in transcript.tType tType = None codingFlag = None tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc) ] if '3UTR' in tTypes: tType = '3UTR' elif '5UTR' in tTypes: tType = '5UTR' else: tType = tTypes[0] #has to be one thing...exon or intron #This only works because UTR takes precedence over EXON in TYPE. if tType == 'EXON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType)) #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag)) fOut.close()
def doit(fN): geneSet = cgGenes3.createGeneSetEditing(fN) for transcript in geneSet.transcripts: if transcript.id == 'NM_031422': dumpObj.dumpObj(transcript)
def makeContextPieBetter(contextFN, gFN, eFN, passedInfo, outFN): if passedInfo[0] == passedInfo[1]: print passedInfo, 'no such thing' return 0 print 'loading geneSet' geneSet = cgGenes3.createGeneSetEditing(gFN) typeCount = {'EXON': 0, '3UTR': 0, '5UTR': 0, 'INTRON': 0, 'NONG': 0, 'NONT': 0} #joint tID_transcript = {} for transcript in geneSet.transcripts: tID_transcript[transcript.id] = transcript eID_Info = {} f = open(eFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[13]) chrom = ls[0] coord = ls[1] newCoord = '%s:%s' % (chrom, coord) nEdited = ls[4] nTotal = ls[5] eID_Info[eID] = [newCoord, nEdited, nTotal] eID_gName = {} eID_tTypes = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) gName = ls[1] eID_gName[eID] = gName tType = ls[3] tName = ls[2] if tType == 'INTER': continue transcript = tID_transcript[tName] tInfo = [tType, transcript] if eID in eID_tTypes: eID_tTypes[eID].append(tInfo) else: eID_tTypes[eID] = [tInfo] eID_finalType = {} for eID in eID_tTypes: highestType = None for tInfo in eID_tTypes[eID]: tType = tInfo[0] transcript = tInfo[1] tCoding = True if '_coding' not in transcript.tType: tCoding = False gCoding = True if '_coding' not in transcript.gType: gCoding = False #print transcript.id, tType, tCoding, gCoding if tType == 'EXON': if tCoding: highestType = 'EXON' break else: if gCoding: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']: highestType = 'NONT' else: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']: highestType = 'NONG' elif tType == '3UTR': if tCoding: if gCoding: highestType = '3UTR' else: highestType = '3UTR' else: if gCoding: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']: highestType = 'NONT' else: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']: highestType = 'NONG' elif tType == '5UTR': if tCoding: if gCoding: if highestType not in ['3UTR']: highestType = '5UTR' else: if highestType not in ['3UTR']: highestType = '5UTR' else: if gCoding: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']: highestType = 'NONT' else: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']: highestType = 'NONG' elif tType == 'INTRON': if tCoding: if gCoding: if highestType not in ['3UTR', '5UTR']: highestType = 'INTRON' else: if highestType not in ['3UTR', '5UTR']: highestType = 'INTRON' else: if gCoding: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR']: highestType = 'NONT' else: if highestType not in ['EXON', 'INTRON', '3UTR', '5UTR', 'NONT']: highestType = 'NONG' eID_finalType[eID] = highestType typeCount[highestType] += 1 outF = open(outFN, 'w') for eID in eID_finalType: outF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (eID_Info[eID][0].split(':')[0], eID_Info[eID][0].split(':')[1], passedInfo, eID_gName[eID], eID_finalType[eID], eID_Info[eID][1], eID_Info[eID][2])) return 0 outF = open(outFN, 'w') for type in typeCount: outF.write('%s\t%s\n' % (type, typeCount[type])) return 0 #get fractions of each type types = ['EXON', '3UTR', '5UTR', 'INTRON', 'NONG', 'NONT'] fracs = [typeCount['EXON'], typeCount['3UTR'], typeCount['5UTR'], typeCount['INTRON'], typeCount['NONG'], typeCount['NONT']] #print fracs labels = ['Exons (%s)' % fracs[0], '3\'UTR (%s)' % fracs[1], '5\'UTR (%s)' % fracs[2], 'Introns (%s)' % fracs[3], 'Noncoding Gene (%s)' % fracs[4], 'Noncoding Transcript (%s)' % fracs[5]] theSum = fracs[0] + fracs[1] + fracs[2] + fracs[3] + fracs[4] + fracs[5] fracs = [float(x)/theSum for x in fracs] #print fracs explode=(0.1, 0.1, 0.1, 0.1, .1, .1) pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True) title('Editing Site Genomic Location', bbox={'facecolor':'1.0', 'pad':10}) show()
def makeTargetExpressionHistogram(eFN, targetFN, contextFN, geneFN, eChangeFN): print 'loading expression ratios' gName_eChange = getERatioDict(eChangeFN) print 'loading eSites and Transcripts' eSites = cgEdit.loadEditingSites(eFN) geneSet = cgGenes3.createGeneSetEditing(geneFN) print 'making joint dicts and loading extra data' #joint eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite #joint tID_gName = {} for transcript in geneSet.transcripts: tID_gName[transcript.id] = transcript.parent #load context data f = open(contextFN, 'r') eID_tID = {} # eID: tID tID_tType = {} for line in f: ls = line.strip().split('\t') eID = int(ls[0]) tID = ls[1] tType = ls[2] tID_tType[tID] = tType if eID in eID_tID: eID_tID[eID].append(tID) else: eID_tID[eID] = [tID] f.close() print 'analyzing' #Get created or destroyed f = open(targetFN, 'r') altered = [] for line in f: ls = line.strip().split('\t') #created/destroyed if (ls[3] != 'None') and (ls[4] == 'None'): altered.append(int(ls[0])) f.close() print 'number of created/destroyed sites:', len(altered) alteredSites = [] for id in altered: alteredSites.append(eID_eSite[id]) eChanges = [] gDone = [] #Get gene names for each eSite for eSite in alteredSites: genes = [] for tID in eID_tID[eSite.ID]: if tID == 'NONE': continue gName = tID_gName[tID] if tID_tType[tID] != '3UTR': print 'Not 3UTR', tID continue if gName not in genes: genes.append(gName) if len(genes) > 1: print 'more than one gene for eSite...', genes continue if gName in gDone: continue else: gDone.append(gName) #Now add expression to HistoGram List... if gName in gName_eChange: eChange = gName_eChange[gName] else: print 'gene not in expression list', gName continue eChange = gName_eChange[gName] eChange = math.log(eChange, 2) eChanges.append(eChange) #Now plot the histogram plt.hist(eChanges, 40) plt.xlabel('log2(RPKM KD/ RPKM CONTROL)') plt.ylabel('# Genes') plt.show()
def updateContext(editFN, geneSetFN, outFN, refBase = 'A'): print refBase #Load Transcripts and Editing Sites print 'loading editing sites' eSites = cgEdit.loadEditingSites(editFN, refBase) print 'loading gene set' geneSet = cgGenes3.createGeneSetEditing(geneSetFN) #make the eSites 0 based for eSite in eSites: #redo coordinate and tcc eSite.coordinate = eSite.coordinate - 1 eSite.tcc = bioLibCG.makeTcc(eSite.chromosome, eSite.strand, eSite.coordinate, eSite.coordinate) #Create Joint dictionaries print 'creating joint dictionaries' eJoint = {} #tcc : eSite for eSite in eSites: eJoint[eSite.tcc] = eSite tJoint = {} # tcc : [transcript, ...] for transcript in geneSet.transcripts: if transcript.tcc in tJoint: tJoint[transcript.tcc].append(transcript) else: tJoint[transcript.tcc] = [transcript] #Overlap tccs print 'overlapping joints' ##make new 0-based keys tccOverlaps = compareData.getIndividualOverlaps(eJoint.keys(), tJoint.keys(), 1) print 'creating final dictionary' #create final dictionary containing {edit sites : [transcript, ..]} eSiteTranscripts = {} # edit site: [transcript, ..] for eTcc in tccOverlaps: eSite = eJoint[eTcc] eSiteTranscripts[eSite] = [] for tTcc in tccOverlaps[eTcc]: eSiteTranscripts[eSite].extend(tJoint[tTcc]) print 'get context info' #Go through each site and find out what it overlaps, and if it is in a coding region... fOut = open(outFN, 'w') for eSite in eSiteTranscripts: if len(eSiteTranscripts[eSite]) == 0: #label intergenic tType = 'INTER' codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, 'NONE', 'NONE', tType, codingFlag, 'NONE')) continue for transcript in eSiteTranscripts[eSite]: codingTranscript = '_coding' in transcript.tType tType = None codingFlag = None tTypes = [ x[1] for x in transcript.getOverlappingElements(eSite.tcc)] if '3UTR' in tTypes: tType = '3UTR' elif '5UTR' in tTypes: tType = '5UTR' else: tType = tTypes[0] #has to be one thing...exon or intron #This only works because UTR takes precedence over EXON in TYPE. if tType == 'EXON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' fOut.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (eSite.ID, transcript.parent, transcript.id, tType, codingFlag, transcript.tType)) #fOut.write('%s:%s:%s\t%s\t%s\t%s\t%s\n' % (eSite.chromosome, eSite.strand, eSite.coordinate, transcript.parent, transcript.id, tType, codingFlag)) fOut.close()
def updateContext(oDir, geneSetFN): print 'loading oRNA' oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() print 'loading gene set' geneSet = cgGenes3.createGeneSetEditing(geneSetFN) #Get in terms of tccs print 'Joining' oTcc_oRNA = oneToOne(id_oRNA.values(), 'tcc') tTcc_transcripts = oneToMany(geneSet.transcripts, 'tcc') #Overlap tccs print 'overlapping' oTcc_tTccs = compareData.getIndividualOverlaps(oTcc_oRNA.keys(), tTcc_transcripts.keys(), 1) #create final dictionary containing {oRNA : [transcript, ..]} oRNA_transcripts = {} for oTcc in oTcc_tTccs: oRNA = oTcc_oRNA[oTcc] oRNA_transcripts[oRNA] = [] for tTcc in oTcc_tTccs[oTcc]: oRNA_transcripts[oRNA].extend(tTcc_transcripts[tTcc]) print 'get context info' #Go through each site and find out what it overlaps, and if it is in a coding region... ds = bioLibCG.dominantSpotter(['EXON_INTRON', '3UTR', '5UTR', 'EXON', 'INTRON']) for oRNA in oRNA_transcripts: oRNA.transcriptIDs = [] oRNA.transcriptContexts = [] oRNA.transcriptTypes = [] oRNA.transcriptCodingTypes = [] if len(oRNA_transcripts[oRNA]) == 0: continue for transcript in oRNA_transcripts[oRNA]: codingTranscript = '_coding' in transcript.tType tType = None codingFlag = None tTypes = [x[1] for x in transcript.getOverlappingElements(oRNA.tcc)] #categorize border types tType = ds.spotItem(tTypes) if tType == 'EXON' or 'EXON_INTRON': if codingTranscript: codingFlag = 'C' else: codingFlag = 'NC' else: codingFlag = 'NC' oRNA.transcriptIDs.append(transcript.id) oRNA.transcriptContexts.append(tType) oRNA.transcriptTypes.append(transcript.tType) oRNA.transcriptCodingTypes.append(codingFlag) oDC.commit(id_oRNA)
def checkSeeds(editFN, contextFN, miLocationFN, miSequenceFN, gFN): eSites = cgEdit.loadEditingSites(editFN) cgEdit.updateContextEditingSites(eSites, contextFN) #puts the UTR, EXON in eSite.context geneSet = cgGenes3.createGeneSetEditing(gFN) tName_t = {} for t in geneSet.transcripts: tName_t[t.id] = t miName_miSequence = {} f = open(miSequenceFN, 'r') for line in f: ls = line.strip().split('\t') name = ls[0] seq = ls[1] name = 'hsa-' + name miName_miSequence[name] = seq tName_miInfo = {} f = open(miLocationFN, 'r') for line in f: ls = line.strip().split('\t') tName = ls[0] miName = ls[1] loc = int(ls[2]) tName_miInfo.setdefault(tName, []).append([miName, loc]) foundIt = [] notFoundIt = [] for tName in tName_miInfo: try: t = tName_t[tName] except: continue checkSeq = get3UTRSeq(t) try: mRNA = t.getMRNA() except: continue for miInfo in tName_miInfo[tName]: miName = miInfo[0] loc = miInfo[1] try: miSequence = miName_miSequence[miName] miSeed = miSequence[1:8] except: continue rcMiSeed = cgSeqMod.reverseComplementSequence(miSeed, True) newLoc = loc - (len(mRNA) - len(checkSeq)) finding = checkSeq.find(rcMiSeed, newLoc - 25) if finding != -1: if (0 < newLoc - finding < 30): newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc) if newResult not in foundIt: foundIt.append(newResult) else: if miName == 'hsa-miR-21': print loc, len(checkSeq), len(mRNA) print mRNA print checkSeq newResult = '%s\t%s\t%s\t%s\t%s' % (miName, tName, finding, newLoc, loc) if newResult not in notFoundIt: notFoundIt.append(newResult) print len(foundIt) print len(notFoundIt) print '' for i in foundIt: print i print '' for i in notFoundIt: print i
def makeContextPieBetter(contextFN, gFN, eFN, passedInfo, outFN): if passedInfo[0] == passedInfo[1]: print passedInfo, 'no such thing' return 0 print 'loading geneSet' geneSet = cgGenes3.createGeneSetEditing(gFN) typeCount = { 'EXON': 0, '3UTR': 0, '5UTR': 0, 'INTRON': 0, 'NONG': 0, 'NONT': 0 } #joint tID_transcript = {} for transcript in geneSet.transcripts: tID_transcript[transcript.id] = transcript eID_Info = {} f = open(eFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[13]) chrom = ls[0] coord = ls[1] newCoord = '%s:%s' % (chrom, coord) nEdited = ls[4] nTotal = ls[5] eID_Info[eID] = [newCoord, nEdited, nTotal] eID_gName = {} eID_tTypes = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) gName = ls[1] eID_gName[eID] = gName tType = ls[3] tName = ls[2] if tType == 'INTER': continue transcript = tID_transcript[tName] tInfo = [tType, transcript] if eID in eID_tTypes: eID_tTypes[eID].append(tInfo) else: eID_tTypes[eID] = [tInfo] eID_finalType = {} for eID in eID_tTypes: highestType = None for tInfo in eID_tTypes[eID]: tType = tInfo[0] transcript = tInfo[1] tCoding = True if '_coding' not in transcript.tType: tCoding = False gCoding = True if '_coding' not in transcript.gType: gCoding = False #print transcript.id, tType, tCoding, gCoding if tType == 'EXON': if tCoding: highestType = 'EXON' break else: if gCoding: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR' ]: highestType = 'NONT' else: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR', 'NONT' ]: highestType = 'NONG' elif tType == '3UTR': if tCoding: if gCoding: highestType = '3UTR' else: highestType = '3UTR' else: if gCoding: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR' ]: highestType = 'NONT' else: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR', 'NONT' ]: highestType = 'NONG' elif tType == '5UTR': if tCoding: if gCoding: if highestType not in ['3UTR']: highestType = '5UTR' else: if highestType not in ['3UTR']: highestType = '5UTR' else: if gCoding: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR' ]: highestType = 'NONT' else: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR', 'NONT' ]: highestType = 'NONG' elif tType == 'INTRON': if tCoding: if gCoding: if highestType not in ['3UTR', '5UTR']: highestType = 'INTRON' else: if highestType not in ['3UTR', '5UTR']: highestType = 'INTRON' else: if gCoding: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR' ]: highestType = 'NONT' else: if highestType not in [ 'EXON', 'INTRON', '3UTR', '5UTR', 'NONT' ]: highestType = 'NONG' eID_finalType[eID] = highestType typeCount[highestType] += 1 outF = open(outFN, 'w') for eID in eID_finalType: outF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (eID_Info[eID][0].split(':')[0], eID_Info[eID][0].split(':')[1], passedInfo, eID_gName[eID], eID_finalType[eID], eID_Info[eID][1], eID_Info[eID][2])) return 0 outF = open(outFN, 'w') for type in typeCount: outF.write('%s\t%s\n' % (type, typeCount[type])) return 0 #get fractions of each type types = ['EXON', '3UTR', '5UTR', 'INTRON', 'NONG', 'NONT'] fracs = [ typeCount['EXON'], typeCount['3UTR'], typeCount['5UTR'], typeCount['INTRON'], typeCount['NONG'], typeCount['NONT'] ] #print fracs labels = [ 'Exons (%s)' % fracs[0], '3\'UTR (%s)' % fracs[1], '5\'UTR (%s)' % fracs[2], 'Introns (%s)' % fracs[3], 'Noncoding Gene (%s)' % fracs[4], 'Noncoding Transcript (%s)' % fracs[5] ] theSum = fracs[0] + fracs[1] + fracs[2] + fracs[3] + fracs[4] + fracs[5] fracs = [float(x) / theSum for x in fracs] #print fracs explode = (0.1, 0.1, 0.1, 0.1, .1, .1) pie(fracs, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True) title('Editing Site Genomic Location', bbox={ 'facecolor': '1.0', 'pad': 10 }) show()
def updateSynonomous(eFN, gFN, resultsFN, outFN): #Load Transcripts and Editing Sites print 'Loading editing sites' eSites = cgEdit.loadEditingSites(eFN) print 'Loading gene set' geneSet = cgGenes3.createGeneSetEditing(gFN) codingTID_eID = {} f = open(resultsFN, 'r') for line in f: ls = line.strip().split('\t') if ls[4] == 'C': codingTID_eID[ls[2]] = int(ls[0]) #Get coding Transcripts codingTranscripts = {} #tID : eID ! many:one always! f = open(resultsFN, 'r') for line in f: ls = line.strip().split('\t') if ls[4] == 'C': codingTranscripts[ls[2]] = int(ls[0]) eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite tID_transcript = {} for transcript in geneSet.transcripts: tID[transcript.id] = transcript codingT_eSite for tID in codingTID_eID: eID = codingTID_eID[tID] t = tID_transcript[tID] e = eID_eSite[eID] print 'Creating scroll dict' scrollDict = {} # transcript: eSite for tID in codingTranscripts: e = eJoint[codingTranscripts[tID]] try: t = tJoint[tID] scrollDict[t] = e except KeyError: pass print 'Deducing synonomous' map = cgSeqMod.loadCodonMap('hg19') finalDict = {} # tID: [SYN, AAA, AAB, G, A] #Figure out if they are synonomous for t in scrollDict: eSite = scrollDict[t] #dumpObj.dumpObj(t) #dumpObj.dumpObj(eSite) ePositionInMRNA = t.getRelativePositionMRNA(eSite.coordinate - 1) if ePositionInMRNA == -1: print t.id, 'should not be designated coding...' continue #grab mRNA and emRNA mRNA = t.getMRNA(coding=True) emRNA = t.getMRNA(coding=True) if mRNA[ePositionInMRNA] != 'A': print 'wrong position', t.id, '%s:%s' % ( eSite.chromosome, eSite.coordinate), eSite.strand, mRNA[ ePositionInMRNA - 5:ePositionInMRNA - 1], mRNA[ePositionInMRNA], mRNA[ePositionInMRNA + 1:ePositionInMRNA + 5] #edit the site emRNA = list(emRNA) emRNA[ePositionInMRNA] = 'G' emRNA = ''.join(emRNA) #Test the protein sequences pRNA = cgSeqMod.translateRNA(mRNA, map) epRNA = cgSeqMod.translateRNA(emRNA, map) #print t.parent, t.id newString = ['%s ' % x for x in list(pRNA)] newString = ''.join(newString) if pRNA[0] != 'M': print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10] if pRNA[-1] != '*': print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:] #compare the codons. mCodonList = cgSeqMod.getCodonListFromRNA(mRNA) emCodonList = cgSeqMod.getCodonListFromRNA(emRNA) compareList = zip(mCodonList, emCodonList) synFlag = 'SYN' codonNumber = ePositionInMRNA // 3 codonPair = compareList[codonNumber] print t.id print eSite.ID print mCodonList[:codonNumber] print mRNA[:ePositionInMRNA] bCodon = codonPair[0] aCodon = codonPair[1] baa = cgSeqMod.translateRNA(bCodon, map) aaa = cgSeqMod.translateRNA(aCodon, map) if baa != aaa: synFlag = 'NON' bCodonList = list(bCodon) aCodonList = list(aCodon) matchedLetters = zip(bCodonList, aCodonList) for pair in matchedLetters: if pair[0] != 'A': if pair[1] == 'G' and pair[0] != 'G': print 'messed up codon switch', bCodonList, aCodonList print t.parent, '%s:%s' % ( eSite.chromosome, eSite.coordinate ), eSite.strand, bCodon, aCodon, baa, aaa else: synFlag = 'SYN' finalDict[t.id] = [synFlag, bCodon, aCodon, baa, aaa] print 'writing to file' #update line by line newLines = [] f = open(resultsFN, 'r') for line in f: newLine = line.strip() tID = line.strip().split('\t')[2] if tID in finalDict: newLine = newLine + '\t%s\t%s\t%s\t%s\t%s\n' % ( finalDict[tID][0], finalDict[tID][1], finalDict[tID][2], finalDict[tID][3], finalDict[tID][4]) else: newLine = newLine + '\tNA\tNA\tNA\tNA\tNA\n' newLines.append(newLine) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def betterSynonymous(eFN, gFN, contextFN, outFN, refBase='A', eBase='G'): print 'loading e sites' eSites = cgEdit.loadEditingSites(eFN) print 'loading geneSet' geneSet = cgGenes3.createGeneSetEditing(gFN) contextInfo = {} # eID: tID : [UTR, C] f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) tID = ls[2] cInfo = [ls[3], ls[4]] if eID not in contextInfo: contextInfo[eID] = {} contextInfo[eID][tID] = cInfo else: contextInfo[eID][tID] = cInfo eID_tIDs = {} f = open(contextFN, 'r') for line in f: ls = line.strip().split('\t') eID = int(ls[0]) tID = ls[2] if tID not in eID_tIDs.setdefault(eID, []): eID_tIDs[eID].append(tID) eID_eSite = {} for eSite in eSites: eID_eSite[eSite.ID] = eSite tID_transcript = {} for transcript in geneSet.transcripts: tID_transcript[transcript.id] = transcript eSite_transcripts = {} for eID in eID_tIDs: eSite = eID_eSite[eID] tList = [] for tID in eID_tIDs[eID]: if tID == 'NONE': continue if tID_transcript.get(tID, None) == None: continue tList.append(tID_transcript[tID]) eSite_transcripts[eSite] = tList outF = open(outFN, 'w') map = cgSeqMod.loadCodonMap('hg19') for eSite in eSite_transcripts: for transcript in eSite_transcripts[eSite]: siteType, codingType = contextInfo[eSite.ID][transcript.id] if '_noncoding' in transcript.tType: continue if codingType != 'C': continue ePositionInMRNA = transcript.getRelativePositionMRNA( eSite.coordinate - 1) mRNA = transcript.getMRNA(coding=True) emRNA = transcript.getMRNA(coding=True) if mRNA[ePositionInMRNA] != refBase: print 'Editing site was not an A...' #edit the site emRNA = list(emRNA) emRNA[ePositionInMRNA] = eBase emRNA = ''.join(emRNA) #Test the protein sequences pRNA = cgSeqMod.translateRNA(mRNA, map) epRNA = cgSeqMod.translateRNA(emRNA, map) if pRNA[0] != 'M': print 'Non-canonical Start AA:', pRNA[0:5], mRNA[:10] if pRNA[-1] != '*': print 'Non-canonical End AA:', pRNA[-5:], mRNA[-10:] #compare the codons. mCodonList = cgSeqMod.getCodonListFromRNA(mRNA) emCodonList = cgSeqMod.getCodonListFromRNA(emRNA) compareList = zip(mCodonList, emCodonList) codonNumber = ePositionInMRNA // 3 codonPair = compareList[codonNumber] bCodon = codonPair[0] aCodon = codonPair[1] baa = cgSeqMod.translateRNA(bCodon, map) aaa = cgSeqMod.translateRNA(aCodon, map) synFlag = 'SYN' if baa != aaa: synFlag = 'NON' bCodonList = list(bCodon) aCodonList = list(aCodon) matchedLetters = zip(bCodonList, aCodonList) for pair in matchedLetters: if pair[0] != 'A': if pair[1] == 'G' and pair[0] != 'G': print 'messed up codon switch', bCodonList, aCodonList print t.parent, '%s:%s' % ( eSite.chromosome, eSite.coordinate ), eSite.strand, bCodon, aCodon, baa, aaa outF.write('\t'.join([ str(eSite.ID), transcript.parent, transcript.id, synFlag, bCodon, aCodon, baa, aaa ]) + '\n')