def transcriptSetOverlap(aDir, AS): AS = bool(AS) geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript oRNA_DC = cgNexusFlat.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() if AS == True: degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()] else: degTccs = [x.tcc for x in id_oRNA.values()] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #write new file for obj in id_oRNA.values(): if AS: degTcc = cg.convertToAS(obj.tcc) else: degTcc = obj.tcc if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False oRNA_DC.commit(id_oRNA)
def transcriptSetOverlap(aDir, AS): AS = bool(AS) geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() if AS == True: degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()] else: degTccs = [x.tcc for x in id_oRNA.values()] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #write new file for obj in id_oRNA.values(): if AS: degTcc = cg.convertToAS(obj.tcc) else: degTcc = obj.tcc if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False oRNA_DC.commit(id_oRNA)
def transcriptSetOverlapTargets(aDir): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() #create list of unique tccs. uniqTccs = [] for alignment in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) if tcc not in uniqTccs: uniqTccs.append(tcc) degTccs = [cg.convertToAS(x) for x in uniqTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #update for obj in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) degTcc = cg.convertToAS(tcc) if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False aDC.commit(id_alignment)
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) transcriptTccs = [] for gene in allExons.set.values(): for transcript in gene.transcripts: transcriptTccs.append(transcript.tcc) #create hitmap coordSet = set() for tcc in transcriptTccs: chrom, strand, start, end = cg.tccSplit(tcc) if chrom != runningChrom: continue if strand != runningStrand: continue for i in range(start, end + 1): coordSet.add(i) #find overlapping degTccs print 'done creating hitmap' f = open(degFile, 'r') newLines = [] for line in f: ls = line.strip().split('\t') degTcc = cg.convertToAS(ls[1]) chrom, strand, start, end = cg.tccSplit(degTcc) if chrom != runningChrom: continue if strand != runningStrand: continue inTran = '0' for i in xrange(start, end + 1): if i in coordSet: inTran = '1' break #update newLines newLine = cg.appendToLine(line, inTran, 3) newLines.append(newLine) f.close() f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w') f.writelines(newLines) f.close()
def transcriptSetOverlapDegFile(degFile): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript degTccs = [] f = open(degFile, 'r') for line in f: ls = line.strip().split('\t') degTccs.append(ls[1]) f.close() degTccs = [cg.convertToAS(x) for x in degTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) f = open(degFile, 'r') newLines = [] for line in f: degTcc = cg.convertToAS(ls[1]) inTran = '0' if degTcc in overlappingDegTccs: inTran = '1' #update newLines newLine = cg.appendToLine(line, inTran, 3) f.close()
import getHairpins import cgGenes import cgConfig as c import bioLibCG as cg mConf = c.getConfig('Main.conf') geneSetFolder = mConf.conf['geneSetsHuman'] fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted' cHairs = getHairpins.getHairpins(fN) ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv') cDesc = {} #CID:gDesc for CID in cHairs: tcc = cHairs[CID] cDesc[CID] = "NONE" overlappingGenes = ensGenes.geneOverlaps([tcc]) if len(overlappingGenes) > 0: print overlappingGenes[0].type cDesc[CID] = overlappingGenes[0].type f = open(fN, 'r') newLines = [] for line in f: CID = line.strip().split('\t')[7] newLines.append(cg.appendToLine(line, cDesc[CID], 16)) f.close()
import cgGenes import compareData as compare import cgConfig as c cName = 'mm9.conf' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] genes = cgGenes.createGeneSetFromFile(geneSetFolder + '/allTransciptsType.tsv') peakTccs = compare.tccFileToList('peakData.500.mm9', 0) tOverlaps = genes.transcriptOverlaps(peakTccs) typeDict = {} for transcript in tOverlaps: if transcript.type not in typeDict: typeDict[transcript.type] = 1 else: typeDict[transcript.type] += 1 #count the amounts of each type for each transcript amount = {} for gene in genes.genes: for t in gene.transcripts: if t.type in amount: amount[t.type] += 1 else: amount[t.type] = 1 print 'Total Peaks:', len(peakTccs)
def exonNoisy(cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] #make prediction overlap hitmap print 'Making prediction list' predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) if compare.checkIfOverlaps(predList): predList = compare.collapseOverlaps(predList) #make genes for Ensemble/make list of tccs for exons. print 'Creating gene set' ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv') print ' loaded # genes:', len(ensGenes.set) #collect levels for each haipin region print '[Checking all levels]' cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] #for each hairpin, --> find overlapping transcripts in same gene overlappingGenes = ensGenes.geneOverlaps([hPin]) if len(overlappingGenes) > 0: gIDs = [gene.id for gene in overlappingGenes] allTccs = ensGenes.getTccsFromGIDs(gIDs) if compare.checkIfOverlaps: print ' Overlaps...collapsing' allTccs = compare.collapseOverlaps(allTccs) else: print 'NO GENE OVERLAPS!!!!!', CID, hPin #filter out my predictions. print ' Filtering out predictions' checkList = compare.subtractTwoTccLists(allTccs, predList) #Get Expression level for gene. print ' Retrieving Expression levels:', cg.getTccListTotalLength(checkList) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file print 'Outputting to file' #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() #print sortedKeys newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['exonNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()