def convertTccFileToGff(tccFileName): tccList = compare.tccFileToList(tccFileName, 0) gffList = convertTccListToBed(tccList) gffFileName = tccFileName + '.gff' gffFile = open(gffFileName, 'w') for line in gffList: gffFile.write(line) gffFile.close()
def convertTccFileToBed(tccFileName): tccList = compare.tccFileToList(tccFileName, 0) bedList = convertTccListToBed(tccList) bedFileName = tccFileName + '.bed' bedFile = open(bedFileName, 'w') for line in bedList: bedFile.write(line) bedFile.close()
def filterOut(cName = None): #Init conf = cgConfig.getConfig(cName) predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 1) #predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 0) overlapped = compare.filterOutTccs(predictionList, conf.conf['knownDirectory'], True) #True gives me the filtered out ones instead of the list without filtered out matureOverlaps = open(conf.conf['matureOverlaps'], 'w') for tcc in overlapped: matureOverlaps.write(tcc + '\n')
def filterOut(cName=None): # Init conf = cgConfig.getConfig(cName) predictionList = compare.tccFileToList(conf.conf["resultsRaw"], 1) # predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 0) overlapped = compare.filterOutTccs( predictionList, conf.conf["knownDirectory"], True ) # True gives me the filtered out ones instead of the list without filtered out matureOverlaps = open(conf.conf["matureOverlaps"], "w") for tcc in overlapped: matureOverlaps.write(tcc + "\n")
def splitExonsIntrons(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def splitExonsIntrons(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
import cgPeaks import compareData as compare import math import bioLibCG as cg knowns = compare.tccFileToList('mouseKnownMirs.tcc', 0) eLevels = [] for known in knowns: chrom, strand, start, end = cg.tccSplit(known, True) #text... if strand == '1': strand = '-1' else: strand = '1' oppTcc = cg.makeTcc(chrom, strand, start, end) knownStretch = cgPeaks.stretch(known) knownStretch.createPeaks(1, 20) kPos = knownStretch.getHighestPeak() if kPos: eLevels.append(knownStretch.profile[kPos]) oppStretch = cgPeaks.stretch(oppTcc) oppStretch.createPeaks(1, 20) oPos = oppStretch.getHighestPeak() if oPos and kPos: #determine if they are close enough to be considered mirrored... if math.fabs(int(kPos) - int(oPos)) < 12: print known, oPos, kPos, oppStretch.profile[ oPos], knownStretch.profile[kPos]
import profileTargets import compareData as compare tccList = compare.tccFileToList('ago.200.tcc', 0) profileTargets.profileTargets(tccList, 'agoProfile.conf', dir = 'ago', min = 30) #profileTargets.profileTargetsHistoAS(tccList, 'agoProfile.conf', name = 'agoNEG')
import compareData as compare import bioLibCG as cg exonList = compare.tccFileToList('allExons.tcc', 0) print cg.getTccListTotalLength(exonList) nonOverlap = compare.collapseOverlaps(exonList) print cg.getTccListTotalLength(nonOverlap) o = open('mouseExons.tcc', 'w') for tcc in nonOverlap: o.write(tcc + '\n') o.close()
def intronNoisy(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength(scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
import profileTargets import compareData as compare tccList = compare.tccFileToList('ago.200.tcc', 0) profileTargets.profileTargets(tccList, 'agoProfile.conf', dir='ago', min=30) #profileTargets.profileTargetsHistoAS(tccList, 'agoProfile.conf', name = 'agoNEG')
#given tcc, return best peak combo import bioLibCG as cg import cgConfig as c import wigValue import compareData as compare #init mConf = c.cgConfig('Main.conf') conf = c.cgConfig() pRange = 100 tccList = ['chr3:-1:96042576:96042685', 'chr3:-1:96042576:96042685'] tccList = compare.tccFileToList('mouseKnownMirs.tcc', 0) timer = cg.cgTimer() timer.start() #put peaks in memory print 'loading peak data' peakFilesNames = cg.recurseDir(mConf.conf['wigMouse'], end = '.peaks') peaks = {} # chr:peak:value for pN in peakFilesNames: chrom = pN.strip().split('.')[4] strand = pN.strip().split('.')[2] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]:
'''For all things testing''' import bioLibCG as cg import compareData as compare fileName = '/u/home8/gxxiao/chrisgre/scripts/FilterKnownMirs/ensemblHumanData/ensemblData .dblColonDash' dcdList = compare.tccFileToList(fileName, 0) tccList = cg.convertDcdToTcc(dcdList) for x in tccList: print x
##Clusters are based off of overlapping neighbors, if you have an overlapping neighbor than you are part of that cluster. import bioLibCG as cg import subprocess import compareData as compare import cgConfig #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.returnConfDict() finalMirFileName = '/u/home8/gxxiao/chrisgre/projects/PipeRuns/LanderHuman/out/LanderHuman-s3k8b17.ALL.FINAL.mirs.tsv' finalMirFileName = conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item = None, mode = None): if item in addedList:
def intronNoisy(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins( conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength( scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j: # add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def defineClusters(cName=None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item=None, mode=None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append( item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append( item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength / 2) rangeEnd = startCoord + (frameLength / 2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
def defineClusters(cName = None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item = None, mode = None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append(item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append(item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength/2) rangeEnd = startCoord + (frameLength/2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
import compareData as compare tccList = compare.tccFileToList('snos.tcc', 0) collapsed = compare.collapseOverlaps(tccList) for tcc in collapsed: print tcc
import cgGenes import compareData as compare import cgConfig as c cName = 'mm9.conf' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] genes = cgGenes.createGeneSetFromFile(geneSetFolder + '/allTransciptsType.tsv') peakTccs = compare.tccFileToList('peakData.500.mm9', 0) tOverlaps = genes.transcriptOverlaps(peakTccs) typeDict = {} for transcript in tOverlaps: if transcript.type not in typeDict: typeDict[transcript.type] = 1 else: typeDict[transcript.type] += 1 #count the amounts of each type for each transcript amount = {} for gene in genes.genes: for t in gene.transcripts: if t.type in amount: amount[t.type] += 1 else: amount[t.type] = 1 print 'Total Peaks:', len(peakTccs)
import cgPeaks import compareData as compare import math import bioLibCG as cg knowns = compare.tccFileToList("mouseKnownMirs.tcc", 0) eLevels = [] for known in knowns: chrom, strand, start, end = cg.tccSplit(known, True) # text... if strand == "1": strand = "-1" else: strand = "1" oppTcc = cg.makeTcc(chrom, strand, start, end) knownStretch = cgPeaks.stretch(known) knownStretch.createPeaks(1, 20) kPos = knownStretch.getHighestPeak() if kPos: eLevels.append(knownStretch.profile[kPos]) oppStretch = cgPeaks.stretch(oppTcc) oppStretch.createPeaks(1, 20) oPos = oppStretch.getHighestPeak() if oPos and kPos: # determine if they are close enough to be considered mirrored... if math.fabs(int(kPos) - int(oPos)) < 12: print known, oPos, kPos, oppStretch.profile[oPos], knownStretch.profile[kPos]
#get results that are only noncoding import bioLibCG as cg import compareData as compare predName = '/home/chrisgre/projects/NoncodingMouse/results/NCmouse-s3k8b17.bothNCandC.results' keepList = compare.tccFileToList('keepNoncoding.tcc', 0) predList = compare.tccFileToList(predName, 1) keepers = compare.compareTwoTcc(predList, keepList, 1) print len(keepers) #now go back through pred file and create a new file with only lines that have noncoding in them predFile = open(predName, 'r') outFile = open('NCmouse.noncoding.results', 'w') predLines = predFile.readlines() predFile.close() newLines = {} for keeper in keepers: for line in predLines: if keeper in line: newLines[line] = 1 for line in newLines: outFile.write(line)