def __init__(self, baseDir, maxMotifLength, alphabet, samplingFreq, enableMiRSampling=True, enableNegSeq=False, fetchGroupMode=0, covPrecThres=0.1): self.maxMotifLength = int(maxMotifLength) self.alphabet = alphabet self.samplingFreq = int(samplingFreq) self.maxPathLength = 20 # max length when search for paths in 2mer graph self.backgroundRatio = { 'A': 0.221, 'C': 0.235, 'G': 0.288, 'T': 0.256, 'U': 0.256 } self.ghostscriptProcessor = 'gs-921-linux-x86_64' self.uniprobe2memeProcessor = './uniprobe2meme' self.tomtomProcessor = './tomtom' self.enableMiRSampling = enableMiRSampling self.enableNegSeq = enableNegSeq self.fetchGroupMode = fetchGroupMode self.covPrecThres = covPrecThres # dir info self.baseDir = baseDir.replace(' ', '\\ ') self.PPMDir = os.path.join(baseDir, 'PPM') self.uniprobeDir = os.path.join(baseDir, 'PPM') self.epsDir = os.path.join(baseDir, 'eps') self.pngDir = os.path.join(baseDir, 'png') folderLis = [self.PPMDir, self.epsDir, self.pngDir, self.uniprobeDir] # input file self.userDataFn = os.path.join(baseDir, 'userInput.fa') self.negSeqFn = os.path.join(baseDir, 'negSeq.fa') # output file self.inputInfoFn = os.path.join(baseDir, 'inputInfo.txt') self.pattern2pwmSimiFn = os.path.join(baseDir, 'pattern2pwmSimi') self.pattern2covSimiFn = os.path.join(baseDir, 'pattern2covSimi') self.pattern2covFn = os.path.join(baseDir, 'pattern2cov') self.pattern2kmerSetFn = os.path.join(baseDir, 'pattern2kmerSet.txt') self.patternInfoFn = os.path.join(baseDir, 'patternInfo.txt') self.clusterInfoFn = os.path.join(baseDir, 'clusterInfo.txt') self.finalRltFn = os.path.join(baseDir, 'finalRlt.txt') self.jobDoneFn = os.path.join(baseDir, 'job Done!!') # global variables self.overallSigKmer = [{}, {}] if self.enableMiRSampling else [{}] self.wholeTree = [[], []] if self.enableMiRSampling else [[]] self.allInTreeKmerSet = [set(), set() ] if self.enableMiRSampling else [set()] self.overallKmer2Cov = {} self.overallRefKmer2CovThreshold = [ {}, {} ] if self.enableMiRSampling else [{}] # self.overallUserKmer2VisDetail = {} self.overallRefKmer2covIdxMatrix = [ {}, {} ] if self.enableMiRSampling else [{}] self.layerCovLis = [] self.K2kmers = {} self.pattern2kmerSet = {} # open file self.patternInfoFile = open(self.patternInfoFn, 'w') self.clusterInfoFile = open(self.clusterInfoFn, 'w') self.finalRltFile = open(self.finalRltFn, 'w') # initialization functions if os.path.exists(self.jobDoneFn): os.remove(self.jobDoneFn) Comm.CreateFolder(folderLis) self.ShowBasicInfo()
def pipeline(self): # TODO change if SINGLE_LENGTH_MODE: motifLengthLis = [TARGET_LENGTH] else: motifLengthLis = range(self.maxMotifLength + 1)[2:] # 2 <= K < maxMotifLength # load user data, build 2-mer graph and fetch path userSeqLis, seqCnt, minSeqLen, maxSeqLen = BioinfoComm.loadSinglelineSeq( self.userDataFn) kmerMatrix = DimerGraph.DivideSeq(userSeqLis) dimerGraph = DimerGraph.BuildGraph(kmerMatrix, alphabet=self.alphabet) K2Paths = DimerGraph.SearchPaths(dimerGraph, self.maxPathLength, enableFiltering=False, covPercThres=self.covPrecThres) self.overallKmer2Cov[1] = {'root': seqCnt} self.layerCovLis.append( ('root', seqCnt)) # root layer covers all sequences # load ref data sampledSeqMatrix = [] RNASampledSeqMatrix = SignificaceEvaluation.SampleRNARef( RNA_REF_FN, seqCnt, self.samplingFreq, minSeqLen, maxSeqLen, self.alphabet) sampledSeqMatrix.append(RNASampledSeqMatrix) if self.enableMiRSampling: miRSampledSeqMatrix = SignificaceEvaluation.SampleMiRRef( MIR_REF_FN, seqCnt, self.samplingFreq, self.alphabet) sampledSeqMatrix.append(miRSampledSeqMatrix) PrintWithTime("finish loading data", isLogging=True) INFO("number of sequences: %s" % seqCnt) INFO("min length: %s" % minSeqLen) INFO("max length: %s" % maxSeqLen) print9(isLogging=True) kmer2visDetail = {} for motifLength in motifLengthLis: PrintWithTime("begin to calculate on layer motifLength = %s" % motifLength, isLogging=True) for typeId, allSigKmer in enumerate(self.overallSigKmer): title = 'RNA' if typeId == 0 else 'miR' lastLayerKmers = allSigKmer if motifLength > 2 else BioinfoComm.GenerateKmerLis( self.alphabet, 2) inTreeKmerSet = self.allInTreeKmerSet[typeId] # not all the k-mers are sampled in order to increase the speed sigKmerSet, userKmer2cov, kmer2visDetail, kmer2refCovThreshold, kmer2Pvalue, _, refKmer2covIdxMatrix, inTreeKmerSet = SignificaceEvaluation.EvalSignificance( motifLength, sampledSeqMatrix[typeId], lastLayerKmers, seqCnt, self.samplingFreq, K2Paths, inTreeKmerSet, self.alphabet, enableParallel=ENABLE_PARALLEL, coreNm=CORE_NUM, covPrecThres=self.covPrecThres) filtSigKmerSet, self.wholeTree[ typeId] = HierarchicalGraph.BuildTree( motifLength, self.wholeTree[typeId], sigKmerSet, lastLayerKmers, userKmer2cov, userSeqLis, covPrecThres=self.covPrecThres) HierarchicalGraph.DisplayLayerInfo(title, sigKmerSet, filtSigKmerSet, self.wholeTree[typeId], kmer2Pvalue) # update overall dict self.overallKmer2Cov.setdefault(motifLength, userKmer2cov) self.overallRefKmer2covIdxMatrix[ typeId] = refKmer2covIdxMatrix # [RNA, miR] self.overallRefKmer2CovThreshold[typeId] = dict( kmer2refCovThreshold, **self.overallRefKmer2CovThreshold[typeId]) # merge dict # TODO: change if SINGLE_LENGTH_MODE: self.overallSigKmer[typeId] = filtSigKmerSet else: self.overallSigKmer[typeId] = sigKmerSet self.allInTreeKmerSet[typeId] = inTreeKmerSet if not self.overallSigKmer[0] and (self.enableMiRSampling and not self.overallSigKmer[1]): continue treeStructureFn = os.path.join(self.baseDir, 'currentTree_K=%s' % motifLength) # TODO: change if SINGLE_LENGTH_MODE: segmentLis = sigKmerSet else: self.K2kmers = HierarchicalGraph.DrawTree( treeStructureFn, self.wholeTree, self.overallKmer2Cov, self.overallRefKmer2CovThreshold, enableMiRSampleing=self.enableMiRSampling) print0(isLogging=True) if motifLength < 2: continue # display all the segments segmentLis = self.K2kmers[motifLength][0] | self.K2kmers[ motifLength][1] PrintWithTime("all the segments", isLogging=True) INFO(segmentLis) print9(isLogging=True) # build segment similarity graph INFO('fetching segment similarity graph') graphFn = os.path.join(self.baseDir, 'segmentSimiGraph-K=%s' % motifLength) simiGraph = Backpack.BuildSimiGraph(graphFn, segmentLis) # format covered sequences into binary number allNode = set(simiGraph.node_neighbors.keys()) kmer2seqIdSet = { kmer: set(map(lambda x: x[0], visLis)) for kmer, visLis in kmer2visDetail.iteritems() } userKmer2seqIdInt = BioinfoComm.formatCovId(kmer2seqIdSet, seqCnt) # build backpack model to find the path based on segments INFO('building backpack problem') initMinIC = (0.61 * 0.25 + 0.3 * 0.5 + 0.125 * 0.25) * motifLength initMaxIC = 0.61 * motifLength # 0 variables in each position allPatterns, userKmer2seqIdInt, pattern2IC, pattern2kmerSet = Backpack.FetchPatternWithICIter( allNode, simiGraph.node_neighbors, initMinIC, initMaxIC, userSeqLis, set(), userKmer2seqIdInt, self.overallKmer2Cov, pattern2IC={}, enableParallel=ENABLE_PARALLEL, coreNm=CORE_NUM, pattern2kmerSet={}) self.pattern2kmerSet = dict(self.pattern2kmerSet, **pattern2kmerSet) PrintWithTime("all the patterns", isLogging=True) INFO(allPatterns) print0(isLogging=True) # fetch pattern's IC, coverage and p-value to do filtering and sorting pattern2cov, _ = outputRlt.FetchPatternSetCov( allPatterns, userSeqLis, userKmer2seqIdInt, seqCnt) PrintWithTime('calculating Pvalue of RNA', isLogging=True) RNAPattern2pvalue, disabledRNAPattern = outputRlt.FetchPatternPvalue( initMinIC, seqCnt, pattern2IC, pattern2cov, sampledSeqMatrix[0], self.overallRefKmer2covIdxMatrix[0], self.samplingFreq, enableParallel=ENABLE_PARALLEL, coreNm=CORE_NUM) self.overallRefKmer2covIdxMatrix[0] = {} print9(isLogging=True) miRPattern2pvalue = {} if self.enableMiRSampling: PrintWithTime('calculating Pvalue of miR', isLogging=True) miRPattern2pvalue, _ = outputRlt.FetchPatternPvalue( initMinIC, seqCnt, pattern2IC, pattern2cov, sampledSeqMatrix[1], self.overallRefKmer2covIdxMatrix[1], self.samplingFreq, disabledInputPattern=disabledRNAPattern, enableParallel=ENABLE_PARALLEL, coreNm=CORE_NUM) print9(isLogging=True) self.overallRefKmer2covIdxMatrix = [ {}, {} ] if self.enableMiRSampling else [{}] # format the pattern result and merge all the dict patternInfoLis = [] for sourcePattern, userCov in pattern2cov.iteritems(): if sourcePattern not in RNAPattern2pvalue or sourcePattern not in RNAPattern2pvalue: continue IC = pattern2IC[sourcePattern] RNAPvalue = 1.0 if RNAPattern2pvalue[ sourcePattern] > 1 else RNAPattern2pvalue[sourcePattern] if RNAPvalue > 0.05: continue miRPvalue = 'N/A' if not self.enableMiRSampling else 1.0 if miRPattern2pvalue[ sourcePattern] > 1 else miRPattern2pvalue[sourcePattern] patternInfoLis.append( (sourcePattern, IC, userCov, RNAPvalue, miRPvalue)) if not patternInfoLis: continue patternInfoLis.sort(key=lambda x: (x[2], x[1]), reverse=True) # filter negative pattern if self.enableNegSeq: patternCovLis = map(lambda x: (x[0], x[2]), patternInfoLis) allKmerSet = BioinfoComm.GenerateKmerLis( self.alphabet, motifLength) signKmerSet = negativeTest.TestPatternInNegSeq( patternCovLis, seqCnt, self.negSeqFn, allKmerSet) patternInfoLis = filter(lambda x: x[0] in signKmerSet, patternInfoLis) # display motif pattern result PrintWithTime("IC, cov and pvalue of all the patterns", isLogging=True) Comm.showList(patternInfoLis, isLogging=True) self.patternInfoFile.write('=====motif length: %s=====\n' % motifLength) titleLine = '\t'.join([ 'pattern', 'IC', 'coverage(in %s)' % len(userSeqLis), 'RNA_Pvalue', 'miR_Pvalue' ]) self.patternInfoFile.write('%s\n' % titleLine) for item in patternInfoLis: curLine = '\t'.join(map(lambda x: str(x), item)) self.patternInfoFile.write('%s\n' % curLine) print0(isLogging=True) # fetch PPM PrintWithTime("begin to fetch PPM", isLogging=True) curPPMDir = os.path.join(self.PPMDir, str(motifLength)) curUniprobeDir = os.path.join(self.uniprobeDir, str(motifLength)) curEpsDir = os.path.join(self.epsDir, str(motifLength)) curPngDir = os.path.join(self.pngDir, str(motifLength)) Comm.CreateFolder([curPPMDir, curEpsDir, curPngDir]) # output PPM and uniprobe for tomtom PPMFn = os.path.join(curPPMDir, 'PPM.txt') uniprobeFn = os.path.join(curUniprobeDir, 'uniprobe.txt') pattern2PPM, uniprobeFnLis = outputRlt.OutputPPM( motifLength, patternInfoLis, PPMFn, uniprobeFn, userSeqLis, self.alphabet, self.backgroundRatio) if len(patternInfoLis) <= 1: continue # output pattern similarity to do clustering # run uniprobe2meme memeFnLis = [] for uniprobeFn in uniprobeFnLis: basename, ext = os.path.splitext(uniprobeFn) memeFn = '%s.meme' % basename memeFnLis.append(memeFn) bgFn = 'data/background_rna' if 'U' in self.alphabet else 'data/background_dna' rnaPara = '-rna' if 'U' in self.alphabet else '' cmd = '%s %s -bg %s %s > %s' % ( self.uniprobe2memeProcessor, rnaPara, bgFn, rawFormat(uniprobeFn), rawFormat(memeFn)) INFO(cmd) os.system(cmd) # merge all the motif info(*.meme) into one file allMotifFn = '%s/allMotif.meme' % os.path.dirname(uniprobeFn) hasHeader = False patternMap = {} mergedPatternLis = [] with open(allMotifFn, 'w') as allMotifFileobj: for memeFn in memeFnLis: sourcePattern = os.path.basename(memeFn).strip( 'uniprobe-').partition('.')[0] with open(memeFn) as memeFileobj: isMotifSection = False for line in memeFileobj: if line[:5] == 'MOTIF': isMotifSection = True encodePattern = line.strip().partition(' ')[2] mergedPatternLis.append(encodePattern) patternMap[encodePattern] = sourcePattern if not hasHeader or isMotifSection: allMotifFileobj.write(line) hasHeader = True """ # run tomtom 1 vs. 1, this section is replaced by running tomtom 1 vs. all, in which pvalue is more accurate patternPairInfo = {} for memeFn1 in memeFnLis: for memeFn2 in memeFnLis: if memeFn1 <= memeFn2: continue pattern1 = os.path.basename(memeFn1).strip('uniprobe-').partition('.')[0] pattern2 = os.path.basename(memeFn2).strip('uniprobe-').partition('.')[0] cmd = '%s -text -no-ssc -oc . -verbosity 1 -min-overlap 2 -mi 1 -dist pearson -evalue -thresh 10.0 %s %s' % (self.tomtomProcessor, rawFormat(memeFn1), rawFormat(memeFn2)) INFO(cmd) rltLine = os.popen(cmd).readlines()[-1] items = rltLine.strip().split('\t') offset = int(items[2]) pvalue = float(items[3]) evalue = float(items[4]) qvalue = float(items[5]) patternPairInfo[(pattern1, pattern2)] = (offset, pvalue, evalue, qvalue) patternPairInfo[(pattern2, pattern1)] = (-1 * offset, pvalue, evalue, qvalue) """ # run tomtom 1 vs. all allMotifTomtomRltFn = '%s/allMotifTomtomRlt.meme' % os.path.dirname( uniprobeFn) with open(allMotifTomtomRltFn, 'w') as allMotifTomtomRltFileobj: patternPairInfo = {} for memeFn in memeFnLis: sourcePattern = os.path.basename(memeFn).strip( 'uniprobe-').partition('.')[0] cmd = '%s -text -no-ssc -oc . -verbosity 1 -min-overlap %s -mi 1 -dist pearson -evalue -thresh 10.0 %s %s' % ( self.tomtomProcessor, motifLength / 2, rawFormat(memeFn), allMotifFn) INFO(cmd) rltLines = os.popen(cmd).readlines() isMotifSection = False for rltLine in rltLines: INFO(rltLine) allMotifTomtomRltFileobj.write('%s' % rltLine) if rltLine[ 0] == '#': # only read motif section which start with '#' isMotifSection = True continue elif not isMotifSection: continue items = rltLine.strip().split('\t') targetPattern = items[1] offset = int(items[2]) pvalue = float(items[3]) evalue = float(items[4]) qvalue = float(items[5]) patternPairInfo[(sourcePattern, targetPattern)] = (offset, pvalue, evalue, qvalue) patternPairInfo[(targetPattern, sourcePattern)] = (-1 * offset, pvalue, evalue, qvalue) # node's weight: pattern and their coverage PrintWithTime("begin to fetch node weight(pattern coverage)", isLogging=True) pattern2covFn = '%s-K=%s.txt' % (self.pattern2covFn, motifLength) pattern2cov = patternClustering.outputPatternCov( patternInfoLis, pattern2covFn) # edge's weight: pattern and their similarity based on tomtom PrintWithTime("begin to fetch edge weight(pattern similarity)", isLogging=True) pattern2simiFn = '%s-K=%s.txt' % (self.pattern2pwmSimiFn, motifLength) pattern2pwmSimi = patternClustering.CalcPatternPwmSimi( patternInfoLis, pattern2PPM, patternPairInfo, pattern2simiFn, self.alphabet, doNormalization=False) # print data to show pattern simi edge, cov and pvalue INFO('** pattern info: simi, coverage and pvalue **') showPatternSimi(pattern2pwmSimi, pattern2cov, RNAPattern2pvalue, miRPattern2pvalue, patternPairInfo) INFO('***' * 20) if not pattern2pwmSimi: continue """ # build pattern similarity graph and do clustering, this section is replace by R script below PrintWithTime("begin to build pattern similarity graph", isLogging=True) clusterLis = patternClustering.patternClustering(pattern2cov, pattern2pwmSimi) """ # run R script to build similarity graph and do clustering PrintWithTime("begin to build pattern similarity graph", isLogging=True) clusterRltFn = '%s/clusterRlt-K=%s.txt' % ( os.path.dirname(pattern2simiFn), motifLength) cmd = "Rscript clustering.R %s %s" % (pattern2simiFn, clusterRltFn) INFO(cmd) os.system(cmd) clusterLis = patternClustering.loadClusterRlt(clusterRltFn) if not clusterLis: continue # show cluster info outputLine = '==== clusters for motif with length: %s ====' % motifLength self.clusterInfoFile.write('%s\n' % outputLine) INFO(outputLine) for cluster in clusterLis: outputLine = '\t'.join(cluster) self.clusterInfoFile.write('%s\n' % outputLine) INFO(outputLine) # for each cluster, find some cores PrintWithTime("begin to fetch cluster core", isLogging=True) coreMotifSet, coreOutputLines = patternClustering.searchForCoreInCluster( clusterLis, userKmer2seqIdInt, seqCnt, pattern2cov, pattern2IC, RNAPattern2pvalue, miRPattern2pvalue, self.fetchGroupMode, covPrecThres=self.covPrecThres) # show cores in each cluster, which are the final motif to report outputLine = '== core pattern in each cluster ==' self.clusterInfoFile.write('%s\n' % outputLine) INFO(outputLine) for coreOutputLine in coreOutputLines: self.clusterInfoFile.write('%s\n' % coreOutputLine) INFO(coreOutputLine) # output final result INFO('== final result ==') outputLine = 'pattern length: %s' % motifLength self.finalRltFile.write('%s\n' % outputLine) INFO(outputLine) coreMotifLis = sorted(coreMotifSet, key=lambda x: (pattern2cov[x], pattern2IC[x]), reverse=True) for motif in coreMotifLis: outputLis = [ motif, str(pattern2cov[motif]), str(pattern2IC[motif]), str(RNAPattern2pvalue[motif]), str(miRPattern2pvalue.get(motif, 'N/A')) ] outputLine = '\t'.join(outputLis) self.finalRltFile.write('%s\n' % outputLine) INFO(outputLine) # output logo PrintWithTime("begin to output logo", isLogging=True) epsFn = os.path.join(curEpsDir, 'logo.eps') pngFn = os.path.join(curPngDir, 'logo.png') outputRlt.OutputLogo(coreMotifLis, motifLength, self.ghostscriptProcessor, PPMFn, epsFn, pngFn) # showing finishing a loop PrintWithTime("finish the loop for K=%s" % motifLength, isLogging=True) # end_for for current K # write pattern2kmerSet dict with open(self.pattern2kmerSetFn, 'w') as pattern2kmerSetFileobj: pickle.dump(self.pattern2kmerSet, pattern2kmerSetFileobj) PrintWithTime("finish the pipeline", isLogging=True) with open(self.jobDoneFn, 'w') as f: pass