def sequenceDTAs(self): curPairedScanData = self._indexedPairData[int(self._pairedScanListbox.curselection()[0])] t1 = time.time() if curPairedScanData['heavy'] != 'N/A': heavySeqMap = copy.deepcopy(self._seqMap) heavySeqMap['Mods']['N-Term'] = self._paramsDict['Pair Configurations'][curPairedScanData['pair configuration']]['NModSymbol'] heavySeqMap['Mods']['C-Term'] = self._paramsDict['Pair Configurations'][curPairedScanData['pair configuration']]['CModSymbol'] sharedInfo, starts, ends, deltas, termModHash, specs, G = DNS.initializeSpectrumGraph(self._pnet, self._paramsDict, self._scanFDict[curPairedScanData['light']]['dta'], heavyPath=self._scanFDict[curPairedScanData['heavy']]['dta'], ppm=self._ppm, usePaired=True, pairConfigName=curPairedScanData['pair configuration'], verbose=False) precMass = sharedInfo['lightPrecMass'] else: sharedInfo, starts, ends, deltas, termModHash, specs, G = DNS.initializeSpectrumGraph(self._pnet, self._paramsDict, self._scanFDict[curPairedScanData['light']]['dta'], ppm=self._ppm, verbose=False) precMass = sharedInfo['precMass'] epsilon = self._ppm * precMass * 10 ** -6 paths, subG = DNS.getSpectrumGraphPaths(G, deltas, specs, starts, ends, precMass - Constants.mods['H+'] - Constants.mods['H2O'], termModHash=termModHash, unknownPenalty=self._ambigpenalty, maxEdge=self._maxedge, minEdge=self._minedge, subGraphCut=self._subgraphcut, subAlpha=0.3, alpha=self._alpha, epsilon=epsilon, aas=self._aas, verbose=False) seqTime = time.time() - t1 if paths: seqs = [] for path in paths: seqs.extend([DNS.getSequenceFromNodes(subG, path[1], precMass - Constants.mods['H+'] - Constants.mods['H2O'], termModHash)]) scores = list(zip(*paths)[0]) Ord = np.argsort(-1 * np.array(scores)) ambigEdges = [] numAmbig = 0 for j in range(self._numseq): try: for i in range(len(seqs[Ord[j]])): if type(seqs[Ord[j]][i]) == tuple: ambigEdges.extend([seqs[Ord[j]][i]]) numAmbig += 1 seqs[Ord[j]][i] = '-' curSeq = ''.join(seqs[Ord[j]]) curSeq = An.preprocessSequence(curSeq, self._seqMap, ambigEdges=ambigEdges) if j == 0 and curPairedScanData['heavy'] != 'N/A': try: curHeavySeq = An.preprocessSequence(curSeq, heavySeqMap, replaceExistingTerminalMods=True, ambigEdges=ambigEdges) AAs = An.getAllAAs(curHeavySeq, ambigEdges=ambigEdges) self._seqStatus.set('Paired Sequencing Successful! Heavy Sequence: %s. Time taken: %f seconds' % (curHeavySeq, seqTime)) except KeyError: self._seqStatus.set('ERROR: Heavy Sequence %s is not a valid sequence! Time wasted: %f seconds' % (curHeavySeq, seqTime)) elif j == 0: self._seqStatus.set('Unpaired Sequencing Successful! Time taken: %f seconds' % (seqTime)) for labelInst in self._seqScoreData[j]['seq'].children.values(): labelInst.destroy() self.displayConfColoredSequence(subG, self._seqScoreData[j]['seq'], paths[Ord[j]][1], curSeq, ambigEdges=ambigEdges) self._seqScoreData[j]['score'].set(str(scores[Ord[j]])) except IndexError: for labelInst in self._seqScoreData[j]['seq'].children.values(): labelInst.destroy() self._seqScoreData[j]['score'].set('') else: self._seqStatus.set('ERROR: No Sequences Found! Time wasted: %f seconds' % seqTime)
def combineSpectraCompleteLinkage(masses1, masses2, epsilon=0.04): combMasses = np.append(masses1, masses2) combMasses = np.sort(combMasses) clusters = DNS.getClustersCompleteLinkage(combMasses, epsilon) combMasses = [] for cluster in clusters: if len(cluster) > 1: combMasses += [sum(cluster) / len(cluster)] else: combMasses += cluster return np.sort(np.array(combMasses))
def getSequencing(pair, sharedPeaks, paramsDict, outFile, res): global print_lock, spectrum_lock result = [] scanData = {} lightSpecs = [DataFile.getMassIntPairs(scanFDict[lightScanF]['dta']) for lightScanF in samePeptideClusters[pair[0]]] heavySpecs = [DataFile.getMassIntPairs(scanFDict[heavyScanF]['dta']) for heavyScanF in samePeptideClusters[pair[1]]] precMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in samePeptideClusters[pair[0]]])) epMean = options.ppmsyserror * precMass * 10**-6 epSTD = options.ppmstd * precMass * 10**-6 scanData['shared peaks ratio'] = sharedPeaks s1 = time.time() sharedInfo, starts, ends, deltas, G = DNS.prepPairedSpectrumGraph(lightSpecs, heavySpecs, precMass, addEnds, ppmSTD=options.ppmstd, Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], verbose=options.verbose) scanData['M+H'] = precMass specs = [] for massIntPairs in lightSpecs: specs += [PN.Spectrum(PNet, precMass, Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)] for massIntPairs in heavySpecs: specs += [PN.Spectrum(PNet, precMass + pairConfig['NMod'] + pairConfig['CMod'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)] for spec in specs: spec.initializeNoiseModel() # with spectrum_lock: temp = DNS.getSpectrumGraphDataThread(G, deltas, specs, starts, ends, precMass - Constants.mods['H+'] - Constants.mods['H2O'], ambigPenaltyFun, ppmPenaltyFun, hashedAAs, termModHash=termModHash, maxEdge=options.maxedge, minEdge=options.minedge, subGraphCut=options.subgraphcut, subAlpha=0.3, alpha=options.alpha, epMean=epMean, epSTD=epSTD, epStep=epStep, verbose=options.verbose) temp_scan = temp[0] peps = temp[1] scanData.update(temp_scan) scanData['pair configuration'] = pairConfigName with print_lock: print 'Now sequencing light scan(s) %s, heavy scan(s) %s with shared peaks ratio %f \n' % (str(samePeptideClusters[pair[0]]), str(samePeptideClusters[pair[1]]), scanData['shared peaks ratio']) # out.append('Now sequencing light scan(s) ' + str(samePeptideClusters[pair[0]]) + ', heavy scan(s) ' + str(samePeptideClusters[pair[1]]) + ' with shared peaks ratio ' + str(scanData['shared peaks ratio']) + ' \n' ) Ord = np.argsort(-1 * np.array(scanData['over_scores'])) if scanData['blind'] == 0: for i in range(min(Ord.size, 10)): try: print 'Score: ', peps[0][Ord[i]], 'Seq: ', ''.join(peps[1][Ord[i]]) # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + ''.join(peps[1][Ord[i]])) except TypeError: print 'Score: ', peps[0][Ord[i]], 'Seq: ', peps[1][Ord[i]] # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + str(peps[1][Ord[i]])) elif scanData['blind'] == 1: for i in range(min(Ord.size, maxNum)): try: print 'Score: ', peps[0][Ord[i]], 'Seq: ', ''.join(peps[1][Ord[i]][0]), 'Mod Names: ', peps[2][Ord[i]][1] # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + ''.join(peps[1][Ord[i]][0]) + ' Mod Names: ' + peps[2][Ord[i]][1]) except TypeError: print 'Score: ', peps[0][Ord[i]], 'Seq: ', peps[1][Ord[i]][0], 'Mod Names: ', peps[2][1] # out.append('Score: ' + str(peps[0][Ord[i]]) + ' Seq: ' + peps[1][Ord[i]][0] + ' Mod Names: ' + peps[2][1]) scanData['sequencing time'] = time.time() - s1 print '\nTime Taken:', time.time() - s1, '\n' # out.append('\nTime Taken: ' + str(time.time() - s1) + '\n') if validateHeavySequence(scanData['seq'], heavySeqMap, scanData['ambiguous edges']): for scanF in samePeptideClusters[pair[0]] + samePeptideClusters[pair[1]]: scanFDict[scanF]['sequenced'] = True if options.output: for pair in [(lightScanF, heavyScanF) for lightScanF in samePeptideClusters[pair[0]] for heavyScanF in samePeptideClusters[pair[1]]]: scanData['light scan'] = int(pair[0]) scanData['heavy scan'] = int(pair[1]) # outFile.write('\t'.join([str(scanData[col]) for col in cols]) + '\n') # print str(scanData[col]) res.append([str(scanData[col]) for col in cols]) else: print 'WARNING: Invalid sequence! Unsuccessful sequencing of %s and %s with pair configuration %s' % (str(samePeptideClusters[pair[0]]), str(samePeptideClusters[pair[1]]), pairConfigName) exit(0)
cols = ['light scan', 'heavy scan', 'pair configuration', 'M+H', 'score', 'seq', 'epsilon', 'ambiguous edges', 'num ambig edges'] if options.output: outFile = open(options.output, 'w') outFile.write('\t'.join([col.upper() for col in cols]) + '\n') PNet = PN.ProbNetwork(options.config, options.model) dtaList = glob.glob(options.dtadir + '/*.dta') scanFDict = getScanFDict(dtaList) aas = Constants.addPepsToAADict(300) hashedAAs = Constants.hashAAsEpsilonRange(aas, epStep, maxEp) ambigOpenPenalty = 0 ambigPenaltyFun = DNS.getAmbigEdgePenaltyFunction(options.minedge, ambigOpenPenalty, options.ambigpenalty) ppmPenaltyFun = DNS.getPPMPenaltyFun(options.ppmstd, hashedAAs, options.minedge, options.ppmpenalty, options.ppmsyserror, epStep) print 'Getting Clusters' parent = os.path.abspath(os.pardir) clusterSVMModel = svmutil.svm_load_model(parent + paramsDict['Cluster Configuration']['model']) clusterSVMRanges = svmutil.load_ranges(parent + os.path.splitext((paramsDict['Cluster Configuration']['model']))[0] + '.range') precMassClusters = Analytics.findSamePrecMassClusters(dtaList, ppm=options.ppmstd) # print 'precMassClusters', precMassClusters samePeptideClusters = Analytics.getSamePeptideClusters(precMassClusters, scanFDict, clusterSVMModel, clusterSVMRanges, ppmSTD=options.ppmstd, cutOff=float(paramsDict['Cluster Configuration']['cutoff'])) # samePeptideClusters = Analytics.getSamePeptideClusters(precMassClusters, scanFDict, clusterSVMModel, clusterSVMRanges, ppmSTD=options.ppmstd, cutOff=4) # samePeptideClusters = An.getSamePeptideClusters(precMassClusters, scanFDict, clusterSVMModel, clusterSVMRanges, ppmSTD=options.ppmstd, cutOff=4) # To test without any clustering #samePeptideClusters = [[scanF] for scanF in scanFDict]
if options.output: outFile = open(options.output, "w") outFile.write("\t".join([col.upper() for col in cols]) + "\n") t1 = time.time() print "Configuring LADS for sequencing..." ETDPNet = PN.ProbNetwork(paramsDict["Models"]["etd"]["config"], paramsDict["Models"]["etd"]["model"]) HCDPNet = PN.ProbNetwork(paramsDict["Models"]["hcd"]["config"], paramsDict["Models"]["hcd"]["model"]) dtaList = glob.glob(options.dtadir + "/*.dta") scanFDict = getScanFDict(dtaList) aas = Constants.addPepsToAADict(options.minedge) hashedAAs = Constants.hashAAsEpsilonRange(aas, epStep, maxEp) ambigOpenPenalty = 0 ambigPenaltyFun = DNS.getAmbigEdgePenaltyFunction(options.minedge, ambigOpenPenalty, options.ambigpenalty) ppmPenaltyFun = DNS.getPPMPenaltyFun( options.ppmstd, hashedAAs, options.minedge, options.ppmpenalty, options.ppmsyserror, epStep ) addEnds = DNS.getSpectrumGraphEndpointInitFunction( np.array(Constants.NTermMods.values()), np.array(Constants.CTermMods.values()), paramsDict["Enzyme"]["specificity"], ) termModHash = Constants.createTermModHashAAs( N=copy.deepcopy(Constants.NTermMods), C=copy.deepcopy(Constants.CTermMods) ) print "Getting Pairs..." pairs = getCIDETDPairs(scanFDict)
(paired, unpaired) = Analytics.getPairedAndUnpairedSpectra(options.dtaDir, dtaList, delta=(options.Nmod + options.Cmod), ppm=options.ppm, cutOff=options.pairCutoff) if options.verbose: t2 = time.time() print 'Finished getting paired spectra. Time taken: ', t2 - t1 print 'Starting Sequencing' aas = Constants.addPepsToAADict(options.minEdge) for pair in paired: (lightSpec, heavySpec) = pair[1:] if options.verbose: print 'Now sequencing %s %s with shared peaks ratio %f' % (lightSpec, heavySpec, pair[0]) s1 = time.time() heavyPath = heavySpec lightPath = lightSpec sharedInfo = DNS.getPairedSpectraInfoForSequencing(lightPath, heavyPath, options.verbose) DNS.sequencePairedSpectra(sharedInfo['NInd'], sharedInfo['CInd'], sharedInfo['lightPairs'], sharedInfo['heavyPairs'], sharedInfo['lightPrecMass'] - Constants.mods['H+'] - Constants.mods['H2O'], PNet, alpha=options.alpha, unknownPenalty=options.ambigEdgePenalty, maxEdge=options.maxEdge, minEdge=options.minEdge, Nmod=options.Nmod, Cmod=options.Cmod, aas=aas, verbose=options.verbose) if options.verbose: print 'Time taken:', time.time() - s1 for spec in unpaired: if options.verbose: print 'Now sequencing unpaired spectrum %s' % spec s1 = time.time() precMass = DataFile.getPrecMassAndCharge(spec)[0] pairs = DataFile.getMassIntPairs(spec) DNS.sequenceSingleSpectrum(pairs, precMass - Constants.mods['H+'] - Constants.mods['H2O'], PNet, alpha=options.alpha, unknownPenalty=options.ambigEdgePenalty, maxEdge=options.maxEdge, minEdge=options.minEdge, aas=aas, verbose=options.verbose) if options.verbose: