def sequenceDTAs(self): curPairedScanData = self._indexedPairData[int(self._pairedScanListbox.curselection()[0])] t1 = time.time() if curPairedScanData['heavy'] != 'N/A': heavySeqMap = copy.deepcopy(self._seqMap) heavySeqMap['Mods']['N-Term'] = self._paramsDict['Pair Configurations'][curPairedScanData['pair configuration']]['NModSymbol'] heavySeqMap['Mods']['C-Term'] = self._paramsDict['Pair Configurations'][curPairedScanData['pair configuration']]['CModSymbol'] sharedInfo, starts, ends, deltas, termModHash, specs, G = DNS.initializeSpectrumGraph(self._pnet, self._paramsDict, self._scanFDict[curPairedScanData['light']]['dta'], heavyPath=self._scanFDict[curPairedScanData['heavy']]['dta'], ppm=self._ppm, usePaired=True, pairConfigName=curPairedScanData['pair configuration'], verbose=False) precMass = sharedInfo['lightPrecMass'] else: sharedInfo, starts, ends, deltas, termModHash, specs, G = DNS.initializeSpectrumGraph(self._pnet, self._paramsDict, self._scanFDict[curPairedScanData['light']]['dta'], ppm=self._ppm, verbose=False) precMass = sharedInfo['precMass'] epsilon = self._ppm * precMass * 10 ** -6 paths, subG = DNS.getSpectrumGraphPaths(G, deltas, specs, starts, ends, precMass - Constants.mods['H+'] - Constants.mods['H2O'], termModHash=termModHash, unknownPenalty=self._ambigpenalty, maxEdge=self._maxedge, minEdge=self._minedge, subGraphCut=self._subgraphcut, subAlpha=0.3, alpha=self._alpha, epsilon=epsilon, aas=self._aas, verbose=False) seqTime = time.time() - t1 if paths: seqs = [] for path in paths: seqs.extend([DNS.getSequenceFromNodes(subG, path[1], precMass - Constants.mods['H+'] - Constants.mods['H2O'], termModHash)]) scores = list(zip(*paths)[0]) Ord = np.argsort(-1 * np.array(scores)) ambigEdges = [] numAmbig = 0 for j in range(self._numseq): try: for i in range(len(seqs[Ord[j]])): if type(seqs[Ord[j]][i]) == tuple: ambigEdges.extend([seqs[Ord[j]][i]]) numAmbig += 1 seqs[Ord[j]][i] = '-' curSeq = ''.join(seqs[Ord[j]]) curSeq = An.preprocessSequence(curSeq, self._seqMap, ambigEdges=ambigEdges) if j == 0 and curPairedScanData['heavy'] != 'N/A': try: curHeavySeq = An.preprocessSequence(curSeq, heavySeqMap, replaceExistingTerminalMods=True, ambigEdges=ambigEdges) AAs = An.getAllAAs(curHeavySeq, ambigEdges=ambigEdges) self._seqStatus.set('Paired Sequencing Successful! Heavy Sequence: %s. Time taken: %f seconds' % (curHeavySeq, seqTime)) except KeyError: self._seqStatus.set('ERROR: Heavy Sequence %s is not a valid sequence! Time wasted: %f seconds' % (curHeavySeq, seqTime)) elif j == 0: self._seqStatus.set('Unpaired Sequencing Successful! Time taken: %f seconds' % (seqTime)) for labelInst in self._seqScoreData[j]['seq'].children.values(): labelInst.destroy() self.displayConfColoredSequence(subG, self._seqScoreData[j]['seq'], paths[Ord[j]][1], curSeq, ambigEdges=ambigEdges) self._seqScoreData[j]['score'].set(str(scores[Ord[j]])) except IndexError: for labelInst in self._seqScoreData[j]['seq'].children.values(): labelInst.destroy() self._seqScoreData[j]['score'].set('') else: self._seqStatus.set('ERROR: No Sequences Found! Time wasted: %f seconds' % seqTime)
def validateHeavySequence(seq, heavySeqMap, ambigEdges): try: if seq != '-': heavySeq = Analytics.preprocessSequence(seq, heavySeqMap, replaceExistingTerminalMods=True, ambigEdges=ambigEdges) AAs = Analytics.getAllAAs(heavySeq, ambigEdges=ambigEdges) return True else: return False except KeyError: return False
def getSpectrumAndPSMFeatureDict(LADSSeqInfo, seqEntry, scanFDict, pairConfig, PNet): featureList = [] lightScans = seqEntry[0] heavyScans = seqEntry[1] lightSpecs = [DataFile.getMassIntPairs(scanFDict[int(lightScanF)]['dta']) for lightScanF in lightScans] heavySpecs = [DataFile.getMassIntPairs(scanFDict[int(heavyScanF)]['dta']) for heavyScanF in heavyScans] avgLightPrecMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in lightScans])) epSTD = options.ppmstd * 10**-6 * avgLightPrecMass specs = [] for i, massIntPairs in enumerate(lightSpecs): specs += [PN.Spectrum(PNet, scanFDict[lightScans[i]]['precMass'], Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)] for i, massIntPairs in enumerate(heavySpecs): specs += [PN.Spectrum(PNet, scanFDict[heavyScans[i]]['precMass'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)] for spec in specs: spec.initializeNoiseModel() clusterPairingStats = Discriminator.getClusterPairingStats(lightSpecs, heavySpecs, avgLightPrecMass, pairConfig, epSTD=epSTD) GLFD.addClusterPairingStatsToFeatureList(clusterPairingStats, featureList) scoreStats = {} truePMs = {} prmLadders = {} for PSM in LADSSeqInfo[seqEntry]: lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2]) scoreStats[PSM[:2]] = Discriminator.getScoreStats(specs, lightSeq, ambigEdges=PSM[2]) prmLadderWithEnds = An.getPRMLadder(lightSeq, ambigEdges=PSM[2], addEnds=True) truePMs[PSM[:2]] = prmLadderWithEnds[-1] prmLadders[PSM[:2]] = prmLadderWithEnds[1:-1] PSMList = scoreStats.keys() spectrumOrderedScoreStats, clusterScoreStats = GLFD.compileScoreStats(scoreStats, specs, PSMList) spectrumAndPSMSpecificFeatureDict = {} PSMIndexDict = dict([(PSM, i) for i, PSM in enumerate(PSMList)]) for i, PSM in enumerate(LADSSeqInfo[seqEntry]): PSMSpecificFeatureList = copy.copy(featureList) peptLength = len(prmLadders[PSM[:2]]) + 1 # Add LADS PScore (and normalized variants) and delta rank, delta score (LADS PScore) to feature list PSMSpecificFeatureList += [PSM[0], PSM[0]/peptLength, PSM[0]/len(specs), -i, PSM[0]-LADSSeqInfo[seqEntry][0][0]] # Add Total Path Score (and normalized variants) and delta rank, delta score (total path score) and total minimum node score to feature list totalPathScore = scoreStats[PSM[:2]]['Total Path Score'] PSMSpecificFeatureList += [totalPathScore, totalPathScore/peptLength, totalPathScore/len(specs), -clusterScoreStats['PSM Rankings'][PSMIndexDict[PSM[:2]]], totalPathScore-clusterScoreStats['Max Cluster Path Score'], scoreStats[PSM[:2]]['Total Minimum Node Score']] # Add minimum path score, maximum path score, (and normalized variants) and minimum score/maximum score for cluster to feature list PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Minimum Path Score'], scoreStats[PSM[:2]]['Minimum Path Score']/peptLength, scoreStats[PSM[:2]]['Maximum Path Score'], scoreStats[PSM[:2]]['Maximum Path Score']/peptLength, scoreStats[PSM[:2]]['Minimum Path Score']/scoreStats[PSM[:2]]['Maximum Path Score']] # Add difference between minimum and maximum ranking for PSM across cluster to feature list rankingsForPSM = [spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]] for i in spectrumOrderedScoreStats] PSMSpecificFeatureList += [min(rankingsForPSM) - max(rankingsForPSM)] #Add Number forbidden node pairs (and normalized variants) to feature list numForbiddenPairs = Discriminator.getNumForbiddenPairs(prmLadders[PSM[:2]], avgLightPrecMass) PSMSpecificFeatureList += [numForbiddenPairs, 2.0*numForbiddenPairs/(peptLength-1)] # Add number of ambiguous edges to feature list PSMSpecificFeatureList += [len(PSM[2])] # Add stats for PRM Evidence over cluster (and normalized variants) to feature list PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence']/float(peptLength-1)] # Add stats for paired PRMs and their corresponding ion types to feature list pairedPRMStats = Discriminator.getPairedPRMStats(prmLadders[PSM[:2]], clusterPairingStats['Light Merged Spec'], clusterPairingStats['Heavy Merged Spec'], lightSpecs, heavySpecs, clusterPairingStats['Cluster Paired PRM Information'], epSTD=epSTD) GLFD.addPairedPRMStatsToFeatureList(pairedPRMStats, PSMSpecificFeatureList, len(prmLadders[PSM[:2]])) pairedPRMLadder = pairedPRMStats['Paired PRM Ladder'] for i, scan in enumerate(lightScans): spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList) # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]] numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]] spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]] # Add mass deviation from true peptide mass to feature list precMass = scanFDict[scan]['precMass'] spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)] peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet) GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength) GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList) spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength] spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList for j, scan in enumerate(heavyScans): i = j + len(lightScans) spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList) # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]] numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]] spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]] # Add mass deviation from true peptide mass to feature list precMass = scanFDict[scan]['precMass'] spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + pairConfig['NMod'] + pairConfig['CMod'] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)] peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet) GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength) GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList) spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength] spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList return spectrumAndPSMSpecificFeatureDict
outFile.write('\t'.join([col for col in cols]) + '\n') for seqEntry in LADSSeqInfo: lightScans = seqEntry[0] heavyScans = seqEntry[1] scanScoreDict = getScanScoreDictSVM(LADSSeqInfo, seqEntry, scanFDict, svmModel, svmRange, pairConfigurations[pairConfigName], PNet, desired_feats = desired_feats) # scanScoreDict = getScanScoreDictRankBoost(LADSSeqInfo, seqEntry, scanFDict, rankModel, pairConfigurations['lightdimethyl_heavydimethyl'], PNet) # scanScoreDict = getScanScoreDictClusterNormScore(LADSSeqInfo, seqEntry) for i, scan in enumerate(lightScans): scanData = {'ScanF': scan} lightSeq = An.preprocessSequence(scanScoreDict[scan]['Seq'][0], seqMap, ambigEdges=scanScoreDict[scan]['Seq'][1]) scanData['LADS Sequence'] = lightSeq scanData['LADS Ambig Edges'] = scanScoreDict[scan]['Seq'][1] scanData['LADS Raw Score'] = scanScoreDict[scan]['Raw Score'] scanData['LADS Post Score'] = scanScoreDict[scan]['Post Score'] scanData['M+H'] = scanFDict[scan]['precMass'] try: comp = An.comparePeptideResults(lightSeq, SEQUESTMASCOTResults[scan]['Peptide'], ambigEdges1=scanScoreDict[scan]['Seq'][1], ambigEdges2=[], ppm=20) scanData['SEQUEST XCorr'] = SEQUESTMASCOTResults[scan]['SEQUEST XCorr'] scanData['MASCOT Ion Score'] = SEQUESTMASCOTResults[scan]['MASCOT Ion Score'] scanData['SEQUEST MASCOT Sequence'] = SEQUESTMASCOTResults[scan]['Peptide'] scanData['Accuracy'] = comp[0] scanData['Precision'] = comp[1] except KeyError: scanData['SEQUEST XCorr'] = None
specs = [] for i, massIntPairs in enumerate(lightSpecs): specs += [PN.Spectrum(PNet, scanFDict[lightScans[i]]['precMass'], Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)] for i, massIntPairs in enumerate(heavySpecs): specs += [PN.Spectrum(PNet, scanFDict[heavyScans[i]]['precMass'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)] for spec in specs: spec.initializeNoiseModel() clusterPairingStats = Discriminator.getClusterPairingStats(lightSpecs, heavySpecs, avgLightPrecMass, pairConfig, epSTD=epSTD) addClusterPairingStatsToFeatureList(clusterPairingStats, featureList) scoreStats = {} truePMs = {} prmLadders = {} for PSM in LADSSeqInfo[seqEntry]: lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2]) scoreStats[PSM[:2]] = Discriminator.getScoreStats(specs, lightSeq, ambigEdges=PSM[2]) prmLadderWithEnds = An.getPRMLadder(lightSeq, ambigEdges=PSM[2], addEnds=True) truePMs[PSM[:2]] = prmLadderWithEnds[-1] prmLadders[PSM[:2]] = prmLadderWithEnds[1:-1] PSMList = scoreStats.keys() spectrumOrderedScoreStats, clusterScoreStats = compileScoreStats(scoreStats, specs, PSMList) PSMIndexDict = dict([(PSM, i) for i, PSM in enumerate(PSMList)]) for i, PSM in enumerate(LADSSeqInfo[seqEntry]): PSMSpecificFeatureList = copy.copy(featureList) lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2]) heavySeq = An.preprocessSequence(PSM[1], heavySeqMaps['silac_light_heavy'], replaceExistingTerminalMods=True, ambigEdges=PSM[2])