Ejemplo n.º 1
0
 def sequenceDTAs(self):
     curPairedScanData = self._indexedPairData[int(self._pairedScanListbox.curselection()[0])]
     t1 = time.time()
     if curPairedScanData['heavy'] != 'N/A':
         heavySeqMap = copy.deepcopy(self._seqMap)
         heavySeqMap['Mods']['N-Term'] = self._paramsDict['Pair Configurations'][curPairedScanData['pair configuration']]['NModSymbol']
         heavySeqMap['Mods']['C-Term'] = self._paramsDict['Pair Configurations'][curPairedScanData['pair configuration']]['CModSymbol']
         sharedInfo, starts, ends, deltas, termModHash, specs, G = DNS.initializeSpectrumGraph(self._pnet, self._paramsDict, self._scanFDict[curPairedScanData['light']]['dta'], heavyPath=self._scanFDict[curPairedScanData['heavy']]['dta'], ppm=self._ppm, usePaired=True, pairConfigName=curPairedScanData['pair configuration'], verbose=False)
         precMass = sharedInfo['lightPrecMass']
     else:
         sharedInfo, starts, ends, deltas, termModHash, specs, G = DNS.initializeSpectrumGraph(self._pnet, self._paramsDict, self._scanFDict[curPairedScanData['light']]['dta'], ppm=self._ppm, verbose=False)
         precMass = sharedInfo['precMass']
     
     epsilon = self._ppm * precMass * 10 ** -6
     paths, subG = DNS.getSpectrumGraphPaths(G, deltas, specs, starts, ends, precMass - Constants.mods['H+'] - Constants.mods['H2O'], termModHash=termModHash, unknownPenalty=self._ambigpenalty, maxEdge=self._maxedge, minEdge=self._minedge, subGraphCut=self._subgraphcut, subAlpha=0.3, alpha=self._alpha, epsilon=epsilon, aas=self._aas, verbose=False)
     seqTime = time.time() - t1
     if paths:
         seqs = []
         for path in paths:
             seqs.extend([DNS.getSequenceFromNodes(subG, path[1], precMass - Constants.mods['H+'] - Constants.mods['H2O'], termModHash)])
 
         scores = list(zip(*paths)[0])
         Ord = np.argsort(-1 * np.array(scores))
         
         ambigEdges = []
         numAmbig = 0
         for j in range(self._numseq):
             try:
                 for i in range(len(seqs[Ord[j]])):
                     if type(seqs[Ord[j]][i]) == tuple:
                         ambigEdges.extend([seqs[Ord[j]][i]])
                         numAmbig += 1
                         seqs[Ord[j]][i] = '-'
             
                 curSeq = ''.join(seqs[Ord[j]])
                 curSeq = An.preprocessSequence(curSeq, self._seqMap, ambigEdges=ambigEdges)
                 if j == 0 and curPairedScanData['heavy'] != 'N/A':
                     try:
                         curHeavySeq = An.preprocessSequence(curSeq, heavySeqMap, replaceExistingTerminalMods=True, ambigEdges=ambigEdges)
                         AAs = An.getAllAAs(curHeavySeq, ambigEdges=ambigEdges)
                         self._seqStatus.set('Paired Sequencing Successful! Heavy Sequence: %s. Time taken: %f seconds' % (curHeavySeq, seqTime))
                     except KeyError:
                         self._seqStatus.set('ERROR: Heavy Sequence %s is not a valid sequence! Time wasted: %f seconds' % (curHeavySeq, seqTime))
                 elif j == 0:
                     self._seqStatus.set('Unpaired Sequencing Successful! Time taken: %f seconds' % (seqTime))
                 
                 for labelInst in self._seqScoreData[j]['seq'].children.values():
                     labelInst.destroy()
                 self.displayConfColoredSequence(subG, self._seqScoreData[j]['seq'], paths[Ord[j]][1], curSeq, ambigEdges=ambigEdges)
                 self._seqScoreData[j]['score'].set(str(scores[Ord[j]]))
             except IndexError:
                 for labelInst in self._seqScoreData[j]['seq'].children.values():
                     labelInst.destroy()
                 self._seqScoreData[j]['score'].set('')
     else:
         self._seqStatus.set('ERROR: No Sequences Found! Time wasted: %f seconds' % seqTime)
Ejemplo n.º 2
0
def validateHeavySequence(seq, heavySeqMap, ambigEdges):
    try:
        if seq != '-':
            heavySeq = Analytics.preprocessSequence(seq, heavySeqMap, replaceExistingTerminalMods=True, ambigEdges=ambigEdges)
            AAs = Analytics.getAllAAs(heavySeq, ambigEdges=ambigEdges)
            return True
        else:
            return False
    except KeyError:
        return False
def getSpectrumAndPSMFeatureDict(LADSSeqInfo, seqEntry, scanFDict, pairConfig, PNet):

    featureList = []
    lightScans = seqEntry[0]
    heavyScans = seqEntry[1]
    
    lightSpecs = [DataFile.getMassIntPairs(scanFDict[int(lightScanF)]['dta']) for lightScanF in lightScans]
    heavySpecs = [DataFile.getMassIntPairs(scanFDict[int(heavyScanF)]['dta']) for heavyScanF in heavyScans]
    avgLightPrecMass = np.average(np.array([scanFDict[lightScanF]['precMass'] for lightScanF in lightScans]))
    
    epSTD = options.ppmstd * 10**-6 * avgLightPrecMass
    
    specs = []
    for i, massIntPairs in enumerate(lightSpecs):
        specs += [PN.Spectrum(PNet, scanFDict[lightScans[i]]['precMass'], Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)]
    for i, massIntPairs in enumerate(heavySpecs):
        specs += [PN.Spectrum(PNet, scanFDict[heavyScans[i]]['precMass'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)]
    for spec in specs:
        spec.initializeNoiseModel()
                                                                                                                                                    
    clusterPairingStats = Discriminator.getClusterPairingStats(lightSpecs, heavySpecs, avgLightPrecMass, pairConfig, epSTD=epSTD)
    GLFD.addClusterPairingStatsToFeatureList(clusterPairingStats, featureList)

    scoreStats = {}
    truePMs = {}
    prmLadders = {}
    for PSM in LADSSeqInfo[seqEntry]:
        lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2])
        scoreStats[PSM[:2]] = Discriminator.getScoreStats(specs, lightSeq, ambigEdges=PSM[2])

        prmLadderWithEnds = An.getPRMLadder(lightSeq, ambigEdges=PSM[2], addEnds=True)
        truePMs[PSM[:2]] = prmLadderWithEnds[-1]
        prmLadders[PSM[:2]] = prmLadderWithEnds[1:-1]
        
    PSMList = scoreStats.keys()
    spectrumOrderedScoreStats, clusterScoreStats = GLFD.compileScoreStats(scoreStats, specs, PSMList)

    spectrumAndPSMSpecificFeatureDict = {}
        
    PSMIndexDict = dict([(PSM, i) for i, PSM in enumerate(PSMList)])
    for i, PSM in enumerate(LADSSeqInfo[seqEntry]):
        PSMSpecificFeatureList = copy.copy(featureList)

        peptLength = len(prmLadders[PSM[:2]]) + 1

        # Add LADS PScore (and normalized variants)  and delta rank, delta score (LADS PScore) to feature list
        PSMSpecificFeatureList += [PSM[0], PSM[0]/peptLength, PSM[0]/len(specs), -i, PSM[0]-LADSSeqInfo[seqEntry][0][0]]
        # Add Total Path Score (and normalized variants) and delta rank, delta score (total path score)  and total minimum node score to feature list
        totalPathScore = scoreStats[PSM[:2]]['Total Path Score']
        PSMSpecificFeatureList += [totalPathScore, totalPathScore/peptLength, totalPathScore/len(specs), -clusterScoreStats['PSM Rankings'][PSMIndexDict[PSM[:2]]], totalPathScore-clusterScoreStats['Max Cluster Path Score'], scoreStats[PSM[:2]]['Total Minimum Node Score']]
        
        # Add minimum path score, maximum path score, (and normalized variants) and minimum score/maximum score for cluster to feature list
        PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Minimum Path Score'], scoreStats[PSM[:2]]['Minimum Path Score']/peptLength, scoreStats[PSM[:2]]['Maximum Path Score'], scoreStats[PSM[:2]]['Maximum Path Score']/peptLength, scoreStats[PSM[:2]]['Minimum Path Score']/scoreStats[PSM[:2]]['Maximum Path Score']]
        
        # Add difference between minimum and maximum ranking for PSM across cluster to feature list
        rankingsForPSM = [spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]] for i in spectrumOrderedScoreStats]
        PSMSpecificFeatureList += [min(rankingsForPSM) - max(rankingsForPSM)]
        
        #Add Number forbidden node pairs (and normalized variants) to feature list
        numForbiddenPairs = Discriminator.getNumForbiddenPairs(prmLadders[PSM[:2]], avgLightPrecMass)
        PSMSpecificFeatureList += [numForbiddenPairs, 2.0*numForbiddenPairs/(peptLength-1)]

        # Add number of ambiguous edges to feature list
        PSMSpecificFeatureList += [len(PSM[2])]
        
        # Add stats for PRM Evidence over cluster (and normalized variants) to feature list
        PSMSpecificFeatureList += [scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['All Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['Majority Evidence']/float(peptLength-1), scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence'], scoreStats[PSM[:2]]['Aggregate PRM Score Statistics']['None Evidence']/float(peptLength-1)]

        # Add stats for paired PRMs and their corresponding ion types to feature list
        pairedPRMStats = Discriminator.getPairedPRMStats(prmLadders[PSM[:2]], clusterPairingStats['Light Merged Spec'], clusterPairingStats['Heavy Merged Spec'], lightSpecs, heavySpecs, clusterPairingStats['Cluster Paired PRM Information'], epSTD=epSTD)
        GLFD.addPairedPRMStatsToFeatureList(pairedPRMStats, PSMSpecificFeatureList, len(prmLadders[PSM[:2]]))

        pairedPRMLadder = pairedPRMStats['Paired PRM Ladder']        
    
        for i, scan in enumerate(lightScans):
            spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList)
            # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list
            pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]]
            numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]]
            spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]]
            
            # Add mass deviation from true peptide mass to feature list
            precMass = scanFDict[scan]['precMass']
            spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)]
        
            peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet)
            GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength)
            GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList)
        
            spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength]
            spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList

        for j, scan in enumerate(heavyScans):
            i = j + len(lightScans)
            
            spectrumSpecificFeatureList = copy.copy(PSMSpecificFeatureList)
            # Add path score (and normalized variants), delta rank, delta score, number of negative PRMs, and minimum node score for spectrum to feature list
            pathScore = spectrumOrderedScoreStats[i]['Path Scores'][PSMIndexDict[PSM[:2]]]
            numNegativePRMs = spectrumOrderedScoreStats[i]['Num Negative PRMs'][PSMIndexDict[PSM[:2]]]
            spectrumSpecificFeatureList += [pathScore, pathScore/peptLength, pathScore/scoreStats[PSM[:2]]['Maximum Path Score'], -spectrumOrderedScoreStats[i]['PSM Rankings'][PSMIndexDict[PSM[:2]]], spectrumOrderedScoreStats[i]['Delta Scores'][PSMIndexDict[PSM[:2]]], numNegativePRMs, numNegativePRMs/float(peptLength-1), spectrumOrderedScoreStats[i]['Min Node Scores'][PSMIndexDict[PSM[:2]]]]
            
            # Add mass deviation from true peptide mass to feature list
            precMass = scanFDict[scan]['precMass']
            spectrumSpecificFeatureList += [abs(truePMs[PSM[:2]] + pairConfig['NMod'] + pairConfig['CMod'] + Constants.mods['H2O'] + Constants.mods['H+'] - precMass)]
            
            peakAnnotationMassOffsetStats = Discriminator.getPeakAnnotationAndMassOffsetStats(DataFile.getMassIntPairs(scanFDict[scan]['dta']), specs[i], prmLadders[PSM[:2]], pairedPRMLadder, PNet)
            GLFD.addPeakAnnotationStatsToFeatureList(PNet, peakAnnotationMassOffsetStats, spectrumSpecificFeatureList, peptLength)
            GLFD.addMassOffsetStatsToFeatureList(peakAnnotationMassOffsetStats, spectrumSpecificFeatureList)
            
            spectrumSpecificFeatureList += [precMass, GLFD.getChargeStateFromDTAFName(scanFDict[scan]['dta']), peptLength]
            spectrumAndPSMSpecificFeatureDict[(scan, PSM[:2])] = spectrumSpecificFeatureList

    return spectrumAndPSMSpecificFeatureDict
    outFile.write('\t'.join([col for col in cols]) + '\n')

    for seqEntry in LADSSeqInfo:
        lightScans = seqEntry[0]
        heavyScans = seqEntry[1]

        scanScoreDict = getScanScoreDictSVM(LADSSeqInfo, seqEntry, scanFDict, svmModel, svmRange, pairConfigurations[pairConfigName], PNet, desired_feats = desired_feats)
        
#        scanScoreDict = getScanScoreDictRankBoost(LADSSeqInfo, seqEntry, scanFDict, rankModel, pairConfigurations['lightdimethyl_heavydimethyl'], PNet)
#        scanScoreDict = getScanScoreDictClusterNormScore(LADSSeqInfo, seqEntry)

        for i, scan in enumerate(lightScans):

            scanData = {'ScanF': scan}
                        
            lightSeq = An.preprocessSequence(scanScoreDict[scan]['Seq'][0], seqMap, ambigEdges=scanScoreDict[scan]['Seq'][1])
            scanData['LADS Sequence'] = lightSeq
            scanData['LADS Ambig Edges'] = scanScoreDict[scan]['Seq'][1]
            scanData['LADS Raw Score'] = scanScoreDict[scan]['Raw Score']
            scanData['LADS Post Score'] = scanScoreDict[scan]['Post Score']
            scanData['M+H'] = scanFDict[scan]['precMass']

            try:
                comp = An.comparePeptideResults(lightSeq, SEQUESTMASCOTResults[scan]['Peptide'], ambigEdges1=scanScoreDict[scan]['Seq'][1], ambigEdges2=[], ppm=20)            
                scanData['SEQUEST XCorr'] = SEQUESTMASCOTResults[scan]['SEQUEST XCorr']
                scanData['MASCOT Ion Score'] = SEQUESTMASCOTResults[scan]['MASCOT Ion Score']
                scanData['SEQUEST MASCOT Sequence'] = SEQUESTMASCOTResults[scan]['Peptide']
                scanData['Accuracy'] = comp[0]
                scanData['Precision'] = comp[1]
            except KeyError:
                scanData['SEQUEST XCorr'] = None
            specs = []
            for i, massIntPairs in enumerate(lightSpecs):
                specs += [PN.Spectrum(PNet, scanFDict[lightScans[i]]['precMass'], Nmod=0.0, Cmod=0.0, epsilon=2*epSTD, spectrum=massIntPairs)]
            for i, massIntPairs in enumerate(heavySpecs):
                specs += [PN.Spectrum(PNet, scanFDict[heavyScans[i]]['precMass'], Nmod=pairConfig['NMod'], Cmod=pairConfig['CMod'], epsilon=2*epSTD, spectrum=massIntPairs)]
            for spec in specs:
                spec.initializeNoiseModel()

            clusterPairingStats = Discriminator.getClusterPairingStats(lightSpecs, heavySpecs, avgLightPrecMass, pairConfig, epSTD=epSTD)
            addClusterPairingStatsToFeatureList(clusterPairingStats, featureList)
            
            scoreStats = {}
            truePMs = {}
            prmLadders = {}
            for PSM in LADSSeqInfo[seqEntry]:
                lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2])
                scoreStats[PSM[:2]] = Discriminator.getScoreStats(specs, lightSeq, ambigEdges=PSM[2])

                prmLadderWithEnds = An.getPRMLadder(lightSeq, ambigEdges=PSM[2], addEnds=True)
                truePMs[PSM[:2]] = prmLadderWithEnds[-1]
                prmLadders[PSM[:2]] = prmLadderWithEnds[1:-1]
            
            PSMList = scoreStats.keys()
            spectrumOrderedScoreStats, clusterScoreStats = compileScoreStats(scoreStats, specs, PSMList)

            PSMIndexDict = dict([(PSM, i) for i, PSM in enumerate(PSMList)])
            for i, PSM in enumerate(LADSSeqInfo[seqEntry]):
                PSMSpecificFeatureList = copy.copy(featureList)
                lightSeq = An.preprocessSequence(PSM[1], seqMap, ambigEdges=PSM[2])
                heavySeq = An.preprocessSequence(PSM[1], heavySeqMaps['silac_light_heavy'], replaceExistingTerminalMods=True, ambigEdges=PSM[2])