def cal_nnls(LibIntensity, MS2Intensity, penalty): RowIndex = list(range(len(LibIntensity) + 1)) ColIndex = [0] * (len(LibIntensity) + 1) LibIntensity.append(penalty) MS2Intensity.append(0) MS2Intensity = np.array(MS2Intensity) LibraryVector = sparse.coo_matrix((LibIntensity, (RowIndex, ColIndex))) LibraryCoeffs = sparse_nnls.lsqnonneg(LibraryVector, MS2Intensity, {'show_progress': False}) LibraryCoeffs = LibraryCoeffs['x'] LibraryCoeffs = LibraryCoeffs[0] return LibraryCoeffs
def RegressSpectraOntoLibrary(DIASpectraIterator, Library, tol, maxWindowOffset): RefSpectraLibrary = Library.value for DIASpectrum in DIASpectraIterator: precMZ = float(DIASpectrum[1]) precRT = float(DIASpectrum[2]) #MS2 scan retention time, in minutes index = DIASpectrum[3] windowWidth = DIASpectrum[4] DIASpectrum = np.array(DIASpectrum[0]) LibraryCoeffs = [] if len(DIASpectrum.shape) == 2: if windowWidth > 0: CandidateRefSpectraLibrary = [ spectrum['Spectrum'] for key, spectrum in RefSpectraLibrary.iteritems() if abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth / 2 ] MassWindowCandidates = [ key for key, spectrum in RefSpectraLibrary.iteritems() if abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth / 2 ] else: CandidateRefSpectraLibrary = [ spectrum['Spectrum'] for key, spectrum in RefSpectraLibrary.iteritems() if float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset / 2 ] MassWindowCandidates = [ key for key, spectrum in RefSpectraLibrary.iteritems() if float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset / 2 ] #MERGING OF POINTS IN ACQUIRED SPECTRUM WITH NEARBY M/Z COORDINATES MergedDIASpecCoordIndices = np.searchsorted( DIASpectrum[:, 0] + tol * DIASpectrum[:, 0], DIASpectrum[:, 0]) MergedDIASpecCoords = DIASpectrum[ np.unique(MergedDIASpecCoordIndices), 0] MergedDIASpecIntensities = [ np.mean( DIASpectrum[np.where(MergedDIASpecCoordIndices == i)[0], 1]) for i in np.unique(MergedDIASpecCoordIndices) ] DIASpectrum = np.array( (MergedDIASpecCoords, MergedDIASpecIntensities)).transpose() #FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM CentroidBreaks = np.concatenate( (DIASpectrum[:, 0] - tol * DIASpectrum[:, 0], DIASpectrum[:, 0] + tol * DIASpectrum[:, 0])) CentroidBreaks = np.sort(CentroidBreaks) LocateReferenceCoordsInDIA = [ np.searchsorted(CentroidBreaks, M[:, 0]) for M in CandidateRefSpectraLibrary ] TopTenPeaksCoordsInDIA = [ np.searchsorted( CentroidBreaks, M[np.argsort(-M[:, 1])[0:min(10, M.shape[0])], 0]) for M in CandidateRefSpectraLibrary ] ReferencePeaksInDIA = [ i for i in range(len(MassWindowCandidates)) if len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5 ] #min(3,CandidateRefSpectraLibrary[i].shape[0])] ProportionOfReferencePeaksInDIA = [ len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) / CandidateRefSpectraLibrary[i].shape[0] for i in range(len(MassWindowCandidates)) ] RefPeptideCandidatesLocations = [ LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA ] RefPeptideCandidateList = [ CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA ] RefPeptideCandidates = [ MassWindowCandidates[i] for i in ReferencePeaksInDIA ] NormalizedRefPeptideCandidateList = [ M[:, 1] / sum(M[:, 1]) for M in RefPeptideCandidateList ] RefSpectraLibrarySparseRowIndices = (np.array([ i for v in RefPeptideCandidatesLocations for i in v if i % 2 == 1 ]) + 1) / 2 RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1 #Respect the 0-indexing RefSpectraLibrarySparseColumnIndices = np.array([ i for j in range(len(RefPeptideCandidates)) for i in [j] * len([ k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1 ]) ]) RefSpectraLibrarySparseMatrixEntries = np.array([ NormalizedRefPeptideCandidateList[k][i] for k in range(len(NormalizedRefPeptideCandidateList)) for i in range(len(NormalizedRefPeptideCandidateList[k])) if RefPeptideCandidatesLocations[k][i] % 2 == 1 ]) if (len(RefSpectraLibrarySparseRowIndices) > 0 and len(RefSpectraLibrarySparseColumnIndices) > 0 and len(RefSpectraLibrarySparseMatrixEntries) > 0): UniqueRowIndices = [ i for i in set(RefSpectraLibrarySparseRowIndices) ] UniqueRowIndices.sort() DIASpectrumIntensities = DIASpectrum[ UniqueRowIndices, 1] #Project the spectrum to those m/z bins at which at least one column of the coefficient matrix has a nonzero entry DIASpectrumIntensities = np.append(DIASpectrumIntensities, [ 0 ]) #Add a zero to the end of the DIA data vector to penalize #peaks of library spectra not present in the DIA spectrum #AUGMENT THE LIBRARY MATRIX WITH TOTAL ION INTENSITIES OF PEAKS OF LIBRARY SPECTRA THAT DON'T CORRESPOND TO PEAKS IN DIA SPECTRUM ReferencePeaksNotInDIA = np.array([ k for v in RefPeptideCandidatesLocations for k in range(len(v)) if v[k] % 2 == 0 ]) SparseColumnIndicesForPeaksNotInDIA = np.arange( len(RefPeptideCandidates)) NumRowsOfLibraryMatrix = max(UniqueRowIndices) SparseRowIndicesForPeaksNotInDIA = [ NumRowsOfLibraryMatrix + 1 ] * len(SparseColumnIndicesForPeaksNotInDIA) #Duplicate (i,j) entries are summed together, yielding total ion intensities SparseMatrixEntriesForPeaksNotInDIA = np.array([ np.sum([ NormalizedRefPeptideCandidateList[j][k] for k in range( len(NormalizedRefPeptideCandidateList[j])) if RefPeptideCandidatesLocations[j][k] % 2 == 0 ]) for j in range(len(NormalizedRefPeptideCandidateList)) ]) SparseRowIndices = np.append(RefSpectraLibrarySparseRowIndices, SparseRowIndicesForPeaksNotInDIA) SparseColumnIndices = np.append( RefSpectraLibrarySparseColumnIndices, SparseColumnIndicesForPeaksNotInDIA) SparseMatrixEntries = np.append( RefSpectraLibrarySparseMatrixEntries, SparseMatrixEntriesForPeaksNotInDIA) SparseRowIndices = stats.rankdata( SparseRowIndices, method='dense' ).astype( int ) - 1 #Renumber the row indices according to the projected spectrum, #respecting the 0-indexing LibrarySparseMatrix = sparse.coo_matrix( (SparseMatrixEntries, (SparseRowIndices, SparseColumnIndices))) LibraryCoeffs = sparse_nnls.lsqnonneg(LibrarySparseMatrix, DIASpectrumIntensities, {'show_progress': False}) LibraryCoeffs = LibraryCoeffs['x'] NonzeroCoeffs = [c for c in LibraryCoeffs if c != 0] NonzeroCoeffsAboveThreshold = NonzeroCoeffs Output = [[0, index, 0, 0, 0, 0]] if len(NonzeroCoeffs) > 0: RefSpectraIDs = [ RefPeptideCandidates[j] for j in range(len(RefPeptideCandidates)) if LibraryCoeffs[j] != 0 ] Output = [[ NonzeroCoeffsAboveThreshold[i], index, RefSpectraIDs[i][0], RefSpectraIDs[i][1], precMZ, precRT ] for i in range(len(NonzeroCoeffsAboveThreshold))] yield Output
def RegressSpectraOntoLibraryWithDecoys(DIASpectraIterator, Library, tol, maxWindowOffset): RefSpectraLibrary = Library.value for DIASpectrum in DIASpectraIterator: precMZ = float(DIASpectrum[1]) precRT = float(DIASpectrum[2]) #MS2 scan retention time, in minutes index = DIASpectrum[3] windowWidth = DIASpectrum[4] DIASpectrum = np.array(DIASpectrum[0]) LibraryCoeffs = [] if len(DIASpectrum.shape) == 2: if windowWidth > 0: CandidateRefSpectraLibrary = [ spectrum['Spectrum'] for key, spectrum in RefSpectraLibrary.iteritems() if abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth / 2 ] MassWindowCandidates = [ key for key, spectrum in RefSpectraLibrary.iteritems() if abs(float(spectrum['PrecursorMZ']) - precMZ) < windowWidth / 2 ] CandidateDecoyLibrary = [ spectrum['Spectrum'] for key, spectrum in RefSpectraLibrary.iteritems() if windowWidth / 2 <= abs(float(spectrum['PrecursorMZ']) - precMZ) <= windowWidth ] MassWindowDecoyCandidates = [ ("DECOY_" + key[0], key[1]) for key, spectrum in RefSpectraLibrary.iteritems() if windowWidth / 2 <= abs(float(spectrum['PrecursorMZ']) - precMZ) <= windowWidth ] else: CandidateRefSpectraLibrary = [ spectrum['Spectrum'] for key, spectrum in RefSpectraLibrary.iteritems() if float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset / 2 ] MassWindowCandidates = [ key for key, spectrum in RefSpectraLibrary.iteritems() if float(spectrum['PrecursorMZ']) > precMZ - maxWindowOffset / 2 ] CandidateDecoyLibrary = [ spectrum['Spectrum'] for key, spectrum in RefSpectraLibrary.iteritems() if precMZ - maxWindowOffset <= float( spectrum['PrecursorMZ']) <= precMZ - maxWindowOffset / 2 ] MassWindowDecoyCandidates = [ ("DECOY_" + key[0], key[1]) for key, spectrum in RefSpectraLibrary.iteritems() if precMZ - maxWindowOffset <= float( spectrum['PrecursorMZ']) <= precMZ - maxWindowOffset / 2 ] #FILTER LIBRARY SPECTRA BY THE CONDITION THAT SOME NUMBER OF THEIR 10 MOST INTENSE PEAKS BELONG TO THE DIA SPECTRUM CentroidBreaks = np.concatenate( (DIASpectrum[:, 0] - tol * DIASpectrum[:, 0], DIASpectrum[:, 0] + tol * DIASpectrum[:, 0])) CentroidBreaks.sort() LocateReferenceCoordsInDIA = [ np.searchsorted(CentroidBreaks, M[:, 0]) for M in CandidateRefSpectraLibrary ] #Hard cutoff - at least 5 of the 10 most intense peaks (or all peaks if there are fewer than 3) of reference spectrum must appear in acquired spectrum TopTenPeaksCoordsInDIA = [ np.searchsorted( CentroidBreaks, M[np.argsort(-M[:, 1])[0:min(10, M.shape[0])], 0]) for M in CandidateRefSpectraLibrary ] ReferencePeaksInDIA = [ i for i in range(len(MassWindowCandidates)) if len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5 ] #min(3,CandidateRefSpectraLibrary[i].shape[0])] #SHIFT ALL FRAGMENT ION PEAKS OF ALL DECOY SPECTRA BY 20 M/Z TO ENSURE DISSIMILARITY FROM REAL SPECTRA LocateDecoyCoordsInDIA = [ np.searchsorted(CentroidBreaks, M[:, 0] + 20) for M in CandidateDecoyLibrary ] TopTenPeaksCoordsInDIA = [ np.searchsorted( CentroidBreaks, M[np.argsort(-M[:, 1])[0:min(10, M.shape[0])], 0] + 20) for M in CandidateDecoyLibrary ] DecoyPeaksInDIA = [ i for i in range(len(MassWindowDecoyCandidates)) if len([a for a in TopTenPeaksCoordsInDIA[i] if a % 2 == 1]) > 5 ] #min(3,CandidateRefSpectraLibrary[i].shape[0])] RefPeptideCandidatesLocations = [ LocateReferenceCoordsInDIA[i] for i in ReferencePeaksInDIA ] RefPeptideCandidateList = [ CandidateRefSpectraLibrary[i] for i in ReferencePeaksInDIA ] RefPeptideCandidates = [ MassWindowCandidates[i] for i in ReferencePeaksInDIA ] NormalizedRefPeptideCandidateList = [ M[:, 1] / sum(M[:, 1]) for M in RefPeptideCandidateList ] DecoyCandidatesLocations = [ LocateDecoyCoordsInDIA[i] for i in DecoyPeaksInDIA ] DecoyCandidateList = [ CandidateDecoyLibrary[i] for i in DecoyPeaksInDIA ] DecoyCandidates = [ MassWindowDecoyCandidates[i] for i in DecoyPeaksInDIA ] NormalizedDecoyCandidateList = [ M[:, 1] / sum(M[:, 1]) for M in DecoyCandidateList ] RefSpectraLibrarySparseRowIndices = (np.array([ i for v in RefPeptideCandidatesLocations for i in v if i % 2 == 1 ]) + 1) / 2 RefSpectraLibrarySparseRowIndices = RefSpectraLibrarySparseRowIndices - 1 #Respect the 0-indexing RefSpectraLibrarySparseColumnIndices = np.array([ i for j in range(len(RefPeptideCandidates)) for i in [j] * len([ k for k in RefPeptideCandidatesLocations[j] if k % 2 == 1 ]) ]) RefSpectraLibrarySparseMatrixEntries = np.array([ NormalizedRefPeptideCandidateList[k][i] for k in range(len(NormalizedRefPeptideCandidateList)) for i in range(len(NormalizedRefPeptideCandidateList[k])) if RefPeptideCandidatesLocations[k][i] % 2 == 1 ]) if (len(RefSpectraLibrarySparseRowIndices) > 0 and len(RefSpectraLibrarySparseColumnIndices) > 0 and len(RefSpectraLibrarySparseMatrixEntries) > 0): DecoyLibrarySparseRowIndices = (np.array([ i for v in DecoyCandidatesLocations for i in v if i % 2 == 1 ]) + 1) / 2 DecoyLibrarySparseRowIndices = DecoyLibrarySparseRowIndices - 1 #Respect the 0-indexing DecoyLibrarySparseColumnIndices = max( RefSpectraLibrarySparseColumnIndices ) + 1 + np.array([ i for j in range(len(DecoyCandidates)) for i in [j] * len([k for k in DecoyCandidatesLocations[j] if k % 2 == 1]) ]) DecoyLibrarySparseMatrixEntries = np.array([ NormalizedDecoyCandidateList[k][i] for k in range(len(NormalizedDecoyCandidateList)) for i in range(len(DecoyCandidatesLocations[k])) if DecoyCandidatesLocations[k][i] % 2 == 1 ]) UniqueRowIndices = np.unique( np.concatenate((RefSpectraLibrarySparseRowIndices, DecoyLibrarySparseRowIndices))) UniqueRowIndices = np.array(np.sort(UniqueRowIndices), dtype=int) DIASpectrumIntensities = DIASpectrum[ UniqueRowIndices, 1] #Project the spectrum to those m/z bins at which at least one column of the coefficient matrix has a nonzero entry DIASpectrumIntensities = np.append(DIASpectrumIntensities, [ 0 ]) #Add a zero to the end of the DIA data vector to penalize #peaks of library spectra not present in the DIA spectrum #AUGMENT THE LIBRARY MATRIX WITH TOTAL ION INTENSITIES OF PEAKS OF LIBRARY SPECTRA THAT DON'T CORRESPOND TO PEAKS IN DIA SPECTRUM ReferencePeaksNotInDIA = np.array([ k for v in RefPeptideCandidatesLocations for k in range(len(v)) if v[k] % 2 == 0 ]) SparseColumnIndicesForPeaksNotInDIA = np.arange( len(RefPeptideCandidates)) NumRowsOfLibraryMatrix = max(UniqueRowIndices) SparseRowIndicesForPeaksNotInDIA = [ NumRowsOfLibraryMatrix + 1 ] * len(SparseColumnIndicesForPeaksNotInDIA) #Duplicate (i,j) entries are summed together, yielding total ion intensities SparseMatrixEntriesForPeaksNotInDIA = np.array([ np.sum([ NormalizedRefPeptideCandidateList[j][k] for k in range( len(NormalizedRefPeptideCandidateList[j])) if RefPeptideCandidatesLocations[j][k] % 2 == 0 ]) for j in range(len(NormalizedRefPeptideCandidateList)) ]) RefSpectraLibrarySparseRowIndices = np.append( RefSpectraLibrarySparseRowIndices, SparseRowIndicesForPeaksNotInDIA) RefSpectraLibrarySparseColumnIndices = np.append( RefSpectraLibrarySparseColumnIndices, SparseColumnIndicesForPeaksNotInDIA) RefSpectraLibrarySparseMatrixEntries = np.append( RefSpectraLibrarySparseMatrixEntries, SparseMatrixEntriesForPeaksNotInDIA) DecoyPeaksNotInDIA = np.array([ k for v in DecoyCandidatesLocations for k in range(len(v)) if v[k] % 2 == 0 ]) SparseColumnIndicesForDecoyPeaksNotInDIA = np.arange( len(DecoyCandidates)) NumRowsOfLibraryMatrix = max(UniqueRowIndices) SparseRowIndicesForDecoyPeaksNotInDIA = [ NumRowsOfLibraryMatrix + 1 ] * len(SparseColumnIndicesForDecoyPeaksNotInDIA) #Duplicate (i,j) entries are summed together, yielding total ion intensities SparseMatrixEntriesForDecoyPeaksNotInDIA = np.array([ np.sum([ NormalizedDecoyCandidateList[j][k] for k in range(len(NormalizedDecoyCandidateList[j])) if DecoyCandidatesLocations[j][k] % 2 == 0 ]) for j in range(len(NormalizedDecoyCandidateList)) ]) DecoyLibrarySparseRowIndices = np.append( DecoyLibrarySparseRowIndices, SparseRowIndicesForDecoyPeaksNotInDIA) DecoyLibrarySparseColumnIndices = np.append( DecoyLibrarySparseColumnIndices, max(RefSpectraLibrarySparseColumnIndices) + SparseColumnIndicesForDecoyPeaksNotInDIA + 1) DecoyLibrarySparseMatrixEntries = np.append( DecoyLibrarySparseMatrixEntries, SparseMatrixEntriesForDecoyPeaksNotInDIA) SparseRowIndices = np.concatenate( (RefSpectraLibrarySparseRowIndices, DecoyLibrarySparseRowIndices)) SparseColumnIndices = np.concatenate( (RefSpectraLibrarySparseColumnIndices, DecoyLibrarySparseColumnIndices)) SparseMatrixEntries = np.concatenate( (RefSpectraLibrarySparseMatrixEntries, DecoyLibrarySparseMatrixEntries)) SparseRowIndices = stats.rankdata( SparseRowIndices, method='dense' ).astype( int ) - 1 #Renumber the row indices according to the projected spectrum, #respecting the 0-indexing LibrarySparseMatrix = sparse.coo_matrix( (SparseMatrixEntries, (SparseRowIndices, SparseColumnIndices))) LibraryCoeffs = sparse_nnls.lsqnonneg(LibrarySparseMatrix, DIASpectrumIntensities, {'show_progress': False}) LibraryCoeffs = LibraryCoeffs['x'] NonzeroCoeffs = [c for c in LibraryCoeffs if c != 0] NonzeroCoeffsAboveThreshold = NonzeroCoeffs Output = [[0, index, 0, 0, 0, 0]] if len(NonzeroCoeffs) > 0: RefSpectraIDs = [ RefPeptideCandidates[j] for j in range(len(RefPeptideCandidates)) if LibraryCoeffs[j] != 0 ] DecoyIDs = [ DecoyCandidates[j] for j in range(len(DecoyCandidates)) if LibraryCoeffs[max(RefSpectraLibrarySparseColumnIndices) + 1 + j] != 0 ] RefSpectraIDs = RefSpectraIDs + DecoyIDs Output = [[ NonzeroCoeffsAboveThreshold[i], index, RefSpectraIDs[i][0], RefSpectraIDs[i][1], precMZ, precRT ] for i in range(len(NonzeroCoeffsAboveThreshold))] yield Output