def clusteringCAMERA(peaks, adducts, **kwargs): """ arguments needed: error_rt:rt_ drift ppm:precision useCorrelation: if we calculate correlations """ t=time.clock() #unpack parameters error_rt = kwargs.get('rtError', 6) #ppm = float(kwargs.get('ppm'))/10**6 ppm=peaks[0].sample.ppm/1e6 mode=kwargs.get('mode', 'HighRes') resolveConflicts=kwargs.get('resolveConflicts', False) peaks_with_iso=peaks print "peaklist length",len(peaks) adducts_to_check=np.array(adducts.keys()) #=========================================================================== #START CAMERA ALGORITHM print ("RT Grouping ...") #RT_peak=peaks_with_iso.rtClustering(error_rt) #3,find for each peak peaks which matches with retention time rtPeak =[] for i, peak in enumerate(peaks_with_iso.ipeaks()): l=MSPeakList() l.addPeak(peak) for j, peak_ in enumerate(peaks_with_iso.ipeaks()): if i!=j: if abs(peak.rt - peak_.rt) < error_rt: l.append(peak_) isIncluded=False index=[] for k, rtClust in enumerate(rtPeak): if set(l)<=(set(rtClust)):#inclusion test of l already in rt ? seen as 'equivalent to' isIncluded=True break if set(rtClust) <= (set(l)): index.append(k) #break #del rtPeak[index] rtPeak= [rtPeak[i] for i in xrange(len(rtPeak)) if i not in index] if not isIncluded: rtPeak.append(MSPeakList(l)) #isIncluded=True #else: # if rtClust.__eq__(l): # rtPeak[k]=l # break #isIncluded=True #if not isIncluded: #l.sort(key=lambda x:x.mass()) # with open('test1.txt', 'w') as f: # for r in rtPeak: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) #EXPERIMENTAL CODE # cl=[] # for cluster in rtPeak: # list_=[];datapoints={} # for i, p in enumerate(cluster): # correspondingPeaks=set() # correspondingPeaks.add(p) # for j in xrange(i+1, len(cluster)): # #put caching on that to avoid recalculation each time of the datapoints # try: # r=r_coef(list(datapoints[p]), list(datapoints[cluster[j]])) # except KeyError: # y, y_= None, None # try: # y=datapoints[p] # except KeyError: # x, y= massExtractionBisectAlgo(p.sample,p.mass(), ppm) # datapoints[p]=y # # try: # y_=datapoints[cluster[j]] # except KeyError: # x, y_= massExtractionBisectAlgo(cluster[j].sample, cluster[j].mass(), ppm) # datapoints[cluster[j]]=y_ # r=r_coef(y, y_) # if r >= threshold: # correspondingPeaks.add(cluster[j]) # list_.append(correspondingPeaks) # # for i, p in enumerate(list_): # for j in xrange(i+1, len(list_)): # if list_[j].issubset(p): # continue # else: # cl.append(MSPeakList(list(p))) #merging step again # print "cluster length, same without replicates",len(cl), len(set(map(set, [x for x in cl]))) # with open('test2.txt', 'w') as f: # for r in cl: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) # #END EXPERIMENTAL CODE print 'len RTpeak', len(rtPeak) print ("Creating possible M0...") #Cython code finalList = massGenPerGroup(rtPeak, adducts_to_check, ppm) print("Mapping of calculated mass on peaklist...") #4,see if one matches with peak in the raw peaklist goodPeak=[]#list will contain good peak per rtCluster for i, dic in enumerate(finalList): matchingMass=defaultdict(list) for mass in dic.iterkeys(): p = rtPeak[i].peaksInMZRange(mass, deltam=mass * ppm if mode=='HighRes' else 1.)#rtPeak[i] not necessarily sorted warning if not p: continue peak=sorted(p, key=lambda x:abs(mass - x.mass()))[0] #if peak not in matchingMass.keys():#may avoid this to see if one peak appears several times !then do 'set' # matchingMass[peak]=[] matchingMass[peak] += dic[mass] goodPeak.append(matchingMass) #start new stuffs here print ("Merging informations...") #conflicts=False adds=MSPeakList()#object sor storing adducts found newGoodPeaks=defaultdict(list)#{} for peaksInOneRtGroup in goodPeak: for peak in peaksInOneRtGroup.iterkeys(): newGoodPeaks[peak] += peaksInOneRtGroup[peak] for p in newGoodPeaks.iterkeys(): p.fragCluster=MSClusterList(list(set(newGoodPeaks[p]))) for f in p.fragCluster: f.parentPeak.append(p) adds += p.fragCluster finalPeaks=MSPeakList(newGoodPeaks.keys()) print ("Resolving conflicts if any...") #removing peak that appears many times that is to say in different clusters def clusterComparison(list_):#receive a list of peak with clusters identified """ return the best peak WARNING: p_ydata and p_.y_data are None TODO: """ sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) longest=len(sortedList[-1].fragCluster) sameSizePeaks=MSPeakList() for p in sortedList: if len(p.fragCluster) == longest: sameSizePeaks.append(p) if len(sameSizePeaks) == 1: return sameSizePeaks[0] corr=np.array([0.] * len(sameSizePeaks)) #for i, p in enumerate(sameSizePeaks): # for p_ in p.fragCluster: # corr[i] += r_coef(p_.y_data, p.y_data) m=max_f(corr) return sameSizePeaks[np.where(corr == m)[0][0]] if resolveConflicts: for add in set(adds): if len(add.parentPeak) <= 1: #print "%s belong to several fragCluster"%str(add) continue #print "%s belong to several fragCluster"%str(add) goodParent=clusterComparison(add.parentPeak) #if goodParent is not None: # add.parentPeak = [goodParent] for parent in add.parentPeak: if parent != goodParent: try: parent.fragCluster.remove(add) except ValueError: print "Error removing %s from fragCluster of %s"%(str(add), str(parent)) add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove #print "after removing len add.parentPeak", len(add.parentPeak) #make the annotation for peak in finalPeaks.ipeaks(): for f in peak.fragCluster: #results = makeAnnotations(adducts_to_check, adducts, f.mass(), ppm) for annot in adducts.iterkeys(): p = f.mass() / annot[1] + annot[0] diff = peak.mass()*ppm if mode =='HighRes' else 1 if peak.mass() > p-diff and peak.mass() < p+diff: f.annotation[annot]=adducts[annot] break finalPeaks=checkingSons(finalPeaks) #5,second filter, correlation on the isotopic cluster between samples # if useCorrelation: # print "Calculating correlation between samples..." # interSamplesCorr(spl, **kwargs) # print "Calculating correlation intra sample..." # intraSampleCorr(spl) # #6 merging print "Merging interesting peaks" for peak in peaks_with_iso.ipeaks():#wring merging must take out those which allow to construct this peak if peak not in finalPeaks and peak not in adds:#matching_peaks: finalPeaks.append(peak) #matching_peaks to if not finalPeaks: print ("no cluster found, please increase the ppm, or rt drift parameters") print ("finished, time elapsed:",time.clock()-t) return MSPeakList(sorted(finalPeaks, key=lambda x:x.mass)), adds#checkingSons(finalPeaks), adds
def clusteringCAMERA(peaks, adducts, **kwargs): """ arguments needed: error_rt:rt_ drift ppm:precision useCorrelation: if we calculate correlations """ t = time.clock() #unpack parameters error_rt = kwargs.get('rtError', 6) #ppm = float(kwargs.get('ppm'))/10**6 ppm = peaks[0].sample.ppm / 1e6 mode = kwargs.get('mode', 'HighRes') resolveConflicts = kwargs.get('resolveConflicts', False) peaks_with_iso = peaks print "peaklist length", len(peaks) adducts_to_check = np.array(adducts.keys()) #=========================================================================== #START CAMERA ALGORITHM print("RT Grouping ...") #RT_peak=peaks_with_iso.rtClustering(error_rt) #3,find for each peak peaks which matches with retention time rtPeak = [] for i, peak in enumerate(peaks_with_iso.ipeaks()): l = MSPeakList() l.addPeak(peak) for j, peak_ in enumerate(peaks_with_iso.ipeaks()): if i != j: if abs(peak.rt - peak_.rt) < error_rt: l.append(peak_) isIncluded = False index = [] for k, rtClust in enumerate(rtPeak): if set(l) <= ( set(rtClust) ): #inclusion test of l already in rt ? seen as 'equivalent to' isIncluded = True break if set(rtClust) <= (set(l)): index.append(k) #break #del rtPeak[index] rtPeak = [rtPeak[i] for i in xrange(len(rtPeak)) if i not in index] if not isIncluded: rtPeak.append(MSPeakList(l)) #isIncluded=True #else: # if rtClust.__eq__(l): # rtPeak[k]=l # break #isIncluded=True #if not isIncluded: #l.sort(key=lambda x:x.mass()) # with open('test1.txt', 'w') as f: # for r in rtPeak: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) #EXPERIMENTAL CODE # cl=[] # for cluster in rtPeak: # list_=[];datapoints={} # for i, p in enumerate(cluster): # correspondingPeaks=set() # correspondingPeaks.add(p) # for j in xrange(i+1, len(cluster)): # #put caching on that to avoid recalculation each time of the datapoints # try: # r=r_coef(list(datapoints[p]), list(datapoints[cluster[j]])) # except KeyError: # y, y_= None, None # try: # y=datapoints[p] # except KeyError: # x, y= massExtractionBisectAlgo(p.sample,p.mass(), ppm) # datapoints[p]=y # # try: # y_=datapoints[cluster[j]] # except KeyError: # x, y_= massExtractionBisectAlgo(cluster[j].sample, cluster[j].mass(), ppm) # datapoints[cluster[j]]=y_ # r=r_coef(y, y_) # if r >= threshold: # correspondingPeaks.add(cluster[j]) # list_.append(correspondingPeaks) # # for i, p in enumerate(list_): # for j in xrange(i+1, len(list_)): # if list_[j].issubset(p): # continue # else: # cl.append(MSPeakList(list(p))) #merging step again # print "cluster length, same without replicates",len(cl), len(set(map(set, [x for x in cl]))) # with open('test2.txt', 'w') as f: # for r in cl: # s="" # for i, p in enumerate(r): # s+=str(p)+';' if i<len(r)-1 else str(p)+'\n' # f.write(s) # #END EXPERIMENTAL CODE print 'len RTpeak', len(rtPeak) print("Creating possible M0...") #Cython code finalList = massGenPerGroup(rtPeak, adducts_to_check, ppm) print("Mapping of calculated mass on peaklist...") #4,see if one matches with peak in the raw peaklist goodPeak = [] #list will contain good peak per rtCluster for i, dic in enumerate(finalList): matchingMass = defaultdict(list) for mass in dic.iterkeys(): p = rtPeak[i].peaksInMZRange( mass, deltam=mass * ppm if mode == 'HighRes' else 1.) #rtPeak[i] not necessarily sorted warning if not p: continue peak = sorted(p, key=lambda x: abs(mass - x.mass()))[0] #if peak not in matchingMass.keys():#may avoid this to see if one peak appears several times !then do 'set' # matchingMass[peak]=[] matchingMass[peak] += dic[mass] goodPeak.append(matchingMass) #start new stuffs here print("Merging informations...") #conflicts=False adds = MSPeakList() #object sor storing adducts found newGoodPeaks = defaultdict(list) #{} for peaksInOneRtGroup in goodPeak: for peak in peaksInOneRtGroup.iterkeys(): newGoodPeaks[peak] += peaksInOneRtGroup[peak] for p in newGoodPeaks.iterkeys(): p.fragCluster = MSClusterList(list(set(newGoodPeaks[p]))) for f in p.fragCluster: f.parentPeak.append(p) adds += p.fragCluster finalPeaks = MSPeakList(newGoodPeaks.keys()) print("Resolving conflicts if any...") #removing peak that appears many times that is to say in different clusters def clusterComparison( list_): #receive a list of peak with clusters identified """ return the best peak WARNING: p_ydata and p_.y_data are None TODO: """ sortedList = sorted(list_, key=lambda x: len(x.fragCluster)) longest = len(sortedList[-1].fragCluster) sameSizePeaks = MSPeakList() for p in sortedList: if len(p.fragCluster) == longest: sameSizePeaks.append(p) if len(sameSizePeaks) == 1: return sameSizePeaks[0] corr = np.array([0.] * len(sameSizePeaks)) #for i, p in enumerate(sameSizePeaks): # for p_ in p.fragCluster: # corr[i] += r_coef(p_.y_data, p.y_data) m = max_f(corr) return sameSizePeaks[np.where(corr == m)[0][0]] if resolveConflicts: for add in set(adds): if len(add.parentPeak) <= 1: #print "%s belong to several fragCluster"%str(add) continue #print "%s belong to several fragCluster"%str(add) goodParent = clusterComparison(add.parentPeak) #if goodParent is not None: # add.parentPeak = [goodParent] for parent in add.parentPeak: if parent != goodParent: try: parent.fragCluster.remove(add) except ValueError: print "Error removing %s from fragCluster of %s" % ( str(add), str(parent)) add.parentPeak = [ goodParent ] #the same of constructing a list 'toRemove then remove #print "after removing len add.parentPeak", len(add.parentPeak) #make the annotation for peak in finalPeaks.ipeaks(): for f in peak.fragCluster: #results = makeAnnotations(adducts_to_check, adducts, f.mass(), ppm) for annot in adducts.iterkeys(): p = f.mass() / annot[1] + annot[0] diff = peak.mass() * ppm if mode == 'HighRes' else 1 if peak.mass() > p - diff and peak.mass() < p + diff: f.annotation[annot] = adducts[annot] break finalPeaks = checkingSons(finalPeaks) #5,second filter, correlation on the isotopic cluster between samples # if useCorrelation: # print "Calculating correlation between samples..." # interSamplesCorr(spl, **kwargs) # print "Calculating correlation intra sample..." # intraSampleCorr(spl) # #6 merging print "Merging interesting peaks" for peak in peaks_with_iso.ipeaks( ): #wring merging must take out those which allow to construct this peak if peak not in finalPeaks and peak not in adds: #matching_peaks: finalPeaks.append(peak) #matching_peaks to if not finalPeaks: print( "no cluster found, please increase the ppm, or rt drift parameters" ) print("finished, time elapsed:", time.clock() - t) return MSPeakList( sorted(finalPeaks, key=lambda x: x.mass)), adds #checkingSons(finalPeaks), adds
def isotopicPeakListFinder(peaks, isomasses, **kwargs): """ assign an isotopic cluster for each peak, and try to find an idms we may use a system like the CAMERA algorithm to see... input: list of peak must an obj.MSPeakList object clusterLength = 6 never go to six in LOW_RES size expected of an isotopic cluster rtError: maximum drift of the retention time decreaseOrder: allow or not allow that the successive peak of the isotopic cluster intensity are going down, can be confusing for finding idms output: two MSPeakList, the first one corresponding to the peaks with an isotopic cluster and the other one all peaks belonging to an isotopic cluster """ #unpacking parameters print "Isotopic cluster calculation..." rtError = np.float(kwargs.get('rtError', 6)) ppm=np.float(peaks[0].sample.ppm/1e6) MAX_GAP_ALLOWED = np.int(len(isomasses)) decreaseOrder = kwargs.get('decreaseOrder', True) #we use the less restrictive... mode = kwargs.get('mode', 'Highres') #sort isomasses #isomasses = sorted(isomasses, key=lambda x:x[0]) peaks_with_iso =MSPeakList() peaks_without_iso = MSPeakList()#peaks without isotopic cluster but which does not have a isotopic cluster list_iso = set()#MSPeakList() t = time.clock() for peak in peaks.ipeaks():#iterating over peaks if peak in list_iso: continue#avoid to calculate for every peaks isoCluster= MSClusterList() gap = 0 #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses for i, isomass in enumerate(sorted(isomasses, key=lambda x:x[0])): #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError) mass=isomass[0] massToCheck=peak.mass()+mass p = peaks.peaksInMZRange(massToCheck, deltam=ppm*massToCheck if mode=='HighRes' else 1.) #deltart matchingRtPeaks = MSPeakList()#will contain all matching peak in rt for pk in p.ipeaks(): if pk != peak: if abs(peak.rt - pk.rt) <= rtError: matchingRtPeaks.append(pk) if matchingRtPeaks: pic = sorted(matchingRtPeaks, key=lambda pics: abs(pics.mass()-peak.mass()))[0] #take the closest in mass if pic is not None: if decreaseOrder:#we want peak area inferior a peak #if isoCluster: areaToCompare=isoCluster[-1].area if isoCluster else peak.area if areaToCompare < pic.area:#idms found ??? break if pic not in list_iso:#pic not in isoCluster and isoCluster.append(pic) list_iso.add(pic) else: gap+=1 if gap >=MAX_GAP_ALLOWED: break # #set parent for all peaks found if isoCluster: for pics in isoCluster: #pics.parentPeak=peak pics.parentPeak.append(peak) peak.isoCluster = isoCluster peaks_with_iso.addPeak(peak) else: peaks_without_iso.addPeak(peak) # for p in peaks.ipeaks(): # if p not in peaks_with_iso and p not in list_iso: # peaks_without_iso.addPeak(p) print time.clock()-t print "peaks with isotopes: " ,len(peaks_with_iso) print "list isotopes: " ,len(list_iso) print "peaks without isotopes: " ,len(peaks_without_iso) return peaks_with_iso+peaks_without_iso, list_iso
def isotopicPeakListFinder(peaks, isomasses, **kwargs): """ assign an isotopic cluster for each peak, and try to find an idms we may use a system like the CAMERA algorithm to see... input: list of peak must an obj.MSPeakList object clusterLength = 6 never go to six in LOW_RES size expected of an isotopic cluster rtError: maximum drift of the retention time decreaseOrder: allow or not allow that the successive peak of the isotopic cluster intensity are going down, can be confusing for finding idms output: two MSPeakList, the first one corresponding to the peaks with an isotopic cluster and the other one all peaks belonging to an isotopic cluster """ #unpacking parameters print "Isotopic cluster calculation..." rtError = np.float(kwargs.get('rtError', 6)) ppm = np.float(peaks[0].sample.ppm / 1e6) MAX_GAP_ALLOWED = np.int(len(isomasses)) decreaseOrder = kwargs.get('decreaseOrder', True) #we use the less restrictive... mode = kwargs.get('mode', 'Highres') #sort isomasses #isomasses = sorted(isomasses, key=lambda x:x[0]) peaks_with_iso = MSPeakList() peaks_without_iso = MSPeakList( ) #peaks without isotopic cluster but which does not have a isotopic cluster list_iso = set() #MSPeakList() t = time.clock() for peak in peaks.ipeaks(): #iterating over peaks if peak in list_iso: continue #avoid to calculate for every peaks isoCluster = MSClusterList() gap = 0 #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses for i, isomass in enumerate(sorted(isomasses, key=lambda x: x[0])): #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError) mass = isomass[0] massToCheck = peak.mass() + mass p = peaks.peaksInMZRange( massToCheck, deltam=ppm * massToCheck if mode == 'HighRes' else 1.) #deltart matchingRtPeaks = MSPeakList( ) #will contain all matching peak in rt for pk in p.ipeaks(): if pk != peak: if abs(peak.rt - pk.rt) <= rtError: matchingRtPeaks.append(pk) if matchingRtPeaks: pic = sorted(matchingRtPeaks, key=lambda pics: abs(pics.mass() - peak.mass()))[ 0] #take the closest in mass if pic is not None: if decreaseOrder: #we want peak area inferior a peak #if isoCluster: areaToCompare = isoCluster[ -1].area if isoCluster else peak.area if areaToCompare < pic.area: #idms found ??? break if pic not in list_iso: #pic not in isoCluster and isoCluster.append(pic) list_iso.add(pic) else: gap += 1 if gap >= MAX_GAP_ALLOWED: break # #set parent for all peaks found if isoCluster: for pics in isoCluster: #pics.parentPeak=peak pics.parentPeak.append(peak) peak.isoCluster = isoCluster peaks_with_iso.addPeak(peak) else: peaks_without_iso.addPeak(peak) # for p in peaks.ipeaks(): # if p not in peaks_with_iso and p not in list_iso: # peaks_without_iso.addPeak(p) print time.clock() - t print "peaks with isotopes: ", len(peaks_with_iso) print "list isotopes: ", len(list_iso) print "peaks without isotopes: ", len(peaks_without_iso) return peaks_with_iso + peaks_without_iso, list_iso