コード例 #1
0
ファイル: MetClustering.py プロジェクト: jerkos/metms
def clusteringBASIC(peaks, adds, **k):
    
    if not peaks:
        return
    t=time.clock()
    errorRt = k.get('rtError', 6)
    #ppm = float(kwargs.get('ppm'))/10**6
    ppm = k.get('ppm')
    if ppm is None:
        try:        
            ppm = peaks[0].sample.ppm/1e6
        except AttributeError:
            print "No value found for ppm setting to 10/1E6"
            ppm = 10./1e6
    #mode = k.get('mode', 'HighRes')
    resolveConflicts=k.get('resolveConflicts', False)
    addsToCheck=np.array(adds.keys())
    
    adductsFound = MSPeakList()    
    for i, p in enumerate(peaks):
        a = MSClusterList()        
        for v in addsToCheck:
            m = p.mz+v[0]
            match = peaks.peaksInMZRTRange(m, p.rt, errorRt, deltam= 2 * ppm * m)
            if match is None or not match:
                continue
            #take the closest in mass
            goodP = sorted(match, key=lambda x:abs(x.mz - (p.mz + v[0])))[0]
            #if goodP in set(adductsFound):
            #    if resolveConflicts:
            #        pass
            #else:
            if goodP is p:
                continue
            a.append(goodP)
            goodP.parentPeak=p
            adductsFound.append(goodP)
        p.fragCluster=MSPeakList(set(a))#prevent from duplicates
        
#    def clusterComparison(list_):#receive a list of peak with clusters identified
#        """
#        return the best peak
#        WARNING: p_ydata and p_.y_data are None
#        TODO: 
#        
#        """        
#        sortedList = sorted(list_, key=lambda x: len(x.fragCluster))
#        longest=len(sortedList[-1].fragCluster)
#        sameSizePeaks=MSPeakList()        
#        
#        for p in sortedList:
#            if len(p.fragCluster) == longest:
#                sameSizePeaks.append(p)
#        
#        if len(sameSizePeaks) == 1:
#            return sameSizePeaks[0]
#        corr=np.array([0.] * len(sameSizePeaks))
#        #for i, p in enumerate(sameSizePeaks):
#        #    for p_ in p.fragCluster:
#        #        corr[i] += r_coef(p_.y_data, p.y_data)
#        m=max_f(corr)
#        return sameSizePeaks[np.where(corr == m)[0][0]]
#        
#        if resolveConflicts:
#            for add in set(adductsFound):
#                if len(add.parentPeak) <= 1:
#                    #print "%s belong to several fragCluster"%str(add)
#                    continue
#                #print "%s belong to several fragCluster"%str(add)
#                goodParent=clusterComparison(add.parentPeak)
#                #if goodParent is not None:
#                #    add.parentPeak = [goodParent]            
#                
#                for parent in add.parentPeak:
#                    if parent != goodParent:
#                        try:
#                            parent.fragCluster.remove(add)
#                        except ValueError:
#                            print "Error removing %s from fragCluster of %s"%(str(add), str(parent))
#                add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove
#                #print "after removing len add.parentPeak", len(add.parentPeak)
    print "TiemElapsed: %s"%str(time.clock()-t)
    return peaks, adductsFound
コード例 #2
0
ファイル: MetClustering.py プロジェクト: jerkos/metms
def isotopicPeakListFinder(peaks, isomasses, **kwargs):
    """
    assign an isotopic cluster for each peak, and try to find an idms
     we may use a system like the CAMERA algorithm to see...
    input:
        list of peak must an obj.MSPeakList object
        clusterLength = 6  never go to six in LOW_RES
                        size expected of an isotopic cluster
        rtError: maximum drift of the retention time
        decreaseOrder: allow or not allow that the successive peak of the isotopic cluster
                        intensity are going down, can be confusing for finding idms
    output:
        two MSPeakList, the first one corresponding to the peaks with an isotopic cluster
        and the other one all peaks belonging to an isotopic cluster
    """
   
    #unpacking parameters
    print "Isotopic cluster calculation..."
    
    rtError = np.float(kwargs.get('rtError', 6))
    ppm=np.float(peaks[0].sample.ppm/1e6)
    MAX_GAP_ALLOWED = np.int(len(isomasses))
    decreaseOrder = kwargs.get('decreaseOrder', True)  #we use the less restrictive...
    mode =  kwargs.get('mode', 'Highres')
    #sort isomasses
    #isomasses = sorted(isomasses, key=lambda x:x[0])
    
    peaks_with_iso =MSPeakList()               
    peaks_without_iso = MSPeakList()#peaks without isotopic cluster but which does not have a isotopic cluster
    list_iso = set()#MSPeakList()

    t = time.clock()    
    
    for peak in peaks.ipeaks():#iterating over peaks
        
        if peak in list_iso:
            continue#avoid to calculate for every peaks
        
        isoCluster= MSClusterList()
        gap = 0
        #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses
        for i, isomass in enumerate(sorted(isomasses, key=lambda x:x[0])):
            #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError)
            
            mass=isomass[0]
            massToCheck=peak.mass()+mass
                        
            p = peaks.peaksInMZRange(massToCheck, deltam=ppm*massToCheck if mode=='HighRes' else 1.) #deltart
            matchingRtPeaks = MSPeakList()#will contain all matching peak in rt
            for pk in p.ipeaks():
                if pk != peak:
                    if abs(peak.rt - pk.rt) <= rtError:
                        matchingRtPeaks.append(pk)
            
            if matchingRtPeaks:
         
                pic = sorted(matchingRtPeaks, key=lambda pics: abs(pics.mass()-peak.mass()))[0] #take the closest in mass
                if pic is not None:
                    if decreaseOrder:#we want peak area inferior a peak
                        #if isoCluster:
                        areaToCompare=isoCluster[-1].area if isoCluster else peak.area 
                        if areaToCompare < pic.area:#idms found ???
                           break
                      
                    if pic not in list_iso:#pic not in isoCluster and
                        isoCluster.append(pic)
                        list_iso.add(pic)
            else:
                gap+=1
                if gap >=MAX_GAP_ALLOWED:
                    break
        
        # #set parent for all peaks found
        if isoCluster:
            for pics in isoCluster:
                #pics.parentPeak=peak
                pics.parentPeak.append(peak)
            peak.isoCluster = isoCluster
            peaks_with_iso.addPeak(peak)
        else:
            peaks_without_iso.addPeak(peak)
      
    
#    for p in peaks.ipeaks():
#        if p not in peaks_with_iso and p not in list_iso:
#            peaks_without_iso.addPeak(p)
    
    
    print time.clock()-t
    print "peaks with isotopes: " ,len(peaks_with_iso)
    print "list isotopes: " ,len(list_iso)
    print "peaks without isotopes: " ,len(peaks_without_iso)
    return peaks_with_iso+peaks_without_iso, list_iso
コード例 #3
0
ファイル: MetClustering.py プロジェクト: jerkos/metms
def clusteringCAMERA(peaks, adducts, **kwargs):
    """
    arguments needed:
        error_rt:rt_ drift
        ppm:precision
        useCorrelation: if we calculate correlations
    """
    t = time.clock()
    #unpack parameters
    error_rt = kwargs.get('rtError', 6)
    #ppm = float(kwargs.get('ppm'))/10**6
    ppm = peaks[0].sample.ppm / 1e6
    mode = kwargs.get('mode', 'HighRes')
    resolveConflicts = kwargs.get('resolveConflicts', False)
    peaks_with_iso = peaks
    print "peaklist length", len(peaks)
    adducts_to_check = np.array(adducts.keys())
    #===========================================================================
    #START CAMERA ALGORITHM
    print("RT Grouping ...")
    #RT_peak=peaks_with_iso.rtClustering(error_rt)
    #3,find for each peak peaks which matches with retention time
    rtPeak = []
    for i, peak in enumerate(peaks_with_iso.ipeaks()):

        l = MSPeakList()
        l.addPeak(peak)
        for j, peak_ in enumerate(peaks_with_iso.ipeaks()):
            if i != j:
                if abs(peak.rt - peak_.rt) < error_rt:
                    l.append(peak_)
        isIncluded = False
        index = []
        for k, rtClust in enumerate(rtPeak):
            if set(l) <= (
                    set(rtClust)
            ):  #inclusion test of l already in rt ? seen as 'equivalent to'
                isIncluded = True
                break

            if set(rtClust) <= (set(l)):
                index.append(k)
                #break
        #del rtPeak[index]
        rtPeak = [rtPeak[i] for i in xrange(len(rtPeak)) if i not in index]
        if not isIncluded:
            rtPeak.append(MSPeakList(l))
            #isIncluded=True
            #else:
            #    if rtClust.__eq__(l):
            #        rtPeak[k]=l
            #        break
            #isIncluded=True
        #if not isIncluded:
        #l.sort(key=lambda x:x.mass())


#    with open('test1.txt', 'w') as f:
#        for r in rtPeak:
#            s=""
#            for i, p in enumerate(r):
#                s+=str(p)+';' if i<len(r)-1 else str(p)+'\n'
#            f.write(s)

#EXPERIMENTAL CODE
#    cl=[]
#    for cluster in rtPeak:
#        list_=[];datapoints={}
#        for i, p in enumerate(cluster):
#            correspondingPeaks=set()
#            correspondingPeaks.add(p)
#            for j in xrange(i+1, len(cluster)):
#                #put caching on that to avoid recalculation each time of the datapoints
#                try:
#                    r=r_coef(list(datapoints[p]), list(datapoints[cluster[j]]))
#                except KeyError:
#                    y, y_= None, None
#                    try:
#                        y=datapoints[p]
#                    except KeyError:
#                        x, y= massExtractionBisectAlgo(p.sample,p.mass(), ppm)
#                        datapoints[p]=y
#
#                    try:
#                        y_=datapoints[cluster[j]]
#                    except KeyError:
#                        x, y_= massExtractionBisectAlgo(cluster[j].sample, cluster[j].mass(), ppm)
#                        datapoints[cluster[j]]=y_
#                    r=r_coef(y, y_)
#                if r >= threshold:
#                    correspondingPeaks.add(cluster[j])
#            list_.append(correspondingPeaks)
#
#        for i, p in enumerate(list_):
#            for j in xrange(i+1, len(list_)):
#                if list_[j].issubset(p):
#                    continue
#                else:
#                    cl.append(MSPeakList(list(p)))
#merging step again
#    print "cluster length, same without replicates",len(cl), len(set(map(set, [x for x in cl])))
#    with open('test2.txt', 'w') as f:
#        for r in cl:
#            s=""
#            for i, p in enumerate(r):
#                s+=str(p)+';' if i<len(r)-1 else str(p)+'\n'
#            f.write(s)
#
#END EXPERIMENTAL CODE
    print 'len RTpeak', len(rtPeak)
    print("Creating possible M0...")
    #Cython code
    finalList = massGenPerGroup(rtPeak, adducts_to_check, ppm)
    print("Mapping of calculated mass on peaklist...")
    #4,see if one matches with peak in the raw peaklist
    goodPeak = []  #list will contain good peak per rtCluster
    for i, dic in enumerate(finalList):
        matchingMass = defaultdict(list)
        for mass in dic.iterkeys():
            p = rtPeak[i].peaksInMZRange(
                mass, deltam=mass * ppm if mode == 'HighRes' else
                1.)  #rtPeak[i] not necessarily sorted warning
            if not p:
                continue
            peak = sorted(p, key=lambda x: abs(mass - x.mass()))[0]
            #if peak not in matchingMass.keys():#may avoid this to see if one peak appears several times !then do 'set'
            #    matchingMass[peak]=[]
            matchingMass[peak] += dic[mass]
        goodPeak.append(matchingMass)

    #start new stuffs here
    print("Merging informations...")
    #conflicts=False
    adds = MSPeakList()  #object sor storing adducts found
    newGoodPeaks = defaultdict(list)  #{}
    for peaksInOneRtGroup in goodPeak:
        for peak in peaksInOneRtGroup.iterkeys():
            newGoodPeaks[peak] += peaksInOneRtGroup[peak]
    for p in newGoodPeaks.iterkeys():
        p.fragCluster = MSClusterList(list(set(newGoodPeaks[p])))
        for f in p.fragCluster:
            f.parentPeak.append(p)
        adds += p.fragCluster
    finalPeaks = MSPeakList(newGoodPeaks.keys())

    print("Resolving conflicts if any...")

    #removing peak that appears many times that is to say in different clusters
    def clusterComparison(
            list_):  #receive a list of peak with clusters identified
        """
        return the best peak
        WARNING: p_ydata and p_.y_data are None
        TODO: 
        
        """
        sortedList = sorted(list_, key=lambda x: len(x.fragCluster))
        longest = len(sortedList[-1].fragCluster)
        sameSizePeaks = MSPeakList()

        for p in sortedList:
            if len(p.fragCluster) == longest:
                sameSizePeaks.append(p)

        if len(sameSizePeaks) == 1:
            return sameSizePeaks[0]
        corr = np.array([0.] * len(sameSizePeaks))
        #for i, p in enumerate(sameSizePeaks):
        #    for p_ in p.fragCluster:
        #        corr[i] += r_coef(p_.y_data, p.y_data)
        m = max_f(corr)
        return sameSizePeaks[np.where(corr == m)[0][0]]

    if resolveConflicts:
        for add in set(adds):
            if len(add.parentPeak) <= 1:
                #print "%s belong to several fragCluster"%str(add)
                continue
            #print "%s belong to several fragCluster"%str(add)
            goodParent = clusterComparison(add.parentPeak)
            #if goodParent is not None:
            #    add.parentPeak = [goodParent]

            for parent in add.parentPeak:
                if parent != goodParent:
                    try:
                        parent.fragCluster.remove(add)
                    except ValueError:
                        print "Error removing %s from fragCluster of %s" % (
                            str(add), str(parent))
            add.parentPeak = [
                goodParent
            ]  #the same of constructing a list 'toRemove then remove
            #print "after removing len add.parentPeak", len(add.parentPeak)

    #make the annotation
    for peak in finalPeaks.ipeaks():
        for f in peak.fragCluster:
            #results = makeAnnotations(adducts_to_check, adducts, f.mass(), ppm)
            for annot in adducts.iterkeys():
                p = f.mass() / annot[1] + annot[0]
                diff = peak.mass() * ppm if mode == 'HighRes' else 1
                if peak.mass() > p - diff and peak.mass() < p + diff:
                    f.annotation[annot] = adducts[annot]
                    break
    finalPeaks = checkingSons(finalPeaks)
    #5,second filter, correlation on the isotopic cluster between samples
    #    if useCorrelation:
    #        print "Calculating correlation between samples..."
    #        interSamplesCorr(spl, **kwargs)
    #        print  "Calculating correlation intra sample..."
    #        intraSampleCorr(spl)
    #    #6 merging
    print "Merging interesting peaks"
    for peak in peaks_with_iso.ipeaks(
    ):  #wring merging must take out those which allow to construct this peak
        if peak not in finalPeaks and peak not in adds:  #matching_peaks:
            finalPeaks.append(peak)  #matching_peaks to
    if not finalPeaks:
        print(
            "no cluster found, please increase the ppm, or rt drift parameters"
        )
    print("finished, time elapsed:", time.clock() - t)
    return MSPeakList(
        sorted(finalPeaks,
               key=lambda x: x.mass)), adds  #checkingSons(finalPeaks), adds
コード例 #4
0
ファイル: MetClustering.py プロジェクト: jerkos/metms
def clusteringBASIC(peaks, adds, **k):

    if not peaks:
        return
    t = time.clock()
    errorRt = k.get('rtError', 6)
    #ppm = float(kwargs.get('ppm'))/10**6
    ppm = k.get('ppm')
    if ppm is None:
        try:
            ppm = peaks[0].sample.ppm / 1e6
        except AttributeError:
            print "No value found for ppm setting to 10/1E6"
            ppm = 10. / 1e6
    #mode = k.get('mode', 'HighRes')
    resolveConflicts = k.get('resolveConflicts', False)
    addsToCheck = np.array(adds.keys())

    adductsFound = MSPeakList()
    for i, p in enumerate(peaks):
        a = MSClusterList()
        for v in addsToCheck:
            m = p.mz + v[0]
            match = peaks.peaksInMZRTRange(m,
                                           p.rt,
                                           errorRt,
                                           deltam=2 * ppm * m)
            if match is None or not match:
                continue
            #take the closest in mass
            goodP = sorted(match, key=lambda x: abs(x.mz - (p.mz + v[0])))[0]
            #if goodP in set(adductsFound):
            #    if resolveConflicts:
            #        pass
            #else:
            if goodP is p:
                continue
            a.append(goodP)
            goodP.parentPeak = p
            adductsFound.append(goodP)
        p.fragCluster = MSPeakList(set(a))  #prevent from duplicates

#    def clusterComparison(list_):#receive a list of peak with clusters identified
#        """
#        return the best peak
#        WARNING: p_ydata and p_.y_data are None
#        TODO:
#
#        """
#        sortedList = sorted(list_, key=lambda x: len(x.fragCluster))
#        longest=len(sortedList[-1].fragCluster)
#        sameSizePeaks=MSPeakList()
#
#        for p in sortedList:
#            if len(p.fragCluster) == longest:
#                sameSizePeaks.append(p)
#
#        if len(sameSizePeaks) == 1:
#            return sameSizePeaks[0]
#        corr=np.array([0.] * len(sameSizePeaks))
#        #for i, p in enumerate(sameSizePeaks):
#        #    for p_ in p.fragCluster:
#        #        corr[i] += r_coef(p_.y_data, p.y_data)
#        m=max_f(corr)
#        return sameSizePeaks[np.where(corr == m)[0][0]]
#
#        if resolveConflicts:
#            for add in set(adductsFound):
#                if len(add.parentPeak) <= 1:
#                    #print "%s belong to several fragCluster"%str(add)
#                    continue
#                #print "%s belong to several fragCluster"%str(add)
#                goodParent=clusterComparison(add.parentPeak)
#                #if goodParent is not None:
#                #    add.parentPeak = [goodParent]
#
#                for parent in add.parentPeak:
#                    if parent != goodParent:
#                        try:
#                            parent.fragCluster.remove(add)
#                        except ValueError:
#                            print "Error removing %s from fragCluster of %s"%(str(add), str(parent))
#                add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove
#                #print "after removing len add.parentPeak", len(add.parentPeak)
    print "TiemElapsed: %s" % str(time.clock() - t)
    return peaks, adductsFound
コード例 #5
0
ファイル: MetClustering.py プロジェクト: jerkos/metms
def isotopicPeakListFinder(peaks, isomasses, **kwargs):
    """
    assign an isotopic cluster for each peak, and try to find an idms
     we may use a system like the CAMERA algorithm to see...
    input:
        list of peak must an obj.MSPeakList object
        clusterLength = 6  never go to six in LOW_RES
                        size expected of an isotopic cluster
        rtError: maximum drift of the retention time
        decreaseOrder: allow or not allow that the successive peak of the isotopic cluster
                        intensity are going down, can be confusing for finding idms
    output:
        two MSPeakList, the first one corresponding to the peaks with an isotopic cluster
        and the other one all peaks belonging to an isotopic cluster
    """

    #unpacking parameters
    print "Isotopic cluster calculation..."

    rtError = np.float(kwargs.get('rtError', 6))
    ppm = np.float(peaks[0].sample.ppm / 1e6)
    MAX_GAP_ALLOWED = np.int(len(isomasses))
    decreaseOrder = kwargs.get('decreaseOrder',
                               True)  #we use the less restrictive...
    mode = kwargs.get('mode', 'Highres')
    #sort isomasses
    #isomasses = sorted(isomasses, key=lambda x:x[0])

    peaks_with_iso = MSPeakList()
    peaks_without_iso = MSPeakList(
    )  #peaks without isotopic cluster but which does not have a isotopic cluster
    list_iso = set()  #MSPeakList()

    t = time.clock()

    for peak in peaks.ipeaks():  #iterating over peaks

        if peak in list_iso:
            continue  #avoid to calculate for every peaks

        isoCluster = MSClusterList()
        gap = 0
        #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses
        for i, isomass in enumerate(sorted(isomasses, key=lambda x: x[0])):
            #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError)

            mass = isomass[0]
            massToCheck = peak.mass() + mass

            p = peaks.peaksInMZRange(
                massToCheck,
                deltam=ppm *
                massToCheck if mode == 'HighRes' else 1.)  #deltart
            matchingRtPeaks = MSPeakList(
            )  #will contain all matching peak in rt
            for pk in p.ipeaks():
                if pk != peak:
                    if abs(peak.rt - pk.rt) <= rtError:
                        matchingRtPeaks.append(pk)

            if matchingRtPeaks:

                pic = sorted(matchingRtPeaks,
                             key=lambda pics: abs(pics.mass() - peak.mass()))[
                                 0]  #take the closest in mass
                if pic is not None:
                    if decreaseOrder:  #we want peak area inferior a peak
                        #if isoCluster:
                        areaToCompare = isoCluster[
                            -1].area if isoCluster else peak.area
                        if areaToCompare < pic.area:  #idms found ???
                            break

                    if pic not in list_iso:  #pic not in isoCluster and
                        isoCluster.append(pic)
                        list_iso.add(pic)
            else:
                gap += 1
                if gap >= MAX_GAP_ALLOWED:
                    break

        # #set parent for all peaks found
        if isoCluster:
            for pics in isoCluster:
                #pics.parentPeak=peak
                pics.parentPeak.append(peak)
            peak.isoCluster = isoCluster
            peaks_with_iso.addPeak(peak)
        else:
            peaks_without_iso.addPeak(peak)

#    for p in peaks.ipeaks():
#        if p not in peaks_with_iso and p not in list_iso:
#            peaks_without_iso.addPeak(p)

    print time.clock() - t
    print "peaks with isotopes: ", len(peaks_with_iso)
    print "list isotopes: ", len(list_iso)
    print "peaks without isotopes: ", len(peaks_without_iso)
    return peaks_with_iso + peaks_without_iso, list_iso