Example #1
def clusteringCAMERA(peaks, adducts, **kwargs):
    arguments needed:
        error_rt:rt_ drift
        useCorrelation: if we calculate correlations
    t = time.clock()
    #unpack parameters
    error_rt = kwargs.get('rtError', 6)
    #ppm = float(kwargs.get('ppm'))/10**6
    ppm = peaks[0].sample.ppm / 1e6
    mode = kwargs.get('mode', 'HighRes')
    resolveConflicts = kwargs.get('resolveConflicts', False)
    peaks_with_iso = peaks
    print "peaklist length", len(peaks)
    adducts_to_check = np.array(adducts.keys())
    print("RT Grouping ...")
    #3,find for each peak peaks which matches with retention time
    rtPeak = []
    for i, peak in enumerate(peaks_with_iso.ipeaks()):

        l = MSPeakList()
        for j, peak_ in enumerate(peaks_with_iso.ipeaks()):
            if i != j:
                if abs(peak.rt - peak_.rt) < error_rt:
        isIncluded = False
        index = []
        for k, rtClust in enumerate(rtPeak):
            if set(l) <= (
            ):  #inclusion test of l already in rt ? seen as 'equivalent to'
                isIncluded = True

            if set(rtClust) <= (set(l)):
        #del rtPeak[index]
        rtPeak = [rtPeak[i] for i in xrange(len(rtPeak)) if i not in index]
        if not isIncluded:
            #    if rtClust.__eq__(l):
            #        rtPeak[k]=l
            #        break
        #if not isIncluded:
        #l.sort(key=lambda x:x.mass())

#    with open('test1.txt', 'w') as f:
#        for r in rtPeak:
#            s=""
#            for i, p in enumerate(r):
#                s+=str(p)+';' if i<len(r)-1 else str(p)+'\n'
#            f.write(s)

#    cl=[]
#    for cluster in rtPeak:
#        list_=[];datapoints={}
#        for i, p in enumerate(cluster):
#            correspondingPeaks=set()
#            correspondingPeaks.add(p)
#            for j in xrange(i+1, len(cluster)):
#                #put caching on that to avoid recalculation each time of the datapoints
#                try:
#                    r=r_coef(list(datapoints[p]), list(datapoints[cluster[j]]))
#                except KeyError:
#                    y, y_= None, None
#                    try:
#                        y=datapoints[p]
#                    except KeyError:
#                        x, y= massExtractionBisectAlgo(p.sample,p.mass(), ppm)
#                        datapoints[p]=y
#                    try:
#                        y_=datapoints[cluster[j]]
#                    except KeyError:
#                        x, y_= massExtractionBisectAlgo(cluster[j].sample, cluster[j].mass(), ppm)
#                        datapoints[cluster[j]]=y_
#                    r=r_coef(y, y_)
#                if r >= threshold:
#                    correspondingPeaks.add(cluster[j])
#            list_.append(correspondingPeaks)
#        for i, p in enumerate(list_):
#            for j in xrange(i+1, len(list_)):
#                if list_[j].issubset(p):
#                    continue
#                else:
#                    cl.append(MSPeakList(list(p)))
#merging step again
#    print "cluster length, same without replicates",len(cl), len(set(map(set, [x for x in cl])))
#    with open('test2.txt', 'w') as f:
#        for r in cl:
#            s=""
#            for i, p in enumerate(r):
#                s+=str(p)+';' if i<len(r)-1 else str(p)+'\n'
#            f.write(s)
    print 'len RTpeak', len(rtPeak)
    print("Creating possible M0...")
    #Cython code
    finalList = massGenPerGroup(rtPeak, adducts_to_check, ppm)
    print("Mapping of calculated mass on peaklist...")
    #4,see if one matches with peak in the raw peaklist
    goodPeak = []  #list will contain good peak per rtCluster
    for i, dic in enumerate(finalList):
        matchingMass = defaultdict(list)
        for mass in dic.iterkeys():
            p = rtPeak[i].peaksInMZRange(
                mass, deltam=mass * ppm if mode == 'HighRes' else
                1.)  #rtPeak[i] not necessarily sorted warning
            if not p:
            peak = sorted(p, key=lambda x: abs(mass - x.mass()))[0]
            #if peak not in matchingMass.keys():#may avoid this to see if one peak appears several times !then do 'set'
            #    matchingMass[peak]=[]
            matchingMass[peak] += dic[mass]

    #start new stuffs here
    print("Merging informations...")
    adds = MSPeakList()  #object sor storing adducts found
    newGoodPeaks = defaultdict(list)  #{}
    for peaksInOneRtGroup in goodPeak:
        for peak in peaksInOneRtGroup.iterkeys():
            newGoodPeaks[peak] += peaksInOneRtGroup[peak]
    for p in newGoodPeaks.iterkeys():
        p.fragCluster = MSClusterList(list(set(newGoodPeaks[p])))
        for f in p.fragCluster:
        adds += p.fragCluster
    finalPeaks = MSPeakList(newGoodPeaks.keys())

    print("Resolving conflicts if any...")

    #removing peak that appears many times that is to say in different clusters
    def clusterComparison(
            list_):  #receive a list of peak with clusters identified
        return the best peak
        WARNING: p_ydata and p_.y_data are None
        sortedList = sorted(list_, key=lambda x: len(x.fragCluster))
        longest = len(sortedList[-1].fragCluster)
        sameSizePeaks = MSPeakList()

        for p in sortedList:
            if len(p.fragCluster) == longest:

        if len(sameSizePeaks) == 1:
            return sameSizePeaks[0]
        corr = np.array([0.] * len(sameSizePeaks))
        #for i, p in enumerate(sameSizePeaks):
        #    for p_ in p.fragCluster:
        #        corr[i] += r_coef(p_.y_data, p.y_data)
        m = max_f(corr)
        return sameSizePeaks[np.where(corr == m)[0][0]]

    if resolveConflicts:
        for add in set(adds):
            if len(add.parentPeak) <= 1:
                #print "%s belong to several fragCluster"%str(add)
            #print "%s belong to several fragCluster"%str(add)
            goodParent = clusterComparison(add.parentPeak)
            #if goodParent is not None:
            #    add.parentPeak = [goodParent]

            for parent in add.parentPeak:
                if parent != goodParent:
                    except ValueError:
                        print "Error removing %s from fragCluster of %s" % (
                            str(add), str(parent))
            add.parentPeak = [
            ]  #the same of constructing a list 'toRemove then remove
            #print "after removing len add.parentPeak", len(add.parentPeak)

    #make the annotation
    for peak in finalPeaks.ipeaks():
        for f in peak.fragCluster:
            #results = makeAnnotations(adducts_to_check, adducts, f.mass(), ppm)
            for annot in adducts.iterkeys():
                p = f.mass() / annot[1] + annot[0]
                diff = peak.mass() * ppm if mode == 'HighRes' else 1
                if peak.mass() > p - diff and peak.mass() < p + diff:
                    f.annotation[annot] = adducts[annot]
    finalPeaks = checkingSons(finalPeaks)
    #5,second filter, correlation on the isotopic cluster between samples
    #    if useCorrelation:
    #        print "Calculating correlation between samples..."
    #        interSamplesCorr(spl, **kwargs)
    #        print  "Calculating correlation intra sample..."
    #        intraSampleCorr(spl)
    #    #6 merging
    print "Merging interesting peaks"
    for peak in peaks_with_iso.ipeaks(
    ):  #wring merging must take out those which allow to construct this peak
        if peak not in finalPeaks and peak not in adds:  #matching_peaks:
            finalPeaks.append(peak)  #matching_peaks to
    if not finalPeaks:
            "no cluster found, please increase the ppm, or rt drift parameters"
    print("finished, time elapsed:", time.clock() - t)
    return MSPeakList(
               key=lambda x: x.mass)), adds  #checkingSons(finalPeaks), adds
Example #2
def clusteringBASIC(peaks, adds, **k):

    if not peaks:
    t = time.clock()
    errorRt = k.get('rtError', 6)
    #ppm = float(kwargs.get('ppm'))/10**6
    ppm = k.get('ppm')
    if ppm is None:
            ppm = peaks[0].sample.ppm / 1e6
        except AttributeError:
            print "No value found for ppm setting to 10/1E6"
            ppm = 10. / 1e6
    #mode = k.get('mode', 'HighRes')
    resolveConflicts = k.get('resolveConflicts', False)
    addsToCheck = np.array(adds.keys())

    adductsFound = MSPeakList()
    for i, p in enumerate(peaks):
        a = MSClusterList()
        for v in addsToCheck:
            m = p.mz + v[0]
            match = peaks.peaksInMZRTRange(m,
                                           deltam=2 * ppm * m)
            if match is None or not match:
            #take the closest in mass
            goodP = sorted(match, key=lambda x: abs(x.mz - (p.mz + v[0])))[0]
            #if goodP in set(adductsFound):
            #    if resolveConflicts:
            #        pass
            if goodP is p:
            goodP.parentPeak = p
        p.fragCluster = MSPeakList(set(a))  #prevent from duplicates

#    def clusterComparison(list_):#receive a list of peak with clusters identified
#        """
#        return the best peak
#        WARNING: p_ydata and p_.y_data are None
#        TODO:
#        """
#        sortedList = sorted(list_, key=lambda x: len(x.fragCluster))
#        longest=len(sortedList[-1].fragCluster)
#        sameSizePeaks=MSPeakList()
#        for p in sortedList:
#            if len(p.fragCluster) == longest:
#                sameSizePeaks.append(p)
#        if len(sameSizePeaks) == 1:
#            return sameSizePeaks[0]
#        corr=np.array([0.] * len(sameSizePeaks))
#        #for i, p in enumerate(sameSizePeaks):
#        #    for p_ in p.fragCluster:
#        #        corr[i] += r_coef(p_.y_data, p.y_data)
#        m=max_f(corr)
#        return sameSizePeaks[np.where(corr == m)[0][0]]
#        if resolveConflicts:
#            for add in set(adductsFound):
#                if len(add.parentPeak) <= 1:
#                    #print "%s belong to several fragCluster"%str(add)
#                    continue
#                #print "%s belong to several fragCluster"%str(add)
#                goodParent=clusterComparison(add.parentPeak)
#                #if goodParent is not None:
#                #    add.parentPeak = [goodParent]
#                for parent in add.parentPeak:
#                    if parent != goodParent:
#                        try:
#                            parent.fragCluster.remove(add)
#                        except ValueError:
#                            print "Error removing %s from fragCluster of %s"%(str(add), str(parent))
#                add.parentPeak = [goodParent] #the same of constructing a list 'toRemove then remove
#                #print "after removing len add.parentPeak", len(add.parentPeak)
    print "TiemElapsed: %s" % str(time.clock() - t)
    return peaks, adductsFound
Example #3
def isotopicPeakListFinder(peaks, isomasses, **kwargs):
    assign an isotopic cluster for each peak, and try to find an idms
     we may use a system like the CAMERA algorithm to see...
        list of peak must an obj.MSPeakList object
        clusterLength = 6  never go to six in LOW_RES
                        size expected of an isotopic cluster
        rtError: maximum drift of the retention time
        decreaseOrder: allow or not allow that the successive peak of the isotopic cluster
                        intensity are going down, can be confusing for finding idms
        two MSPeakList, the first one corresponding to the peaks with an isotopic cluster
        and the other one all peaks belonging to an isotopic cluster

    #unpacking parameters
    print "Isotopic cluster calculation..."

    rtError = np.float(kwargs.get('rtError', 6))
    ppm = np.float(peaks[0].sample.ppm / 1e6)
    MAX_GAP_ALLOWED = np.int(len(isomasses))
    decreaseOrder = kwargs.get('decreaseOrder',
                               True)  #we use the less restrictive...
    mode = kwargs.get('mode', 'Highres')
    #sort isomasses
    #isomasses = sorted(isomasses, key=lambda x:x[0])

    peaks_with_iso = MSPeakList()
    peaks_without_iso = MSPeakList(
    )  #peaks without isotopic cluster but which does not have a isotopic cluster
    list_iso = set()  #MSPeakList()

    t = time.clock()

    for peak in peaks.ipeaks():  #iterating over peaks

        if peak in list_iso:
            continue  #avoid to calculate for every peaks

        isoCluster = MSClusterList()
        gap = 0
        #isos = resolutionAdjustment(isomasses, peak.mass()*ppm) if mode=='HighRes' else isomasses
        for i, isomass in enumerate(sorted(isomasses, key=lambda x: x[0])):
            #pic = _getMatchingPeaks(peaks, peak, isomass[0], ppm, rtError)

            mass = isomass[0]
            massToCheck = peak.mass() + mass

            p = peaks.peaksInMZRange(
                deltam=ppm *
                massToCheck if mode == 'HighRes' else 1.)  #deltart
            matchingRtPeaks = MSPeakList(
            )  #will contain all matching peak in rt
            for pk in p.ipeaks():
                if pk != peak:
                    if abs(peak.rt - pk.rt) <= rtError:

            if matchingRtPeaks:

                pic = sorted(matchingRtPeaks,
                             key=lambda pics: abs(pics.mass() - peak.mass()))[
                                 0]  #take the closest in mass
                if pic is not None:
                    if decreaseOrder:  #we want peak area inferior a peak
                        #if isoCluster:
                        areaToCompare = isoCluster[
                            -1].area if isoCluster else peak.area
                        if areaToCompare < pic.area:  #idms found ???

                    if pic not in list_iso:  #pic not in isoCluster and
                gap += 1
                if gap >= MAX_GAP_ALLOWED:

        # #set parent for all peaks found
        if isoCluster:
            for pics in isoCluster:
            peak.isoCluster = isoCluster

#    for p in peaks.ipeaks():
#        if p not in peaks_with_iso and p not in list_iso:
#            peaks_without_iso.addPeak(p)

    print time.clock() - t
    print "peaks with isotopes: ", len(peaks_with_iso)
    print "list isotopes: ", len(list_iso)
    print "peaks without isotopes: ", len(peaks_without_iso)
    return peaks_with_iso + peaks_without_iso, list_iso