コード例 #1
0
ファイル: ImportLC.py プロジェクト: CASC4DE/EUFT_Spike
def Import_and_Process_LC(folder,
                          outfile="LC-MS.msh5",
                          compress=False,
                          comp_level=3.0,
                          downsample=True,
                          dparameters=None):
    """
    Entry point to import sets of LC-MS spectra
    processing is done on the fly
    It creates and returns a HDF5 file containing the data-set
    
    compression is active if (compress=True).
    comp_level is the ratio (in x sigma) under which values are set to 0.0
    downsample is applied if (downsample=True).
    These two parameters are efficient but it takes time.

    dparameters if present, is a dictionnary copied into the final file as json 
    """
    from spike.File import Solarix, Apex
    #    from spike.File.Solarix import locate_acquisition, read_param
    from spike.NPKData import TimeAxis, copyaxes
    from spike.File import HDF5File as hf
    from spike.util import progressbar as pg
    from spike.util import widgets
    from spike.FTICR import FTICRData
    for _importer in (Solarix, Apex):
        try:
            parfilename = _importer.locate_acquisition(folder)
            params = _importer.read_param(parfilename)
            sizeF2 = int(params["TD"])
            importer = _importer
            break
        except:
            print("***************************************")
            print(params)
        else:
            raise Exception("could  not import data-set - unrecognized format")
    # get chromatogram
    minu, tic, maxpk = import_scan(os.path.join(folder, "scan.xml"))
    # Import parameters : size in F1 and F2
    sizeF1 = len(minu)
    sizeF2 = int(params["TD"])
    if os.path.isfile(os.path.join(folder, "ser")):
        fname = os.path.join(folder, "ser")
    else:
        raise Exception(
            "You are dealing with 1D data, you should use Import_1D")
    #size, specwidth,  offset, left_point, highmass, calibA, calibB, calibC, lowfreq, highfreq
    data = FTICRData(dim=2)  # create dummy LCMS
    data.axis1 = TimeAxis(size=sizeF1,
                          tabval=np.array(minu),
                          importunit="min",
                          currentunit='min')
    data.axis2.size = 1 * sizeF2  # The processing below might change the size, so we anticipate here !
    data.axis2.specwidth = float(params["SW_h"])
    found = False  # search for excitation bandwidth
    try:
        data.axis2.lowfreq, data.axis2.highfreq = read_ExciteSweep(
            locate_ExciteSweep(folder))
        found = True
    except:
        pass
    if not found:
        try:
            data.axis2.highfreq = float(params["EXC_Freq_High"])
        except:
            data.axis2.highfreq = data.axis2.calibA / float(
                params["EXC_low"])  # on Apex version
        try:
            data.axis2.lowfreq = float(params["EXC_Freq_Low"])
        except:
            data.axis2.lowfreq = data.axis2.calibA / float(
                params["EXC_hi"])  # on Apex version

    data.axis2.highmass = float(params["MW_high"])
    data.axis2.left_point = 0
    data.axis2.offset = 0.0
    data.axis2.calibA = float(params["ML1"])
    data.axis2.calibB = float(params["ML2"])
    data.axis2.calibC = float(params["ML3"])
    if not math.isclose(data.axis2.calibC, 0.0):
        print('Using 3 parameters calibration,  Warning calibB is -ML2')
        data.axis2.calibB *= -1

    data.params = params  # add the parameters to the data-set
    HF = hf.HDF5File(outfile, "w")
    if compress:
        HF.set_compression(True)
    HF.create_from_template(data, group='resol1')
    HF.store_internal_object(params,
                             h5name='params')  # store params in the file
    # then store files xx.methods and scan.xml
    HF.store_internal_file(parfilename)
    HF.store_internal_file(os.path.join(folder, "scan.xml"))
    try:
        HF.store_internal_file(locate_ExciteSweep(folder))
    except:
        print('ExciteSweep file not stored')
    data.hdf5file = HF  # I need a link back to the file in order to close it

    # Start processing - first computes sizes and sub-datasets
    print(data)
    datalist = []  # remembers all downsampled dataset
    maxvalues = [
        0.0
    ]  # remembers max values in all datasets - main and downsampled
    if downsample:
        allsizes = comp_sizes(data.size1, data.size2)
        for i, (si1, si2) in enumerate(allsizes):
            datai = FTICRData(dim=2)
            copyaxes(data, datai)
            datai.axis1.size = si1
            datai.axis2.size = si2
            HF.create_from_template(datai, group='resol%d' % (i + 2))
            datalist.append(datai)
            maxvalues.append(0.0)

    # Then go through input file
    if sys.maxsize == 2**31 - 1:  # the flag used by array depends on architecture - here on 32bit
        flag = 'l'  # Apex files are in int32
    else:  # here in 64bit
        flag = 'i'  # strange, but works here.
    spectre = FTICRData(shape=(sizeF2, ))  # to handle FT
    projection = FTICRData(buffer=np.zeros(sizeF2))  # to accumulate projection
    projection.axis1 = data.axis2.copy()
    Impwidgets = [
        'Importing: ',
        widgets.Percentage(), ' ',
        widgets.Bar(marker='-', left='[', right=']'),
        widgets.ETA()
    ]
    pbar = pg.ProgressBar(widgets=Impwidgets, maxval=sizeF1,
                          fd=sys.stdout).start()

    with open(fname, "rb") as f:
        ipacket = 0
        szpacket = 10
        packet = np.zeros(
            (szpacket,
             sizeF2))  # store by packet to increase compression speed
        for i1 in range(sizeF1):
            absmax = 0.0
            #print(i1, ipacket, end='  ')
            tbuf = f.read(4 * sizeF2)
            if len(tbuf) != 4 * sizeF2:
                break
            abuf = np.array(array.array(flag, tbuf), dtype=float)
            # processing
            spectre.set_buffer(abuf)
            spectre.adapt_size()
            spectre.hamming().zf(2).rfft().modulus()  # double the size
            mu, sigma = spectre.robust_stats(iterations=5)
            spectre.buffer -= mu
            if compress:
                spectre.zeroing(sigma * comp_level).eroding()
            packet[ipacket, :] = spectre.buffer[:]  # store into packet
            np.maximum(projection.buffer,
                       spectre.buffer,
                       out=projection.buffer)  # projection
            if (ipacket + 1) % szpacket == 0:  # and dump every szpacket
                maxvalues[0] = max(maxvalues[0],
                                   abs(packet.max()))  # compute max
                data.buffer[i1 - (szpacket - 1):i1 +
                            1, :] = packet[:, :]  # and copy
                packet[:, :] = 0.0
                ipacket = 0
            else:
                ipacket += 1
            # now downsample
            for idt, datai in enumerate(datalist):
                if i1 % (sizeF1 // datai.size1) == 0:  # modulo the size ratio
                    ii1 = (i1 * datai.size1) // sizeF1
                    spectre.set_buffer(abuf)
                    spectre.adapt_size()
                    spectre.chsize(
                        datai.size2).hamming().zf(2).rfft().modulus()
                    mu, sigma = spectre.robust_stats(iterations=5)
                    spectre.buffer -= mu
                    if compress:
                        spectre.zeroing(sigma * comp_level).eroding()
                    maxvalues[idt + 1] = max(
                        maxvalues[idt + 1],
                        spectre.absmax)  # compute max (0 is full spectrum)
                    datai.buffer[ii1, :] = spectre.buffer[:]

            pbar.update(i1)
        # flush the remaining packet
        maxvalues[0] = max(maxvalues[0], abs(packet[:ipacket, :].max()))
        data.buffer[i1 - ipacket:i1, :] = packet[:ipacket, :]
    # store maxvalues in the file
    HF.store_internal_object(maxvalues, h5name='maxvalues')
    if dparameters is not None:
        HF.store_internal_object(dparameters, h5name='import_parameters')

    # then write projection as 'projectionF2'
    proj = FTICRData(dim=1)
    proj.axis1 = data.axis2.copy()
    HF.create_from_template(proj, group='projectionF2')
    proj.buffer[:] = projection.buffer[:]
    pbar.finish()
    HF.flush()
    return data
コード例 #2
0
def Import_and_Process_LC(folder,
                          nProc=1,
                          outfile="LC-MS.msh5",
                          compress=False,
                          comp_level=3.0,
                          downsample=True,
                          dparameters=None):
    """
    Entry point to import sets of LC-MS spectra
    processing is done on the fly
    It creates and returns a HDF5 file containing the data-set
    
    compression is active if (compress=True).
    comp_level is the ratio (in x sigma) under which values are set to 0.0
    downsample is applied if (downsample=True).
    These two parameters are efficient but it takes time.

    dparameters if present, is a dictionnary copied into the final file as json 
    """
    import multiprocessing as mp
    from spike.File import Solarix, Apex
    #    from spike.File.Solarix import locate_acquisition, read_param
    from spike.NPKData import TimeAxis, copyaxes
    from spike.File import HDF5File as hf
    from spike.util import progressbar as pg
    from spike.util import widgets
    from spike.FTICR import FTICRData

    if nProc > 1:
        print("** running on %d processors" % nProc)
        Pool = mp.Pool(nProc)

    for _importer in (Solarix, Apex):
        try:
            parfilename = _importer.locate_acquisition(folder)
            params = _importer.read_param(parfilename)
            sizeF2 = int(params["TD"])
            importer = _importer
            break
        except:
            #print("***************************************")
            #print(params)
            pass
        else:
            raise Exception("could  not import data-set - unrecognized format")
    # get chromatogram
    minu, tic, maxpk = import_scan(os.path.join(folder, "scan.xml"))
    # Import parameters : size in F1 and F2
    sizeF1 = len(minu)
    sizeF2 = int(params["TD"])
    if os.path.isfile(os.path.join(folder, "ser")):
        fname = os.path.join(folder, "ser")
    else:
        raise Exception(
            "You are dealing with 1D data, you should use Import_1D")
    #size, specwidth,  offset, left_point, highmass, calibA, calibB, calibC, lowfreq, highfreq
    data = FTICRData(dim=2)  # create dummy LCMS
    data.axis1 = TimeAxis(size=sizeF1,
                          tabval=np.array(minu),
                          importunit="min",
                          currentunit='min')
    data.axis2.size = 1 * sizeF2  # The processing below might change the size, so we anticipate here !
    data.axis2.specwidth = float(params["SW_h"])
    found = False  # search for excitation bandwidth
    try:
        data.axis2.lowfreq, data.axis2.highfreq = read_ExciteSweep(
            locate_ExciteSweep(folder))
        found = True
    except:
        pass
    if not found:
        try:
            data.axis2.highfreq = float(params["EXC_Freq_High"])
        except:
            data.axis2.highfreq = data.axis2.calibA / float(
                params["EXC_low"])  # on Apex version
        try:
            data.axis2.lowfreq = float(params["EXC_Freq_Low"])
        except:
            data.axis2.lowfreq = data.axis2.calibA / float(
                params["EXC_hi"])  # on Apex version

    data.axis2.highmass = float(params["MW_high"])
    data.axis2.left_point = 0
    data.axis2.offset = 0.0
    data.axis2.calibA = float(params["ML1"])
    data.axis2.calibB = float(params["ML2"])
    data.axis2.calibC = float(params["ML3"])
    if not math.isclose(data.axis2.calibC, 0.0):
        print('Using 3 parameters calibration,  Warning calibB is -ML2')
        data.axis2.calibB *= -1

    data.params = params  # add the parameters to the data-set
    HF = hf.HDF5File(outfile, "w")
    if compress:
        HF.set_compression(True)
    HF.create_from_template(data, group='resol1')
    HF.store_internal_object(params,
                             h5name='params')  # store params in the file
    # then store files xx.methods and scan.xml
    HF.store_internal_file(parfilename)
    HF.store_internal_file(os.path.join(folder, "scan.xml"))
    try:
        HF.store_internal_file(locate_ExciteSweep(folder))
    except:
        print('ExciteSweep file not found')
    data.hdf5file = HF  # I need a link back to the file in order to close it

    # Start processing - first computes sizes and sub-datasets
    print(data)
    datalist = []  # remembers all downsampled dataset
    maxvalues = [
        0.0
    ]  # remembers max values in all datasets - main and downsampled
    if downsample:
        allsizes = comp_sizes(data.size1, data.size2)
        for i, (si1, si2) in enumerate(allsizes):
            datai = FTICRData(dim=2)
            copyaxes(data, datai)
            datai.axis1.size = si1
            datai.axis2.size = si2
            HF.create_from_template(datai, group='resol%d' % (i + 2))
            datalist.append(datai)
            maxvalues.append(0.0)

    # Then go through input file
    projection = FTICRData(buffer=np.zeros(sizeF2))  # to accumulate projection
    projection.axis1 = data.axis2.copy()
    Impwidgets = [
        'Importing: ',
        widgets.Percentage(), ' ',
        widgets.Bar(marker='-', left='[', right=']'),
        widgets.ETA()
    ]
    pbar = pg.ProgressBar(widgets=Impwidgets, maxval=sizeF1,
                          fd=sys.stdout).start()

    with open(fname, "rb") as f:
        ipacket = 0
        szpacket = 11
        packet = np.zeros(
            (szpacket,
             sizeF2))  # store by packet to increase compression speed
        absmax = 0.0

        xarg = iterargF2(f, sizeF1, sizeF2, compress, comp_level,
                         allsizes)  # construct iterator for main loop

        if nProc > 1:
            res = Pool.imap(processF2row,
                            xarg)  # multiproc processing using Pool
        else:
            res = map(processF2row, xarg)  # plain single proc processing
        for i1, spectres in enumerate(res):  # and get results
            spectre = spectres.pop(0)
            packet[ipacket, :] = spectre.buffer[:]  # store into packet
            np.maximum(projection.buffer,
                       spectre.buffer,
                       out=projection.buffer)  # projection
            if (ipacket + 1) % szpacket == 0:  # and dump every szpacket
                maxvalues[0] = max(maxvalues[0],
                                   abs(packet.max()))  # compute max
                data.buffer[i1 - (szpacket - 1):i1 +
                            1, :] = packet[:, :]  # and copy
                packet[:, :] = 0.0
                ipacket = 0
            else:
                ipacket += 1
            # now downsample
            for idt, spectre in enumerate(spectres):
                datai = datalist[idt]
                if i1 % (sizeF1 // datai.size1) == 0:  # modulo the size ratio
                    ii1 = (i1 * datai.size1) // sizeF1
                    maxvalues[idt + 1] = max(
                        maxvalues[idt + 1],
                        spectre.absmax)  # compute max (0 is full spectrum)
                    datai.buffer[ii1, :] = spectre.buffer[:]

            pbar.update(i1 + 1)
            last = i1
        # flush the remaining packet
        maxvalues[0] = max(maxvalues[0], abs(packet[:ipacket, :].max()))
        data.buffer[last - ipacket:last, :] = packet[:ipacket, :]
    pbar.finish()

    # then write projection as 'projectionF2'
    print('writing projections')
    proj = FTICRData(dim=1)
    proj.axis1 = data.axis2.copy()
    HF.create_from_template(proj, group='projectionF2')
    proj.buffer[:] = projection.buffer[:]

    # store maxvalues in the file
    print('writing max abs value')
    HF.store_internal_object(maxvalues, h5name='maxvalues')

    print('writing parameters')
    if dparameters is not None:
        HF.store_internal_object(dparameters, h5name='import_parameters')

    # and close
    HF.flush()
    if nProc > 1:
        Pool.close()  # finally closes multiprocessing slaves
    return data