Python get_dataset_names Examples, proc.io.manageh5db.get_dataset_names Python Examples

Example #1

0

Show file

File: vistic.py Project: zzsnow/GNPS_Workflows

def get_average_profile(dbfilepath, dataid=''):
    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[])
        if not datasets:
            printlog(dbfilepath + ' database file doesn'
                     't contain any MSI datasets')
            return
    else:
        printlog(str(dbfilepath) + ' database file is not found')
        return

    sizesp = mh5.load_dataset(dbfilepath, 'sizesp')
    sp_mean = np.zeros((sizesp[0], sizesp[1]))
    crt = mh5.load_dataset(dbfilepath, 'crt')

    #crt = crt / 60;

    j = -1

    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath, datasetid + dataid)
            sp_mean = sp_mean + sp
            dataidx = np.append(dataidx, j)
        except Exception as inst:
            printlog(os.path.basename(datasetid) + ": Failed to readin")
            printlog(inst)

    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    sp_mean = sp_mean / len(dataidx)

    return sp_mean, crt, datasets

Example #2

0

Show file

def do_mzalignment(dbfilepath,
                   method='bining',
                   params={
                       'binshift': 0.3,
                       'binsize': 1,
                       'units': 'Da',
                       'h5writepath': '/proc'
                   },
                   istrain=1):
    """
    Performs intra-sample correction of molecular m/z drifts  between scans of individual samples
    
    Args:
    
        dbfilepath: a user-specified path to the h5 database file
                    
        method: the method choice for intra-sample m/z drift corrections (e.g. bining by default)  
        
        params: dictionary of parameter arguments for the correction method (e.g. ``{'binshift': 0.3, 'binsize':1, 'units': 'Da'})`` for bining) 
             
        dbfilepath: processed intensitiy matrices  
    """

    dataset_names = mh5.get_dataset_names(dbfilepath,
                                          dataset_names=[],
                                          pathinh5='/raw')
    if not dataset_names:
        return
    else:
        params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])

    if istrain == 1:

        with h5py.File(dbfilepath, 'r') as h5file:
            cmzrange = mh5.load_dataset(h5file, '/raw/cmzrange')
            crtrange = mh5.load_dataset(h5file, '/raw/crtrange')

        #delete unnecessary variables and save into hdf5 database file
        if method == 'binning':
            mzAlignObj = Binmz(method, params, cmzrange, crtrange)
        mzAlignObj.save_procobj(dbfilepath, params['h5writepath'])

    elif istrain == 0:
        mzAlignObj = Binmz()
        mzAlignObj.load_procobj(dbfilepath, params['h5writepath'])

    mzAlignObj.bin_h5(dbfilepath, dataset_names)

Example #3

0

Show file

File: vistic.py Project: zzsnow/GNPS_Workflows

def get_data(dbfilepath, h5readpath):
    """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """
    if h5readpath[0] != '/':
        h5readpath = '/'.join(['', h5readpath])

    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath,
                                         dataset_names=[],
                                         pathinh5=h5readpath)
        if not datasets:
            printlog(dbfilepath + ' database file doesn'
                     't contain any MSI datasets')
            return
    else:
        printlog(str(dbfilepath) + ' database file is not found')
        return

    sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp']))
    tics = np.zeros((sizesp[0], sizesp[2]))
    crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt']))
    j = -1
    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath,
                                  ''.join([h5readpath, datasetid, '/sp']))
            tics[:, j] = np.sum(sp, axis=1)
            dataidx = np.append(dataidx, j)
        except Exception as inst:
            printlog(os.path.basename(datasetid) + ": Failed to readin")
            printlog(inst)
            traceback.print_exc()

    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    tics = tics[:, dataidx]
    nrows, ncols = tics.shape
    sp = {'x': [], 'y': [], 'id': [], 'color': []}
    for i in range(ncols):
        sp['x'].append(crt / 60)
        sp['y'].append(tics[:, i])
        sp['id'].append(datasets[i])
    sp['color'] = colorgenerator(ncols)
    return sp

Example #4

0

Show file

 def get_dataset_names(dbfile):
     """    
     
     Exctracts data-set names from the hdf5 database file exists
     
     Args:
         
         dbfile: the path to the database file
     
      """    
     isdbfile = H5BaseMSIWorkflow.checkdbfile(dbfile)
     if isdbfile==1:
         datasets = mh5.get_dataset_names(dbfile,dataset_names=[])
         if not datasets:
             print('%s database file doesn''t contain any MSI datasets'%str(dbfile))
     else:
         datasets = []        
     return datasets

Example #5

0

Show file

File: bokeh_server_demo.py Project: zzsnow/GNPS_Workflows

def get_data(dbfilepath, h5readpath='sp2D'):
    if h5readpath[0] != '/':
        h5readpath = '/'.join(['', 'sp2D'])

    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath,
                                         dataset_names=[],
                                         pathinh5=h5readpath)
        if not datasets:
            print(dbfilepath + ' database file doesn'
                  't contain any MSI datasets')
            return
    else:
        print(str(dbfilepath) + ' database file is not found')
        return

    sizesp = mh5.load_dataset(dbfilepath, os.path.join(h5readpath, 'sizesp'))
    tics = np.zeros((sizesp[0], sizesp[2]))
    crt = mh5.load_dataset(dbfilepath, os.path.join(h5readpath, 'crt'))
    j = -1
    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath,
                                  ''.join([h5readpath, datasetid, '/sp']))
            tics[:, j] = np.sum(sp, axis=1)
            dataidx = np.append(dataidx, j)
        except:
            print(os.path.basename(datasetid) + ": Failed to readin")
            pass
    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    tics = tics[:, dataidx]
    nrows, ncols = tics.shape
    sp = {'x': [], 'y': [], 'id': [], 'color': []}
    for i in range(ncols):
        sp['x'].append(crt)
        sp['y'].append(tics[:, i])
        sp['id'].append(datasets[i])
    sp['color'] = colorgenerator(ncols)
    return sp

Example #6

0

Show file

File: internal_palign.py Project: zzsnow/GNPS_Workflows

def do_profile_alignment(dbfilepath, method='rspa', params = {'recursion':1, 
                                                              'minsegwidth':100, 
                                                              'maxpeakshift':10,
                                                              'reference':'mean',
                                                              'h5readpath': '/proc',
                                                              'h5writepath': '/proc'},
                                                                istrain=1):
    """
    Performs advanced adjustment for chromatographic peak position variations at full profile resolution
    using recursive segment-wise peak alignment strategy
    
    Args:
    
        dbfilepath: The database file path
                    
        method: The choice of peak alignment method. Default value: 'rspa', i.e. Recursive segment-wise peak alignment.  
        
        params: The dictionary of peak alignment parameters
    """     
    
    params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])
    params['h5readpath']  = h5Base.correct_h5path(params['h5readpath'])
    dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1])
    if not dataset_names:
        return

    peak_width = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'peak_width');
    printlog('Loaded min estimated peak width: %s seconds'%peak_width);
    mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\
                                 data=peak_width)

    if str(params['minsegwidth']).lower() == 'auto':
        params['minsegwidth'] = peak_width * 10.0;
        printlog('Parameter "minsegwidth" is set to %s'%params['minsegwidth']);
    else:
        try:
            params['minsegwidth'] = float(params['minsegwidth'])
        except:
            raise LoggingValueError('Error! %s value for parameter "minsegwidth" cannot be converted to float!'%params['minsegwidth'])
            

    
    if str(params['maxpeakshift']).lower() == 'auto':
        params['maxpeakshift'] = peak_width * 5;
        printlog('Parameter "maxpeakshift" is set to %s'%params['maxpeakshift']);
    else:
        try:
            params['maxpeakshift'] = float(params['maxpeakshift'])
        except:
            raise LoggingValueError('Error! %s value for parameter "maxpeakshift" cannot be converted to float!'%params['maxpeakshift'])
        

    if istrain==1:
        rtrange = mh5.load_dataset(dbfilepath, params['h5readpath'] + 'rtrange') 
        if method=='rspa':
            rtAlObj = RSPA(method,params,rtrange)
            
    elif istrain==0:
            rtAlObj = RSPA()
            rtAlObj.load_procobj(dbfilepath,params['h5readpath'])
        
    rtAlObj.aling_h5(dbfilepath, dataset_names, params['h5readpath'] , params['h5writepath'])
    
    if istrain==1:
         #save into hdf5 database file
        rtAlObj.export()
        rtAlObj.save_procobj(dbfilepath,params['h5writepath'])    
        rtAlObj.save_proc_meta(dbfilepath,params['h5writepath'],params['h5readpath'])
                    
    return

Example #7

0

Show file

def do_noisefilter(dbfilepath,
                   smoothmethod='sqfilter',
                   smoothparams={
                       'window': 5,
                       'degree': 3
                   },
                   baselinemethod='tophat',
                   baselineparams={'frame': 90},
                   params={
                       'h5readpath': '/sp2D',
                       'h5writepath': '/spproc2D'
                   },
                   istrain=1):
    """
    Performs adjustment for high frequency noise and lower frequency baseline distortions 
    due to a variety of instrumental and experimental reasons
    
    Args:
    
        dbfilepath: a user-specified path to the h5 database file
                    
        smoothmethod: The type of noise filtering method. Default value: 'sqfilter', i.e. the Savitzky-Golay filter.  
        
        smoothparams: The dictionary of parameter arguments for noise filtering method 
        
        baselinemethod: The type of a baseline correction method. Default value: 'tophat', i.e. the top-hat morphological filter.
        
        baselineparams: The dictionary of parameter arguments for baseline correction method          
    """

    params['h5writepath'] = h5Base.correct_h5path(params['h5writepath'])
    params['h5readpath'] = h5Base.correct_h5path(params['h5readpath'])
    dataset_names = mh5.get_dataset_names(dbfilepath,
                                          dataset_names=[],
                                          pathinh5=params['h5readpath'][:-1])
    if not dataset_names:
        return

    peak_width = mh5.load_dataset(dbfilepath,
                                  params['h5readpath'] + 'peak_width')
    printlog('Loaded min estimated peak width: %s seconds' % peak_width)
    mh5.save_dataset(dbfilepath, params['h5writepath'] + 'peak_width',\
                                 data=peak_width)

    if str(smoothparams['window']).lower() == 'auto':
        smoothparams['window'] = peak_width * 0.5
        printlog('Parameter "window" is set to %s' % smoothparams['window'])
    else:
        try:
            smoothparams['window'] = float(smoothparams['window'])
        except:
            raise LoggingValueError(
                'Error! %s value for parameter "window" cannot be converted to float!'
                % smoothparams['window'])

    if str(baselineparams['frame']).lower() == 'auto':
        baselineparams['frame'] = peak_width * 15
        printlog('Parameter "frame" is set to %s' % baselineparams['frame'])
    else:
        try:
            baselineparams['frame'] = float(baselineparams['frame'])
        except:
            raise LoggingValueError(
                'Error! %s value for parameter "frame" cannot be converted to float!'
                % baselineparams['frame'])

    if istrain == 1:
        rtrange = mh5.load_dataset(dbfilepath,
                                   params['h5readpath'] + 'rtrange')
        if smoothmethod != 'none':
            SmoothObject = SmoothFilter(smoothmethod, smoothparams, rtrange)
        else:
            SmoothObject = []

        if baselinemethod != 'none':
            BaselineObject = BaselineFilter(baselinemethod, baselineparams,
                                            rtrange)
        else:
            BaselineObject = []

    elif istrain == 0:
        SmoothObject = SmoothFilter()
        SmoothObject.load_procobj(dbfilepath, params['h5readpath'])
        if SmoothObject.method == '':
            SmoothObject = []
        BaselineObject = BaselineFilter()
        if BaselineObject.method == '':
            BaselineObject = []
        BaselineObject.load_procobj(dbfilepath, params['h5readpath'])

    filternoise_h5(dbfilepath, dataset_names, SmoothObject, BaselineObject,
                   params)

    if istrain == 1:
        #save into hdf5 database file
        if (SmoothObject):
            SmoothObject.export(rtrange)
            SmoothObject.save_procobj(dbfilepath, params['h5writepath'])
        if (BaselineObject):
            BaselineObject.export(rtrange)
            BaselineObject.save_procobj(dbfilepath, params['h5writepath'])

        SmoothObject.save_proc_meta(dbfilepath, params['h5writepath'],
                                    params['h5readpath'])

    return

Example #8

0

Show file

File: plot_samples.py Project: SisuPark/GNPS_Workflows

def export_HTML_list_of_samples_to_file(dbfilepath,
                                        h5readpath,
                                        output_prefix,
                                        plot_width=900,
                                        top_plot_height=300,
                                        bottom_plot_height=400,
                                        use_global_maximum=True):
    printlog('Exporting HTML view from [%s]%s to %s...' %
             (dbfilepath, h5readpath, output_prefix))

    with h5py.File(dbfilepath, 'r') as h5file:

        if not os.path.exists(output_prefix):
            os.makedirs(output_prefix)

        h5readpath = h5Base.correct_h5path(h5readpath)

        dataset_names = mh5.get_dataset_names(h5file,
                                              dataset_names=[],
                                              pathinh5=h5readpath[:-1])

        if not dataset_names:
            printlog('No datasets found! Nothing to do!')
            return

        dataset_count = len(dataset_names)

        all_rts_set = h5readpath + 'crt'
        if not (all_rts_set in h5file):
            printlog('Error! crt not found in [%s]%s ! Skipping...' %
                     (dbfilepath, h5readpath))
            return

        crt = h5file[all_rts_set]

        all_mzs_set = h5readpath + 'cmz'
        if not (all_mzs_set in h5file):
            printlog('Error! cmz not found in [%s]%s ! Skipping...' %
                     (dbfilepath, h5readpath))
            return

        cmz = h5file[all_mzs_set]

        if not os.path.exists('%s/samples' % output_prefix):
            os.makedirs('%s/samples' % output_prefix)

        with open('%s/index.html' % output_prefix, 'w') as fout:
            fout.write('\n'.join([
                '<!DOCTYPE html>',
                '<html>',
                '<frameset cols="15%,85%">',
                '<frame src="sample_list.html">',
                '<frame src="samples/sample_0.html" name="MSSpectrum">',
                '</frameset>',
                '</html>',
            ]))

        with open('%s/sample_list.html' % output_prefix, 'w') as fout:

            fout.write('\n'.join([
                '<!DOCTYPE html>', '<html>', '<head>',
                '<title>Samples</title>', '</head>', '<body>',
                '    <table border=1>', '    <tr><th>Samples</th></tr>'
            ]))

            global_max = 0.0
            #global_max_sum = 0.0;

            if use_global_maximum:
                printlog('Evaluating global maximum...')
                for i in range(dataset_count):
                    printlog('Sample: %s of %s' % (i + 1, dataset_count))
                    sp_set = h5readpath + dataset_names[i].lstrip('/') + '/sp'
                    if not (sp_set in h5file):
                        printlog('Error! sp not found in %s ! Skipping...' %
                                 (h5readpath + dataset_names[i].lstrip('/')))
                    else:
                        sp = h5file[sp_set]
                        m = np.max(sp)
                        #mm = np.max(np.sum(sp, axis = 1));
                        global_max = max(m, global_max)
                        #global_max_sum = max(mm, global_max_sum)

            for i in range(dataset_count):
                printlog('Sample: %s of %s' % (i + 1, dataset_count))

                generate_sample_plot(
                    h5file, h5readpath + dataset_names[i].lstrip('/'), i,
                    dataset_names[i].lstrip('/'),
                    output_prefix + '/samples/sample_%s.html' % i, crt, cmz,
                    plot_width, top_plot_height, bottom_plot_height,
                    use_global_maximum, global_max)

                fout.write('\n'.join([
                    '<tr><td>%s. ' % (i + 1),
                    '<a href="samples/sample_%s.html" target="MSSpectrum">%s</a>'
                    % (i, dataset_names[i].lstrip('/')),
                    '</td></tr>',
                ]))

            fout.write('\n'.join([
                '    </table>',
                '</body>',
                '</html>',
            ]))

        printlog('Done')

Example #9

0

Show file

File: export.py Project: zzsnow/GNPS_Workflows

def do_export(dbfilepath, params = {'h5readpath':'/spproc2D_peakdetect', 
                                    'h5fullprofile':'/spproc2D',
                                    'output_prefix':'%HDF5_file_name%', 
                                    'export_ms_peak_list':'yes',
                                    'export_integral_table':'yes', 
                                    #'export_full_peak_list':'no'
                                    }
                                    ):
    
    """
    Exports processed data to CSV/TXT formats for analysis.
    
    Args:
    
        dbfilepath: a user-specified path to the h5 database file
                    
        params: parameters to be used for export:
            h5readpath - path inside HDF5 to the processed dataset
            output_prefix - prefix to be used for the output files
            export_ms_peak_list - whether to export or not MS peak
                                  lists for NIST
            export_integral_table - whether to export or not the 
                                    chromatographic peak integrals
    """     
    
    params['h5readpath'] = h5Base.correct_h5path(params['h5readpath'])
    params['h5fullprofile'] = h5Base.correct_h5path(params['h5fullprofile'])
    dbfilepath = os.path.abspath(dbfilepath);
    dataset_names = mh5.get_dataset_names(dbfilepath,dataset_names=[],pathinh5 = params['h5readpath'][:-1])
    #print(dataset_names)
    if not dataset_names:
        printlog('No datasets found in the h5readpath provided: %s !'%params['h5readpath']);
        return
    
    output_prefix = params['output_prefix'];
    if '%HDF5_file_name%' in output_prefix:
        fname = os.path.splitext(os.path.basename(dbfilepath))[0];
        output_prefix = output_prefix.replace('%HDF5_file_name%',fname);
        
    export_path = params['exportpath'];
    if export_path != '':
        export_path = os.path.abspath(export_path);
    else:
        export_path = os.path.split(dbfilepath)[0];
    
    output_prefix = os.path.join(export_path, output_prefix);
    
    fpath = os.path.split(output_prefix)[0];
    
    if not os.path.exists(fpath):
        os.makedirs(fpath);
        
    if not ('no' in params['export_ms_peak_list']):
        export_ms_peak_list_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names, params['rts'], params['rt_tolerance']);

    if not ('no' in params['export_integral_table']):
        export_integral_table_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names, params['samples'], params['rts'], params['rt_tolerance']);

    if not ('no' in params['export_metadata_table']):
        export_metadata_table_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names, params['samples']);

    '''if not ('no' in params['export_all_peaks']):
        export_all_peaks_to_file(dbfilepath, params['h5readpath'], output_prefix, dataset_names);
    '''                
    return

Example #10

0

Show file

File: vistic_extended.py Project: zzsnow/GNPS_Workflows

def get_data(dbfilepath, h5readpath):
    
    """ Extract data from the h5file and output as a dictionary of 'x', 'y', 'ids', and 'colors' for each sample """
    
    data = {'x': [], 'y': [], 'id': [], 'color': []}
    
    if h5readpath[0] != '/':
        h5readpath = '/'.join(['', h5readpath])

    if os.path.isfile(dbfilepath):
        datasets = mh5.get_dataset_names(dbfilepath, dataset_names=[], pathinh5=h5readpath)
        if not datasets:
            printlog(dbfilepath + ' database file doesn''t contain any MSI datasets')
            return
    else:
        printlog(str(dbfilepath) + ' database file is not found')
        return



    sizesp = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'sizesp']))
    crt = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'crt']))
    
    try:
        ref2D = mh5.load_dataset(dbfilepath, '/'.join([h5readpath, 'ref2D']))
        print(ref2D.shape)
    except:
        ref2D = None;
    
    
    try:
        histc = mh5.load_dataset(dbfilepath, '/'.join([h5readpath+'_peakdetect', 'clustering_histogram']))
        print(np.min(histc))        
        print(np.max(histc))
    except:
        histc = None


    try:
        histc_threshold = mh5.load_dataset(dbfilepath, '/'.join([h5readpath+'_peakdetect', 'clustering_histogram_threshold']))
        print(np.min(histc_threshold))        
        print(np.max(histc_threshold))
    except:
        histc_threshold = None

    
    tics = np.zeros((sizesp[0], sizesp[2]))
    
    j = -1
    dataidx = np.array([])
    for datasetid in datasets:
        j = j + 1
        try:
            sp = mh5.load_dataset(dbfilepath, ''.join([h5readpath, datasetid, '/sp']))
            tics[:, j] = np.sum(sp, axis=1)
            dataidx = np.append(dataidx, j)
        except Exception as inst:
            printlog(os.path.basename(datasetid) + ": Failed to readin")
            printlog(inst)
            traceback.print_exc()
            
            
    dataidx = dataidx.astype(int)
    datasets = list(map(datasets.__getitem__, (dataidx)))
    tics = tics[:, dataidx]
    
    if not (histc is None):
        #histc[histc<=threshold] = 0
        '''
        if 'histc' in data:
            p.line(x = 'hx', y = 'histc', line_width=1, line_color = 'firebrick', source = source);
    
        if 'histc_threshold' in data:
            p.line(x = 'hx', y = 'histc_threshold', line_width = 2, line_color = 'navy', source = source);
            
        if 'ref2D' in data:
            p.line(x = 'refx', y = 'ref2D', line_width = 1, line_color = 'red', source = source)
            
        if 'picked_peaks' in data:
            p.circle(x = 'peak_x', y = 'picked_peaks', color = 'peak_color', size = 3, source = source)
    
        '''
        med_int = np.median(tics,axis=1).flatten()[0:-1]  + 1
        med_int = np.sqrt(med_int)
        histc =  med_int * histc       
        mv = np.max(tics.flatten())
        mv = mv/np.max(histc)
        threshold = median_threshold(histc);
        threshold *= mv
        histc *= mv;
        
        dd = int(len(histc)/100) - 1;
        
        dx = np.zeros((dd,), dtype = np.float64);
        dy = np.zeros((dd,), dtype = np.float64);
        
        for i in range(dd):
            dx[i] = np.mean(crt[i*100:i*100+100])
            dy[i] = median_threshold(histc[i*100:i*100+100])
       
        dy  = smooth1D(dx,dy,10) 
        dx[0]  = np.min(crt)
        dx[-1] = np.max(crt)
        fit = interp1d(dx,dy,kind='cubic')       
        fitted_threshold = fit(crt)
        

    nrows, ncols = tics.shape
    
    
    for i in range(ncols):
        data['x'].append(crt/60)
        data['y'].append(tics[:, i])
        data['id'].append(datasets[i])

    data['color'] = colorgenerator(ncols)
    
    
    return data