Example #1
0
def get_dataset_names(dbfilepath, dbroot='', dataset_names=[], pathinh5=[]):
    """
        
    Recursively exctracts dataset names from hdf5 database
        
    """
    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file = h5py.File(dbfilepath, 'r')
        item = h5file
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        item = dbfilepath
        isdbfile = 0
    else:
        return dataset_names

    for key, val in iteritem(dict(item)):
        #printlog(key, val)
        try:
            subitem = dict(val)
            if ('mz' in subitem) or ('sp' in subitem) or (
                    'sp_unfiltered_peaks' in subitem) or (
                        ('is_sample_dataset' in subitem.attrs) and
                        (subitem.attrs['is_sample_dataset'] == True)):
                success = 1
            else:
                success = 0
        except Exception as inst:
            #printlog(inst)
            #traceback.print_exc()
            success = 0
        if success == 1:
            if is_string(pathinh5):
                success = 0
                h5str = val.name.split('/')[0:2]
                for i in h5str:
                    if '/' + i == pathinh5:
                        datasetname = re.sub(pathinh5, '', val.name)
                        dataset_names.append(datasetname)
                        success = 1
                        break
            else:
                dataset_names.append(val.name)
        if success == 0:
            if isinstance(val, h5py.Group):
                dbroot = dbroot + val.name
                dataset_names = get_dataset_names(val,
                                                  dbroot,
                                                  dataset_names,
                                                  pathinh5=pathinh5)

    if isdbfile == 1:
        h5file.close()

    return sorted(dataset_names)
Example #2
0
def load_dataset(dbfilepath, pathinh5):

    pathinh5 = re.sub('//', '/', pathinh5)

    dataset = []
    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file_group = h5py.File(dbfilepath, 'a')
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        h5file_group = dbfilepath
        isdbfile = 0
    else:
        return dataset

    try:
        isdata = pathinh5 in h5file_group
    except Exception as inst:
        printlog(inst)
        traceback.print_exc()
        return dataset

    if isdata == True:
        dataset = h5file_group[pathinh5][()]

    if isdbfile == 1:
        h5file_group.close()

    return dataset
Example #3
0
def save_dataset(dbfilepath,
                 pathinh5,
                 data,
                 chunksize='',
                 compression_opts=''):

    pathinh5 = re.sub('//', '/', pathinh5)

    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file_group = h5py.File(dbfilepath, 'a')
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        h5file_group = dbfilepath
        isdbfile = 0
    else:
        return

    try:
        isdata = pathinh5 in h5file_group
    except Exception as inst:
        printlog(inst)
        traceback.print_exc()
        return

    if isdata:
        fdata = h5file_group[pathinh5]

        if (fdata.shape == data.shape) and (fdata.dtype == data.dtype):
            fdata[...] = data
            return
        else:
            printlog('Deleting original')
            del h5file_group[pathinh5]

    if (not chunksize) and (not compression_opts):
        h5file_group.create_dataset(pathinh5, data=data)
    elif (chunksize) and (compression_opts):
        h5file_group.create_dataset(pathinh5,
                                    data=data,
                                    chunks=chunksize,
                                    compression="gzip",
                                    compression_opts=compression_opts)
    elif (chunksize):
        h5file_group.create_dataset(pathinh5, data=data, chunks=chunksize)
    elif (compression_opts):
        h5file_group.create_dataset(pathinh5,
                                    data=data,
                                    chunks=True,
                                    compression="gzip",
                                    compression_opts=compression_opts)

    if isdbfile == 1:
        h5file_group.close()
    return
Example #4
0
def load_preproc_obj(dbfilepath, procid, pathinh5=''):
    """
    
    **Loads the pre-processing parameters of a module from the hdf5 database.**
    
    Args: 
        
        dbfilepath: the name and path to the hdf5-database file
        
        procid: the module identifier
        
        pathinh5: the path in the hdf5 file for object storage
        
    """

    h5objpath = pathinh5 + procid
    h5objpath = re.sub('//', '/', h5objpath)

    ProcObj = {}
    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file_group = h5py.File(dbfilepath, 'a')
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        h5file_group = dbfilepath
        isdbfile = 0
    else:
        return ProcObj

    try:
        isobj = h5objpath in h5file_group
    except Exception as inst:
        printlog(inst)
        traceback.print_exc()
        return ProcObj

    if isobj == False:
        return ProcObj
    # check whether this object is part of the preprocessing workflow
    h5obj = h5file_group[h5objpath]
    for i_name in h5obj.keys():
        if isinstance(h5obj[i_name], h5py.Group):
            h5subobj = h5obj[i_name]
            subProcObj = {}
            for j_name in h5subobj.keys():
                subProcObj[j_name] = load_dataset(h5subobj, j_name)
            ProcObj[i_name] = subProcObj
        else:
            ProcObj[i_name] = load_dataset(h5obj, i_name)

    if isdbfile == 1:
        h5file_group.close()

    return ProcObj
Example #5
0
def get_traindata_names(dbfilepath, dbroot='', dataset_names=[], istrain=1):
    """
    
    Recursively exctracts dataset names from hdf5 database
    
    """
    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file = h5py.File(dbfilepath, 'r')
        item = h5file
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        item = dbfilepath
        isdbfile = 0
    else:
        return dataset_names

    for key, val in iteritem(dict(item)):
        try:
            subitem = dict(val)
            if ('istrain' in subitem) and ('Sp' in subitem):
                if load_dataset(item, val.name + '/istrain') == istrain:
                    success = 1
                else:
                    success = 0
            else:
                success = 0
        except Exception as inst:
            printlog(inst)
            traceback.print_exc()
            success = 0
        if success == 1:
            dataset_names.append(val.name)
        elif isinstance(val, h5py.Group):
            dbroot = dbroot + val.name
            dataset_names = get_traindata_names(val, dbroot, dataset_names,
                                                istrain)
    if isdbfile == 1:
        h5file.close()

    return dataset_names
Example #6
0
 def h5pathfinder(datapath):
     """    
     
     Finds a suitable path in the database file for storage of workflow metadata
             
     """    
     if is_string(datapath):
         splitpath = datapath.split('/')
     else:
         h5inpath =''
         return h5inpath
     
     nsplits  = len(splitpath)
     h5inpath = ''
     if nsplits == 2:
         if splitpath[0]!='':
             h5inpath = splitpath[0] + '/'
     elif nsplits > 2:
         for i in range(nsplits-1):
             h5inpath = h5inpath + splitpath[i] +'/'
     
     return h5inpath    
Example #7
0
def print_structure_h5db(dbfilepath, dbroot='', offset='    '):
    """Prints the HDF5 database structure"""
    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file = h5py.File(dbfilepath, 'r')
        item = h5file
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        item = dbfilepath
        isdbfile = 0
    else:
        return

    if isinstance(item, h5py.File):
        printlog(item.file, '(File)', item.name)

    elif isinstance(item, h5py.Dataset):
        printlog('(Dataset)', item.name, '    len =', item.shape)  #, g.dtype

    elif isinstance(item, h5py.Group):
        printlog('(Group)', item.name)

    else:
        printlog('Warning: The item type is unkown', item.name)
        sys.exit("execution is terminated")

    if isinstance(item, h5py.File) or isinstance(item, h5py.Group):
        for key, val in dict(item).iteritems():
            subitem = val
            printlog(offset,
                     key)  #,"   ", subg.name #, val, subg.len(), type(subg),
            dbroot = dbroot + 'i'
            print_structure_h5db(subitem, dbroot=dbroot, offset='    ')

    if isdbfile == 1:
        h5file.close()
Example #8
0
def save_preproc_obj(dbfilepath, ProcObj, pathinh5=''):
    """
    **Saves the pre-processing parameters of a module into the hdf5 database.**
    
    Args: 
        
        dbfilepath: the name and path to the hdf5-database file
        
        ProcObj: the pre-processing workflow object
        
        pathinh5: the path in the hdf5 file for object storage
        
    """

    h5objpath = pathinh5 + ProcObj.description
    h5objpath = re.sub('//', '/', h5objpath)

    if is_string(dbfilepath) and (os.path.exists(dbfilepath)):
        h5file_group = h5py.File(dbfilepath, 'a')
        isdbfile = 1
    elif (isinstance(dbfilepath, h5py.File)) or (isinstance(
            dbfilepath, h5py.Group)):
        h5file_group = dbfilepath
        isdbfile = 0
    else:
        return

    try:
        objvars = vars(ProcObj)
    except Exception as inst:
        printlog(inst)
        traceback.print_exc()
        return

    try:
        isgroup = h5objpath in h5file_group
    except Exception as inst:
        printlog(inst)
        traceback.print_exc()
        return

    if isgroup == False:
        h5file_group.create_group(h5objpath)
    else:
        printlog('%s object has already been saved into the database file' %
                 h5objpath)
        return

    h5obj = h5file_group[h5objpath]
    for i_name in objvars.keys():
        subobj = objvars[i_name]
        if isinstance(subobj, dict):
            h5obj.create_group(i_name)
            h5subobj = h5obj[i_name]
            for j_name in subobj.keys():
                save_dataset(h5subobj, j_name, subobj[j_name])
        else:
            save_dataset(h5obj, i_name, objvars[i_name])

    printlog('\n%s from pre-processing workflow have been saved to --> %s' %
             (h5objpath, str(dbfilepath)))

    if isdbfile == 1:
        h5file_group.close()