def append_to_hdf5_dataset(mels, stfts, filename): inputs = mels, stfts short_names = 'mels', 'stfts' filepath = "data/%s/%s" % (filename, "data") atoms = [tables.Float16Atom(), tables.Float16Atom()] tables_file = tables.open_file(filepath, mode='a') for short_name, inp, atom in zip(short_names, inputs, atoms): tables_file.get_node("/%s" % short_name).append(inp) tables_file.close()
def create_hdf5_file(max_freq_length, filename): short_names = 'mels', 'stfts' filepath = "data/%s/%s" % (filename, "data") atoms = [tables.Float16Atom(), tables.Float16Atom()] sizes = [(0, max_freq_length, 80 * audio.r), (0, max_freq_length, 1025 * audio.r)] tables_file = tables.open_file(filepath, mode='w') for short_name, atom, size in zip(short_names, atoms, sizes): print("Creating earray at /root/%s" % short_name) tables_file.create_earray(tables_file.root, short_name, atom, size) print("Tables file created: %s" % tables_file) tables_file.close()
def genotype_minimac2hdf5(data_path, id, save_path, study_name): df = pd.read_csv(data_path, header=None, index_col=None, sep='\t', dtype=np.float16) data = df.as_matrix() data = data.T print(data.shape) print('Saving chunk...{}'.format( os.path.join(save_path, 'genotype', str(id) + '_' + study_name + '.h5'))) h5_gen_file = tables.open_file(os.path.join( save_path, 'genotype', str(id) + '_' + study_name + '.h5'), 'w', title=study_name) atom = tables.Float16Atom() genotype = h5_gen_file.create_carray(h5_gen_file.root, 'genotype', atom, (data.shape), title='Genotype', filters=tables.Filters( complevel=9, complib='zlib')) genotype[:] = data h5_gen_file.close() os.remove(data_path)
def generate_hdf5_file(self, config): # Open HDF5 file if isfile(self.out_name) and not config['overwrite_data']: return self.out_name h5file = tables.open_file(self.out_name, mode="w", title=self.out_name) img_dtype = tables.Float16Atom() print("Generating HDF5 file") new_size = (config['resize']['height'], config['resize']['width']) img_shape = (0, ) + new_size + (1, ) for prefix, df in { 'train': self.train, 'val': self.validation, 'test': self.test }.items(): group = h5file.create_group("/", prefix) img_storage = h5file.create_earray(group, 'images', img_dtype, shape=img_shape) label_storage = h5file.create_earray(group, 'labels', img_dtype, shape=img_shape) filename_storage = h5file.create_earray(group, 'filenames', tables.StringAtom(256), shape=(0, )) for img_path, meta_data in df.iterrows(): print(img_path) filename_storage.append([img_path]) # Load image and labels img_arr = dicom.read_file(img_path, force=True).pixel_array label_arr = np.array(Image.open(img_path + self.label_ext)) img_arr_resized = resize(img_arr, new_size) label_arr_resized = resize(label_arr, new_size) img_arr_pp = normalize(img_arr_resized).astype(np.float16) label_arr_pp = (label_arr_resized / 255.).astype(np.float16) img_arr_pp = np.expand_dims(img_arr_pp, 2) label_arr_pp = np.expand_dims(label_arr_pp, 2) img_storage.append( img_arr_pp[None] ) # this syntax prepends a singleton dimension to the image label_storage.append(label_arr_pp[None]) h5file.close() return self.out_name
def create_data_file(out_file, channels, samples, image_shape): hdf5_file = tables.open_file(out_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') data_shape = tuple([0, channels] + list(image_shape)) label_shape = tuple([0, 1] + list(image_shape)) data_storage = hdf5_file.create_earray(hdf5_file.root, 'data', tables.Float16Atom(), shape=data_shape, filters=filters, expectedrows=samples) label_storage = hdf5_file.create_earray(hdf5_file.root, 'truth', tables.UInt8Atom(), shape=label_shape, filters=filters, expectedrows=samples) affine_storage = hdf5_file.create_earray(hdf5_file.root, 'affine', tables.Float16Atom(), shape=(0, 4, 4), filters=filters, expectedrows=samples) return hdf5_file, data_storage, label_storage, affine_storage
def _write_genotype(self, trityper_data): number_of_chunks = (trityper_data.number_of_variants // self.chunk_size) + 1 for chunk_index in xrange(number_of_chunks): start = chunk_index * self.chunk_size end = min((chunk_index + 1) * self.chunk_size, trityper_data.number_of_variants) dosage_matrix = np.empty( (end - start, len(trityper_data.individuals_data))) print "Loading {}-{} variants to write to chunk {} out of {} total chunks".format( start, end, chunk_index, number_of_chunks) # Get the dosages for every variant within the for chunked_variant_index, variant_index in enumerate( xrange(start, end)): dosage_matrix[chunked_variant_index, ] = trityper_data.get_dosages(variant_index) # Drop every variant that did not have two alleles. bad_variant_indices_chunk = list() for bad_variant_index in self.bad_variant_indices: if start <= bad_variant_index < end: bad_variant_indices_chunk.append(bad_variant_index - start) print bad_variant_indices_chunk dosage_matrix = np.delete(dosage_matrix, bad_variant_indices_chunk, 0) h5_gen_file = tables.open_file(os.path.join( self.genotype_directory_path, str(chunk_index) + '_' + str(self.study_name) + '.h5'), 'w', title=self.study_name) atom = tables.Float16Atom() genotype = h5_gen_file.create_carray(h5_gen_file.root, 'genotype', atom, (dosage_matrix.shape), title='Genotype', filters=tables.Filters( complevel=9, complib='zlib')) genotype[:] = dosage_matrix h5_gen_file.close() print >> sys.stderr, "Discarded {} variants that did not have two alleles".format( len(self.bad_variant_indices))
def save_hdf5_chunk(self, data, out, name): print('Saving chunk...{}'.format( os.path.join(out, 'genotype', str(self.hdf5_iter) + '_' + name + '.h5'))) h5_gen_file = tables.open_file(os.path.join( out, 'genotype', str(self.hdf5_iter) + '_' + name + '.h5'), 'w', title=name) atom = tables.Float16Atom() # TODO (low) check data format genotype = h5_gen_file.create_carray(h5_gen_file.root, 'genotype', atom, (data.shape), title='Genotype', filters=self.pytable_filter) genotype[:] = data h5_gen_file.close() genotype = None data = None gc.collect() self.hdf5_iter += 1
def makeSoltab(self, solset=None, soltype=None, soltab=None, axesNames=[], axesVals=[], chunkShape=None, vals=None, weights=None, parmdbType=None): """ Create a solution-table into a specified solution-set Keyword arguments: solset -- a solution-set name (String) or a Group instance soltype -- solution-type (e.g. amplitude, phase) soltab -- the solution-table name (String) if not specified is generated from the solution-type axesNames -- list with the axes names axesVals -- list with the axes values chunkShape -- list with the chunk shape vals -- weights -- 0->FLAGGED, 1->MAX_WEIGHT parmdbType -- original parmdb solution type """ if soltype == None: raise Exception( "Solution-type not specified while adding a solution-table.") # checks on the solset if solset == None: raise Exception( "Solution-set not specified while adding a solution-table.") if type(solset) is str: solset = self.getSolset(solset) solsetName = solset._v_name if not solsetName in self.getSolsets().keys(): raise Exception("Solution-set " + solsetName + " doesn't exist.") # checks on the soltab soltabName = soltab if type(soltabName) is str and not re.match(r'^[A-Za-z0-9_-]+$', soltabName): logging.warning( 'Solution-table ' + soltabName + ' contains unsuported characters. Use [A-Za-z0-9_-]. Switching to default.' ) soltabName = None if soltabName in self.getSoltabs(solset).keys(): logging.warning('Solution-table ' + soltabName + ' already present. Switching to default.') soltabName = None if soltabName == None: soltabName = self._fisrtAvailSoltabName(solset, soltype) logging.info('Creating a new solution-table: ' + soltabName + '.') soltab = self.H.create_group("/" + solsetName, soltabName, title=soltype) soltab._v_attrs['parmdb_type'] = parmdbType # create axes assert len(axesNames) == len(axesVals) dim = [] # newChunkShape = [] for i, axisName in enumerate(axesNames): #axis = self.H.create_carray('/'+solsetName+'/'+soltabName, axisName,\ # obj=axesVals[i], chunkshape=[len(axesVals[i])]) axis = self.H.create_array('/' + solsetName + '/' + soltabName, axisName, obj=axesVals[i]) axis.attrs['h5parm_version'] = _version.__h5parmVersion__ dim.append(len(axesVals[i])) # # Put time/freq on max lenght for better performances # if chunkShape == None: # if axisName == 'time': # newChunkShape.append(100) # elif axisName == 'freq': # newChunkShape.append(10) # else: # newChunkShape.append(1) # if chunkShape == None: chunkShape = newChunkShape # logging.debug('Chunk shape: '+str(chunkShape)) # check if the axes were in the proper order assert dim == list(vals.shape) assert dim == list(weights.shape) # create the val/weight Carrays #val = self.H.create_carray('/'+solsetName+'/'+soltabName, 'val', obj=vals.astype(np.float64), chunkshape=None, atom=tables.Float64Atom()) #weight = self.H.create_carray('/'+solsetName+'/'+soltabName, 'weight', obj=weights.astype(np.float16), chunkshape=None, atom=tables.Float16Atom()) # array do not have compression but are much faster val = self.H.create_array('/' + solsetName + '/' + soltabName, 'val', obj=vals.astype(np.float64), atom=tables.Float64Atom()) weight = self.H.create_array('/' + solsetName + '/' + soltabName, 'weight', obj=weights.astype(np.float16), atom=tables.Float16Atom()) val.attrs['VERSION_H5PARM'] = _version.__h5parmVersion__ val.attrs['AXES'] = ','.join([axisName for axisName in axesNames]) weight.attrs['VERSION_H5PARM'] = _version.__h5parmVersion__ weight.attrs['AXES'] = ','.join([axisName for axisName in axesNames]) return soltab
def makeSoltab(self, soltype=None, soltabName=None, axesNames = [], axesVals = [], chunkShape=None, vals=None, weights=None, parmdbType=''): """ Create a Soltab into this solset. Parameters ---------- soltype : str Solution-type (e.g. amplitude, phase) soltabName : str, optional The solution-table name, if not specified is generated from the solution-type axesNames : list List with the axes names axesVals : list List with the axes values (each is a separate list) chunkShape : list, optional List with the chunk shape vals : numpy array Array with shape given by the axesVals lenghts weights : numpy array Same shape of the vals array 0->FLAGGED, 1->MAX_WEIGHT parmdbType : str Original parmdb solution type Returns ------- soltab obj Newly created soltab object """ if soltype is None: raise Exception("Solution-type not specified while adding a solution-table.") # checks on the soltab if type(soltabName) is str and not re.match(r'^[A-Za-z0-9_-]+$', soltabName): logging.warning('Solution-table '+soltabName+' contains unsuported characters. Use [A-Za-z0-9_-]. Switching to default.') soltabName = None if soltabName in self.getSoltabNames(): logging.warning('Solution-table '+soltabName+' already present. Switching to default.') soltabName = None if soltabName is None: soltabName = self._fisrtAvailSoltabName(soltype) logging.info('Creating a new solution-table: '+soltabName+'.') # check input assert len(axesNames) == len(axesVals) dim = [] for i, axisName in enumerate(axesNames): dim.append(len(axesVals[i])) assert dim == list(vals.shape) assert dim == list(weights.shape) # if input is OK, create table soltab = self.obj._v_file.create_group("/"+self.name, soltabName, title=soltype) soltab._v_attrs['parmdb_type'] = parmdbType for i, axisName in enumerate(axesNames): #axis = self.obj._v_file.create_carray('/'+self.name+'/'+soltabName, axisName,\ # obj=axesVals[i], chunkshape=[len(axesVals[i])]) axis = self.obj._v_file.create_array('/'+self.name+'/'+soltabName, axisName, obj=axesVals[i]) # create the val/weight Carrays #val = self.obj._v_file.create_carray('/'+self.name+'/'+soltabName, 'val', obj=vals.astype(np.float64), chunkshape=None, atom=tables.Float64Atom()) #weight = self.obj._v_file.create_carray('/'+self.name+'/'+soltabName, 'weight', obj=weights.astype(np.float16), chunkshape=None, atom=tables.Float16Atom()) # array do not have compression but are much faster val = self.obj._v_file.create_array('/'+self.name+'/'+soltabName, 'val', obj=vals.astype(np.float64), atom=tables.Float64Atom()) weight = self.obj._v_file.create_array('/'+self.name+'/'+soltabName, 'weight', obj=weights.astype(np.float16), atom=tables.Float16Atom()) val.attrs['AXES'] = ','.join([axisName for axisName in axesNames]) weight.attrs['AXES'] = ','.join([axisName for axisName in axesNames]) return Soltab(soltab)
chunk.to_hdf(os.path.join(args.g, 'individuals', args.study_name + '.h5'), key='individuals', format='table', min_itemsize=25, complib='zlib', complevel=9) for g_file in os.listdir(os.path.join(args.g, 'genotype')): print(g_file) data = h5py.File(os.path.join(args.g, 'genotype', g_file), 'r')['genotype'][...] data = data[:, keep_index] h5_gen_file = tables.open_file(os.path.join( args.g, 'genotype', g_file), 'w', title=args.study_name) atom = tables.Float16Atom() genotype = h5_gen_file.create_carray(h5_gen_file.root, 'genotype', atom, (data.shape), title='Genotype', filters=tables.Filters( complevel=9, complib='zlib')) genotype[:] = data h5_gen_file.close()
def open_h5(f, hid_size): f = tables.open_file(f, mode='w') atom = tables.Float16Atom() array = f.create_earray(f.root, 'data', atom, (0, hid_size)) return f, array
def soltab_swap_freq_time(soltab): """Swap the frequency and time axes to make the frequency the fastest varying axis Parameters ---------- soltab : Soltab Soltab object which will be changed """ vals = soltab.getValues(retAxesVals=False) weights = soltab.getValues(weight=True, retAxesVals=False) if vals.shape != weights.shape: raise RuntimeError("Shape of weights differs from shape of values") axesnames = soltab.getAxesNames() axesnums = list(range(len(axesnames))) if 'freq' not in axesnames or 'time' not in axesnames: print("Nothing to be done, no freq + time axes in " + soltab.name) return freqindex = axesnames.index('freq') timeindex = axesnames.index('time') if freqindex > timeindex: print("Nothing to be done, freq already varies fastest in " + soltab.name) return # Swap the time and frequency axis in the axes names and numbers axesnums[freqindex], axesnums[timeindex] = axesnums[timeindex], axesnums[ freqindex] axesnames[freqindex], axesnames[timeindex] = axesnames[ timeindex], axesnames[freqindex] # Swap the axes order in the metadata soltab.obj.val._f_setattr("AXES", ",".join(axesnames)) # Transpose the values vals = vals.transpose(axesnums) weights = weights.transpose(axesnums) # Need to remove the array from the file because changing shape is not supported by pytables # Store the attributes in a dict attrs = soltab.obj.val._v_attrs attrsdict = {} for attrname in attrs._f_list(): attrsdict[attrname] = attrs[attrname] soltab.obj.val._f_remove() soltab.obj.weight._f_remove() # Create new val here soltab.obj._v_file.create_array(soltab.obj._v_pathname, 'val', obj=vals, atom=tables.Float64Atom()) soltab.obj._v_file.create_array(soltab.obj._v_pathname, 'weight', obj=weights, atom=tables.Float16Atom()) # Restore the original attributes for attrname in attrsdict: soltab.obj.val._f_setattr(attrname, attrsdict[attrname]) soltab.addHistory( "Swap frequency and time axes to make frequency vary fastest") soltab.obj._f_flush() print("Successfully swapped frequency and time axes in " + soltab.name)
return E, t, w, dt, w0, frogtrace.reshape(-1) if __name__ == '__main__': E, t, _, _, _, frogtrace_flat = retrieve_data(plot_frog_bool=False, print_size=True) # data for input E_real = np.real(E) E_imag = np.imag(E) # create file hdf5_file = tables.open_file('frogtrainingdata.hdf5', mode='w') frog_image_f = hdf5_file.create_earray(hdf5_file.root, 'frog', tables.Float16Atom(), shape=(0, len(frogtrace_flat))) E_real_f = hdf5_file.create_earray(hdf5_file.root, 'E_real', tables.Float16Atom(), shape=(0, len(E_real))) E_imag_f = hdf5_file.create_earray(hdf5_file.root, 'E_imag', tables.Float16Atom(), shape=(0, len(E_imag))) hdf5_file.close() # create file hdf5_file = tables.open_file('frogtestdata.hdf5', mode='w') frog_image_f = hdf5_file.create_earray(hdf5_file.root, 'frog',
def getIntensityProfile(masked_image_file, skeletons_file, intensities_file, width_resampling=15, length_resampling=131, min_num_skel=100, smooth_win=11, pol_degree=3, width_percentage=0.5, save_maps=False): min_num_skel = min_num_skel_defaults(skeletons_file, min_num_skel=min_num_skel) assert smooth_win > pol_degree assert min_num_skel > 0 assert 0 < width_percentage < 1 # we want to use symetrical distance centered in the skeleton if length_resampling % 2 == 0: length_resampling += 1 if width_resampling % 2 == 0: width_resampling += 1 # get the limits to be averaged from the intensity map if save_maps: width_win_ind = getWidthWinLimits(width_resampling, width_percentage) else: width_win_ind = (0, width_resampling) # filters for the tables structures table_filters = tables.Filters(complevel=5, complib='zlib', shuffle=True, fletcher32=True) # Get a reduced version of the trajectories_data table with only the valid skeletons. # The rows of this new table are going to be saved into skeletons_file trajectories_data_valid = setIntMapIndexes(skeletons_file, min_num_skel) # let's save this new table into the intensities file with tables.File(intensities_file, 'w') as fid: fid.create_table('/', 'trajectories_data_valid', obj=trajectories_data_valid.to_records(index=False), filters=table_filters) tot_rows = len(trajectories_data_valid) if tot_rows == 0: with tables.File(intensities_file, "r+") as int_file_id: # nothing to do here let's save empty data and go out worm_int_avg_tab = int_file_id.create_array( "/", "straighten_worm_intensity_median", obj=np.zeros(0)) worm_int_avg_tab._v_attrs['has_finished'] = 1 return with tables.File(masked_image_file, 'r') as mask_fid, \ tables.File(skeletons_file, 'r') as ske_file_id, \ tables.File(intensities_file, "r+") as int_file_id: # pointer to the compressed videos mask_dataset = mask_fid.get_node("/mask") # pointer to skeletons skel_tab = ske_file_id.get_node('/skeleton') skel_width_tab = ske_file_id.get_node('/width_midbody') filters = tables.Filters(complevel=5, complib='zlib', shuffle=True) # we are using Float16 to save space, I am assuing the intensities are # between uint8 worm_int_avg_tab = int_file_id.create_carray( "/", "straighten_worm_intensity_median", tables.Float16Atom(dflt=np.nan), (tot_rows, length_resampling), chunkshape=(1, length_resampling), filters=table_filters) worm_int_avg_tab._v_attrs['has_finished'] = 0 worm_int_avg_tab.attrs['width_win_ind'] = width_win_ind if save_maps: worm_int_tab = int_file_id.create_carray( "/", "straighten_worm_intensity", tables.Float16Atom(dflt=np.nan), (tot_rows, length_resampling, width_resampling), chunkshape=(1, length_resampling, width_resampling), filters=table_filters) grouped_frames = trajectories_data_valid.groupby('frame_number') # variables used to report progress base_name = skeletons_file.rpartition('.')[0].rpartition( os.sep)[-1].rpartition('_')[0] progressTime = TimeCounter('Obtaining intensity maps.', len(grouped_frames)) for frame, frame_data in grouped_frames: img = mask_dataset[frame, :, :] for ii, row_data in frame_data.iterrows(): skeleton_id = int(row_data['skeleton_id']) worm_index = int(row_data['worm_index_joined']) int_map_id = int(row_data['int_map_id']) # read ROI and skeleton, and put them in the same coordinates # map worm_img, roi_corner = getWormROI(img, row_data['coord_x'], row_data['coord_y'], row_data['roi_size']) skeleton = skel_tab[skeleton_id, :, :] - roi_corner half_width = skel_width_tab[skeleton_id] / 2 assert not np.isnan(skeleton[0, 0]) skel_smooth = smoothSkeletons( skeleton, length_resampling=length_resampling, smooth_win=smooth_win, pol_degree=pol_degree) straighten_worm, grid_x, grid_y = getStraightenWormInt( worm_img, skel_smooth, half_width=half_width, width_resampling=width_resampling) # if you use the mean it is better to do not use float16 int_avg = np.median( straighten_worm[width_win_ind[0]:width_win_ind[1], :], axis=0) worm_int_avg_tab[int_map_id] = int_avg # only save the full map if it is specified by the user if save_maps: worm_int_tab[int_map_id] = straighten_worm.T if frame % 500 == 0: progress_str = progressTime.get_str(frame) print_flush(base_name + ' ' + progress_str) worm_int_avg_tab._v_attrs['has_finished'] = 1