def update(self, gene, weight, ref_allele, allele, dosage_row): if self.D is None: self.gene_list = self.get_gene_list() self.gene_index = { gene: k for (k, gene) in enumerate(self.gene_list) } self.n_genes = len(self.gene_list) self.n_samples = len(dosage_row) self.D_file = h5py_cache.File(self.output_binary_file, 'w', chunk_cache_mem_size=self.cache_size) n_genes_chunk = np.min((self.n_genes, 10)) self.D = self.D_file.create_dataset("pred_expr", shape=(self.n_genes, self.n_samples), chunks=(n_genes_chunk, self.n_samples), dtype=np.dtype('float32'), scaleoffset=4, compression='gzip') if gene in self.gene_index: # assumes dosage coding 0 to 2 # assumes non-ambiguous SNPs to resolve strand issues: if ref_allele == allele or self.complements[ref_allele] == allele: self.D[self.gene_index[gene], :] += dosage_row * weight else: self.D[self.gene_index[gene], :] += ( 2 - dosage_row) * weight # Update all cases for that gene
def __init__(self, data_file_pth, chunk_cache_mem_size_bytes=20 * 1024 ** 3, desired_chunk_size_bytes=0.1 * 1024 ** 2, compression='lzf'): """ :param data_file_pth: Path to the hdf5 file :param chunk_cache_mem_size_bytes: HDF5 chunk cache size. Larger the better. Default is 20GiB :param desired_chunk_size_bytes: Chunk size for individual chunks. h5py docs recommends keeping this between 10 KiB and 1 MiB. Default is 0.1 MiB. Pass in -1 to switch to h5py automagic chunk size. """ import h5py import h5py_cache self.h5py = h5py self.desired_chunk_size_bytes = desired_chunk_size_bytes if not os.path.exists(data_file_pth): self.f = h5py.File(data_file_pth, 'w', libver='latest') else: self.f = h5py_cache.File( data_file_pth, 'r', chunk_cache_mem_size=chunk_cache_mem_size_bytes, libver='latest', w0=0.1, n_cache_chunks=int(chunk_cache_mem_size_bytes / desired_chunk_size_bytes)) self.i = 0 self.is_swmr_hdf_version = h5py.version.hdf5_version_tuple >= (1, 9, 178) self.compression = compression
def __init__(self, datapath, dataset, *nargs, train_percentage=100., val_split=False, **kwargs): if datapath not in self.refs: f = h5py_cache.File(datapath, chunk_cache_mem_size=1024**3 // 10) self.refs[datapath] = f length = len(self.refs[datapath][dataset]) split = int(np.ceil(train_percentage * length)) if val_split: start = split end = length else: start = 0 end = split super(CachingHDF5Matrix, self).__init__(datapath, dataset, *nargs, start=start, end=end, **kwargs) # bugfix: ndim is not changed to reflect the normalizer's output if self.normalizer is not None: self._base_ndim = self.normalizer(self.data[0:1]).ndim else: self._base_ndim = self.data.ndim
def __init__(self, filename): '''Init opening the file and finding all data groups. Currently only searches the /Data/Images group. Parameters ---------- filename : The file path to load as a string. ''' ## necessary declaration in case something goes wrong self.file_hdl = None self.metaDataJSON = None self.list_data = None # check for string if not isinstance(filename, str): raise TypeError('Filename is supposed to be a string!') # try opening the file try: #self.file_hdl = h5py.File(filename, 'r') self.file_hdl = h5py_cache.File(filename, 'r', chunk_cache_mem_size=10 * 1024**2) except: print('Error opening file for readonly: "{}"'.format(filename)) raise self._find_groups()
def extract_all_files(self): nd2_files = self.extract_all_meta() meta_handle = pandas_hdf5_handler(self.headpath + "/metadata.hdf5") meta_df = meta_handle.read_df("data", read_metadata=True) channels = meta_df.metadata["channels"] y_dim = meta_df.metadata["height"] x_dim = meta_df.metadata["width"] ttl_indices = len(meta_df) chunk_shape = (1, meta_df.metadata['height'], meta_df.metadata['width']) chunk_bytes = (2 * np.multiply.accumulate(np.array(chunk_shape))[-1]) chunk_cache_mem_size = 2 * chunk_bytes with h5py_cache.File( self.hdf5path + "/extracted.hdf5", "w", chunk_cache_mem_size=chunk_cache_mem_size) as h5pyfile: for c, channel in enumerate(channels): hdf5_dataset = h5pyfile.create_dataset( str(channel), (ttl_indices, y_dim, x_dim), chunks=chunk_shape, dtype='uint16') for file_idx in meta_df["file_idx"].unique(): nd2path = self.headpath + "/" + nd2_files[file_idx] with ND2Reader(nd2path) as nd2file: file_df = meta_df[meta_df["file_idx"] == file_idx] for idx, item in file_df.iterrows(): t = item["timepoint"] v = item["fov"] nd2_image = nd2file.get_frame_2D(c=c, t=t, v=v) hdf5_dataset[idx, :, :] = nd2_image
def run(args): if os.path.exists(args.output): logging.info("Output exists, delete it or move it if you want it generated again") return Utilities.ensure_requisite_folders(args.output) logging.info("Reading input") data = pandas.read_table(args.input) logging.info("Opening output") f = h5py_cache.File(args.output, 'w', chunk_cache_mem_size=int(50 * (1024 ** 2))) n_genes = data.shape[1]-2 n_samples = data.shape[0] n_genes_chunk = np.min((n_genes, 10)) logging.info("Processing expression") p = f.create_dataset("pred_expr", shape=(n_genes, n_samples), chunks=(n_genes_chunk, n_samples), dtype=np.dtype('float32'), scaleoffset=4, compression='gzip') g = f.create_dataset("genes", (n_genes,), dtype="S30") for i, gene in enumerate(data.columns.values[2:]): p[i, :] = data[gene].to_numpy() g[i] = np.string_(gene) logging.info("saving samples") s = f.create_dataset("samples", (n_samples,), dtype="S25") for i in xrange(0, n_samples): s[i] = np.string_(data["IID"][i]) f.close() logging.info("Done")
def _structure_file(file_path): logging.info("Acquiring HDF5 expression cache") HDF5_CACHE = int(60 * (1024 ** 2)) file = h5py_cache.File(file_path, 'r', chunk_cache_mem_size=HDF5_CACHE, w0=1.0, dtype='float32') genes = [g for g in file['genes']] h5 = file['pred_expr'] return genes, h5
def writehdf5(fovnum, num_entries, timepoint_list, file_idx, num_fovs): with ND2Reader(self.nd2filename) as nd2file: y_dim = self.metadata['height'] x_dim = self.metadata['width'] with h5py_cache.File( self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5", "w", chunk_cache_mem_size=self.chunk_cache_mem_size ) as h5pyfile: for i, channel in enumerate(self.metadata["channels"]): hdf5_dataset = h5pyfile.create_dataset(str(channel),\ (num_entries,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16') # If Elements crashed nd2reader does not index into the file correctly, this is a hard fix if self.metadata["failed_file"]: for j in range(len(timepoint_list)): frame = timepoint_list[j] nd2_image = nd2file.get_frame_2D( c=i, t=0, v=fovnum + frame * num_fovs) hdf5_dataset[j, :, :] = nd2_image else: for j in range(len(timepoint_list)): frame = timepoint_list[j] nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum) hdf5_dataset[j, :, :] = nd2_image return "Done."
def train_new_top_model(all_features_hdf5, all_labels_hdf5): #The Bottleneck or other image extracted features are stored in h5 files # Use HDF5MatrixCacheIterator to use them. By default HDF5MatrixCacheIterator doesnt transform and doesnt shuffle. # so we can use old ImageDataGenerator datagen = ImageDataGenerator(validation_split=0.2) f1_trainvalidation = h5.File(all_features_hdf5, 'r') shape = f1_trainvalidation['data'].shape f1_trainvalidation.close() #15 Gb is upper limit for cache memory. total_mem_usage, dividing_factor = calculateDividingFactor(shape) #print("Dividing factor {}:".format(dividing_factor)) print("Cached memory usage: {}".format(total_mem_usage / (1024**3) / dividing_factor)) chunk_shape = (1, max(shape[1] // 2, 1), max(shape[2] // 2, 1), max(shape[3] // 2, 1)) f1_trainvalidation = h5c.File(all_features_hdf5, 'r', chunk_cache_mem_size=total_mem_usage // dividing_factor) f1_label = h5.File(all_labels_hdf5, 'r') train_generator = datagen.flow_hdf5(f1_trainvalidation['data'], f1_label['data'], subset='training', batch_size=batch_size, shuffle=False) validation_generator = datagen.flow_hdf5(f1_trainvalidation['data'], f1_label['data'], subset='validation', batch_size=batch_size, shuffle=False) model = Sequential() model.add(Flatten(input_shape=shape[1:])) model.add(Dense(1024, activation='relu')) #model.add(Dense(256,input_shape=(train_data_shape[1:],),activation='relu')) model.add(Dropout(0.5)) model.add(Dense(85, activation='sigmoid')) #model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy']) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=[f1]) model.fit_generator( generator=train_generator, steps_per_epoch=int(ceil(train_generator.samples / batch_size)), validation_data=validation_generator, max_queue_size= 10, # use a value which can fit batch_size * image_size * max_queue_size in your CPU memory workers= 4, # I don't see multi workers can have any performance benefit without multi threading use_multiprocessing=False, # HDF5Matrix cannot support multi-threads validation_steps=int(ceil(validation_generator.samples / batch_size)), shuffle=True, epochs=epochs) model.save_weights(top_model_weights_path)
def __init__(self, data_dir, subset='train', download=True): super().__init__() if download: Cifar10Dataset.download(data_dir) h5_file = os.path.join(data_dir, 'cifar-10.h5') self.h5_file = h5_file self.subset = subset with h5py_cache.File(h5_file, 'r', chunk_cache_mem_size=1024**3) as f: self.length = f[subset]['images'].shape[0]
def write_hdf5(self, filepath, dataset_id, vid_data): """ Writes data to an hdf5 file :param filepath: filepath of hdf5 :param dataset_id: id of dataset in hdf5 :param vid_data: data to write :return: None """ hdf5_lock = self.add_hdf5(filepath) with hdf5_lock: h5py_file = h5c.File(filepath, 'a', libver='latest', chunk_cache_mem_size=(1024**2)*16) h5py_file.create_dataset(dataset_id, data=vid_data, dtype='uint8') h5py_file.close()
def read_hdf5(self, filepath, dataset_id, start_frame, end_frame): """ Reads data from an hdf5 :param filepath: filepath of hdf5 :param dataset_id: id of dataset in hdf5 :param start_frame: frame to start from :param end_frame: frame to end before (exclusive) :return: data """ hdf5_lock = self.add_hdf5(filepath) with hdf5_lock: h5py_file = h5c.File(filepath, 'r', libver='latest', chunk_cache_mem_size=(1024**2)*16) dataset = h5py_file[dataset_id] return dataset[start_frame: end_frame]
def writehdf5(fovnum,num_entries,timepoint_list,file_idx, num_fovs): with ND2Reader(self.nd2filename) as nd2file: for key,item in self.nd2reader_override.items(): nd2file.metadata[key] = item y_dim = self.metadata['height'] x_dim = self.metadata['width'] with h5py_cache.File(self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5","w",chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile: for i,channel in enumerate(self.metadata["channels"]): hdf5_dataset = h5pyfile.create_dataset(str(channel),\ (num_entries,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16') for j in range(len(timepoint_list)): frame = timepoint_list[j] nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum) hdf5_dataset[j,:,:] = nd2_image return "Done."
def __getitem__(self, index): subset = self.subset with h5py_cache.File(self.h5_file, 'r', chunk_cache_mem_size=1024**3) as f: image_bytes = torch.from_numpy(f[subset]['images'][index]) label = torch.from_numpy(f[subset]['labels'][index]) image = (image_bytes.float() * 2) / 255 - 1 sample = { 'input': image, 'label': label, } return sample
def init_hdf5(self, file_name, dataset_name, array, t_len, t_dim_out, dtype='uint16', singleton_chunk_dims=[]): """Initializes an empty hdf5 file and dataset to write to, given an array with the target shape in all axes but the time axis. The time axis is then specified by t_len. Args: file_name (str): Name of the hdf5 file, assumed to be in the temp folder initialized by this class. dataset_name (str): The name of the hdf5 dataset to initialize. array (array): Array which is of the same size as the dataset, except in the time dimension. t_len (int): Total size of the dataset time dimension. t_dim_out (int): Axis of the dataset time dimension. dtype(str, optional): Specifies the array datatype to initialize an hdf5 file for. A 16 bit unsigned integer by default. """ out_shape = list(array.shape) out_shape[t_dim_out] = t_len out_shape = tuple(out_shape) chunk_shape = np.array(list(out_shape)) min_arr = np.ones(chunk_shape.shape, dtype=int) * self.img_chunk_size chunk_shape = np.minimum(chunk_shape, min_arr) chunk_shape[t_dim_out] = 1 if singleton_chunk_dims != []: for dim in singleton_chunk_dims: chunk_shape[dim] = 1 chunk_shape = tuple(chunk_shape) with h5py_cache.File( self.temp_path + file_name + ".hdf5", "a", chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile: hdf5_dataset = h5pyfile.create_dataset(dataset_name, out_shape, chunks=chunk_shape, dtype=dtype)
def writehdf5(fidx_channels_paths): y_dim = self.metadata['height'] x_dim = self.metadata['width'] num_channels = len(self.metadata["channels"]) file_idx, channels, filepaths = fidx_channels_paths datasets = {} with h5py_cache.File(self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5","w",chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile: for i,channel in enumerate(self.metadata["channels"]): hdf5_dataset = h5pyfile.create_dataset(str(channel),\ (len(filepaths)/num_channels,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16') datasets[channel] = hdf5_dataset for i in range(len(filepaths)): curr_channel = channels[i] curr_file = filepaths[i] datasets[curr_channel][i//num_channels,:,:] = imread(curr_file) return "Done."
def extract_fov(self, fovnum): nd2file = ND2Reader(self.nd2filename) num_fovs = len(nd2file.metadata["fields_of_view"]) with h5py_cache.File( self.hdf5path + "/fov_" + str(fovnum) + ".hdf5", "w", chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile: for i, channel in enumerate(nd2file.metadata["channels"]): y_dim = nd2file.metadata['height'] x_dim = nd2file.metadata['width'] t_dim = len(nd2file.metadata['frames']) hdf5_dataset = h5pyfile.create_dataset("channel_" + str(channel),\ (x_dim,y_dim,t_dim), chunks=self.chunk_shape, dtype='uint16') for frame in range(len(nd2file.metadata['frames'])): nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum) hdf5_dataset[:, :, int(frame)] = nd2_image nd2file.close()
def write_hdf5(self, file_name, array, ti, t_len, t_dim_out, dataset_name): """Writes an array to a particular dataset in an hdf5 file. Positions in time are left variable to enable chunking the dataset in time. Args: file_name (str): Name of the hdf5 file, assumed to be in the temp folder initialized by this class. array (array): Array to be written. ti (int): Initial time position to write array values to. t_len (int): Total size of the target time dimension. t_dim_out (int): Axis of the target time dimension. dataset_name (str): The name of the hdf5 dataset to write to. """ with h5py_cache.File( self.temp_path + file_name + ".hdf5", "r+", chunk_cache_mem_size=self.chunk_cache_mem_size) as h5pyfile: indices = list(range(ti, min(ti + self.t_chunk, t_len))) self.reassign_idx(h5pyfile[dataset_name], array, indices, t_dim_out)
def writehdf5(fovnum, num_entries, timepoint_list, file_idx): with ND2Reader(self.nd2filename) as nd2file: y_dim = self.metadata['height'] x_dim = self.metadata['width'] with h5py_cache.File( self.hdf5path + "/hdf5_" + str(file_idx) + ".hdf5", "w", chunk_cache_mem_size=self.chunk_cache_mem_size ) as h5pyfile: for i, channel in enumerate(self.metadata["channels"]): hdf5_dataset = h5pyfile.create_dataset(str(channel),\ (num_entries,y_dim,x_dim), chunks=self.chunk_shape, dtype='uint16') for j in range(len(timepoint_list)): frame = timepoint_list[j] # frame = dataframe[j:j+1]["timepoints"].values[0] # frame = self.metadf['frames'][timepoint] #not sure if this is necessary... nd2_image = nd2file.get_frame_2D(c=i, t=frame, v=fovnum) hdf5_dataset[j, :, :] = nd2_image return "Done."
def save_image_features(img_width, img_height, all_features_hdf5, all_labels_hdf5, train): datagen = ImageDataGenerator() #Build a fake model model = Sequential() generator = datagen.flow_from_directory(full_data_dir, target_size=(img_height, img_width), batch_size=batch_size, interpolation='lanczos', class_mode=None, table_pd=train) shape = (generator.samples, img_width, img_height, 3) chunk_shape = (1, 100, 100, 3) total_mem_usage, dividing_factor = calculateDividingFactor(shape) f1_all = h5c.File(all_features_hdf5, 'w', chunk_cache_mem_size=total_mem_usage // dividing_factor) f1_label = h5.File(all_labels_hdf5, 'w') d1_all = f1_all.create_dataset('data', shape, dtype='float32', chunks=chunk_shape, compression="lzf") d1_label = f1_label.create_dataset( 'data', (generator.samples, len(generator.table_pd.columns)), dtype='float32') model.write_generator(generator, steps=int(ceil(generator.samples / batch_size)), max_queue_size=10, workers=4, use_multiprocessing=False, verbose=0, d_set=d1_all, label_set=d1_label) f1_all.close() f1_label.close()
def main(): ''' Set parameters ''' parser = argparse.ArgumentParser(description='VAD on the AMI corpus') parser.add_argument('--SRate', type=int, default=16000, metavar='SR', help="Sample rate to be used") parser.add_argument('--SChannels', type=int, default=1, metavar='SC', help="Number of channels") parser.add_argument('--SWidth', type=int, default=2, metavar='SW', help="Sample Width") parser.add_argument('--FSize', type=int, default=30, metavar='FS', help="Frame size in MS") parser.add_argument('--Path', type=str, default='/home/lucas/PycharmProjects/Papers_with_code/data/AMI', metavar='DP', help="Path to data folder") parser.add_argument('--MFCC_WinS', type=int, default=4, metavar='MWS', help="MFCC window size (in frames)") parser.add_argument('--Generate_Datasets', type=bool, default=True, metavar='GD', help="Generate hdf5 datasets? (Default=True)") parser.add_argument('--Batch_Size', type=int, default=2048, metavar='BS', help="Batch size (default 2048)") parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args = parser.parse_args() #Calculate frame size in data points FRAME_SIZE = int(args.SRate * (args.FSize / 1000.0)) #Change path PATH = args.Path os.chdir(PATH) AMI_speech = FileManager('AMI_speech_test', 'amicorpus') #AMI_speech.prepare_files(normalize=False, SAMPLE_RATE=args.SRate, SAMPLE_WIDTH=args.SWidth, # SAMPLE_CHANNELS=args.SChannels) #AMI_speech.collect_frames(FRAME_SIZE=FRAME_SIZE) #AMI_speech.label_frames(PATH,FRAME_LENGTH=30) speech_data = AMI_speech.data data = h5py_cache.File('data_AMI_test.hdf5', 'a', chunk_cache_mem_size=1024 ** 3) CUDA = True SLICE_MIN_MS = 2500 SLICE_MAX_MS = 5000 # Frame size to use for the labelling. #FRAME_SIZE_MS = 30 # Convert slice ms to frame size. SLICE_MIN = int(SLICE_MIN_MS / 30) SLICE_MAX = int(SLICE_MAX_MS / 30) if 'labels' not in data: print('Preparing Data...') pos = 0 l = len(AMI_speech.data['frames']) slices = [] # Split frames into slices for feature extraction. Slices are used to extract features is "batches" to not overload system memory while pos + SLICE_MIN < l: slice_indexing = (pos, pos + SLICE_MIN) slices.append(slice_indexing) pos = slice_indexing[1] # Add remainder to last slice. slices[-1] = (slices[-1][0], l) #Get total frame count total = l + args.MFCC_WinS #Create datasets data.create_dataset('frames', (total, FRAME_SIZE), dtype=np.dtype(np.int16)) data.create_dataset('mfcc', (total, 12), dtype=np.dtype(np.float32)) data.create_dataset('delta', (total, 12), dtype=np.dtype(np.float32)) #Create data set for labels dt = np.dtype(np.int8) data.create_dataset('labels', (total,), dtype=dt) pos = 0 for s in slices: frames = AMI_speech.data['frames'][s[0]:s[1]] labels = AMI_speech.data['ground_truth'][s[0]:s[1]] if pos == 0: align_frames = np.zeros((args.MFCC_WinS - 1, FRAME_SIZE)) else: align_frames = data['frames'][pos-args.MFCC_WinS+1:pos] frames, mfcc, delta = extract_features(align_frames=align_frames,speech_frames=frames, SAMPLE_WIDTH=args.SWidth, SAMPLE_RATE=args.SRate, SAMPLE_CHANNELS=args.SChannels, mfcc_window_size=args.MFCC_WinS, FRAME_SIZE_MS=args.FSize) data['frames'][pos:pos + len(labels)] = frames data['mfcc'][pos:pos + len(labels)] = mfcc data['delta'][pos:pos + len(labels)] = delta data['labels'][pos: pos + len(labels)] = labels pos += len(labels) print('Preparing ({0:.2f} %)'.format((pos * 100) / total), end='\r', flush=True) data.flush() print('\nDone') # Test generator features. #generator = DataGenerator(data, size_limit=10000) #generator.setup_generation(frame_count=3, step_size=5, batch_size=4) #generator.use_train_data() #X, y = generator.get_batch(50) #print(f'Load a few frames into memory:\n{X[0]}\n\nCorresponding label: {y[0]}') #print(len(AMI_speech.data['frames'])) #Vis.plot_sample(frames=AMI_speech.data['frames'][1:],labels=AMI_speech.data['ground_truth'][1:]) #Vis.plot_sample_webrtc(frames=AMI_speech.data['frames'][1:10000], sensitivity=0) #generator.plot_data(0, 400000) #print(data['frames'][0:2]) #net = Net(large=False) #count_params(net) #print(net) # Test generator generator = DataGenerator(data) generator.setup_generation(frame_count=30, step_size=6, batch_size=2048) generator.use_train_data() print(generator.batch_count, 'training batches were found') # Compact instantiation of untrained network on CPU temp, CUDA = CUDA, False net, CUDA = Net(large=False, CUDA=CUDA), temp del temp test_net(net=net, num_batches=1, generator=generator) #Test simple lstm network set_seed(1001,CUDA=CUDA) #net = BiRNN(num_in=30,num_hidden=30,batch_size=2048,large=True,lstm=False,fcl=True,bidir=True,OBJ_CUDA=True) net = Net(large=True,lstm=False) train_net(net, data=data) #get_xml(PATH) #net, data, noise_level = '-3', init_pos = 50, length = 700, only_plot_net = False, timeit = True, FRAMES = 30, STEP_SIZE = 6, BATCH_SIZE = 2048, FEATURES = 24, OBJ_CUDA = True netvad(net = net, data=data,init_pos=10000,length= 20000, only_plot_net=True, timeit=True, FRAMES=30, STEP_SIZE=6, BATCH_SIZE=2048, FEATURES=24, OBJ_CUDA=True)
def __init__(self, file_path, start=0, end=None): super(H5Dataset, self).__init__() with h5c.File(file_path, 'r', chunk_cache_mem_size=CHUNK_CACHE_MEM_SIZE) as f: self.images = torch.from_numpy(f['images'][start : end]) self.labels = torch.from_numpy(f['labels'][start : end]).to(torch.int32) print("Loaded images of shape {}, type {}, and labels of shape {}, type {}.".format(self.images.shape, self.images.dtype, self.labels.shape, self.labels.dtype))
dataset_path = '/mnt/hdd/PROX/snapshot_realcams_v3' outfilename = 'realcams.hdf5' h5file_path = os.path.join('/home/yzhang/Videos/PROXE', outfilename) batch_gen = BatchGeneratorWithSceneMeshMatfile(dataset_path=dataset_path, scene_verts_path = '/home/yzhang/Videos/PROXE/scenes_downsampled', scene_sdf_path = '/home/yzhang/Videos/PROXE/scenes_sdf', device=torch.device('cuda')) ### create the dataset used in the hdf5 file with h5c.File(h5file_path, mode='w',chunk_cache_mem_size=1024**2*128) as hdf5_file: while batch_gen.has_next_batch(): train_data = batch_gen.next_batch(1) if train_data is None: continue train_data_np = [x.detach().cpu().numpy() for x in train_data[:-1]] break [depth_batch, seg_batch, body_batch, cam_ext_batch, cam_int_batch, max_d_batch,
import numpy as np import os import sys CHUNK_CACHE = 1024**2 * 4000 if len(sys.argv) < 4: print('Error: not enough input arguments!') print('Usage: python3 shuffle_h5_mem_2pass.py IN_H5 OUT_H5 STEPS') exit(-1) fin = sys.argv[1] fout = sys.argv[2] steps = int(sys.argv[3]) with h5c.File(fin, 'r', chunk_cache_mem_size=CHUNK_CACHE) as f: images_shape = f['images'].shape labels_shape = f['labels'].shape n = labels_shape[0] max_in_memory = int(np.floor(n / steps)) blocks = list(np.arange(0, n, max_in_memory)) + [labels_shape[0]] blocks = blocks[:-1] blocks[-1] = n m = len(blocks) - 1 print('n:', n, 'm:', m, 'blocks:', blocks) chunk_shape = (1000, ) + images_shape[1:] db_idx = np.random.permutation(np.repeat(np.arange(m), max_in_memory)) print('db_idx:', len(db_idx)) offsets = [0] * m # 1st pass: Assign instances to randomly chosen bins.
def main(args): if tf.test.is_gpu_available(): print(bc.OKGREEN + bc.BOLD + '#' * 9 + ' USING GPU ' + '#' * 9 + bc.ENDC) else: print(bc.FAIL + bc.BOLD + '#' * 9 + ' NOT USING GPU ' + '#' * 9 + bc.ENDC) # Get agent name tokens = args.p.split('/') if args.p[-1] == '/': assert (tokens.pop() == '') dir_agent = '-'.join(tokens[-1].split('_')[:-3]) + '.save' print(dir_agent) # run this first to avoid failing after huge overhead model_ok, initial_epoch = model_exists(args.m, dir_agent) PATH_DIR_SAVE = os.path.join(args.m, dir_agent) PATH_DIR_CKPT = os.path.join(PATH_DIR_SAVE, 'ckpts') n_epoch = args.epochs hypers = { 'lr': 0.00015, 'batch_size': 128, 'hl_activations': [ReLU, ReLU, ReLU, ReLU, ReLU, ReLU], 'hl_sizes': [1024, 1024, 512, 512, 512, 256], 'decay': 0., 'bNorm': True, 'dropout': True, 'regularizer': None } # checking input data format. if args.p.split('.')[-1] in ['hdf5', 'HDF5']: f = h5py_cache.File(p, 'r', chunk_cache_mem_size=1 * 1024**3, swmr=True) gen_tr = Gen4h5(f['X_tr'], f['Y_tr'], hypers['batch_size'], False) gen_va = Gen4h5(f['X_va'], f['Y_va'], hypers['batch_size'], False) else: X, Y, mask = CV(args.p) gen_tr = DataGenerator(X[mask], Y[mask], hypers['batch_size']) gen_va = DataGenerator(X[~mask], Y[~mask], 1000) os.makedirs(PATH_DIR_CKPT, exist_ok=True) # Callbacks: save best & latest models. callbacks = [ ModelCheckpoint(os.path.join(PATH_DIR_SAVE, 'best.h5'), monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1), ModelCheckpoint(os.path.join(PATH_DIR_CKPT, '{epoch:02d}-{val_accuracy:.2f}.h5'), monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=True, mode='auto', period=1), CSVLogger(os.path.join(PATH_DIR_SAVE, 'training.log'), append=True) ] m = Mlp(io_sizes=(glb.SIZE_OBS_VEC, glb.SIZE_ACT_VEC), out_activation=Softmax, loss='categorical_crossentropy', metrics=['accuracy'], **hypers, verbose=1) if model_ok: # continue from previously saved msg = "Saved model found. Resuming training." print(bc.OKGREEN + bc.BOLD + msg + bc.ENDC) h5s = os.listdir(PATH_DIR_CKPT) h5s.sort() saved_h5 = os.path.join(PATH_DIR_CKPT, h5s[-1]) m.construct_model(saved_h5, weights_only=True) else: # create new model msg = "{} doesn't exist or is empty. Creating new model." print(bc.WARNING + bc.BOLD + msg.format(PATH_DIR_SAVE) + bc.ENDC) os.makedirs(PATH_DIR_CKPT, exist_ok=True) m.construct_model() m.train_model(gen_tr, gen_va, n_epoch=n_epoch, callbacks=callbacks, verbose=False, workers=args.w, use_mp=True, max_q_size=args.q, initial_epoch=initial_epoch)
def transpose(dset_in, file_name_out, dset_name_out, chunk_cache_mem_size=1024**3, w0=1.0, dtype=None, R2=None, C2=None, close_file_when_done=False, access_axis=1, show_progress=False): """Transpose large matrix in HDF5 for fast access on transposed row Assuming the input dataset is chunked along its column (second) index, this function transposes it to an output dataset that is chunked along its column index. Done naively, this operation can be extraordinarily slow, because we need to access rows in the input in order to write columns in the output. But row access is slow because the entire column chunk needs to be read. A similar problem occurs for writing. This function splits the difference, using the chunk cache to maximum advantage. Roughly speaking, if we have sufficient memory to store M elements, we can set the output chunk size to sqrt(M) and process sqrt(M) chunks at a time. The input dataset may actually be one-dimensional, but interpreted as a matrix of shape R1xC1 = C2xR2. Or the input may simply be a matrix of that shape, in which case R2 and C2 will be inferred. Parameters ---------- dset_in : h5py.Dataset Open dataset in an h5py File object used as input. file_name_out : str dset_name_out : str Names of file and dataset in that file used for output. chunk_cache_mem_size : int w0 : float dtype : dtype for h5py dataset Default is None. If None, dtype of dset_in is selected. Parameters passed to `h5py_File_with_cache`. See that function's documentation. R2, C2 : int Number of rows and columns in returned dataset. These default to None, in which case the shape is inferred from the shape of the input. Note that if the input is 1-D, this could turn out badly because each row will have just one element, which will be slow. close_file_when_done : bool If True, close output file and return None. If False, return file handle and dataset. Default is False. access_axis : int Axis along which the output array will be accessed. Default is 1, meaning the second (column) index. Anything else will be interpreted as 0. show_progress : bool Periodically show progress through the main loop. Defaults to False. """ import numpy as np import h5py import h5py_cache if not dtype: dtype = dset_in.dtype # Figure out the basic output sizes if R2 is None: if len(dset_in.shape) > 1: R2 = dset_in.shape[1] else: R2 = 1 if C2 is None: C2 = dset_in.shape[0] bytes_per_object = np.dtype(dtype).itemsize num_chunk_elements = chunk_cache_mem_size // bytes_per_object sqrt_n_c_e = int(np.sqrt(num_chunk_elements)) assert R2 * C2 == dset_in.size, ( "Requested size {0}*{1}={2}".format(R2, C2, R2 * C2) + " is incompatible with input size {0}".format(dset_in.size)) # If the transposition can be done in memory, just do it if dset_in.size <= num_chunk_elements: # print("Doing transpose in memory") file_out = h5py.File(file_name_out, 'a') if dset_name_out in file_out: del file_out[dset_name_out] dset_out = file_out.create_dataset(dset_name_out, shape=(R2, C2), dtype=dtype) dset_out[:] = (dset_in[:].reshape(C2, R2).T).astype(dtype) else: # Set up output file and dset if access_axis == 1: n_cache_chunks = min(sqrt_n_c_e, R2) chunk_size = min(num_chunk_elements // n_cache_chunks, C2) chunks = (1, chunk_size) else: n_cache_chunks = min(sqrt_n_c_e, C2) chunk_size = min(num_chunk_elements // n_cache_chunks, R2) chunks = (chunk_size, 1) file_out = h5py_cache.File(file_name_out, 'a', chunk_cache_mem_size, w0, n_cache_chunks) if dset_name_out in file_out: del file_out[dset_name_out] dset_out = file_out.create_dataset(dset_name_out, shape=(R2, C2), dtype=dtype, chunks=chunks) # Depending on whether input is 1-D or 2-D, we do this differently if len(dset_in.shape) == 1: def submatrix_dset_in(r2a, r2b, c2a, c2b): temp = np.empty((c2b - c2a, r2b - r2a), dtype=dtype) C1 = R2 c1a, c1b = r2a, r2b for r1 in range(c2a, c2b): try: temp[r1 - c2a] = (dset_in[r1 * C1 + c1a:r1 * C1 + c1b]).astype(dtype) except ValueError: print(r2a, r2b, "\t", c2a, c2b, "\n", r1, c1a, c1b, "\t", C1) print(r1 - c2a, r1 * C1 + c1a, r1 * C1 + c1b, dset_in.shape, temp.shape) raise return temp else: def submatrix_dset_in(r2a, r2b, c2a, c2b): return (dset_in[c2a:c2b, r2a:r2b]).astype(dtype) # Do the transposition i = 1 for c2a in range(0, C2, chunk_size): for r2a in range(0, R2, n_cache_chunks): if show_progress: print("\t\t\t{0} of {1}".format( i, int( np.ceil(C2 / chunk_size) * np.ceil(R2 / n_cache_chunks)))) c2b = min(C2, c2a + chunk_size) r2b = min(R2, r2a + n_cache_chunks) dset_out[r2a:r2b, c2a:c2b] = submatrix_dset_in(r2a, r2b, c2a, c2b).T i += 1 if close_file_when_done: file_out.close() return else: return file_out, dset_out
parser.add_argument("savefile", type=str, help="File to save results") parser.add_argument("-s", "--size", type=int, help="Batch size") parser.add_argument("-f", "--freq", type=float, help="Central gaussian frequency") parser.add_argument("-w", "--width", type=float, help="Width of gaussian frequency") args = parser.parse_args() cfreq = args.freq freq_width = args.width data = h5py_cache.File(args.file, "r", chunk_cache_mem_size=500 * 1024**2) print("Opening {} as data file...".format(sys.argv[1])) keys = list(data.keys()) data = [data[key] for key in keys] # read all datasets in the hf5 file print("Components: {}".format(keys)) print("Data {}".format(data[0].shape)) rfile = h5py_cache.File(args.reffile, "r", chunk_cache_mem_size=500 * 1024**2) reference = [rfile[key] for key in keys] print("Reference {}".format(reference[0].shape)) l = data[0].shape[2] # length of temporal dimension interp = 4 # optional zero padding of data to increase resilution, which is equivalent to interpolation flen = (l * interp) // 2 # length of data in frequency # domain is 2 times smaller due to FFT symmetry print("Temporal length of data {}, length of freq domain {}".format(l, flen))
datagen = ImageDataGenerator(preprocessing_function=preprocess_input, validation_split=0.2, apply_gen_transform=True) f1_trainvalidation = h5.File(all_features_hdf5, 'r') shape = f1_trainvalidation['data'].shape f1_trainvalidation.close() #15 Gb is upper limit for cache memory. total_mem_usage, dividing_factor = calculateDividingFactor(shape) #print("Dividing factor {}:".format(dividing_factor)) print("Cached memory usage: {}".format(total_mem_usage / (1024**3) / dividing_factor)) chunk_shape = (1, 100, 100, 3) f1_trainvalidation = h5c.File(all_features_hdf5, 'r', chunk_cache_mem_size=total_mem_usage // dividing_factor) f1_label = h5.File(all_labels_hdf5, 'r') train_generator = datagen.flow_hdf5(f1_trainvalidation['data'], f1_label['data'], subset='training', batch_size=batch_size, shuffle=False) validation_generator = datagen.flow_hdf5(f1_trainvalidation['data'], f1_label['data'], subset='validation', batch_size=batch_size, shuffle=False) # fine-tune the model model.fit_generator(