def _write_sample_results(self, results, full_mask, roi_indices): """Write the sample results to a .npy file. If the given sample files do not exists or if the existing file is not large enough it will create one with enough storage to hold all the samples for the given total_nmr_voxels. On storing it should also be given a list of voxel indices with the indices of the voxels that are being stored. Args: results (dict): the samples to write full_mask (ndarray): the complete mask for the entire brain roi_indices (ndarray): the roi indices of the voxels we computed """ total_nmr_voxels = np.count_nonzero(full_mask) if not os.path.exists(self._output_dir): os.makedirs(self._output_dir) for map_name, samples in results.items(): samples_path = os.path.join(self._output_dir, map_name + '.samples.npy') mode = 'w+' if os.path.isfile(samples_path): mode = 'r+' current_results = open_memmap(samples_path, mode='r') if current_results.shape[1] != samples.shape[1]: mode = 'w+' del current_results # closes the memmap saved = open_memmap(samples_path, mode=mode, dtype=samples.dtype, shape=(total_nmr_voxels, samples.shape[1])) saved[roi_indices, :] = samples
def _write_volumes(self, roi_indices, results, tmp_dir): """Write the result arrays to the temporary storage Args: roi_indices (ndarray): the indices of the voxels we computed results (dict): the dictionary with the results to save tmp_dir (str): the directory to save the intermediate results to """ if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) volume_indices = self._volume_indices[roi_indices, :] for param_name, result_array in results.items(): storage_path = os.path.join(tmp_dir, param_name + '.npy') map_4d_dim_len = 1 if len(result_array.shape) > 1: map_4d_dim_len = result_array.shape[1] else: result_array = np.reshape(result_array, (-1, 1)) mode = 'w+' if os.path.isfile(storage_path): mode = 'r+' tmp_matrix = open_memmap(storage_path, mode=mode, dtype=result_array.dtype, shape=self._problem_data.mask.shape[0:3] + (map_4d_dim_len,)) tmp_matrix[volume_indices[:, 0], volume_indices[:, 1], volume_indices[:, 2]] = result_array mask_path = os.path.join(tmp_dir, '{}.npy'.format(self._used_mask_name)) mode = 'w+' if os.path.isfile(mask_path): mode = 'r+' tmp_mask = open_memmap(mask_path, mode=mode, dtype=np.bool, shape=self._problem_data.mask.shape) tmp_mask[volume_indices[:, 0], volume_indices[:, 1], volume_indices[:, 2]] = True
def test_version_2_0_memmap(tmpdir): # requires more than 2 byte for header dt = [(("%d" % i) * 100, float) for i in range(500)] d = np.ones(1000, dtype=dt) tf1 = os.path.join(tmpdir, f'version2_01.npy') tf2 = os.path.join(tmpdir, f'version2_02.npy') # 1.0 requested but data cannot be saved this way assert_raises(ValueError, format.open_memmap, tf1, mode='w+', dtype=d.dtype, shape=d.shape, version=(1, 0)) ma = format.open_memmap(tf1, mode='w+', dtype=d.dtype, shape=d.shape, version=(2, 0)) ma[...] = d ma.flush() ma = format.open_memmap(tf1, mode='r') assert_array_equal(ma, d) with warnings.catch_warnings(record=True) as w: warnings.filterwarnings('always', '', UserWarning) ma = format.open_memmap(tf2, mode='w+', dtype=d.dtype, shape=d.shape, version=None) assert_(w[0].category is UserWarning) ma[...] = d ma.flush() ma = format.open_memmap(tf2, mode='r') assert_array_equal(ma, d)
def evaluate_model(model, global_params, output_dir): testing_input = open_memmap(global_params['testing-x']) testing_output = open_memmap(global_params['testing-y']) xfrm_params = eval(open(global_params['transform-y']).read()) predmtx = model.predict(testing_input, global_params['batchsize_test']) expected_mtx = np.array([ (testing_output[:, 0] * xfrm_params['scale_std']) + xfrm_params['scale_mean'], (testing_output[:, 1] * xfrm_params['shift_std']) + xfrm_params['shift_mean'] ]).T predmtx = np.array([ (predmtx[:, 0] * xfrm_params['scale_std']) + xfrm_params['scale_mean'], (predmtx[:, 1] * xfrm_params['shift_std']) + xfrm_params['shift_mean'] ]).T dt = np.hstack([expected_mtx, predmtx]) print("\t\tscale\tshift") print("Pearson r\t{:.5f}\t{:.5f}".format( pearsonr(dt[:, 0], dt[:, 2])[0], pearsonr(dt[:, 1], dt[:, 3])[0])) print("RMSD\t\t{:.5f}\t{:.5f}".format( ((dt[:, 0] - dt[:, 2])**2).mean()**0.5, ((dt[:, 1] - dt[:, 3])**2).mean()**0.5)) np.save(os.path.join(output_dir, 'test-output.npy'), dt)
def _store_sample(self, optimization_results, roi_indices, sample_ind): """Store the optimization results as a next sample.""" if not os.path.exists(self._output_dir): os.makedirs(self._output_dir) if self._sample_storage is None: self._sample_storage = {} for key, value in optimization_results.items(): samples_path = os.path.join(self._output_dir, key + '.samples.npy') mode = 'w+' if os.path.isfile(samples_path): mode = 'r+' current_results = open_memmap(samples_path, mode='r') if current_results.shape[1] != self._nmr_samples: mode = 'w+' # opening the memmap with w+ creates a new one del current_results # closes the memmap shape = [self._total_nmr_voxels, self._nmr_samples] if value.ndim > 1: shape.extend(value.shape[1:]) self._sample_storage[key] = open_memmap(samples_path, mode=mode, dtype=value.dtype, shape=tuple(shape)) for key, value in optimization_results.items(): self._sample_storage[key][roi_indices, sample_ind] = value
def test_alloc(self): with tempfile.TemporaryDirectory() as tdir: fname = os.path.join(tdir, "vdat") np.save(fname, self.data) dmap = open_memmap(fname + ".npy") # illegal type with pytest.raises(SPYTypeError): VirtualData({}) # 2darray expected d3 = np.ones((2, 3, 4)) np.save(fname + "3", d3) d3map = open_memmap(fname + "3.npy") with pytest.raises(SPYValueError): VirtualData([d3map]) # rows/cols don't match up with pytest.raises(SPYValueError): VirtualData([dmap, dmap.T]) # check consistency of VirtualData object for vk in range(2, 6): vdata = VirtualData([dmap] * vk) assert vdata.dtype == dmap.dtype assert vdata.M == dmap.shape[0] assert vdata.N == vk * dmap.shape[1] # Delete all open references to file objects b4 closing tmp dir del dmap, vdata, d3map
def test_memmap_roundtrip(): # XXX: test crashes nose on windows. Fix this if not (sys.platform == "win32" or sys.platform == "cygwin"): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, "normal.npy") mfn = os.path.join(tempdir, "memmap.npy") fp = open(nfn, "wb") try: format.write_array(fp, arr) finally: fp.close() fortran_order = arr.flags.f_contiguous and not arr.flags.c_contiguous ma = format.open_memmap(mfn, mode="w+", dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr del ma # Check that both of these files' contents are the same. fp = open(nfn, "rb") normal_bytes = fp.read() fp.close() fp = open(mfn, "rb") memmap_bytes = fp.read() fp.close() yield assert_equal, normal_bytes, memmap_bytes # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode="r") # yield assert_array_equal, ma, arr del ma
def test_memmap_roundtrip(tmpdir): for i, arr in enumerate(basic_arrays + record_arrays): if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tmpdir, f'normal{i}.npy') mfn = os.path.join(tmpdir, f'memmap{i}.npy') with open(nfn, 'wb') as fp: format.write_array(fp, arr) fortran_order = (arr.flags.f_contiguous and not arr.flags.c_contiguous) ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr ma.flush() # Check that both of these files' contents are the same. with open(nfn, 'rb') as fp: normal_bytes = fp.read() with open(mfn, 'rb') as fp: memmap_bytes = fp.read() assert_equal_(normal_bytes, memmap_bytes) # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode='r') ma.flush()
def gen_bone_data(arg): """Generate bone data from joint data for NTU skeleton dataset""" if arg.data_path: data = np.load(arg.data_path) else: data = np.load( r'C:\Users\chuaz\Unofficial-DGNN-PyTorch\data\test_data_joint.npy') N, C, T, V, M = data.shape if arg.data_path: fp_sp = open_memmap(arg.data_path, dtype='float32', mode='w+', shape=(N, 2, T, V, M)) else: fp_sp = open_memmap( r'C:\Users\chuaz\Unofficial-DGNN-PyTorch\data\test_data_bone.npy', dtype='float32', mode='w+', shape=(N, 2, T, V, M)) # Copy the joints data to bone placeholder tensor fp_sp[:, :C, :, :, :] = data for v1, v2 in tqdm(paris['xview']): # Reduce class index for NTU datasets v1 -= 1 v2 -= 1 # Assign bones to be joint1 - joint2, the pairs are pre-determined and hardcoded # There also happens to be 25 bones fp_sp[:, :, :, v1, :] = data[:, :, :, v1, :] - data[:, :, :, v2, :]
def __init__( self, transient_path, parameter_path, noise_multiplier, noise_path=None, ): super(CustomTransientDataset, self).__init__() self.transient_mmap = open_memmap(transient_path, mode="r") self.parameter_mmap = open_memmap(parameter_path, mode="r") self.noise_multiplier = noise_multiplier if noise_path is not None: self.noise_mmap = open_memmap(noise_path, mode="r") self.noise_len = len(self.noise_mmap) else: self.noise_mmap = None self.shape = self.transient_mmap.shape[-2:] self.length = len(self.transient_mmap) print(f"Transient shapes: {self.transient_mmap.shape}") if self.noise_mmap is not None: print(f"Noise shapes: {self.noise_mmap.shape}")
def test_memmap_roundtrip(): # Fixme: test crashes nose on windows. if not (sys.platform == 'win32' or sys.platform == 'cygwin'): for arr in basic_arrays + record_arrays: if arr.dtype.hasobject: # Skip these since they can't be mmap'ed. continue # Write it out normally and through mmap. nfn = os.path.join(tempdir, 'normal.npy') mfn = os.path.join(tempdir, 'memmap.npy') fp = open(nfn, 'wb') try: format.write_array(fp, arr) finally: fp.close() fortran_order = ( arr.flags.f_contiguous and not arr.flags.c_contiguous) ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype, shape=arr.shape, fortran_order=fortran_order) ma[...] = arr del ma # Check that both of these files' contents are the same. fp = open(nfn, 'rb') normal_bytes = fp.read() fp.close() fp = open(mfn, 'rb') memmap_bytes = fp.read() fp.close() yield assert_equal_, normal_bytes, memmap_bytes # Check that reading the file using memmap works. ma = format.open_memmap(nfn, mode='r') del ma
def test_version_2_0_memmap(): # requires more than 2 byte for header dt = [(("%d" % i) * 100, float) for i in range(500)] d = np.ones(1000, dtype=dt) tf = tempfile.mktemp('', 'mmap', dir=tempdir) # 1.0 requested but data cannot be saved this way assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype, shape=d.shape, version=(1, 0)) ma = format.open_memmap(tf, mode='w+', dtype=d.dtype, shape=d.shape, version=(2, 0)) ma[...] = d del ma with warnings.catch_warnings(record=True) as w: warnings.filterwarnings('always', '', UserWarning) ma = format.open_memmap(tf, mode='w+', dtype=d.dtype, shape=d.shape, version=None) assert_(w[0].category is UserWarning) ma[...] = d del ma ma = format.open_memmap(tf, mode='r') assert_array_equal(ma, d)
def save_data(part, out_path, sample_label, sample_name, sample_data, valid_frame_num): with open('{}/{}_label.pkl'.format(out_path, part), 'wb') as f: pickle.dump((sample_name, list(sample_label)), f) # model will be pre-trained on NTU RGB-D which was trained with 2 possible skeletons, in cad we only have 1 body at a time, but want to have the # same data dimensions fp = open_memmap('{}/{}_data.npy'.format(out_path, part), dtype='float32', mode='w+', shape=(len(sample_label), 3, _window_size, _num_joint, 2)) # num of frames of every sample stored here, every sample has equal length now fl = open_memmap('{}/{}_num_frame.npy'.format(out_path, part), dtype='int', mode='w+', shape=(len(sample_label), )) for i, s in enumerate(sample_name): print_toolbar( i * 1.0 / len(sample_label), '({:>5}/{:<5}) Processing {:<5} data: '.format( i + 1, len(sample_name), part)) fp[i, :, :, :, 0] = sample_data[i, :, :, :] fl[i] = _cut_frames # num_frame end_toolbar()
def main(global_params, output_dir): training_input = open_memmap(global_params['training-x']) training_output = open_memmap(global_params['training-y']) model = create_training_model(global_params, training_input.shape[1:], training_output.shape[1]) if os.path.isdir(output_dir): if any((not f.startswith('.')) for f in os.listdir(output_dir)): print('Clearing {}...'.format(output_dir)) shutil.rmtree(output_dir) os.mkdir(output_dir) else: os.makedirs(output_dir) train_model(model, global_params, training_input, training_output, output_dir) print('Adopting the weights to a new model for CPU') cpu_model = \ convert_model_to_noncudnn(model, global_params, training_input, training_output, output_dir) del training_input, training_output print('Evaluation of the CUDA model') evaluate_model(model, global_params, output_dir) print('Evaluation of the CPU model') evaluate_model(cpu_model, global_params, output_dir)
def generate_data(data_path, out_path, ignore_sample_path=None, benchmark='cv', dataset='test'): if ignore_sample_path != None: with open(ignore_sample_path, 'r') as f: ignore_samples = [line.strip() + '.skeleton' for line in f.readlines()] else: ignore_samples = [] sample_name = [] sample_label = [] for filename in os.listdir(data_path): if filename in ignore_samples: continue action_class = int(filename[filename.find('A') + 1:filename.find('A') + 4]) subject_id = int(filename[filename.find('P') + 1:filename.find('P') + 4]) camera_id = int(filename[filename.find('C') + 1:filename.find('C') + 4]) if benchmark == 'cv': training = (camera_id in training_cameras) elif benchmark == 'cs': training = (subject_id in training_subjects) else: raise ValueError() if dataset == 'train': training = training elif dataset == 'test': training = not training else: raise ValueError() if training: sample_name.append(filename) sample_label.append(action_class - 1) if dataset == 'train': sample_name, val_name, sample_label, val_label = train_test_split(sample_name, sample_label, test_size=0.05, random_state=10000) with open('{}/val_label.pkl'.format(out_path), 'wb') as f: pickle.dump((val_name, list(val_label)), f) f_data = open_memmap('{}/val_data.npy'.format(out_path), dtype='float32', mode='w+', shape=(len(val_label), 3, max_frame, num_joint, max_body)) for idx, s in enumerate(val_name): print_output(idx * 1.0 / len(val_label), '({:>5}/{:<5}) Processing {:>5}-{:<5} data: ' .format(idx + 1, len(val_name), benchmark, 'val')) data = read_xyz(os.path.join(data_path, s), max_body=max_body, num_joint=num_joint) f_data[idx, :, 0:data.shape[1], :, :] = data with open('{}/{}_label.pkl'.format(out_path, dataset), 'wb') as f: pickle.dump((sample_name, list(sample_label)), f) f_data = open_memmap('{}/{}_data.npy'.format(out_path, dataset), dtype='float32', mode='w+', shape=(len(sample_label), 3, max_frame, num_joint, max_body)) for idx, s in enumerate(sample_name): print_output(idx * 1.0 / len(sample_label), '({:>5}/{:<5}) Processing {:>5}-{:<5} data: ' .format(idx + 1, len(sample_name), benchmark, dataset)) data = read_xyz(os.path.join(data_path, s), max_body=max_body, num_joint=num_joint) f_data[idx, :, 0:data.shape[1], :, :] = data sys.stdout.write('\n')
def csv_to_npy(input_folder, input_filename, output_folder, astro_cols, photo_cols, bestindex_col, header=False): ''' Convert a .csv file representation of a photometric catalogue into the appropriate .npy binary files used in the cross-matching process. Parameters ---------- input_folder : string Folder on disk where the catalogue .csv file is stored. input_filename : string Name of the .csv file, without the extension, to convert to binary files. output_folder : string Folder on disk of where to save the .npy versions of the catalogue. astro_cols : list or numpy.array of integers List of zero-indexed columns in the input catalogue representing the three required astrometric parameters, two orthogonal sky axis coordinates and a single, circular astrometric precision. photo_cols : list or numpy.array of integers List of zero-indexed columns in the input catalogue representing the magnitudes of each photometric source to be used in the cross-matching. bestindex_col : integer Zero-indexed column of the flag indicating which of the available photometric brightnesses (represented by ``photo_cols``) is the preferred choice -- usually the most precise and highest quality detection. header : boolean, optional Flag indicating whether the .csv file has a first line with the names of the columns in it, or whether the first line of the file is the first line of the dataset. ''' astro_cols, photo_cols = np.array(astro_cols), np.array(photo_cols) with open('{}/{}.csv'.format(input_folder, input_filename)) as fp: n_rows = 0 if not header else -1 for _ in fp: n_rows += 1 astro = open_memmap('{}/con_cat_astro.npy'.format(output_folder), mode='w+', dtype=float, shape=(n_rows, 3)) photo = open_memmap('{}/con_cat_photo.npy'.format(output_folder), mode='w+', dtype=float, shape=(n_rows, len(photo_cols))) best_index = open_memmap('{}/magref.npy'.format(output_folder), mode='w+', dtype=int, shape=(n_rows,)) used_cols = np.concatenate((astro_cols, photo_cols, [bestindex_col])) new_astro_cols = np.array([np.where(used_cols == a)[0][0] for a in astro_cols]) new_photo_cols = np.array([np.where(used_cols == a)[0][0] for a in photo_cols]) new_bestindex_col = np.where(used_cols == bestindex_col)[0][0] n = 0 for chunk in pd.read_csv('{}/{}.csv'.format(input_folder, input_filename), chunksize=100000, usecols=used_cols, header=None if not header else 0): astro[n:n+chunk.shape[0]] = chunk.values[:, new_astro_cols] photo[n:n+chunk.shape[0]] = chunk.values[:, new_photo_cols] best_index[n:n+chunk.shape[0]] = chunk.values[:, new_bestindex_col] n += chunk.shape[0] return
def main(): parser = argparse.ArgumentParser() parser.add_argument("datadir", help="Path to data") parser.add_argument("cachedir", help="Metadata storage path") args = parser.parse_args() print("NOTE: The annotations files should be converted to UTF8 beforehand.") print("Reading sequence durations") filepaths = [] durations = [] for dirpath, _, filenames in os.walk(args.datadir): for f in filenames: print("\r" + f, end='', flush=True) filepath = os.path.join(dirpath, f) filepaths.append(filepath) signer, label, sess, date = Recording.parse_archive_name(filepath) recording = Recording(args.datadir, signer, label, sess, date) durations.append(recording.duration) durations = np.array(durations, dtype=np.int64) file_offsets = np.cumsum(durations) - durations subsequences = np.stack([file_offsets, file_offsets + durations], axis=1) metadata_dtype = [('signer', 'u1'), ('sess', 'u1'), ('date', 'U8'), ('label', 'i4'), ('duration', 'u4'), ('skel_data_off', 'u8')] info = np.empty((len(durations),), dtype=metadata_dtype) dump_file = os.path.join(args.cachedir, 'poses_2d.npy') storage = open_memmap(dump_file, 'w+', dtype=np.int16, shape=(durations.sum(), 20, 2)) poses2d = split_seq(storage, subsequences) dump_file = os.path.join(args.cachedir, 'poses_3d.npy') storage = open_memmap(dump_file, 'w+', dtype=np.float32, shape=(durations.sum(), 20, 3)) poses3d = split_seq(storage, subsequences) for i in range(len(durations)): print("\r{} / {}".format(i, len(durations)), end='', flush=True) signer, label, sess, date = Recording.parse_archive_name(filepaths[i]) r = Recording(args.datadir, signer, label, sess, date) info[i]['signer'] = signer info[i]['sess'] = sess info[i]['date'] = date info[i]['label'] = label info[i]['duration'] = r.duration info[i]['skel_data_off'] = subsequences[i, 0] poses3d[i][...], poses2d[i][...] = r.poses() np.save(os.path.join(args.cachedir, 'rec_info.npy'), info)
def split(self, split=0.20, seed=None): dtype = self.images.dtype if split > 0.0: # Split with stratify train_idx, test_idx = train_test_split(range(len(self.images)), test_size=split, random_state=seed, shuffle=True, stratify=self.cls) self.random_idx = train_idx + test_idx else: train_idx = np.random.permutation(range(len(self.images))) test_idx = [] self.random_idx = train_idx print("@ Split mapping...") img_size = self.images.shape[1:] # Memmap splitting if self.use_mmap: print("@ Split mapping - deleting old memmap files") train_filename = os.path.join(self.mmap_directory, "train.npy") test_filename = os.path.join(self.mmap_directory, "test.npy") self.delete_memmap_files(del_split=True, del_source=False) print("@ Split mapping - creating new memmap files") self.train_images = open_memmap(train_filename, dtype=dtype, mode='w+', shape=(len(train_idx), ) + img_size) self.test_images = open_memmap(test_filename, dtype=dtype, mode='w+', shape=(len(test_idx), ) + img_size) print("@ Split mapping - copying train images") for i in range(len(train_idx)): self.train_images[i] = self.images[train_idx[i]] print("@ Split mapping - copying test images") for i in range(len(test_idx)): self.test_images[i] = self.images[test_idx[i]] # Normal splitting else: self.train_images = self.images[train_idx] self.test_images = self.images[test_idx] # Remainder self.train_cls = self.cls[train_idx] self.test_cls = self.cls[test_idx] self.train_onehots = self.onehots[train_idx] self.test_onehots = self.onehots[test_idx] self.train_df = self.data_df.iloc[train_idx, :] self.test_df = self.data_df.iloc[test_idx, :] print("@ Split mapping - done")
def gendata(dataset_path, out_path, benchmark, part='eval'): dataset = NTUMotionProcessor( '{}/{}_data.npy'.format(os.path.join(dataset_path, benchmark), part), '{}/{}_label.pkl'.format(os.path.join(dataset_path, benchmark), part), data_type='relative', t_length=max_frame, y_rotation=True, sampling='resize', displacement=1, mmap=True) data_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=1, drop_last=False) f_position = open_memmap( '{}/{}_position.npy'.format(out_path, part), dtype='float32', mode='w+', shape=(dataset.N, 3, max_frame, num_joint, max_body)) f_motion = open_memmap( '{}/{}_motion.npy'.format(out_path, part), dtype='float32', mode='w+', shape=(dataset.N, 3, max_frame, num_joint, max_body)) f_label = open_memmap( '{}/{}_label.npy'.format(out_path, part), dtype='int64', mode='w+', shape=(dataset.N, 1)) index = 0 for i, (data, motion, label) in enumerate(data_loader): print_toolbar(i * 1.0 / len(data_loader), '({:>5}/{:<5}) Processing {:>5}-{:<5} data: '.format( i + 1, len(data_loader), benchmark, part)) length = label.shape[0] if i * batch_size != index: print(i, index) f_position[index:(index+length), :, :, :, :] = data.numpy() f_motion[index:(index+length), :, :, :, :] = motion.numpy() f_label[index:(index+length), :] = label.numpy().reshape(-1, 1) index += length end_toolbar()
def setUp(self): self.data = sp.arange(80).reshape((2, 8, 5)) self.memmap_data = npfor.open_memmap('temp.npy', mode='w+', shape=(2, 8, 5)) self.memmap_data[:, :, :] = sp.arange(80).reshape(2, 8, 5)
def gendata( data_path, label_path, data_out_path, label_out_path, num_person_in=5, #observe the first 5 persons num_person_out=2, #then choose 2 persons with the highest score max_frame=300): feeder = Feeder_kinetics( data_path=data_path, label_path=label_path, num_person_in=num_person_in, num_person_out=num_person_out, window_size=max_frame) sample_name = feeder.sample_name sample_label = [] fp = open_memmap( data_out_path, dtype='float32', mode='w+', shape=(len(sample_name), 3, max_frame, 18, num_person_out)) for i, s in enumerate(sample_name): data, label = feeder[i] print_toolbar(i * 1.0 / len(sample_name), '({:>5}/{:<5}) Processing data: '.format( i + 1, len(sample_name))) fp[i, :, 0:data.shape[1], :, :] = data sample_label.append(label) with open(label_out_path, 'wb') as f: pickle.dump((sample_name, list(sample_label)), f)
def gendata(data_path, out_path, ignored_sample_path=None, benchmark='xview', part='eval'): if ignored_sample_path != None: with open(ignored_sample_path, 'r') as f: ignored_samples = [ line.strip() + '.skeleton' for line in f.readlines() ] else: ignored_samples = [] sample_name = [] sample_label = [] for filename in os.listdir(data_path): if filename in ignored_samples: continue action_class = int( filename[filename.find('A') + 1:filename.find('A') + 4]) subject_id = int( filename[filename.find('P') + 1:filename.find('P') + 4]) camera_id = int( filename[filename.find('C') + 1:filename.find('C') + 4]) if benchmark == 'xview': istraining = (camera_id in training_cameras) elif benchmark == 'xsub': istraining = (subject_id in training_subjects) else: raise ValueError() if part == 'train': issample = istraining elif part == 'val': issample = not (istraining) else: raise ValueError() if issample: sample_name.append(filename) sample_label.append(action_class - 1) with open('{}/{}_label.pkl'.format(out_path, part), 'wb') as f: pickle.dump((sample_name, list(sample_label)), f) # np.save('{}/{}_label.npy'.format(out_path, part), sample_label) fp = open_memmap( '{}/{}_data.npy'.format(out_path, part), dtype='float32', mode='w+', shape=(len(sample_label), 3, max_frame, num_joint, max_body)) for i, s in enumerate(sample_name): print_toolbar(i * 1.0 / len(sample_label), '({:>5}/{:<5}) Processing {:>5}-{:<5} data: '.format( i + 1, len(sample_name), benchmark, part)) data = read_xyz( os.path.join(data_path, s), max_body=max_body, num_joint=num_joint) fp[i, :, 0:data.shape[1], :, :] = data end_toolbar()
def dump_electrode_data_circus(self, filename, chunks=1e9): self.load_mcs_data() itemsize = np.array([0.0], dtype=np.float32).nbytes data = self.electrodes_data n = len(next(iter(data.values()))) # num samples per channel n_items = int(chunks // itemsize) # num chunked samples per chan total_n = sum(len(value) for value in data.values()) # num bytes total pbar = tqdm( total=total_n * itemsize, file=sys.stdout, unit_scale=1, unit='bytes') mmap_array = open_memmap( filename, mode='w+', dtype=np.float32, shape=(n, len(data))) names = sorted(data.keys(), key=lambda x: (x[0], int(x[1:]))) for k, name in enumerate(names): value = data[name] offset, scale = self.get_electrode_offset_scale(name) i = 0 n = len(value) while i * n_items < n: items = np.array( value[i * n_items:min((i + 1) * n_items, n)]) mmap_array[i * n_items:i * n_items + len(items), k] = \ (items - offset) * scale pbar.update(len(items) * itemsize) i += 1 pbar.close() print('Channel order in "{}" is: {}'.format(filename, names))
def gen_neighbor_data(): """Generate bone data from joint data for NTU skeleton dataset""" for dataset in datasets: for set in sets: print(dataset, set) data = np.load( '/home/hhe/hhe_first_file/data_set/NTU-RGB-D-CV/{}/{}_data.npy' .format(dataset, set)) N, C, T, V, M = data.shape fp_sp = open_memmap( '/home/hhe/hhe_first_file/data_set/NTU-RGB-D-CV/{}/{}_neighbor.npy' .format(dataset, set), dtype='float32', mode='w+', shape=(N, 3, T, V, M)) ori_data = np.zeros(data.shape, dtype=data.dtype) # Copy the joints data to bone placeholder tensor fp_sp[:, :C, :, :, :] = ori_data for v1, v2 in tqdm(paris[dataset]): # Reduce class index for NTU datasets if dataset != 'kinetics': v1 -= 1 v2 -= 1 # Assign bones to be joint1 - joint2, the pairs are pre-determined and hardcoded # There also happens to be 25 bones fp_sp[:, :, :, v1, :] += data[:, :, :, v1, :] - data[:, :, :, v2, :]
def _create_schema(self, *, remote_operation: bool = False): """stores the shape and dtype as the schema of a arrayset. Parameters ---------- remote_operation : optional, kwarg only, bool if this schema is being created from a remote fetch operation, then do not place the file symlink in the staging directory. Instead symlink it to a special remote staging directory. (default is False, which places the symlink in the stage data directory.) """ uid = random_string() file_path = pjoin(self.DATADIR, f'{uid}.npy') m = open_memmap(file_path, mode='w+', dtype=self.schema_dtype, shape=(COLLECTION_SIZE, *self.schema_shape)) self.wFp[uid] = m self.w_uid = uid self.hIdx = 0 if remote_operation: symlink_file_path = pjoin(self.REMOTEDIR, f'{uid}.npy') else: symlink_file_path = pjoin(self.STAGEDIR, f'{uid}.npy') symlink_rel(file_path, symlink_file_path)
def eval(model, data_loader, output_device=0, dstype='train'): part = 'train' if dstype == "train" else 'test' out_path = './data/{}/features'.format(dataset) fp = open_memmap('{}/{}_data.npy'.format(out_path, part), dtype='float32', mode='w+', shape=(len(data_loader['test']), 1, 256, 1)) label_fp = open('{}/{}_label.txt'.format(out_path, part), '+w') for i, (data, label, sample_name) in enumerate(data_loader['test']): data = Variable(data.float().cuda(output_device), requires_grad=False, volatile=True) label = Variable(label.long().cuda(output_device), requires_grad=False, volatile=True) label_int = int(label.data.cpu().numpy()) label_fp.write(sample_name[0] + ", " + str(label_int) + '\n') ddata = data.data.cpu().numpy() label_fp.write( " ".join(list(map(lambda x: str(x), ddata.flatten()[:10]))) + '\n') output = model(data) np_output = output.data.cpu().numpy() label_fp.write( " ".join(list(map(lambda x: str(x), np_output.flatten()[:10]))) + '\n') fp[i, :, :, :] = np_output label_fp.close()
def __init__(self, split, model_group_name): self.codebook = json.load(open('{}/data.json'.format(cfg.DATA_DIR))) data = h5py.File('{}/data.h5'.format( cfg.DATA_DIR))['/{}'.format(split)] self.img_pos = data['img_pos'].value self.que = data['que'].value self.que_id = data['que_id'].value if 'ans' in data: self.ans = data['ans'].value if cfg.SOFT_LOSS: self.ans = self.ans.astype(np.float32) # load image features self.splits = cfg[split.upper()].SPLITS self.img_feas = [] for data_split in self.splits: if data_split == 'vg': continue fea_fname = get_feature_path(data_split, 'feature') if cfg.LOAD_ALL_DATA: img_fea = np.load(fea_fname) else: img_fea = open_memmap(fea_fname, dtype='float32') self.img_feas.append(img_fea) self.img_cnts = list(map(len, self.img_feas)) self.model_group_name = None self.reload_obj(model_group_name)
def _write_volume(self, data, volume_indices, filename): """Write the result of one map to the specified file. This is meant to save map data to a temporary .npy file. Args: data (ndarray): the voxel data to store volume_indices (ndarray): the volume indices of the computed data points filename (str): the file to write the results to. This by default will append to the file if it exists. """ extra_dims = (1, ) if len(data.shape) == 2: extra_dims = (data.shape[1], ) elif len(data.shape) > 2: extra_dims = data.shape[1:] else: data = np.reshape(data, (-1, 1)) mode = 'w+' if os.path.isfile(filename): mode = 'r+' tmp_matrix = open_memmap(filename, mode=mode, dtype=data.dtype, shape=self._mask.shape[0:3] + extra_dims) tmp_matrix[volume_indices[:, 0], volume_indices[:, 1], volume_indices[:, 2]] = data
def _init_mem(self, ex, name, shape, header_name=None): # header, offset = ex.get_header(name, shape, self.headers[0]) # create shared memory for output frames / info n_frames, n_ch, *_ = shape self.logger.info( 'Extracting %i frames from %i amplifier channel%s to ' '%r', n_frames, n_ch, 's' * (n_ch > 1), str(name)) # check free hd space req_bytes_head = self.head_dtype.itemsize * n_frames * n_ch req_bytes_data = (ex.image_size_bytes * n_frames * n_ch) + offset self.check_free_space(req_bytes_data, req_bytes_head) # create memory map for extraction (4D) data = np.memmap(name, ex.dtype, 'w+', offset, shape) # FIXME: w+ will always overwrite, r+ fails on create # header info data header_data = None if header_name: # read the extracted keys to structured memory map header_data = open_memmap(str(header_name), 'w+', self.head_dtype, (n_frames, n_ch)) return data, header, header_data
def dump_electrode_data_circus(self, filename, chunks=1e9): self.load_mcs_data() itemsize = np.array([0.0], dtype=np.float32).nbytes data = self.electrodes_data n = len(next(iter(data.values()))) # num samples per channel n_items = int(chunks // itemsize) # num chunked samples per chan total_n = sum(len(value) for value in data.values()) # num bytes total pbar = tqdm(total=total_n * itemsize, file=sys.stdout, unit_scale=1, unit='bytes') mmap_array = open_memmap(filename, mode='w+', dtype=np.float32, shape=(n, len(data))) names = sorted(data.keys(), key=lambda x: (x[0], int(x[1:]))) for k, name in enumerate(names): value = data[name] offset, scale = self.get_electrode_offset_scale(name) i = 0 n = len(value) while i * n_items < n: items = np.array(value[i * n_items:min((i + 1) * n_items, n)]) mmap_array[i * n_items:i * n_items + len(items), k] = \ (items - offset) * scale pbar.update(len(items) * itemsize) i += 1 pbar.close() print('Channel order in "{}" is: {}'.format(filename, names))
def export(self, file_name, table_name="aequilibrae_table"): """ Exports the dataset to another format. Supports CSV and SQLite Args: *file_name* (:obj:`str`): File name with PATH and extension (csv, or sqlite3, sqlite or db) *table_name* (:obj:`str`): It only applies if you are saving to an SQLite table. Otherwise ignored :: dataset = AequilibraeData() dataset.load("D:/datasets/vectors.aed") dataset.export("D:/datasets/vectors.csv") """ file_type = os.path.splitext(file_name)[1] headers = ["index"] headers.extend(self.fields) if file_type.lower() == ".aed": dtype = [("index", self.aeq_index_type)] dtype.extend([(self.fields[i], self.data_types[i]) for i in range(self.num_fields)]) data = open_memmap(file_name, mode="w+", dtype=dtype, shape=(self.entries,)) for field in data.dtype.names: data[field] = self.data[field] data.flush() del data elif file_type.lower() == ".csv": fmt = "%d" for dt in self.data_types: if np.issubdtype(dt, np.floating): fmt += ",%f" elif np.issubdtype(dt, np.integer): fmt += ",%d" data = np.array(self.data, copy=True) for nm in self.data.dtype.names: np.nan_to_num(data[nm], copy=False) np.savetxt(file_name, data[np.newaxis, :][0], delimiter=",", fmt=fmt, header=",".join(headers), comments="") elif file_type.lower() in [".sqlite", ".sqlite3", ".db"]: # Connecting to the database file conn = sqlite3.connect(file_name) c = conn.cursor() # Creating the table, but before deletes if the table exists c.execute("""DROP TABLE IF EXISTS """ + table_name) fi = "" qm = "?" for f in headers[1:]: fi += ", " + f + " REAL" qm += ", ?" c.execute("""CREATE TABLE """ + table_name + """ (link_id INTEGER PRIMARY KEY""" + fi + ")" "") c.execute("BEGIN TRANSACTION") c.executemany("INSERT INTO " + table_name + " VALUES (" + qm + ")", self.data) c.execute("END TRANSACTION") conn.commit() conn.close()
def gendata( data_path, label_path, data_out_path, label_out_path, num_person_in=1, #observe the first 5 persons num_person_out=1, #then choose 2 persons with the highest score max_frame=300): feeder = Feeder_kinetics(data_path=data_path, label_path=label_path, num_person_in=num_person_in, num_person_out=num_person_out, window_size=max_frame) sample_name = feeder.sample_name sample_label = [] fp = open_memmap(data_out_path, dtype='float32', mode='w+', shape=(len(sample_name), 3, max_frame, 18, num_person_out)) for i, s in enumerate(sample_name): data, label = feeder[i] print_toolbar( i * 1.0 / len(sample_name), '({:>5}/{:<5}) Processing data: '.format(i + 1, len(sample_name))) fp[i, :, 0:data.shape[1], :, :] = data sample_label.append(label) with open(label_out_path, 'wb') as f: pickle.dump((sample_name, list(sample_label)), f)
def create_and_save_depth(inference_fn, video_data, depth_estimation_model_path, dnn_depth_map_path, logger, batch_size): try: depth_maps = open_memmap(filename=dnn_depth_map_path, dtype=np.float32, mode='w+', shape=(video_data.num_frames, 1, *video_data.shape)) depth_map_generator = inference_fn(video_data, depth_estimation_model_path, logger, batch_size=batch_size) for batch_i, depth_map in enumerate(depth_map_generator): batch_start_idx = batch_size * batch_i # Sometimes the last batch is a different size to the rest, so we need to use the actual batch size rather # than the specified one. current_batch_size = depth_map.shape[0] batch_end_idx = batch_start_idx + current_batch_size depth_maps[batch_start_idx:batch_end_idx] = depth_map depth_maps.flush() logger.log("Saved DNN depth maps to {}.".format(dnn_depth_map_path)) return depth_maps except Exception: logger.log( "\nError occurred during creation of depth maps - deleting {}.". format(dnn_depth_map_path)) os.remove(dnn_depth_map_path) raise
def _get_numpy_binary_array(self, name): """Return the an memmap object as represented by the .npy file""" filename = self._array_files.get(name) if filename is not None: return open_memmap(filename) else: return None
def read_data(self, hashVal: NUMPY_10_DataHashSpec) -> np.ndarray: """Read data from disk written in the numpy_00 fmtBackend Parameters ---------- hashVal : NUMPY_10_DataHashSpec record specification stored in the db Returns ------- np.ndarray tensor data stored at the provided hashVal specification. Raises ------ RuntimeError If the recorded checksum does not match the received checksum. Notes ----- TO AVOID DATA LOSS / CORRUPTION: * On a read operation, we copy memmap subarray tensor data to a new `np.ndarray` instance so as to prevent writes on a raw memmap result slice (a `np.memmap` instance) from propogating to data on disk. * This is an issue for reads from a write-enabled checkout where data was just written, since the np flag "WRITEABLE" and "OWNDATA" will be true, and writes to the returned array would be overwrite that data slice on disk. * For read-only checkouts, modifications to the resultant array would perform a "copy on write"-like operation which would be propogated to all future reads of the subarray from that process, but which would not be persisted to disk. """ srcSlc = (self.slcExpr[hashVal.collection_idx], *(self.slcExpr[0:x] for x in hashVal.shape)) try: res = self.Fp[hashVal.uid][srcSlc] except TypeError: self.Fp[hashVal.uid] = self.Fp[hashVal.uid]() res = self.Fp[hashVal.uid][srcSlc] except KeyError: process_dir = self.STAGEDIR if self.mode == 'a' else self.STOREDIR file_pth = pjoin(process_dir, f'{hashVal.uid}.npy') if os.path.islink(file_pth): self.rFp[hashVal.uid] = open_memmap(file_pth, 'r') res = self.Fp[hashVal.uid][srcSlc] else: raise out = np.array(res, dtype=res.dtype, order='C') cksum = adler32(out) if cksum != int(hashVal.checksum): raise RuntimeError( f'DATA CORRUPTION ERROR: Checksum {cksum} != recorded for {hashVal}' ) return out
def crop(self, item, focus, mode='loose', fixed=None, return_data=True): """Faster version of precomputed(item).crop(...)""" memmap = open_memmap(self.get_path(item), mode='r') swf = SlidingWindowFeature(memmap, self.sliding_window_) result = swf.crop(focus, mode=mode, fixed=fixed, return_data=return_data) del memmap return result
def open_memmap(filename, mode='r+', dtype=None, shape=None, fortran_order=False, version=(1, 0), metafile=None): """Open a file and memory map it to an InfoMemmap object. This is similar to the numpy.lib.format.openmemmap() function but also deals with the meta data dictionary, which is read and written from a meta data file. The only extra argument over the numpy version is the meta data file name `metafile`. Parameters ---------- metafile: str File name for which the `info` attribute of the returned InfoMemmap will be read from and written to. Default is None, where the it is assumed to be `filename` + ".meta". Returns ------- marray: InfoMemmap The `info` is intialized as an empty dictionary if `mode` is 'w' or if the file corresponding to `metafile` does not exist. The `metafile` attribute of marray is set to the `metafile` parameter unless `mode` is 'r' or 'c' in which case it is set to None. """ # Restrict to version (1,0) because we've only written write_header for # this version. if version != (1, 0): raise ValueError("Only version (1,0) is safe from this function.") # Memory map the data part. marray = npfor.open_memmap(filename, mode, dtype, shape, fortran_order, version) # Get the file name for the meta data. if metafile is None: metafile = filename + '.meta' # Read the meta data if need be. if ('r' in mode or mode is 'c') and os.path.isfile(metafile): info_fid = open(metafile, 'r') try: infostring = info_fid.readline() finally: info_fid.close() info = safe_eval(infostring) else: info = {} # In read mode don't pass a metafile to protect the meta data. if mode is 'r' or mode is 'c': metafile = None marray = info_header.InfoMemmap(marray, info, metafile) return marray
def create_empty(self, file_path=None, entries=1, field_names=None, data_types=None, memory_mode=False): """ :param file_path: Optional. Full path for the output data file. If *memory_false* is 'false' and path is missing, then the file is created in the temp folder :param entries: Number of records in the dataset. Default is 1 :param field_names: List of field names for this dataset. If no list is provided, the field 'data' will be created :param data_types: List of data types for the dataset. Types need to be NumPy data types (e.g. np.int16, np.float64). If no list of types are provided, type will be *np.float64* :param memory_mode: If true, dataset will be kept in memory. If false, the dataset will be a memory-mapped numpy array :return: # nothing. Associates a dataset with the AequilibraEData object """ if file_path is not None or memory_mode: if field_names is None: field_names = ['data'] if data_types is None: data_types = [np.float64] * len(field_names) self.file_path = file_path self.entries = entries self.fields = field_names self.data_types = data_types self.aeq_index_type = np.uint64 if memory_mode: self.memory_mode = MEMORY else: self.memory_mode = DISK if self.file_path is None: self.file_path = self.random_name() # Consistency checks if not isinstance(self.fields, list): raise ValueError('Titles for fields, "field_names", needs to be a list') if not isinstance(self.data_types, list): raise ValueError('Data types, "data_types", needs to be a list') # The check below is not working properly with the QGIS importer # else: # for dt in self.data_types: # if not isinstance(dt, type): # raise ValueError('Data types need to be Python or Numpy data types') for field in self.fields: if field in object.__dict__: raise Exception(field + ' is a reserved name. You cannot use it as a field name') self.num_fields = len(self.fields) dtype = [('index', self.aeq_index_type)] dtype.extend([(self.fields[i], self.data_types[i]) for i in range(self.num_fields)]) # the file if self.memory_mode: self.data = np.recarray((self.entries,), dtype=dtype) else: self.data = open_memmap(self.file_path, mode='w+', dtype=dtype, shape=(self.entries,))
def setUp(self) : data = sp.arange(20) data.shape = (5,4) self.mat_arr = algebra.make_mat(data.copy(), axis_names=('ra', 'dec')) self.vect_arr = algebra.make_vect(data.copy(), axis_names=('ra', 'dec')) mem = npfor.open_memmap('temp.npy', mode='w+', shape=(5, 4)) mem[:] = data self.vect_mem = algebra.make_vect(mem) self.arr = data.copy()
def test_from_memmap(self) : # Works if constructed from array. data = npfor.open_memmap('temp.npy', mode='w+', shape=(4,3,3)) data[:] = 5.0 Mat = algebra.info_memmap(data, {'a': 'b'}) Mat.flush() self.assertEqual(Mat.shape, (4, 3, 3)) self.assertEqual(Mat.info['a'], 'b') self.assertTrue(sp.allclose(Mat, 5.0)) self.assertTrue(isinstance(Mat, sp.memmap)) del Mat os.remove('temp.npy')
def load(self, file_path): """ :param file_path: Full file path to the AequilibraEDataset to be loaded :return: Loads the dataset into the AequilibraEData instance """ f = open(file_path) self.file_path = os.path.realpath(f.name) f.close() # Map in memory and load data names plus dimensions self.data = open_memmap(self.file_path, mode='r+') self.entries = self.data.shape[0] self.fields = [x for x in self.data.dtype.fields if x != 'index'] self.num_fields = len(self.fields) self.data_types = [self.data[x].dtype.type for x in self.fields]
def test_assert_info(self) : """Test the assert_info function.""" # info_memaps should pass. data = npfor.open_memmap('temp.npy', mode='w+', shape=(4,3,3)) data[:] = 5.0 Mat = algebra.info_memmap(data) algebra.assert_info(Mat) del Mat os.remove('temp.npy') # info_arrays should pass. data = sp.empty((5, 6, 6)) data[:] = 4.0 Mat = algebra.info_array(data) algebra.assert_info(Mat) # arrays should fail. self.assertRaises(TypeError, algebra.assert_info, data)
def load_data_matrix(self): memmap_path = os.path.join(self.bin_dir,self.memmap_name) if os.path.exists(memmap_path): print 'loading in '+self.memmap_name self.raw_data_list = npf.open_memmap(memmap_path,mode='r',dtype='float32') #self.raw_data_list = np.load(memmap_path) print 'shape of loaded memmap:' print self.raw_data_list.shape self.loaded_warm_start = True return True else: print 'no file of name '+self.memmap_name+' to load.' print 'aborting memmap load' return False
from numpy.lib.format import open_memmap n0 = open_memmap('launch-000000.npy') n1 = open_memmap('launch-000001.npy') n2 = open_memmap('launch-000002.npy') n0_cond_ss = n0.reshape((-1, 32, 401, 192))[:, :, 200:].reshape((-1, 32*201, 192)) n1_cond_ss = n1.reshape((-1, 32, 401, 192))[:, :, 200:].reshape((-1, 32*201, 192)) # triple f here figure(figsize=(15, 12)) ws = l9['dataset'].weights ds = l9['dataset'].distances idx = 32*10 + 21 cond = n9.reshape((-1, 32, 401, 96, 2))[idx, :, :] ts = r_[0 : cond.shape[1]*2.5 : 1j*cond.shape[1]] cond -= cond.reshape((-1, 192)).mean(axis=0).reshape((1, 1, 96, 2)) trial_svds = [svd(trial[:, :, 0], full_matrices=0) for trial in cond] cond_svd = svd(cond[:, :, :, 0].reshape((-1, 96)), full_matrices=0) for i, svdi, trial in zip(range(32), trial_svds, cond): subplot(335) x, y, z = svdi[1][:3][:, newaxis]*dot(svdi[2][:3], trial[:, :, 0].T) plot(x+z/3, y+z/3, 'k-', alpha=0.2) subplot(336) x, y, z = svdi[1][:3][:, newaxis]*dot(cond_svd[2][:3], trial[:, :, 0].T) plot(x+z/3, y+z/3, 'k-', alpha=0.3) subplot(6,3,13) hist(concatenate([abs(dot(svd1[2][:3], svd2[2][:3].T)).flat for i, svd1 in enumerate(trial_svds) for j, svd2 in enumerate(trial_svds) if not j==i]), 50) xlim([0, 1.0]) subplot(3, 3, 4)
def create_data_matrix(self,save_memmap=True,nuke=True): raw_path = os.path.join(self.bin_dir,self.memmap_name) if nuke and os.path.exists(raw_path): os.remove(raw_path) if save_memmap: # We need to determine how many nifti files there are in total to # determine the shape of the memmap: brainshape = [] for subject in self.reg_subjects: sub_path = os.path.join(self.top_dir,subject) for nifti_name in self.reg_nifti_name: nifti_path = os.path.join(sub_path,nifti_name) if os.path.exists(nifti_path): self.total_nifti_files += 1 if not brainshape: [tempdata,tempaffine,brainshape] = self.__load_nifti(nifti_path) # Allocate the .npy memmap according to its size: memmap_shape = (self.total_nifti_files,brainshape[0],brainshape[1],brainshape[2], brainshape[3]) print 'Determined memmap shape:' print memmap_shape print 'Allocating the memmap...' self.raw_data_list = npf.open_memmap(raw_path,mode='w+',dtype='float32', shape=memmap_shape) print 'Succesfully allocated memmap... memmap shape:' pprint(self.raw_data_list.shape) nifti_iter = 0 for subject in self.reg_subjects: sub_path = os.path.join(self.top_dir,subject) print sub_path print subject print os.getcwd() for nifti_name in self.reg_nifti_name: nifti_path = os.path.join(sub_path,nifti_name) pprint(nifti_name) if os.path.exists(nifti_path): [idata,affine,ishape] = self.__load_nifti(nifti_path) pprint(ishape) if save_memmap: print 'Appending idata to memmap at: %s' % str(nifti_iter) self.raw_data_list[nifti_iter] = np.array(idata) self.subject_trial_indices[nifti_iter] = [] nifti_iter += 1 if self.reg_experiment_trs == False: self.reg_experiment_trs = len(idata[3]) if self.reg_total_trials == False: if self.reg_trial_trs: self.reg_total_trials = self.reg_experiment_trs/self.reg_trial_trs if self.raw_affine == []: self.raw_affine = affine if self.raw_data_shape == []: self.raw_data_shape = ishape pprint(ishape)
def main(): def parse_args(): parser = argparse.ArgumentParser( description=("Compares sequential write and random read times for " "HDF5 vs memmap datasets.")) parser.add_argument("--output-dir", default="/tmp/", help=("The directory to output to. Handy for " "comparing HDD vs SDD performance.")) parser.add_argument("--no-memmap", action='store_true', default=False, help="Don't test memmaps.") parser.add_argument("--no-h5", action='store_true', default=False, help="Don't test HDF5.") parser.add_argument("--batch-size", default=128, help="Number of images per batch") parser.add_argument("--dtype", type=numpy.dtype, default='uint8', help="Data dtype.") parser.add_argument("--image-dim", default=108, # from big NORB dataset help=("Size of one side of the random square " "images.")) parser.add_argument("--num-gb", type=float, default=1.0, help="File size, in GB") args = parse_args() # modeled after big NORB's test set images example_shape = (args.image_dim, args.image_dim) example_size = numpy.prod(example_shape) * args.dtype.itemsize num_examples = numpy.floor(num_GB * (1024 ** 3) / example_size) shape = (num_examples, args.image_dim, args.image_dim) dtype_max = numpy.iinfo(args.dtype).max # batch_size = 128 num_batches = int(numpy.ceil(shape[0] / float(args.batch_size))) path_prefix = os.path.join(args.output_dir, '/benchmark_random_access_to_hdf5_and_memmap') h5_path = path_prefix + '.h5' mm_path = path_prefix + '.npy' def get_expected_values(start_row, end_row=None): if end_row is None: assert_is_instance(start_row, numpy.ndarray) values = start_row else: assert_integer(start_row) assert_integer(end_row) values = numpy.arange(start_row, end_row) values = values % dtype_max values = values.reshape((values.shape[0], ) + ((1, ) * (len(shape) - 1))) return numpy.tile(values, shape[1:]) def fill_tensor(tensor): ''' Fill each row with its batch index. ''' row_index = 0 while row_index < shape[0]: print("writing {} of {} rows".format(row_index, shape[0]), end='\r') next_row_index = min(shape[0], row_index + args.batch_size) values = get_expected_values(row_index, next_row_index) tensor[row_index:next_row_index, ...] = values row_index = next_row_index memory_size = human_readable_memory_size(numpy.prod(shape)) if not args.no_h5: start_time = default_timer() with h5py.File(h5_path, mode='w') as h5_file: print("Allocating %s HDF5 tensor to %s." % (memory_size, h5_path)) h5_tensor = h5_file.create_dataset('tensor', shape, args.dtype) print("Filling HDF5 tensor.") fill_tensor(h5_tensor) duration = default_timer() - start_time print("HDF5 sequential write time: " + human_readable_duration(duration)) print("{:.2g} secs per {}-sized batch".format(duration / num_batches, args.batch_size)) if not args.no_memmap: print("Allocating %s memmap tensor to %s." % (memory_size, mm_path)) start_time = default_timer() fill_tensor(open_memmap(mm_path, 'w+', args.dtype, shape)) duration = default_timer() - start_time print('Memmap sequential write time: %s' % human_readable_duration(duration)) print("{:.2g} secs per {}-sized batch".format(duration / num_batches, args.batch_size)) rng = numpy.random.RandomState(1413) shuffled_indices = rng.choice(shape[0], size=shape[0], replace=False) def random_reads(tensor): row_index = 0 is_hdf5 = isinstance(tensor, h5py.Dataset) while row_index < shape[0]: print("read {} of {} rows".format(row_index, shape[0]), end='\r') next_row_index = min(shape[0], row_index + args.batch_size) indices = shuffled_indices[row_index:next_row_index] if is_hdf5: indices = numpy.sort(indices) expected_values = get_expected_values(indices) assert_true((tensor[indices, ...] == expected_values).all()) row_index = next_row_index if not args.no_h5: print("Randomly reading from " + h5_path) start_time = default_timer() with h5py.File(h5_path, mode='r') as h5_file: h5_tensor = h5_file['tensor'] random_reads(h5_tensor) duration = default_timer() - start_time print('HDF5 random read time: ' + human_readable_duration(duration)) print("{:.2g} secs per {}-sized batch".format(duration / num_batches, args.batch_size)) if not args.no_memmap: print("Randomly reading from " + mm_path) start_time = default_timer() random_reads(open_memmap(mm_path, 'r', args.dtype, shape)) duration = default_timer() - start_time print('Memmap random read time: ' + human_readable_duration(duration)) print("{:.2g} secs per {}-sized batch".format(duration / num_batches, args.batch_size))
def build_dataset(path='/srv/data/apnea'): nights = ['302-adjust', '302-nopap', '303-adjust', '303-nopap', '304-adjust', '304-nopap', '305-adjust', '305-nopap', '306-adjust', '306-nopap', '307-adjust', '307-nopap', '309-adjust', '309-nopap', '310-adjust', '310-nopap', '311-adjust', '311-nopap', '312-adjust', '312-nopap', '313-adjust', '313-nopap', '314-adjust', '314-nopap', '315-adjust', '316-adjust', '316-nopap', '317-adjust', '317-nopap'] labeled_nights = ['302-adjust', '302-nopap', '303-adjust', '303-nopap', '304-nopap', '309-adjust', '310-adjust', '310-nopap', '311-adjust', '312-adjust', '312-nopap', '316-nopap', '317-adjust', '317-nopap'] # These are the "nominal" start and end times for each WAV file. # In fact, however, each WAV file is padded out with unlabeled data, # at the beginning (!), to an exact multiple of two minutes in length. nominal_times = { '302-adjust': ('2011-07-07 22:42:50', '2011-07-08 06:04:04'), '302-nopap': ('2011-07-11 22:46:36', '2011-07-12 06:51:09'), '303-adjust': ('2011-07-06 00:16:58', '2011-07-06 06:38:52'), '303-nopap': ('2011-07-27 22:20:45', '2011-07-28 06:26:15'), '304-adjust': ('2011-07-19 21:41:07', '2011-07-20 06:20:42'), '304-nopap': ('2011-07-26 22:49:09', '2011-07-27 06:00:30'), '305-adjust': ('2011-08-03 23:22:44', '2011-08-04 06:39:41'), '305-nopap': ('2011-08-04 23:48:46', '2011-08-05 07:09:23'), '306-adjust': ('2011-08-18 22:45:03', '2011-08-19 06:34:35'), '306-nopap': ('2011-08-19 22:13:58', '2011-08-20 06:47:19'), '307-adjust': ('2011-08-23 22:32:23', '2011-08-24 05:48:02'), '307-nopap': ('2011-08-30 22:16:11', '2011-08-31 06:04:09'), '309-adjust': ('2011-11-13 22:22:38', '2011-11-14 05:57:54'), '309-nopap': ('2011-11-14 22:30:07', '2011-11-15 05:14:17'), '310-adjust': ('2011-11-22 23:59:34', '2011-11-23 06:44:19'), '310-nopap': ('2011-11-29 00:47:19', '2011-11-29 06:51:23'), '311-adjust': ('2011-11-09 22:59:20', '2011-11-10 06:17:11'), '311-nopap': ('2011-11-17 22:37:49', '2011-11-18 06:30:28'), '312-adjust': ('2011-12-09 23:16:22', '2011-12-10 06:20:20'), '312-nopap': ('2011-12-11 22:28:14', '2011-12-12 05:23:21'), '313-adjust': ('2011-12-05 22:37:40', '2011-12-06 06:19:57'), '313-nopap': ('2011-12-06 21:58:02', '2011-12-07 05:53:00'), '314-adjust': ('2012-02-12 23:18:14', '2012-02-13 05:21:12'), '314-nopap': ('2012-02-19 22:47:06', '2012-02-20 05:50:21'), '315-adjust': ('2012-04-20 23:13:48', '2012-04-21 06:50:22'), '316-adjust': ('2012-03-21 23:44:59', '2012-03-22 08:36:15'), '316-nopap': ('2012-03-22 22:58:45', '2012-03-23 07:43:39'), '317-adjust': ('2012-04-16 00:23:34', '2012-04-16 08:29:43'), '317-nopap': ('2012-04-30 00:06:28', '2012-04-30 07:56:16') } def parse_time(s): return datetime.strptime(s,'%Y-%m-%d %H:%M:%S') end_times = {x: parse_time(nominal_times[x][1]) for x in nominal_times} # TODO: use unlabeled nights, too window_shape = None total_examples = 0 X_names = [] y_names = [] for night in labeled_nights: basename = path+'/'+night wav_name = basename+'.wav' X_name = basename+'-X.npy' y_name = basename+'-y.npy' X_names.append(X_name) y_names.append(y_name) print >> sys.stderr, "Reading samples from %s" % wav_name rate, samples = wavfile.read(wav_name) assert rate == sample_rate, "File has wrong sample rate: %s (is %d, should be %d)" % (wav_name,rate,sample_rate) assert samples.ndim == 1, "Expected mono audio only: %s" % wav_name assert samples.dtype == numpy.dtype('int16'), "Expected 16-bit samples: %s" % wav_name mat = loadmat(basename+'.mat') actual_length = timedelta(seconds=len(samples)/float(sample_rate)) nominal_start_time = parse_time(nominal_times[night][0]) end_time = parse_time(nominal_times[night][1]) actual_start_time = end_time - actual_length assert actual_start_time < nominal_start_time # elements here are offsets in seconds from the beginning of the wav file. def to_seconds(t): return (parse_time(t) - actual_start_time).total_seconds() times = {signal: map(to_seconds, numpy.hstack(mat[signal].flatten())) if len(mat[signal]) > 0 else [] for signal in classes} X, y = compute_windows(samples, times) del samples assert window_shape is None or window_shape == X.shape[1:] window_shape = X.shape[1:] numpy.save(X_name, X) numpy.save(y_name, y) total_examples += X.shape[0] del X, y # end for night print >> sys.stderr, "Gathering all examples..." X = open_memmap(path+'/X.npy', mode='w+', dtype='float32', shape=(total_examples,)+window_shape) ys = [] i = 0 for X_name,y_name in zip(X_names,y_names): print >> sys.stderr, X_name x1 = numpy.load(X_name, mmap_mode='r') ys.append(numpy.load(y_name)) X[i:i+x1.shape[0]] = x1 i += x1.shape[0] print >> sys.stderr, 'OK' y = numpy.concatenate(ys) numpy.save(path+'/y.npy', y) assert i == total_examples return (X,y)
def shape(self, item): """Faster version of precomputed(item).data.shape""" memmap = open_memmap(self.get_path(item), mode='r') shape = memmap.shape del memmap return shape
ncores = 8 savename = 'launch-%06d.npy' % (idx,) outname = 'reduced-%06d.pickle' % (idx,) msg = msgr() try: os.stat(outname) msg('found result file!') except: msg('no result found, proceeding to do reduction') msg('loading dataset %s' % savename) import cPickle as cp from numpy import * from numpy.linalg import svd from numpy.lib.format import open_memmap from multiprocessing import Pool npy = open_memmap(savename) npy_ = npy.reshape((-1, 32*npy.shape[1], 192)) pool = Pool(ncores) svds = pool.map(reducer, range(npy_.shape[0])) msg('writing data') with open(outname, 'w') as fd: cp.dump(svds, fd)