def get_indexes_da(source_folder, target_folder, nTrain, transferImages): get_logger().info("Loading indexes") allowed = ['backpack.hdf5', 'headphones.hdf5', 'monitor.hdf5', 'bike.hdf5', 'keyboard.hdf5', 'mouse.hdf5', 'projector.hdf5', 'calculator.hdf5', 'laptop.hdf5', 'mug.hdf5'] train = [] test = [] MAX_TEST = 100 for categ in allowed: #support_filename = join(".", basename(filename)) s_filename = join(source_folder, categ) t_filename = join(target_folder, categ) get_logger().info("Loading " + s_filename) shfile = HDF5File(s_filename, 'r') thfile = HDF5File(t_filename, 'r') siid = shfile["image_index"][:] tiid = thfile["image_index"][:] np.random.shuffle(siid) np.random.shuffle(tiid) trainIdx = siid[0:nTrain] trainTIdx =tiid[0:transferImages] testIdx = tiid[transferImages:transferImages+MAX_TEST] train.append((ClassIndexes(s_filename, trainIdx), ClassIndexes(t_filename, trainTIdx))) #data is actually loaded only when needed test.append([ClassIndexes(t_filename, testIdx)]) shfile.close() thfile.close() return Data(train, test)
def load_model(self, ubm_file): """Loads the projector (UBM) from a file.""" hdf5file = HDF5File(ubm_file, "r") logger.debug("Loading model from file '%s'", ubm_file) # Read the UBM self.ubm = GMMMachine.from_hdf5(hdf5file) self.ubm.variance_thresholds = self.variance_threshold
def from_hdf5(cls, hdf5): """Creates a new GMMStats object from an `HDF5File` object.""" if isinstance(hdf5, str): hdf5 = HDF5File(hdf5, "r") try: version_major, version_minor = hdf5.attrs["file_version"].split(".") logger.debug( f"Reading a GMMStats HDF5 file of version {version_major}.{version_minor}" ) except (KeyError, RuntimeError): version_major, version_minor = 0, 0 if int(version_major) >= 1: if hdf5.attrs["writer_class"] != str(cls): logger.warning(f"{hdf5.attrs['writer_class']} is not {cls}.") self = cls( n_gaussians=hdf5["n_gaussians"][()], n_features=hdf5["n_features"][()], ) self.log_likelihood = hdf5["log_likelihood"][()] self.t = hdf5["T"][()] self.n = hdf5["n"][...] self.sum_px = hdf5["sumPx"][...] self.sum_pxx = hdf5["sumPxx"][...] else: # Legacy file version logger.info("Loading a legacy HDF5 stats file.") self = cls( n_gaussians=int(hdf5["n_gaussians"][()]), n_features=int(hdf5["n_inputs"][()]), ) self.log_likelihood = hdf5["log_liklihood"][()] self.t = int(hdf5["T"][()]) self.n = np.reshape(hdf5["n"], (self.n_gaussians,)) self.sum_px = np.reshape(hdf5["sumPx"], (self.shape)) self.sum_pxx = np.reshape(hdf5["sumPxx"], (self.shape)) return self
def get_support(filename, size): hfile = HDF5File(filename, 'r') ds = hfile['support'] patches = ds[:min(size, ds.shape[0]), :] hfile.close() return patches.astype('float')
def test_gmm_ML_1(): """Trains a GMMMachine with ML_GMMTrainer""" ar = load_array( resource_filename("bob.learn.em", "data/faithful.torch3_f64.hdf5")) gmm_ref = GMMMachine.from_hdf5( HDF5File(resource_filename("bob.learn.em", "data/gmm_ML.hdf5"), "r")) for transform in (to_numpy, to_dask_array): ar = transform(ar) gmm = loadGMM() # test rng handling gmm.convergence_threshold = 0.001 gmm.update_means = True gmm.update_variances = True gmm.update_weights = True gmm.random_state = np.random.RandomState(seed=12345) gmm = gmm.fit(ar) gmm = loadGMM() gmm.convergence_threshold = 0.001 gmm.update_means = True gmm.update_variances = True gmm.update_weights = True # Generate reference # gmm.save(HDF5File(resource_filename("bob.learn.em", "data/gmm_ML.hdf5"), "w")) gmm = gmm.fit(ar) assert_gmm_equal(gmm, gmm_ref)
def test_GMMMachine_stats(): """Tests a GMMMachine (statistics)""" arrayset = load_array( resource_filename("bob.learn.em", "data/faithful.torch3_f64.hdf5")) gmm = GMMMachine(n_gaussians=2) gmm.weights = np.array([0.5, 0.5], "float64") gmm.means = np.array([[3, 70], [4, 72]], "float64") gmm.variances = np.array([[1, 10], [2, 5]], "float64") gmm.variance_thresholds = np.array([[0, 0], [0, 0]], "float64") stats = gmm_module.e_step( arrayset, gmm, ) stats_ref = GMMStats(n_gaussians=2, n_features=2) stats_ref.load( HDF5File(resource_filename("bob.learn.em", "data/stats.hdf5"), "r")) np.testing.assert_equal(stats.t, stats_ref.t) np.testing.assert_almost_equal(stats.n, stats_ref.n, decimal=10) # np.testing.assert_equal(stats.sum_px, stats_ref.sum_px) # Note AA: precision error above np.testing.assert_almost_equal(stats.sum_px, stats_ref.sum_px, decimal=10) np.testing.assert_almost_equal(stats.sum_pxx, stats_ref.sum_pxx, decimal=10)
def write_out_file(infile, outfile, tree=None): f = uproot.open(infile)["fancy_tree;1"] #f = root_open(infile) #T = f[tree] names = f.keys() cells = list(filter(lambda x: x.startswith(b'cell'), names)) assert len(cells) == sum(map(np.prod, LAYER_SPECS)) + OVERFLOW_BINS for df in uproot.pandas.iterate(infile, "fancy_tree;1", branches = cells): X = df for df in uproot.pandas.iterate(infile, "fancy_tree;1", branches = b'TotalEnergy'): E = df X = X.values E = E.values.ravel() #X = tree.pd.DataFrame(tree2array(T, branches=cells)).values #E = tree.pd.DataFrame(tree2array(T, branches=['TotalEnergy'])).values.ravel() print(X.shape) with HDF5File(outfile, 'w') as h5: for layer, (sh, (l, u)) in enumerate(zip(LAYER_SPECS, LAYER_DIV)): h5['layer_{}'.format(layer)] = X[:, l:u].reshape((-1, ) + sh) print(u) h5['overflow'] = X[:, -OVERFLOW_BINS:] h5['energy'] = E.reshape(-1, 1)
def select_random_support(train_dir, support_dir, num_train_images, support_size, position_influence): log = get_logger() train_files = [ f for f in glob(join(train_dir, '*')) if splitext(f.lower())[1] == '.hdf5' ] try: os.makedirs(support_dir) except: pass for target_file in train_files: log.info('Extracting random support from "%s"...', basename(target_file)) #(patches, _)= get_standardized_patches(target_file, num_train_images, position_influence) (patches, _) = get_patches(target_file, num_train_images, position_influence) rand_ix = random.sample(range(patches.shape[0]), min(patches.shape[0], support_size)) patches = patches[np.array(rand_ix), :] fh = HDF5File(join(support_dir, basename(target_file)), 'w') ds = fh.create_dataset('support', patches.shape, dtype='float') ds[:] = patches ds.attrs['cursor'] = patches.shape[0] fh.close()
def save_model(self, ubm_file): """Saves the projector (UBM) to file.""" # Saves the UBM to file logger.debug("Saving model to file '%s'", ubm_file) hdf5 = (ubm_file if isinstance(ubm_file, HDF5File) else HDF5File( ubm_file, "w")) self.ubm.save(hdf5)
def from_hdf5(cls, hdf5, ubm=None): """Creates a new GMMMachine object from an `HDF5File` object.""" if isinstance(hdf5, str): hdf5 = HDF5File(hdf5, "r") try: version_major, version_minor = hdf5.attrs["file_version"].split(".") logger.debug( f"Reading a GMMMachine HDF5 file of version {version_major}.{version_minor}" ) except (KeyError, RuntimeError): version_major, version_minor = 0, 0 if int(version_major) >= 1: if hdf5.attrs["writer_class"] != str(cls): logger.warning(f"{hdf5.attrs['writer_class']} is not {cls}.") if hdf5["trainer"] == "map" and ubm is None: raise ValueError( "The UBM is needed when loading a MAP machine." ) self = cls( n_gaussians=hdf5["n_gaussians"][()], trainer=hdf5["trainer"][()], ubm=ubm, convergence_threshold=1e-5, max_fitting_steps=hdf5["max_fitting_steps"][()], weights=hdf5["weights"][...], k_means_trainer=None, update_means=hdf5["update_means"][()], update_variances=hdf5["update_variances"][()], update_weights=hdf5["update_weights"][()], ) gaussians_group = hdf5["gaussians"] self.means = gaussians_group["means"][...] self.variances = gaussians_group["variances"][...] self.variance_thresholds = gaussians_group["variance_thresholds"][ ... ] else: # Legacy file version logger.info("Loading a legacy HDF5 machine file.") n_gaussians = hdf5["m_n_gaussians"][()] g_means = [] g_variances = [] g_variance_thresholds = [] for i in range(n_gaussians): gaussian_group = hdf5[f"m_gaussians{i}"] g_means.append(gaussian_group["m_mean"][...]) g_variances.append(gaussian_group["m_variance"][...]) g_variance_thresholds.append( gaussian_group["m_variance_thresholds"][...] ) weights = np.reshape(hdf5["m_weights"], (n_gaussians,)) self = cls(n_gaussians=n_gaussians, ubm=ubm, weights=weights) self.means = np.array(g_means).reshape(n_gaussians, -1) self.variances = np.array(g_variances).reshape(n_gaussians, -1) self.variance_thresholds = np.array(g_variance_thresholds).reshape( n_gaussians, -1 ) return self
def save(self, filepath): with HDF5File(filepath, "w") as hdf5: hdf5.create_dataset("size", data=self.size) hdf5.create_dataset("depth", data=self.depth) hdf5.create_dataset("levels", data=self.levels) hdf5.create_dataset("first_channels", data=self.first_channels) hdf5.create_dataset("last_channels", data=self.last_channels) hdf5.create_dataset("categories", data=self.categories) HDF5Serializer(hdf5.create_group("weights")).save(self)
def test_gmm_MAP_1(): # Train a GMMMachine with MAP_GMMTrainer ar = load_array( resource_filename("bob.learn.em", "data/faithful.torch3_f64.hdf5")) # test with rng gmmprior = GMMMachine.from_hdf5( HDF5File(resource_filename("bob.learn.em", "data/gmm_ML.hdf5"), "r")) gmm = GMMMachine.from_hdf5( HDF5File(resource_filename("bob.learn.em", "data/gmm_ML.hdf5"), "r"), ubm=gmmprior, ) gmm.update_means = True gmm.update_variances = False gmm.update_weights = False rng = np.random.RandomState(seed=12345) gmm.random_state = rng gmm = gmm.fit(ar) gmmprior = GMMMachine.from_hdf5( HDF5File(resource_filename("bob.learn.em", "data/gmm_ML.hdf5"), "r")) gmm = GMMMachine.from_hdf5( HDF5File(resource_filename("bob.learn.em", "data/gmm_ML.hdf5"), "r"), ubm=gmmprior, ) gmm.update_means = True gmm.update_variances = False gmm.update_weights = False # Generate reference # gmm.save(HDF5File(resource_filename("bob.learn.em", "data/gmm_MAP.hdf5"), "w")) gmm_ref = GMMMachine.from_hdf5( HDF5File(resource_filename("bob.learn.em", "data/gmm_MAP.hdf5"), "r")) for transform in (to_numpy, to_dask_array): ar = transform(ar) gmm = gmm.fit(ar) np.testing.assert_almost_equal(gmm.means, gmm_ref.means, decimal=3) np.testing.assert_almost_equal(gmm.variances, gmm_ref.variances, decimal=3) np.testing.assert_almost_equal(gmm.weights, gmm_ref.weights, decimal=3)
def load_states(self, filepath): with HDF5File(filepath, "r") as hdf5: self.averaged_path_length = float(hdf5["averaged_path_length"][()]) self.augumentation_probability = float( hdf5["augumentation_probability"][()]) HDF5Deserializer(hdf5["generator"]).load(self.generator) HDF5Deserializer(hdf5["averaged_generator"]).load( self.averaged_generator) HDF5Deserializer(hdf5["discriminator"]).load(self.discriminator) for key, optimizer in dict(self.optimizers).items(): HDF5Deserializer(hdf5["optimizers"][key]).load(optimizer)
def load(filepath): with HDF5File(filepath, "r") as hdf5: size = int(hdf5["size"][()]) depth = int(hdf5["depth"][()]) levels = int(hdf5["levels"][()]) first_channels = int(hdf5["first_channels"][()]) last_channels = int(hdf5["last_channels"][()]) categories = int(hdf5["categories"][()]) generator = Generator(size, depth, levels, first_channels, last_channels, categories) HDF5Deserializer(hdf5["weights"]).load(generator) return generator
def save(self, hdf5): """Saves the current statistsics in an `HDF5File` object.""" if isinstance(hdf5, str): hdf5 = HDF5File(hdf5, "w") hdf5.attrs["file_version"] = "1.0" hdf5.attrs["writer_class"] = str(self.__class__) hdf5["n_gaussians"] = self.n_gaussians hdf5["n_features"] = self.n_features hdf5["log_likelihood"] = float(self.log_likelihood) hdf5["T"] = int(self.t) hdf5["n"] = np.array(self.n) hdf5["sumPx"] = np.array(self.sum_px) hdf5["sumPxx"] = np.array(self.sum_pxx)
def doSnapshots(): """ Read the snapshots and plot the corresponding variables. """ # get all the filenames filenames = glob("simple_orbits_*.hdf5") N = len(filenames) filenames.sort() # generate the output arrays E = np.zeros((N, makeIC.num_part)) t = np.zeros(N) p = np.zeros((N, 3)) v = np.zeros((N, 3)) for i, f in enumerate(filenames): # get the data from the file f = HDF5File(f, "r") ids = f["PartType1/ParticleIDs"][:] sort = np.argsort(ids) ids = ids[sort] pos = f["PartType1/Coordinates"][sort, :] pos -= center vel = f["PartType1/Velocities"][sort, :] t[i] = f["Header"].attrs["Time"] r = np.sum(pos**2, axis=1)**0.5 v2 = np.sum(vel**2, axis=1) E[i, :] = 0.5 * v2 - G * M / r # Get the pos / vel of the required particle ind = ids == id_focus p[i, :] = pos[ind, :] v[i, :] = vel[ind, :] # Compute the solution y0 = np.zeros(4) y0[:2] = p[0, :2] y0[2:] = v[0, :2] # compute the plotting variables plt.figure(fig_1.number) plotRelative(t, E, ".", label="Snapshot") plt.figure(fig_2.number) plt.plot(p[:, 0], p[:, 1], "-", label="Snapshot", lw=1.) plt.figure(fig_3.number) plt.plot(v[:, 0], v[:, 1], "-", label="Snapshot", lw=1.)
def create_hdf5_dataset(output_filename, patches, positions): log = get_logger() log.debug('Saving extracted descriptors to %s', output_filename) hfile = HDF5File(output_filename, 'w', compression='gzip', fillvalue=0.0) hpatches = hfile.create_dataset('patches', patches.shape, dtype="float32", chunks=True) hpositions = hfile.create_dataset('positions', positions.shape, dtype="uint16", chunks=True) hpatches[:] = patches hpositions[:] = positions hfile.close()
def save_states(self, filepath): with HDF5File(filepath, "w") as hdf5: hdf5.create_dataset("averaged_path_length", data=self.averaged_path_length) hdf5.create_dataset("augumentation_probability", data=self.augumentation_probability) HDF5Serializer(hdf5.create_group("generator")).save(self.generator) HDF5Serializer(hdf5.create_group("averaged_generator")).save( self.averaged_generator) HDF5Serializer(hdf5.create_group("discriminator")).save( self.discriminator) optimizer_group = hdf5.create_group("optimizers") for key, optimizer in dict(self.optimizers).items(): HDF5Serializer( optimizer_group.create_group(key)).save(optimizer)
def get_num_patches(filename, num_images): hfile = HDF5File(filename, 'r') total_num_patches = hfile[PATCH_TYPE].attrs['cursor'] dim = hfile[PATCH_TYPE].shape[1] if num_images == 0: num_images = hfile['image_index'].shape[0] image_index = hfile['image_index'][:min(num_images, hfile['image_index']. shape[0])] # Getting patches only from desired number of images num_patches = min(image_index[-1, 1], total_num_patches) hfile.close() return (num_patches, dim)
def __init__(self, raw_files, comm=None, blocksize=2**16*20*2*4): """ Initialize a lofar observation, tracking/joining the two polarizations. We also parse the corresponding HDF5 files to initialize: nchan, samplerate, fwidth """ # read the HDF5 file and get useful data h0 = HDF5File(raw_files[0].replace('.raw', '.h5'), 'r') saps = sorted([i for i in h0.keys() if 'SUB_ARRAY_POINTING' in i]) s0 = h0[saps[0]] time0 = Time(s0.attrs['EXPTIME_START_UTC'].replace('Z',''), scale='utc') beams = sorted([i for i in s0.keys() if 'BEAM' in i]) b0 = s0[beams[0]] frequencies = (b0['COORDINATES']['COORDINATE_1'] .attrs['AXIS_VALUES_WORLD'] * u.Hz).to(u.MHz) fbottom = frequencies[0] stokes = sorted([i for i in b0.keys() if 'STOKES' in i and 'i2f' not in i]) st0 = b0[stokes[0]] dtype = _lofar_dtypes[st0.attrs['DATATYPE']] nchan = len(frequencies) # = st0.attrs['NOF_SUBBANDS'] # can also get from np.diff(frequencies.diff).mean() fwidth = (b0.attrs['SUBBAND_WIDTH'] * u.__dict__[b0.attrs['CHANNEL_WIDTH_UNIT']]).to(u.MHz) samplerate = (b0.attrs['SAMPLING_RATE'] * u.__dict__[b0.attrs['SAMPLING_RATE_UNIT']]).to(u.MHz) h0.close() self.time0 = time0 self.samplerate = samplerate self.fwidth = fwidth self.frequencies = frequencies self.fedge = fbottom self.fedge_at_top = False self.dtsample = (1./self.fwidth).to(u.s) super(LOFARdata, self).__init__(raw_files, blocksize, dtype, nchan, comm=comm) # update some of the hdu data self['PRIMARY'].header['DATE-OBS'] = self.time0.isot self[0].header.update('TBIN', (1./samplerate).to('s').value)
def write_out_file(infile, outfile, tree=None): f = root_open(infile) T = f[tree] cells = filter(lambda x: x.startswith('cell'), T.branchnames) assert len(cells) == sum(map(np.prod, LAYER_SPECS)) + OVERFLOW_BINS X = pd.DataFrame(tree2array(T, branches=cells)).values E = pd.DataFrame(tree2array(T, branches=['TotalEnergy'])).values.ravel() with HDF5File(outfile, 'w') as h5: for layer, (sh, (l, u)) in enumerate(zip(LAYER_SPECS, LAYER_DIV)): h5['layer_{}'.format(layer)] = X[:, l:u].reshape((-1, ) + sh) h5['overflow'] = X[:, -OVERFLOW_BINS:] h5['energy'] = E.reshape(-1, 1)
def save(self, hdf5): """Saves the current statistics in an `HDF5File` object.""" if isinstance(hdf5, str): hdf5 = HDF5File(hdf5, "w") hdf5.attrs["file_version"] = "1.0" hdf5.attrs["writer_class"] = str(self.__class__) hdf5["n_gaussians"] = self.n_gaussians hdf5["trainer"] = self.trainer hdf5["convergence_threshold"] = self.convergence_threshold hdf5["max_fitting_steps"] = self.max_fitting_steps hdf5["weights"] = self.weights hdf5["update_means"] = self.update_means hdf5["update_variances"] = self.update_variances hdf5["update_weights"] = self.update_weights gaussians_group = hdf5.create_group("gaussians") gaussians_group["means"] = self.means gaussians_group["variances"] = self.variances gaussians_group["variance_thresholds"] = self.variance_thresholds
def imread_mat(filename, name=None): """ Read an 'image' from a MATLAB .MAT file. The file can be any version. Files that are v7.3 require the h5py module. If no name is given, the first variable is taken. """ try: # Try general first (doesn't work for v7.3+ files) # SciPy has this built in # Supports loading just the given variable name # Otherwise have to load all variables and skip special keys starting with "__" to find the variable to load # Loaded matracies are already arrays from scipy.io import loadmat if name == None: try: # Try to get first variable name without loading entire file (only supported in SciPy 0.12+) from scipy.io import whosmat keys in whosmat(file_name) if len(keys) == 0: raise KeyError() name = keys[0][0] except: pass x = loadmat(filename, variable_names=name) if name == None: name = '__' # we need to find first for name in x.iterkeys(): if name[:2] != '__': break if name[:2] == '__': raise KeyError() # no variables return x[name] # can raise key error except NotImplementedError: # Try v7.3 file which is an HDF5 file # We have to use h5py for this (or PyTables...) # Always loads entire metadata (not just specific variable) but none of the data # Data needs to be actually loaded (.value) and transposed (.T) from h5py import File as HDF5File # TODO: if import error try using PyTables with HDF5File( filename, 'r' ) as x: # IOError if it doesn't exist or is the wrong format if name == None: try: name = x.iterkeys().next() except StopIteration: raise KeyError() # no variables return x[name].value.T # can raise key error
def __init__(self, output_name, output_dir, num_files, patches, feature_type, patch_dim=128, patch_type='uint8', pos_type='uint16'): self.log = get_logger() output_subdir = output_dir try: makedirs(output_subdir) except: pass output_filename = join(output_subdir, basename(output_name)) self.log.debug('Saving extracted descriptors to %s', output_filename) self.mode = 'creating' dt = special_dtype(vlen=bytes) patches += 10 #for safety self.hfile = HDF5File(output_filename, 'w', compression='gzip', fillvalue=0.0) self.patches = self.hfile.create_dataset( 'patches', (num_files * patches, patch_dim), dtype=patch_type, chunks=True) self.positions = self.hfile.create_dataset('positions', (num_files * patches, 2), dtype=pos_type, chunks=True) self.image_index = self.hfile.create_dataset( 'image_index', (num_files, 2), dtype='uint64') # Start, End positions of an image self.keys = self.hfile.create_dataset('keys', (num_files, ), dtype=dt) self.key_set = set() self.patches.attrs['cursor'] = 0 self.patches.attrs['feature_type'] = feature_type self.output_filename = output_filename
def get_patches(filename, num_images, position_influence=0): hfile = HDF5File(filename, 'r') total_num_patches = hfile[PATCH_TYPE].attrs['cursor'] if num_images == 0: num_images = hfile['image_index'].shape[0] image_index = hfile['image_index'][:min(num_images, hfile['image_index']. shape[0])] # Getting patches only from desired number of images num_patches = min(image_index[-1, 1], total_num_patches) patches = hfile[PATCH_TYPE][:num_patches, :] # patches = patches.astype(float) # norms = (patches**2).sum(axis=1)**0.5 # patches /= norms.max() #patches = patches.astype(float) #patches /= patches.max() feature_type = hfile[PATCH_TYPE].attrs.get('feature_type', None) # if feature_type == 'DECAF': # norms = (patches**2).sum(axis=1)**0.5 # patches /= norms.max() if position_influence > 0: pos = hfile['positions'][:num_patches, :] pos = pos.astype(float) max_x = pos[:, 0].max() max_y = pos[:, 1].max() if max_x > 0: pos[:, 0] /= max_x if max_y > 0: pos[:, 1] /= max_y patches = np.hstack([patches, position_influence * pos]) hfile.close() return (patches, image_index)
def load(self): get_logger().info("Loading patches for " + self.file_name) hfile = HDF5File(self.file_name, 'r') patches = hfile[self.patch_name] feature_dim = patches.shape[1] indexes = self.indexes num_patches = (indexes[:, 1] - indexes[:, 0]).sum() self.patches = np.empty([num_patches, feature_dim]) self.new_index = np.empty([indexes.shape[0], 2]) patch_start = n_image = 0 for iid in indexes: n_patches = iid[1] - iid[0] self.patches[patch_start:patch_start + n_patches, :] = patches[iid[0]:iid[1], :] self.new_index[n_image] = [patch_start, patch_start + n_patches] patch_start += n_patches n_image += 1 hfile.close() get_logger().info("Loaded " + str(num_patches) + " patches")
def getIndexes(patch_folder, nTrain, nTest, position_influence): files = sorted(glob(join(patch_folder, '*.hdf5')), key=basename) train = [] test = [] for (classNumber, filename) in enumerate(files): #support_filename = join(".", basename(filename)) hfile = HDF5File(filename, 'r') iid = hfile["image_index"][:] nImages = iid.shape[0] assert nImages >= (nTrain + nTest), "Not enough images!" np.random.shuffle(iid) trainIdx = iid[0:nTrain] testIdx = iid[nTrain:nTrain + nTest] trainData = ClassPatches(filename, trainIdx, PATCH_TYPE) testData = ClassPatches(filename, testIdx, PATCH_TYPE) test.append(testData) train.append( trainData) #train data is actually loaded only when needed hfile.close() Data = namedtuple("Data", "Train Test") return Data(train, test)
def make_split(output_name, output_subdir, _patches, _positions, _image_indexes): log = get_logger() #import pdb; pdb.set_trace() try: makedirs(output_subdir) except: pass output_filename = join(output_subdir, basename(output_name)) log.debug('Saving extracted descriptors to %s', output_filename) hfile = HDF5File(output_filename, 'w', compression='gzip', fillvalue=0.0) patches = hfile.create_dataset('patches', _patches.shape, dtype="float32", chunks=True) positions = hfile.create_dataset('positions', _positions.shape, dtype="uint16", chunks=True) image_index = hfile.create_dataset('image_index', _image_indexes.shape, dtype='uint64') # Start, End positions of an image patches[:]= _patches positions[:]=_positions image_index[:]=_image_indexes patches.attrs['cursor'] = 0 patches.attrs['feature_type'] = "CAFFE" hfile.close()
def load_split_whole_image_only(input_folder, nTrain, nTest): logger = get_logger() files = sorted(glob(join(input_folder, '*.hdf5')), key=basename) nClasses = len(files) logger.info("Loading " + str(nClasses) + " classes") train_patches = np.empty([nClasses * nTrain, patchOptions.size ]) # nClasses*nSamples x nFeatures test_patches = np.empty([nClasses * nTest, patchOptions.size]) train_labels = np.empty([nClasses * nTrain]) test_labels = np.empty([nClasses * nTest]) start = time.clock() train_patch_count = test_patch_count = 0 for (classNumber, filename) in enumerate(files): hfile = HDF5File(filename, 'r') iid = hfile["image_index"][:] nImages = iid.shape[0] assert nImages >= (nTrain + nTest), "Not enough images!" np.random.shuffle(iid) trainIdx = iid[0:nTrain] testIdx = iid[nTrain:nTrain + nTest] patches = hfile[patchOptions.patch_name] for iid in trainIdx: train_patches[train_patch_count] = patches[iid[0]] train_patch_count += 1 train_labels[classNumber * nTrain:(classNumber + 1) * nTrain] = classNumber * np.ones(nTrain) for iid in testIdx: test_patches[test_patch_count] = patches[iid[0]] test_patch_count += 1 test_labels[classNumber * nTest:(classNumber + 1) * nTest] = classNumber * np.ones(nTest) logger.info("Patch count: " + str(train_patch_count) + " training and " + str(test_patch_count) + " test patches for class " + filename) hfile.close() end = time.clock() logger.info("It took " + str((end - start)) + " seconds") LoadedData = namedtuple( "LoadedData", "train_patches train_labels test_patches test_labels") return LoadedData(train_patches, train_labels, test_patches, test_labels)
def load_patches(class_data): hfile = HDF5File(class_data.filename, 'r') patches = hfile[patchOptions.patch_name][:] positions = hfile[patchOptions.position_name][:] feature_dim = patchOptions.patch_dim indexes = class_data.index num_patches=(indexes[:,1]-indexes[:,0]).sum() loaded_patches = np.empty([num_patches, feature_dim]) loaded_positions = np.empty([num_patches, 2]) loaded_idx = np.empty([class_data.index.shape[0], 2]) tags = np.zeros([num_patches,1]) patch_start = n_image = 0 #import pdb; pdb.set_trace() for n, iid in enumerate(indexes): n_patches = iid[1]-iid[0] loaded_patches[patch_start:patch_start+n_patches,:] = patches[iid[0]:iid[1],:] loaded_positions[patch_start:patch_start+n_patches,:] = positions[iid[0]:iid[1],:] loaded_idx[n,:] = np.array([patch_start, patch_start+n_patches]) tags[patch_start] = 1 patch_start += n_patches n_image += 1 hfile.close() return ClassData(loaded_patches, loaded_positions, tags, num_patches, loaded_idx)