def write_to_disk(data_dict: TimeSeriesDict, seg_start: int, f: File): """Write a TimeSeriesDict to a gwpy-compatible .hdf5 file. Supports appending to an existing file.""" for name in data_dict: # deal with each TimeSeries in the TimeSeriesDict. data = data_dict[name] try: # create a gwpy-compatible h5py file. data.write(f, **writing_opts) except RuntimeError: # the RuntimeError in regard here is caused by the dataset already existing. # use the h5py File driver to get a direct pointer to the existing dataset. dataset = f[name] # compute the time offset between the existing data and the new data. secs = seg_start - get_last_time(dataset) padding = secs / dataset.attrs['dx'] # print(f'write: padding from {get_last_time(dataset)} to {seg_start} ({secs}s, {padding}pts)') if data.value.shape[0] < -padding: # this would resize the dataset to be smaller than it already is. raise RuntimeError('insertion is not supported.') else: # append data to the end of the file. dataset.resize( (dataset.shape[0] + padding + data.value.shape[0]), axis=0) dataset[-data.value.shape[0]:] = data.value f.flush() # sync table to disk
def write_to_hdf5(path): """ Take FITS file at ``path`` and save it into the HDF5 archive """ kid = path.split('kplr')[1].split('-')[0] lc_data = fits.getdata(path) if os.path.exists(hdf5_path): h = File(hdf5_path, 'a') else: h = File(hdf5_path, 'w') attrs = header_to_dict(path) if kid in h: group = h[kid] else: group = h.create_group(kid) dataset_name = f"q{attrs['QUARTER']}" if dataset_name not in group: dset = group.create_dataset(dataset_name, data=np.vstack( [lc_data[col] for col in cols])) for k, v in attrs.items(): dset.attrs[k] = v h.flush() h.close()
def main(): a = DetPulseCoord() fileid = h5f.create(b"test.h5") x = [1, 3, 3] y = [1., 3., 3, 4., 5, 3., 33.] x = ones((100, 3), dtype=int32) y = ones((100, 7), dtype=float32) z = ones((100, 2), dtype=float32) c = [(x[i], y[i], z[i]) for i in range(100)] data = {a.names[0]: x, a.names[1]: y} dspaceid = h5s.create_simple((1, ), (h5s.UNLIMITED, )) # dset = h5d.create(fileid, a.name, a.type, dspaceid) # dset.write() file = File("test.h5") numpytype = dtype([("coord", int32, (3, )), ("pulse", float32, (7, )), ("EZ", float32, (2, ))]) data = array(c, dtype=numpytype) tid = h5t.C_S1.copy() tid.set_size(6) H5T6 = Datatype(tid) tid.set_size(4) H5T_C_S1_4 = Datatype(tid) file.create_dataset("DetPulseCoord", data=data) file.attrs.create("CLASS", "TABLE", dtype=H5T6) file.attrs.create("FIELD_0_NAME", a.names[0]) file.attrs.create("FIELD_1_NAME", a.names[1]) file.attrs.create("TITLE", "Detpulse coord pair data") file.attrs.create("VERSION", "3.0", dtype=H5T_C_S1_4) file.attrs.create("abstime", 1.45e9, dtype=float64, shape=(1, )) file.attrs.create("nevents", 122421, dtype=float64, shape=(1, )) file.attrs.create("runtime", 125000, dtype=float64, shape=(1, )) file.flush()
def train_one_case_generic_save_data(train_result: dict, key_this: str, f_out: h5py.File, y_test: np.ndarray, eval_fn): assert {'y_test_hat', 'corr'} <= train_result.keys() <= { 'y_test_hat', 'corr', 'attrs', 'model' } # save y_test_hat = train_result['y_test_hat'] assert np.all(np.isfinite(y_test_hat)) assert y_test_hat.ndim == 2 and y_test_hat.shape[1] == 1 assert y_test.shape == y_test_hat.shape grp_this = f_out.create_group(key_this) grp_this.create_dataset('y_test_hat', data=y_test_hat) assert np.isscalar(train_result['corr']) and np.isfinite( train_result['corr']) assert eval_fn(y_test_hat, y_test) == train_result['corr'] grp_this.create_dataset('corr', data=train_result['corr']) print('performance', train_result['corr']) if 'attrs' in train_result: # save attrs for k, v in train_result['attrs'].items(): grp_this.attrs[k] = v if 'model' in train_result: grp_this_model = grp_this.create_group('model') if isinstance(train_result['model'], dict): for k_model, v_model in train_result['model'].items(): grp_this_model.create_dataset(k_model, data=v_model) else: # for Gabor. assert callable(train_result['model']) train_result['model'](grp_this_model) f_out.flush()
def analyzeExperiment(self, experimentResults): # write to data file super(Origin, self).toHDF5(experimentResults[self.settings]) # and to settings file try: f = File('settings.hdf5', 'a') super(Origin, self).toHDF5(f['settings/experiment']) f.flush() # write changes except Exception as e: logger.exception('Uncaught Exception in origin.postExperiment.') finally: f.close() # close the file return 0
def handle_one_case_inner(neural_dataset_key, subset, has_val, train_percentage, seed, f_in_idx: h5py.File, features_all: np.ndarray, f_out: h5py.File, transformer: CNNPreprocessor): dataset_main_name = split_dataset_name_gen(neural_dataset_key, subset, has_val, train_percentage, seed) print(f'handle {dataset_main_name}') sets_to_handle = ('train', 'val', 'test') if has_val else ('train', 'test') for set_to_handle_this in sets_to_handle: data_to_save = dataset_main_name + f'/{set_to_handle_this}/X' if data_to_save not in f_out: set_original_idx = f_in_idx[dataset_main_name][set_to_handle_this].attrs['index'] assert np.array_equal(np.unique(set_original_idx), set_original_idx) set_original = features_all[set_original_idx] set_transformed = transformer.transform(set_original) # then save f_out.create_dataset(data_to_save, data=set_transformed) f_out.flush() print(f'{set_to_handle_this} done') else: print(f'{set_to_handle_this} done before')
def handle_one_case_inner(neural_dataset_key, subset, has_val, train_percentage, seed, f_in: h5py.File, f_out: h5py.File, transformer: GLMDataPreprocesser): dataset_main_name = split_dataset_name_gen(neural_dataset_key, subset, has_val, train_percentage, seed) print(f'handle {dataset_main_name}') sets_to_handle = ('train', 'val', 'test') if has_val else ('train', 'test') for set_to_handle_this in sets_to_handle: data_to_save = dataset_main_name + f'/{set_to_handle_this}/X' if data_to_save not in f_out: set_original = f_in[dataset_main_name][f'{set_to_handle_this}/X'][ ...] set_transformed = transformer.transform(set_original) # then save f_out.create_dataset(data_to_save, data=set_transformed) f_out.flush() print(f'{set_to_handle_this} done') else: print(f'{set_to_handle_this} done before')
def init_file(f: h5py.File, groupname: str = 'data') -> None: if groupname in f: del f[groupname] f.flush() grp = f.create_group(groupname) add_cur_time_attr(grp) f.flush() else: grp = f.create_group(groupname) add_cur_time_attr(grp) f.flush()
class Series(object): """ Time Series of fMRI images, store all pixels's data in a hdf5 file. """ def __new__(cls, hdf5_path, *args, **kwargs): """ If the hdf5 file exist, will load it, else will create a new hdf5 file, in this situation, need these arguments: :image_dir: path to directory which store hdr image files. NOTE: Images name's character order must same to time sequence order. :time_interval: time interval between two images, unit: 1 second """ if not exists(hdf5_path): image_dir = kwargs['image_dir'] time_interval = kwargs['time_interval'] cls.create_from_hdr(image_dir, hdf5_path, time_interval) return super(Series, cls).__new__(cls) def __init__(self, hdf5_path, cachedir=CACHE, *args, **kwargs): """ Load Series from hdf5 file. :hdf5_path: path to related hdf5 file. :cachedir: path to cache directory, default current dir. """ self.h5dict = File(hdf5_path, 'r+') for k, v in self.h5dict.attrs.items(): setattr(self, k, v) self.cachedir = cachedir def save_attr(self): """ save self's attributes: * break_points * simu_intervals to self.h5dict.attrs """ for attr in ('break_points', 'simu_intervals'): if hasattr(self, attr): self.h5dict.attrs[attr] = getattr(self, attr) self.h5dict.flush() def _memoize(self, func, verbose=0): ''' helper method for memory cache. ''' if not hasattr(self, '_mymem'): self._mymem = joblib.Memory(cachedir=self.cachedir) memoized_func = self._mymem.cache(func, verbose=verbose) memoized_func.__doc__ = func.__doc__ return memoized_func def _get_series(self, x, y, z): """ return the time series(numpy array) at the position (z, y, x) """ if hasattr(self, 'start') and hasattr(self, 'end'): s, e = self.start, self.end times = self.h5dict['arr4d'][s:e, y, x, z] else: times = self.h5dict['arr4d'][:, y, x, z] return times def get_series(self, *args, **kwargs): """ cached method, cache mothod at first run """ self.get_series = self._memoize(self._get_series) return self.get_series(*args, **kwargs) def _get_arr3d(self, t): """ return the 3d(y, x, z) array at the time point t. """ arr3d = self.h5dict['arr4d'][t, :, :, :] # (t, y, x, z) return arr3d def get_arr3d(self, *args, **kwargs): """ cached method, cache mothod at first run """ self.get_arr3d = self._memoize(self._get_arr3d) return self.get_arr3d(*args, **kwargs) def _get_arr2d(self, t, k, axis='xy'): """ return 2d array at the time point t. :t: (int) time point of 2d array :k: (int) index of another dimension, e.g. axis == 'xy', k will means index of 'z' axis :axis: (str) the axis of 2d array. like: 'xy'(default), 'yz', 'xz' """ arr3d = self.get_arr3d(t) # (t, y, x, z) assert axis in ('xy', 'yz', 'xz') if axis == 'xy': # k -> z arr2d = arr3d[:, :, k] elif axis == 'yz': # k -> x arr2d = arr3d[:, k, :] else: # 'xz' # k -> y arr2d = arr3d[k, :, :] return arr2d def get_arr2d(self, *args, **kwargs): """ cached method, cache mothod at first run """ self.get_arr2d = self._memoize(self._get_arr2d) return self.get_arr2d(*args, **kwargs) def set_break_points(self, time_interval): """ set break points(the image index number when event occur) :time_interval: (tuple) time interval when event occur, like: (100, 110) """ start, end = time_interval assert start >= 0 and end <= self.n_images assert start < end self.break_points = (start, end) def set_range(self, start, end): """ Set start and end position, for take subset of series. NOTE: after this method run, the behavior of `get_series` will change. `get_serirs` will return time_serirs[start:end] :start: (int) start position of time series :end: (int) end position of time series """ msg = "series range seted (%d, %d),"%(start, end) +\ " `get_series`'s behavior will change" log.warning(msg) self._mymem.clear() log.info("memory cache clear") self.start = start self.end = end def set_simu_intervals(self, intervals): """ Set the time intervals when simulation :intervals: (list) a list of intervals. like: [(0, 10), (30, 50), (100, 110)] """ # check intervals for start, end in intervals: assert start >= 0 assert end <= self.n_images - 1 assert start < end self.simu_intervals = intervals def call_simu(self, algorithm, name, *args, **kwargs): """ Call simulation region, store result in the dict: self.simu_results :algorithm: the name of simulation calling method :name: (str) the name of this result """ log.info("call simulation region using {} algorithm".format(algorithm)) calling = importlib.import_module('simucaller.call_simu') if not hasattr(self, 'simu_results'): self.simu_results = {} alg = getattr(calling, algorithm) result = alg(self, *args, **kwargs) self.simu_results.setdefault(algorithm, {}) self.simu_results[algorithm][name] = result def list_simu_result(self): """ list all simulation region call result. """ res_list = [ "%s/%s"%(alg_name, name) for alg_name, alg_group in self.h5dict['simulation_region_call'].items() for name, _ in alg_group.items() ] return res_list def save_simu_result(self, algorithm, name): """ Save simulation region call result to related hdf5 file. :algorithm: (str) name of algorithm :name: (name) the name of result dataset save path: self.h5dict -> simulation_region_call/<algorithm>/<name> """ path = "simulation_region_call/{}/{}".format(algorithm, name) result = self.simu_results[algorithm][name] log.info("saving simulation call result to path: {}".format(path)) self.h5dict.create_dataset(path, shape=result.shape) log.debug(result.shape) log.debug(type(result)) self.h5dict[path][...] = result self.h5dict.flush() def get_simu_result(self, algorithm, name): """ Load simulation region call result from hdf5 file. :algorithm: result's calling method. :name: (str) result dataset name. """ result = self.h5dict['simulation_region_call'][algorithm][name][...] return result @classmethod def create_from_hdr(cls, image_dir, hdf5_path, time_interval): """ Create hdf5 file from hdr images. NOTE: Images name's character order must same to time sequence order. :time_interval: time interval between two images, unit: 1 second """ img_files = [i for i in listdir(image_dir) if i.endswith('.hdr')] img_files.sort(key=lambda i: i.split('.')[0]) img_files = [join(image_dir, i) for i in img_files] load_img = lambda f: nib.load(f).get_data() log.info("loading hdr images ...") imgs = [load_img(i) for i in img_files] n_images = len(imgs) log.info("{} hdr images loaded.".format(n_images)) # check images shape, all images must in sam shape shape = y, x, z = imgs[0].shape log.info("image shape: {}".format(shape)) for i, img in enumerate(imgs): assert img.shape == shape, \ "Image {} expect in shape {} but get shape {}".format( img_files[i], img.shape, shape ) arr4d = np.array(imgs) # shape: (t, y, x, z) log.debug(arr4d[arr4d != 0]) # store data h5dict = File(hdf5_path, 'w') log.info("hdf5 file created at {}".format(hdf5_path)) h5dict.create_dataset('arr4d', shape=arr4d.shape) h5dict['arr4d'][...] = arr4d log.info("time series dataset shape {}".format(arr4d.shape)) # store meta data h5dict.attrs['n_images'] = n_images h5dict.attrs['shape'] = arr4d.shape h5dict.attrs['time_interval'] = float(time_interval) log.info("time interval: {}s".format(time_interval)) h5dict.close() # close hdf5 file log.info("Series hdf5 file creating process finished") def __del__(self): self.h5dict.close()
def test_flush(self): """ Flush via .flush method """ fid = File(self.mktemp(), 'w') fid.flush() fid.close()
def write_data_to_file(datadict: DataDict, f: h5py.File, groupname: str = 'data', append_mode: AppendMode = AppendMode.new, swmr_mode: bool = True) -> None: if groupname not in f: raise RuntimeError('Group does not exist, initialize file first.') grp = f[groupname] # if we want to use swmr, we need to make sure that we're not # creating any more objects (see hdf5 docs). allexist = True for k, v in datadict.data_items(): if k not in grp: allexist = False # add top-level meta data. for k, v in datadict.meta_items(clean_keys=False): set_attr(grp, k, v) f.flush() if allexist and swmr_mode and not f.swmr_mode: f.swmr_mode = True for k, v in datadict.data_items(): data = v['values'] shp = data.shape nrows = shp[0] # create new dataset, add axes and unit metadata if k not in grp: maxshp = tuple([None] + list(shp[1:])) ds = grp.create_dataset(k, maxshape=maxshp, data=data) # add meta data add_cur_time_attr(ds) if v.get('axes', []) != []: set_attr(ds, 'axes', v['axes']) if v.get('unit', "") != "": set_attr(ds, 'unit', v['unit']) for kk, vv in datadict.meta_items(k, clean_keys=False): set_attr(ds, kk, vv) ds.flush() # if the dataset already exits, append data according to # chosen append mode. else: ds = grp[k] dslen = ds.shape[0] if append_mode == AppendMode.new: newshp = tuple([nrows] + list(shp[1:])) ds.resize(newshp) ds[dslen:] = data[dslen:] elif append_mode == AppendMode.all: newshp = tuple([dslen + nrows] + list(shp[1:])) ds.resize(newshp) ds[dslen:] = data[:] ds.flush() f.flush()
def make_nuc(ncc_file_path, n3d_file_path, out_file_name): if not out_file_name.lower().endswith('.nuc'): out_file_name = out_file_name + '.nuc' contact_dict = import_contacts(ncc_file_path) contact_name = os.path.splitext(os.path.basename(ncc_file_path))[0] pos_dict, coords_dict = import_coords(n3d_file_path) root = File(out_file_name, mode='w') hierarchy = (('contacts', ('original', 'working')), ('display', ()), ('chromosomes',()), ('dataTracks', ('derived', 'external', 'innate')), ('sample', ('protocol', 'organism', 'tissue')), ('structures', ('0')), ('images', ()) ) for parent, children in hierarchy: group = root.create_group(parent) for child in children: group.create_group(child) for child in ('particles', 'restraints', 'transforms', 'coords'): root['structures']['0'].create_group(child) now = int(time.time()) random.seed(now) root.attrs['id'] = np.array([random.random(), now, now], np.float32) root['sample'].attrs['name'] = np.string_('Unknown') contact_group = root['contacts']['working'].create_group(contact_name) for chromoPair in contact_dict: chrA, chrB = chromoPair if chrA not in contact_group: contact_group.create_group(chrA) contact_group[chrA].create_dataset(chrB, dtype=np.uint32, data=contact_dict[chromoPair].T) coords_group = root['structures']['0']['coords'] particle_group = root['structures']['0']['particles'] for chromo in coords_dict: coords_group.create_dataset(chromo, dtype=np.float64, data=coords_dict[chromo]) pos = np.array(pos_dict[chromo], np.uint32) group = particle_group.create_group(chromo) group.create_dataset('positions', dtype=np.uint32, data=pos) chromo_group = root['chromosomes'].create_group(chromo) chromo_group.attrs['limits'] = np.array([pos.min(), pos.max()]) root.flush()
class labelManager(object): def __init__(self, fileName, startBlockNum = 0): self._f = File(fileName,'r+') self._blockNumber = startBlockNum self._maxLabelNum = 9999 def addBlockLabel(self, data, start, stop=None, invert = False): if not stop: stop = [length + offset for length, offset in zip(data.shape, start)] if self._blockNumber <= self._maxLabelNum: dataset = self._f['PixelClassification/LabelSets/labels000'].create_dataset('block%04d' % self._blockNumber, data=(data.astype(np.uint8))) dataset.attrs.create('blockSlice',pointsToPosition(start, stop, invert)) self._blockNumber += 1 else: print 'Warning: maximum label block number exceeded. Unable to add further labels.' def addMultipleSingleLabels(self, positions, labelValue): for point in positions.T: self.addLabels(labelValue, pointsToPosition(point, point+1)) def addSingleLabel(self, labelValue, position): dataset = self._f['PixelClassification/LabelSets/labels000'].create_dataset('block%04d' % self._blockNumber, data=[[[[np.uint8(labelValue)]]]]) dataset.attrs.create('blockSlice',position) self._blockNumber += 1 def clear(self): dataset = self._f['PixelClassification/LabelSets/labels000'] for key in dataset.keys(): del dataset[key] self._blockNumber = 0 def getSubBlocks(self): """ returns subblocks containing the labels together with their corresponding offsets""" dataset = self._f['PixelClassification/LabelSets/labels000'] labelBlocks = [] for key in dataset: offset = strToPos(dataset[key].attrs.get('blockSlice')) values = dataset[key].value labelBlocks.append([offset, values]) print key return labelBlocks def getInSingleBlock(self, shape=None): """ returns a block containing all the labels. The return is guaranteed to start at (0,0,0) global coordinates, it may however not cover the whole block (max(shape[0]), max(shape[1]), max(shape[2])), since there is no good way of determining the shape of the raw data from ilasti""" # get the labels as they are saved in the projecct labeledBlocks = self.getSubBlocks() offsets = np.array([labeledBlock[0] for labeledBlock in labeledBlocks]) shapes = np.array([labeledBlock[1].shape[:3] for labeledBlock in labeledBlocks]) data = [labelsBlock[1][:,:,:,0] for labelsBlock in labeledBlocks] if shape is None: # find out the dimension of the block, there should be a better way of doing that. shape = np.max(offsets + shapes[:,:3], axis=0) # write all labeles into one big array labelBlockTotal = np.zeros(shape, dtype=np.uint8) for offset, shape, dataBlock in zip(offsets, shapes, data): index = [slice(offset[0], offset[0] + shape[0]), slice(offset[1], offset[1] + shape[1]), slice(offset[2], offset[2] + shape[2])] labelBlockTotal[index] += dataBlock return labelBlockTotal def flush(self): self._f.flush() def changeRawDataPath(self, newPath): """ deletes all saved paths and replaces it with the path 'newPath' """ dataset = self._f['Input Data/infos/lane0000/Raw Data/'] dataset.pop('filePath') dataset.create_dataset('filePath', data=newPath)
'w') for key in hinput.keys(): hinput.copy('/' + key, houput['/'], name=key) if houput[key].ndim == 2: houput[key + '_c'] = houput[key][0:76, 181:183] elif houput[key].ndim == 3: houput[key + '_c'] = houput[key][0:76, 181:183, :] elif houput[key].ndim == 1: houput[key + '_c'] = houput[key][181:183] houput[key + '_c'].attrs.update(houput[key].attrs) del houput[key] houput[key] = houput[key + '_c'] del houput[key + '_c'] print houput[key] houput.attrs.update(hinput.attrs) houput.flush() print hinput['/'] print houput['/'] print[attr for attr in hinput.attrs] print[attr for attr in houput.attrs] hinput.close() houput.close()
class MovieDB(object): def __init__(self, name): super(MovieDB, self).__init__() self.name = name path = '/media/qwertyflagstop/data/{}.h5'.format(self.name) self.file = File(path, mode='a') def download_songs_from_list_in_file(self, file_name, num_workers=10): """ :param file_name: the name of file with IMDB ids :return: nothing, it downloads stuff """ global ids global worker_count global index global tt_index with open(file_name, 'r') as fp: ids = set(json.load(fp)) tt_index = 75000 for k in self.file.keys(): #remove any we already got tt_index = max(tt_index, int(k[2:])) if k in ids: ids.remove(k) ids = list(ids) worker_count = num_workers index = 0 queue = Queue() lock = Lock() for j in np.arange(0, num_workers): t = Thread(target=fetch_poster, args=[queue, lock, j]) t.daemon = True t.start() while worker_count > 0: try: print('got {} movies'.format(len(self.file.keys()))) s = queue.get(timeout=4) poster_bytes = np.array(s['poster']) plot = np.string_(s['plot']) self.file.create_dataset('{}/poster'.format(s['id']), data=poster_bytes) self.file.create_dataset('{}/plot'.format(s['id']), data=plot) self.file.flush() except: continue print('DONE!') self.file.flush() self.file.close() def view_random__images(self): ids = self.file.keys() lengths = [] chars = set() import string master_txt = open('plots.txt', 'w') for i in np.arange(0, len(ids)): movie_facts = np.array(self.file['{}/{}'.format( ids[i], 'plot')]).tostring() movie_facts = json.loads(movie_facts) l = movie_facts['Plot'] p = ''.join([x for x in l if x in string.printable]) p = p.replace('\n', ' ') master_txt.write(p) master_txt.write('\n') # image_bytes = np.array(self.file['{}/{}'.format(ids[j],'poster')]) # v = Image.open(BytesIO(image_bytes.tostring())) # v.save('{}_.jpg'.format(i)) master_txt.close()
def close_file(file: h5py.File): file.flush() file.close()
def convert_cifar10(directory, output_directory, output_filename='cifar10.hdf5'): """Converts the CIFAR-10 dataset to HDF5. Converts the CIFAR-10 dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as 'cifar10.hdf5'. It assumes the existence of the following file: * `cifar-10-python.tar.gz` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'cifar10.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = File(output_path, mode='w') input_file = os.path.join(directory, DISTRIBUTION_FILE) tar_file = tarfile.open(input_file, 'r:gz') train_batches = [] for batch in range(1, 6): file = tar_file.extractfile( 'cifar-10-batches-py/data_batch_%d' % batch) try: if six.PY3: array = cPickle.load(file, encoding='latin1') else: array = cPickle.load(file) train_batches.append(array) finally: file.close() train_features = numpy.concatenate( [batch['data'].reshape(batch['data'].shape[0], 3, 32, 32) for batch in train_batches]) train_labels = numpy.concatenate( [numpy.array(batch['labels'], dtype=numpy.uint8) for batch in train_batches]) train_labels = numpy.expand_dims(train_labels, 1) print train_features.shape print train_labels.shape flipped_train_features = train_features[:,:,:,::-1] train_features = numpy.array([val for pair in zip(train_features, flipped_train_features) for val in pair]) train_labels = numpy.repeat(train_labels, 2, axis=0) print train_features.shape print train_labels.shape file = tar_file.extractfile('cifar-10-batches-py/test_batch') try: if six.PY3: test = cPickle.load(file, encoding='latin1') else: test = cPickle.load(file) finally: file.close() test_features = test['data'].reshape(test['data'].shape[0], 3, 32, 32) test_labels = numpy.array(test['labels'], dtype=numpy.uint8) test_labels = numpy.expand_dims(test_labels, 1) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
class labelManager(object): def __init__(self, fileName, startBlockNum=0): self._f = File(fileName, 'r+') self._blockNumber = startBlockNum self._maxLabelNum = 9999 def addBlockLabel(self, data, start, stop=None, invert=False): if not stop: stop = [ length + offset for length, offset in zip(data.shape, start) ] if self._blockNumber <= self._maxLabelNum: dataset = self._f[ 'PixelClassification/LabelSets/labels000'].create_dataset( 'block%04d' % self._blockNumber, data=(data.astype(np.uint8))) dataset.attrs.create('blockSlice', pointsToPosition(start, stop, invert)) self._blockNumber += 1 else: print 'Warning: maximum label block number exceeded. Unable to add further labels.' def addMultipleSingleLabels(self, positions, labelValue): for point in positions.T: self.addLabels(labelValue, pointsToPosition(point, point + 1)) def addSingleLabel(self, labelValue, position): dataset = self._f[ 'PixelClassification/LabelSets/labels000'].create_dataset( 'block%04d' % self._blockNumber, data=[[[[np.uint8(labelValue)]]]]) dataset.attrs.create('blockSlice', position) self._blockNumber += 1 def clear(self): dataset = self._f['PixelClassification/LabelSets/labels000'] for key in dataset.keys(): del dataset[key] self._blockNumber = 0 def getSubBlocks(self): """ returns subblocks containing the labels together with their corresponding offsets""" dataset = self._f['PixelClassification/LabelSets/labels000'] labelBlocks = [] for key in dataset: offset = strToPos(dataset[key].attrs.get('blockSlice')) values = dataset[key].value labelBlocks.append([offset, values]) print key return labelBlocks def getInSingleBlock(self, shape=None): """ returns a block containing all the labels. The return is guaranteed to start at (0,0,0) global coordinates, it may however not cover the whole block (max(shape[0]), max(shape[1]), max(shape[2])), since there is no good way of determining the shape of the raw data from ilasti""" # get the labels as they are saved in the projecct labeledBlocks = self.getSubBlocks() offsets = np.array([labeledBlock[0] for labeledBlock in labeledBlocks]) shapes = np.array( [labeledBlock[1].shape[:3] for labeledBlock in labeledBlocks]) data = [labelsBlock[1][:, :, :, 0] for labelsBlock in labeledBlocks] if shape is None: # find out the dimension of the block, there should be a better way of doing that. shape = np.max(offsets + shapes[:, :3], axis=0) # write all labeles into one big array labelBlockTotal = np.zeros(shape, dtype=np.uint8) for offset, shape, dataBlock in zip(offsets, shapes, data): index = [ slice(offset[0], offset[0] + shape[0]), slice(offset[1], offset[1] + shape[1]), slice(offset[2], offset[2] + shape[2]) ] labelBlockTotal[index] += dataBlock return labelBlockTotal def flush(self): self._f.flush() def changeRawDataPath(self, newPath): """ deletes all saved paths and replaces it with the path 'newPath' """ dataset = self._f['Input Data/infos/lane0000/Raw Data/'] dataset.pop('filePath') dataset.create_dataset('filePath', data=newPath)