def test_complibs_default_settings(setup_path): # GH15943 df = tm.makeDataFrame() # Set complevel and check if complib is automatically set to # default value with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complevel=9) result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 9 assert node.filters.complib == "zlib" # Set complib and check to see if compression is disabled with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complib="zlib") result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if not setting complib or complevel results in no compression with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df") result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if file-defaults can be overridden on a per table basis with ensure_clean_path(setup_path) as tmpfile: store = HDFStore(tmpfile) store.append("dfc", df, complevel=9, complib="blosc") store.append("df", df) store.close() with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): assert node.filters.complevel == 9 assert node.filters.complib == "blosc"
def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'): """Save the data in the standardized HDF5 file format This function wraps the :meth:`to_hdf` function of Pandas and supplies fixed values for some of the options so that the data is written in a reproducible format. Parameters ---------- path_or_buf : str or pandas.HDFStore File path or HDFStore object. key : str Identifier for the group in the store. complevel : {0-9}, optional Specifies a compression level for data. A value of 0 disables compression. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' Specifies the compression library to be used. As of v0.20.2 these additional compressors for Blosc are supported (default if no compressor specified: 'blosc:blosclz'): {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd'}. Specifying a compression library which is not available issues a ValueError. """ # Cast the data as a DataFrame, as Pandas's HDF I/O logic does not support subclasses # Note that we use the "table" format to allow for partial reads / querying data = pd.DataFrame(self) data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib, append=False, format='table', index=False) # Create logic for adding metadata def add_metadata(f: HDFStore): """Put the metadata in a standard location at the root of the HDF file""" f.root._v_attrs.metadata = self.metadata.json() # Apply the metadata addition function path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): with HDFStore(path_or_buf, mode='a', complevel=complevel, complib=complib) as store: add_metadata(store) store.flush() else: add_metadata(path_or_buf)
def test_load_multi_alpha(self): from multicov.align_io import from_hdf from multicov.alignment import Alignment from multicov.alphabet import protein_alphabet, dna_alphabet from pandas import HDFStore store = HDFStore(os.path.join('test_data', 'test_aln.h5'), 'r') align = from_hdf(store, 'align2') expected = Alignment(['IVGGYTCQ', '-VGGTEAQ', 'IGG-KDT-'], protein_alphabet) expected2 = Alignment(['AGCT', '-G-G', 'TA-T'], dna_alphabet) expected.add(expected2) store.close() self.assertEqual(align, expected)
def generate(self, data): file = "data/pepper_pose.h5" if os.path.exists(file): store = HDFStore(file) self.pose_data = store['data'] return print "Generating pepper pose..." motion_data = data["motion_data"].values self.emotion_intended = data["intended_emotion"].values self.emotion_perceived = data["accurate_category"].values for seq in motion_data: for i in range(PEPPER_JOINTS.NUMBER): seq_array = numpy.array(seq) to_pose = seq_array[:, MAP_PEPPER_BVH[i]] * motion.TO_RAD self.poses[MAP_PEPPER_STR[i]].append(to_pose) self.pose_data = pandas.DataFrame(self.poses) store = HDFStore(file) store['data'] = self.pose_data
def load_HDF5_to_xarray(filename): """Load HDF file into an xarray DataArray using pandas HDFStore requries pytables""" from pandas import HDFStore from xarray import DataArray with HDFStore(filename) as f: data = f['data'] if 'metadata' in f.get_storer('data').attrs: metadata = f.get_storer('data').attrs.metadata else: metadata = None return DataArray(data, attrs=metadata)
def save_pandas_table(table_dict, ID, save_path): if not os.path.exists(save_path): os.makedirs(save_path) import warnings from pandas import HDFStore from pandas.io.pytables import PerformanceWarning warnings.filterwarnings('ignore', category=PerformanceWarning) with HDFStore(save_path + '/' + ID + '.h5') as store: for name, table in table_dict.iteritems(): store[name] = table
def open_file_if_present(endpoint, values): if 'file_name' in values: g.file_name = file_name = values.pop('file_name', None) if file_name not in open_files: update_files() if file_name not in h5_files: abort(500) open_files[file_name] = HDFStore(h5_files[file_name], 'r') g.h = open_files[file_name] # noinspection PyProtectedMember g.h5h = g.h._handle
def test_read_hdf(tmpdir, test_df): # Write it out_path = os.path.join(tmpdir, 'test.h5') test_df.to_batdata_hdf(out_path) # Read it data = BatteryDataFrame.from_batdata_hdf(out_path) assert data.metadata.name == 'Test data' # Test reading from an already-open file store = HDFStore(out_path, 'r') data = BatteryDataFrame.from_batdata_hdf(store) assert data.metadata.name == 'Test data'
def test_save_simulation(): size_generation = 1 cohort = create_neutral_profiles_cohort(population = size_generation) simulation = Simulation() simulation.country = 'france' simulation.discount_rate = 0 simulation.growth_rate = 0 simulation.cohorts = cohort simulation.create_present_values('tax') simulation.save_simulation(filename='test_save') store = HDFStore(os.path.join(SRC_PATH,'countries','france','sources', 'Output_folder','test_save.h5')) assert store['aggregate_pv'] is not None
def loadSpace(self): file = "data/mpi_data.h5" if os.path.exists(file): store = HDFStore(file) return store['data'] self.loadData() # remove redundant columns delete_cols = [ "motion_id", "accurate_polarity", "duration", "peaks", "speed", "acting_subtask", "actor", "gender", "age", "handedness", "native_tongue", "text", "span", "acting_task" ] for col in delete_cols: del self.csv_data[col] # save data store = HDFStore(file) store['data'] = self.csv_data return self.csv_data
def __init__(self, cache_dir=".cache"): os.makedirs(cache_dir, exist_ok=True) # mkdir -p ... self.fn = join(cache_dir, "lmk.hd5") with HDFStore(self.fn) as cache: if TABLE_RANGE in cache: self.range = cache.get(TABLE_RANGE) else: self.range = DataFrame(columns=["start", "end"], dtype="datetime64[ns]") if TABLE_NAME in cache: self.name = cache.get(TABLE_NAME) else: self.name = Series([])
def save_xarray_to_HDF5(dataArray, filename, complib=None): """Save the xarray DataArray to HDF file using pandas HDFStore attrs will be saved as metadata via pickle requries pytables complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None""" from pandas import HDFStore f = HDFStore(filename, mode='w', complib=complib) f.put('data', dataArray.to_pandas()) if len(dataArray.attrs) > 0: f.get_storer('data').attrs.metadata = dataArray.attrs f.close()
def test_read_missing_key_opened_store(setup_path): # GH 28699 with ensure_clean_path(setup_path) as path: df = DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") with HDFStore(path, "r") as store: with pytest.raises(KeyError, match="'No object named k2 in the file'"): read_hdf(store, "k2") # Test that the file is still open after a KeyError and that we can # still read from it. read_hdf(store, "k1")
def get(self, symbol, start, end): table = self._table_name(symbol) if symbol in self.range.index: # Timestamp -> date r = self.range.loc[symbol] _start, _end = r["start"].date(), r["end"].date() # cache hit - part of existing data. if _start <= start <= end <= _end: with HDFStore(self.fn) as cache: h = cache.get(table) h = h.loc[start:end] return h
def getRecentlyUploaded(): listOfFiles = sorted([ join(server.config['UPLOAD_FOLDER'], f) for f in listdir(server.config['UPLOAD_FOLDER']) if f.endswith('.h5') ], key=getctime, reverse=True)[:20] finalListOfFiles = {} for file in listOfFiles: with HDFStore(file) as currentFile: id = basename(currentFile.filename)[0:-3] finalListOfFiles[id] = next(iter(currentFile.keys()), None).lstrip("/") return finalListOfFiles
def run_radial1d(radial1d_model, history_fname=None): if history_fname: if os.path.exists(history_fname): logger.warn('History file %s exists - it will be overwritten', history_fname) os.system('rm %s' % history_fname) history_buffer = HDFStore(history_fname) radial1d_model.atom_data.lines.to_hdf(history_buffer, 'atom_data/lines') radial1d_model.atom_data.levels.to_hdf(history_buffer, 'atom_data/levels') start_time = time.time() initialize_j_blues = True initialize_nlte = True update_radiation_field = False while radial1d_model.iterations_remaining > 1: logger.info('Remaining run %d', radial1d_model.iterations_remaining) radial1d_model.simulate(update_radiation_field=update_radiation_field, enable_virtual=False, initialize_nlte=initialize_nlte, initialize_j_blues=initialize_j_blues) initialize_j_blues = False initialize_nlte = False update_radiation_field = True if history_fname: radial1d_model.to_hdf5(history_buffer, path='model%03d' % radial1d_model.iterations_executed, close_h5=False) #Finished second to last loop running one more time logger.info('Doing last run') if radial1d_model.tardis_config.montecarlo.last_no_of_packets is not None: radial1d_model.current_no_of_packets = radial1d_model.tardis_config.montecarlo.last_no_of_packets radial1d_model.simulate(enable_virtual=True, update_radiation_field=update_radiation_field, initialize_nlte=initialize_nlte, initialize_j_blues=initialize_j_blues) if history_fname: radial1d_model.to_hdf5(history_buffer, path='model%03d' % radial1d_model.iterations_executed) logger.info("Finished in %d iterations and took %.2f s", radial1d_model.iterations_executed, time.time() - start_time)
def load_pandas_table_dict(name, save_path): import warnings from pandas import HDFStore from pandas.io.pytables import PerformanceWarning warnings.filterwarnings('ignore', category=PerformanceWarning) return_dict = dict() with HDFStore(save_path + '/' + name + '.h5') as store: #print(store) #print(store.keys()) for k in store.keys(): return_dict[k[1:]] = store.get(k) return return_dict
def load_data(self, uid: str, prefix: str): """ Load data from a specified group (prefix) - gps or gravity, from the projects HDF5 store. :param str uid: Datafile Unique Identifier :param str prefix: Data type prefix [gps or gravity] :return: """ with HDFStore(str(self.hdf_path)) as store: try: data = store.get('{}/{}'.format(prefix, uid)) except KeyError: return None else: return data
def build_actualisation_group_amounts_h5(): h5_name = "../actualisation_groups.h5" store = HDFStore(h5_name) xls = ExcelFile('actualisation_groups.xls') df_a = xls.parse('amounts', na_values=['NA']) df_a = df_a.set_index(['case'], drop=True) df_b = xls.parse('benef', na_values=['NA']) df_c = xls.parse('corresp', na_values=['NA']) store['amounts'] = df_a store['benef'] = df_b store['corresp'] = df_c print df_a.to_string() print df_a.columns store.close()
def get_from_store(filename, key, default=None): from pandas import HDFStore try: store = HDFStore(filename, mode='r') except OSError: return default else: out = store.get(key) if key in store else default store.close() return out
def load_amounts_from_file(self, filename=None, year=None): ''' Loads totals from files ''' if year is None: year = self.year if filename is None: data_dir = DATA_DIR try: filename = os.path.join(data_dir, "amounts.h5") store = HDFStore(filename) df_a = store['amounts'] df_b = store['benef'] store.close() self.totals_df = DataFrame(data={ "amount": df_a[year] / 10**6, "benef": df_b[year] / 1000, }) row = DataFrame({'amount': nan, 'benef': nan}, index=['logt']) self.totals_df = self.totals_df.append(row) # Add some aditionnals totals for col in ['amount', 'benef']: # Deals with logt logt = 0 for var in ['apl', 'alf', 'als']: logt += self.totals_df.get_value(var, col) self.totals_df.set_value('logt', col, logt) # Deals with rsa rmi rsa = 0 for var in ['rmi', 'rsa']: rsa += self.totals_df.get_value(var, col) self.totals_df.set_value('rsa', col, rsa) # Deals with irpp, csg, crds for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']: if col in ['amount']: val = -self.totals_df.get_value(var, col) self.totals_df.set_value(var, col, val) except: # raise Exception(" No administrative data available for year " + str(year)) import warnings warnings.warn( "No administrative data available for year %s in file %s" % (str(year), filename)) self.totals_df = None return
def find_unique_snps(x_mwas_files_path, y_mwas_files_path, output_dir, pval_col='Global_Bonferroni', alpha=0.05): # find unique snps and count comparisons (for p_value calculations) x_mwas_files = glob.glob(x_mwas_files_path) y_mwas_files = glob.glob(y_mwas_files_path) x_species = list( set([ 'SGB' + file.split('SGB')[-1].split('.')[0] for file in x_mwas_files ])) y_species = list( set([ 'SGB' + file.split('SGB')[-1].split('.')[0] for file in y_mwas_files ])) # pre run snps = set() pvals_df = pd.DataFrame(index=y_species, columns=x_species) # run for x_mwas_file in x_mwas_files: x_species = 'SGB' + x_mwas_file.split('SGB')[-1].split('.')[0] with HDFStore(x_mwas_file, 'r') as x_mwas_df: x_mwas_df = x_mwas_df[x_species] y_species, y_species_count = np.unique( x_mwas_df.index.get_level_values('Y'), return_counts=True) pvals_df.loc[y_species.tolist(), x_species] = y_species_count.tolist() snps = snps.union( set(x_mwas_df[x_mwas_df[pval_col] < alpha].index.droplevel( 'Y').values)) # post run snps = pd.DataFrame(snps) snps.columns = indices snps = snps.set_index(indices) pvals_df.to_csv(os.path.join(output_dir, 'pvals_count.csv')) snps = order_columns(snps) snps.to_hdf(os.path.join(output_dir, 'snps_unique.h5'), key='snps') return snps
def restart(hdfname, newstart): ''' Updates STATE values in HSP2 HDF file to start at later newstart date from computed values. User can extend timeseries by predictive or historic data to continue simulation. In this case, the user must set a new stop date! Parameters ---------- hdfname : str HSP2 HDF5 file. newstart : str (in Datatime format for Timestamp) DateTime for restarting the simulation. Returns ------- None. ''' with HDFStore(hdfname) as store: df = store['CONTROL/OP_SEQUENCE'] delt = df.loc[0, 'INDELT_minutes'] df = store['CONTROL/GLOBAL'] start = Timestamp(df.loc['Start', 'Info']) stop = Timestamp(df.loc['Stop', 'Info']) dates = date_range(start, stop, freq=Minute(delt)) # deterime new start date for restart; previous date if not exact match startindx = dates.get_loc(newstart, method='pad') startdate = dates[startindx] df.loc['Start', 'Info'] = str(startdate) df.to_hdf(hdfname, 'CONTROL/GLOBAL', format='table', data_columns=True) for path in [p[1:] for p in store.keys() if p.startswith('/RESULTS')]: _, x, activity = path.split('/') operation, segment = x.split('_') if (operation, activity) not in states: continue df = store[path][states[operation, activity]] df = df.iloc[startindx, :].to_frame() df.columns = [segment] storepath = f'{operation}/{activity}/STATES' dff = store[storepath] dff.update(df.T) dff.to_hdf(store, storepath, format='table', data_columns=True) return
def __init__(self): store = HDFStore('../dataset/labels.h5') if (os.path.isfile(classifier_filename) and os.path.isfile(gmm_filename)): classifier = pickle.load(open("classifier.p", "rb")) print("Loaded Classifier!") gmm = load_gmm() else: try: labels_train = store['labels_train_trimmed'] fv = np.load("fisher_vector_train.npy") except (FileNotFoundError, KeyError) as e: labels_to_train = store['labels_train'] skipped_indices, fv, gmm = process_images(labels_to_train, delta=delta, is_training=True) labels_train = load_labels(skipped_indices, labels_to_train, True, delta) store['labels_train_trimmed'] = labels_train np.save("fisher_vector_train.npy", fv) np.save("skipped_indices.npy", skipped_indices) classifier = train(fv, labels_train.score) #classifier = train(fv,labels_train.good) try: labels_test = store['labels_test_trimmed'] fv_test = np.load("fisher_vector_test.npy") except (FileNotFoundError, KeyError) as e: labels_to_test = store['labels_test'] skipped_indices_test, fv_test = process_images(labels_to_test, is_training=False, input_gmm=gmm) labels_test = load_labels(skipped_indices_test, labels_to_test, False) store['labels_test_trimmed'] = labels_test np.save("fisher_vector_test.npy", fv_test) np.save("skipped_indices_test.npy", skipped_indices_test) accuracy_score( labels_test.good, [0 if label < 5 else 1 for label in classifier.predict(fv_test)]) f1_score( labels_test.good, [0 if label < 0 else 1 for label in classifier.predict(fv_test)]) roc_auc_score( labels_test.good, [0 if label < 0 else 1 for label in classifier.predict(fv_test)]) #accuracy_score(labels_test.good, classifier.predict(fv_test)) roc_auc_score(labels_test.good, classifierSVC.predict(fv_test)) f1_score(labels_test.good, classifierSVC.predict(fv_test))
def save(self, filename='store.hdf', path='results'): """ Saves `self.results` into the HDF file `filename` in the tree `path`. """ if self.results.empty: return print('Saving to {} ({})'.format(filename, path)) if path == 'config': logging.error( 'Cant use "config" as path, using "config2" instead.') path = "config2" store = HDFStore(filename) store[path] = self.results store.get_storer(path).attrs.config = self.config store.get_storer(path).attrs.seed = self.seed store.get_storer(path).attrs.parallel = self.parallel store.close()
def test_read_hdf_open_store(setup_path): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters" df = df.set_index(keys="E", append=True) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") direct = read_hdf(path, "df") store = HDFStore(path, mode="r") indirect = read_hdf(store, "df") tm.assert_frame_equal(direct, indirect) assert store.is_open store.close()
def make_df_file(): path = '../data/' remove_list = ['CAMERA', 'END', 'DISSOLVE', 'CUT'] files = [f for f in listdir(path) if isfile(join(path, f))] for f in files: script = list(open(path+f, 'r', encoding='utf-8')) act_list = get_main_actors(script) speaker, sentence = get_flow(script, act_list) # Evaluate Sentimet value = calculateSent(sentence) value = moving_average(value) hdf = HDFStore('../processed_data/'+f[0:-4]+'.h5') hdf.put('d1', pd.DataFrame({'speaker':speaker, 'value':value}), format='table', data_columns=True) hdf.close() print("Store ", f)
def init_parameters(self): ''' Initialize the parameters of the simulation ''' try: population_file = CONF.get('paths', 'population_file') store_pop = HDFStore(population_file, 'r') self.population_choices = store_pop.keys() store_pop.close() profiles_file = CONF.get('paths', 'profiles_file') store_prof = HDFStore(profiles_file, 'r') profiles = store_prof['profiles'] self.set_population_prolong() self.set_taxes_proj() except Exception, e: self.population_loaded = False QMessageBox.warning( self, u"Impossible de lire les données de population", u"GA n'a pas réussi à lire les données de population. L'erreur suivante a été renvoyée:\n%s\n\nVous pouvez configuer le chemin vers le fichier de données Fichier>Paramètres>Chemins>Fichier données population" % e) return False
def clone(hdfname, operation, fromID, toID): ''' Add new segment ID to all HSP2 HDF5 tables for operation with values fromID NOTE: Does not add new segment to CONTROL/OP_SEQUENCE. User must update timeseries in CONTROL/EXT_SOURCES and values in CONTROL/LINKS & CONTROL/MASS_LINKS Parameters ---------- hdfname : str Name of HSP2 HDF5 file operation : str One of PERLND, IMPLND or RCHRES fromID : str Segment name to copy values from toID : str New segment name Returns ------- None. ''' with HDFStore(hdfname) as store: paths = [ key for key in store.keys() if key.startswith(f'/{operation}') ] for path in paths: df = store[path] if fromID in df.index: df.loc[toID, :] = df.loc[fromID, :] df.to_hdf(store, path, format='table', data_columns=True) path = 'CONTROL/EXT_SOURCES' df = store[path] indx = df[(df.TVOL == operation) & (df.TVOLNO == fromID)].index newdf = df.loc[indx, :] newdf['TVOLNO'] = toID dff = concat([df, newdf], ignore_index=True) dff.to_hdf(store, path, format='table', data_columns=True) path = 'CONTROL/LINKS' df = store[path] indx = df[(df.SVOL == operation) & (df.SVOLNO == fromID)].index newdf = df.loc[indx, :] newdf['SVOLNO'] = toID dff = concat([df, newdf], ignore_index=True) dff.to_hdf(store, path, format='table', data_columns=True) return
def test_write_hdf(tmpdir, test_df): """Test whether the contents of the HDF5 file are reasonably understandable""" # Write the HDF file out_path = os.path.join(tmpdir, 'test.h5') test_df.to_batdata_hdf(out_path) # Investigate the contents with h5py.File(out_path) as f: assert 'metadata' in f.attrs assert json.loads(f.attrs['metadata'])['name'] == 'Test data' assert 'raw_data' in f # Test writing to an already-open HDFStore store = HDFStore(out_path, 'r+') test_df.to_batdata_hdf(store)