def test(): directory = os.path.dirname(__file__) fname = os.path.join(directory, H5_FILENAME) store = HDFStore(fname) print store print store.keys()
def from_saved(cls, store: pandas.HDFStore) -> None: """ Create the PCA from its saved parameters. Notes: Performs an IO operation. Args: store (pandas.HDFStore) Returns: PCA """ config = store.get_storer('pca').attrs.config pca = cls(config['num_components'], config['stepsize']) pca.W = be.float_tensor(store.get('pca/W').values) pca.var = be.float_tensor(store.get('pca/var').values[:, 0]) # check the mean is present if 'pca/mean' in store.keys(): pca.mean = be.float_tensor(store.get('pca/mean').values[:, 0]) # if the saved PCA was fit from SVD, there is not calculator defined if 'pca/var_calc' in store.keys(): pca.var_calc = math_utils.MeanVarianceArrayCalculator.from_dataframe( store.get('pca/var_calc')) else: pca.var_calc = math_utils.MeanVarianceArrayCalculator() return pca
def load_df(path, default=None): """Load DataFrame for HDF5 store path '\logs' table""" try: store = HDFStore(path) print store.keys() df = store.get('logs') store.close() return df except: return default
def test_store(self): final_store = HDFStore(self.store_path) print '----' print final_store.keys() print '-' * 80 logs = final_store['/logs'] print type(logs) print len(logs) print logs.columns final_store.close()
def storeHdf5(data, tag, path): hdf = HDFStore(path,'a') if tag in hdf.keys(): hdf.append(tag,data) else: hdf.put(tag,data) hdf.close()
def save_temp(dataframe, name = None, year = None, config_files_directory = default_config_files_directory): """ Save a temporary table Parameters ---------- dataframe : pandas DataFrame the dataframe to save name : string, default None year : integer, default None year of the data """ if year is None: raise Exception("year is needed") if name is None: raise Exception("name is needed") hdf_file_path = get_tmp_file_path(config_files_directory = config_files_directory) store = HDFStore(hdf_file_path) log.info("{}".format(store)) store_path = "{}/{}".format(year, name) if store_path in store.keys(): del store["{}/{}".format(year, name)] dataframe.to_hdf(hdf_file_path, store_path) store.close() return True
class PandasHDFHandler(FileHandler): r""" Handler for HDF5 files using Pandas. """ def _open_for_read(self): self.handle = HDFStore(self.fname, mode='r') def _open_for_write(self): self.handle = HDFStore(self.fname) def list_items(self): keys = [key.strip('/') for key in self.handle.keys()] items = [(key, _get_type_from_attrs(self.handle.get_storer(key).attrs)) for key in keys if '/' not in key] # ---- for backward compatibility (LArray < 0.33) ---- # axes items += [(key.split('/')[-1], 'Axis_Backward_Comp') for key in keys if '__axes__' in key] # groups items += [(key.split('/')[-1], 'Group_Backward_Comp') for key in keys if '__groups__' in key] return items def _read_item(self, key, typename, *args, **kwargs): if typename in _supported_typenames: hdf_key = '/' + key # ---- for backward compatibility (LArray < 0.33) ---- elif typename == 'Axis_Backward_Comp': hdf_key = '__axes__/' + key elif typename == 'Group_Backward_Comp': hdf_key = '__groups__/' + key else: raise TypeError() return read_hdf(self.handle, hdf_key, *args, **kwargs) def _dump_item(self, key, value, *args, **kwargs): hdf_key = '/' + key if isinstance(value, (Array, Axis)): value.to_hdf(self.handle, hdf_key, *args, **kwargs) elif isinstance(value, Group): hdf_axis_key = '/' + value.axis.name value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs) elif isinstance(value, _supported_scalars_types): s = pd.Series(data=value) self.handle.put(hdf_key, s) self.handle.get_storer(hdf_key).attrs.type = type(value).__name__ else: raise TypeError() def _read_metadata(self): metadata = Metadata.from_hdf(self.handle) if metadata is None: metadata = Metadata() return metadata def _dump_metadata(self, metadata): metadata.to_hdf(self.handle) def close(self): self.handle.close()
def storeHdf5(data, tag, path): hdf = HDFStore(path, 'a') if tag in hdf.keys(): hdf.append(tag, data) else: hdf.put(tag, data) hdf.close()
def convert_fiducial(filename, output_type="csv"): ''' Converts the fiducial comparison HDF5 files into a CSV file. Parameters ---------- filename : str HDF5 file. output_type : str, optional Type of file to output. ''' store = HDFStore(filename) data_columns = dict() for key in store.keys(): data = store[key].sort(axis=1) mean_data = data.mean(axis=1) data_columns[key[1:]] = mean_data store.close() df = DataFrame(data_columns) output_name = "".join(filename.split(".")[:-1]) + "." + output_type df.to_csv(output_name)
def save_temp(dataframe, name=None, year=None, config_files_directory=default_config_files_directory): """ Save a temporary table Parameters ---------- dataframe : pandas DataFrame the dataframe to save name : string, default None year : integer, default None year of the data """ if year is None: raise Exception("year is needed") if name is None: raise Exception("name is needed") hdf_file_path = get_tmp_file_path( config_files_directory=config_files_directory) store = HDFStore(hdf_file_path) log.info("{}".format(store)) store_path = "{}/{}".format(year, name) if store_path in store.keys(): del store["{}/{}".format(year, name)] dataframe.to_hdf(hdf_file_path, store_path) store.close() return True
class PandasHDFHandler(FileHandler): """ Handler for HDF5 files using Pandas. """ def _open_for_read(self): self.handle = HDFStore(self.fname, mode='r') def _open_for_write(self): self.handle = HDFStore(self.fname) def list_items(self): keys = [key.strip('/') for key in self.handle.keys()] # axes items = [(key.split('/')[-1], 'Axis') for key in keys if '__axes__' in key] # groups items += [(key.split('/')[-1], 'Group') for key in keys if '__groups__' in key] # arrays items += [(key, 'Array') for key in keys if '/' not in key] return items def _read_item(self, key, type, *args, **kwargs): if type == 'Array': hdf_key = '/' + key elif type == 'Axis': hdf_key = '__axes__/' + key kwargs['name'] = key elif type == 'Group': hdf_key = '__groups__/' + key kwargs['name'] = key else: raise TypeError() return key, read_hdf(self.handle, hdf_key, *args, **kwargs) def _dump_item(self, key, value, *args, **kwargs): if isinstance(value, LArray): hdf_key = '/' + key value.to_hdf(self.handle, hdf_key, *args, **kwargs) elif isinstance(value, Axis): hdf_key = '__axes__/' + key value.to_hdf(self.handle, hdf_key, *args, **kwargs) elif isinstance(value, Group): hdf_key = '__groups__/' + key hdf_axis_key = '__axes__/' + value.axis.name value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs) else: raise TypeError() def _read_metadata(self): metadata = Metadata.from_hdf(self.handle) if metadata is None: metadata = Metadata() return metadata def _dump_metadata(self, metadata): metadata.to_hdf(self.handle) def close(self): self.handle.close()
def load(self,dataFile): """load data from HDF""" if os.path.exists(dataFile): store = HDFStore(dataFile) symbols = [str(s).strip('/') for s in list(store.keys()) ] data = dict(list(zip(symbols,[store[symbol] for symbol in symbols]))) self.wp = Panel(data) store.close() else: raise IOError('Data file does not exist')
def load(self, dataFile): """load data from HDF""" if os.path.exists(dataFile): store = HDFStore(dataFile) symbols = [str(s).strip('/') for s in store.keys()] data = dict(zip(symbols, [store[symbol] for symbol in symbols])) self.wp = WidePanel(data) store.close() else: raise IOError('Data file does not exist')
def load(self,dataFile): """load data from HDF""" if os.path.exists(dataFile): store = HDFStore(dataFile) symbols = store.keys() data = dict(zip(symbols,[store[symbol] for symbol in symbols])) self.wp = WidePanel(data) store.close() else: raise IOError('Data file does not exist')
def convert_fiducial(filename, output_type="csv", decimal_places=8, append_comp=True, num_fids=5, return_name=True, mode='mean', **kwargs): ''' Converts the fiducial comparison HDF5 files into a CSV file. Parameters ---------- filename : str HDF5 file. output_type : str, optional Type of file to output. decimal_places : int, optional Specify the number of decimal places to keep. append_comp : bool, optional Append on columns with fiducial numbers copy num_fids : int, optional Number of fiducials compared. ''' store = HDFStore(filename) data_columns = dict() for key in store.keys(): data = store[key].sort(axis=1) mean_data = timestep_choose(data, mode=mode, **kwargs) data_columns[key[1:]] = trunc_float(mean_data, decimal_places) comp_fids = store[key].index store.close() df = DataFrame(data_columns) if append_comp: fids = [] for fid, num in zip(np.arange(0, num_fids - 1), np.arange(num_fids - 1, 0, -1)): for _ in range(num): fids.append(fid) df["Fiducial 1"] = Series(np.asarray(fids).T, index=df.index) df["Fiducial 2"] = Series(comp_fids.T, index=df.index) for comp in all_comparisons: if comp in filename: break else: raise StandardError("Could not find a face comparison match for " + filename) output_name = "fiducials" + comp[:-1] + "." + output_type df.to_csv(output_name) if return_name: return output_name
class Serialization(object): def __init__(self, filename, mode='r', compress=True): self._filename = filename self._compress = compress self._mode = mode def __enter__(self): if self._compress: self._store = HDFStore(self._filename, complib='blosc:lz4', complevel=9, mode=self._mode) else: # pragma: no cover self._store = HDFStore(self._filename, mode=self._mode) return self def __exit__(self, exc_type, exc_val, exc_tb): self._store.close() @property def keys(self): return self._store.keys() def store_pandas_object(self, path, obj, **metadata): self._store.put(path, obj, format='fixed') self._store.get_storer(path).attrs.metadata = metadata def retrieve_pandas_object(self, path): # Get the metadata metadata = self._store.get_storer(path).attrs.metadata # Get the object obj = self._store.get(path) return obj, metadata
def load(hdf_file_name, dataset_dir, hdf5_data_name="/geolife_trajectories_labelled", process_labels=True): """Parse geolife data grouped by user. Store the datatframe in HDF store to speed up subsequent retrievals. """ store = HDFStore(hdf_file_name) if hdf5_data_name in store.keys(): data = store[hdf5_data_name] else: dirs_with_labels = GeoLifeData.find_dirs_with_labels(dataset_dir) data = GeoLifeData.get_dataframe_grouped_by_user( dirs_with_labels, process_labels) store[hdf5_data_name] = data store.close() return data
def aggregate(hdf_store_loc, file_pattern, headerfile=None, remove_part_files=False): df = None store = HDFStore(hdf_store_loc) store_keys = [w.replace('/', '') for w in store.keys()] print( f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file' ) for key in store_keys: if re.match(file_pattern.replace('*', '.+'), key): print( f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}' ) #thisdf = pd.read_hdf(store_loc, key) thisdf = store.select(key) if df is None: df = thisdf else: #' for gz file that not have headers assign headers. try: df = df.append(thisdf, ignore_index=True, sort=True) except Exception as e: print('Error while joining data {e}') if remove_part_files: store.remove(key) try: #df.to_hdf(store_loc, key=file_pattern.replace('*','')) store.put(key=file_pattern.replace('*', ''), value=df) except Exception as e: print( f'Exception while combining flile for {file_pattern} exception {e}' ) store.close()
def init_parameters(self): ''' Initialize the parameters of the simulation ''' try: population_file = CONF.get('paths', 'population_file') store_pop = HDFStore(population_file,'r') self.population_choices = store_pop.keys() store_pop.close() profiles_file = CONF.get('paths', 'profiles_file') store_prof = HDFStore(profiles_file,'r') profiles = store_prof['profiles'] self.set_population_prolong() self.set_taxes_proj() except Exception, e: self.population_loaded = False QMessageBox.warning(self, u"Impossible de lire les données de population", u"GA n'a pas réussi à lire les données de population. L'erreur suivante a été renvoyée:\n%s\n\nVous pouvez configuer le chemin vers le fichier de données Fichier>Paramètres>Chemins>Fichier données population"%e) return False
def save_temp(dataframe, name=None, year=None): """ Save a temporary table Parameters ---------- dataframe : pandas DataFrame the dataframe to save name : string, default None year : integer, default None year of the data """ if year is None: raise Exception("year is needed") if name is None: raise Exception("year is needed") store = HDFStore(os.path.join(ERF_HDF5_DATA_DIR,'temp.h5')) if str(year)+"/"+name in store.keys(): del store[str(year)+"/"+name] store[str(year)+"/"+name] = dataframe store.close() return True
def save_temp(dataframe, name=None, year=None): """ Save a temporary table Parameters ---------- dataframe : pandas DataFrame the dataframe to save name : string, default None year : integer, default None year of the data """ if year is None: raise Exception("year is needed") if name is None: raise Exception("year is needed") store = HDFStore(os.path.join(ERF_HDF5_DATA_DIR, 'temp.h5')) if str(year) + "/" + name in store.keys(): del store[str(year) + "/" + name] store[str(year) + "/" + name] = dataframe store.close() return True
class PandasHDFHandler(FileHandler): """ Handler for HDF5 files using Pandas. """ def _open_for_read(self): self.handle = HDFStore(self.fname, mode='r') def _open_for_write(self): self.handle = HDFStore(self.fname) def list(self): return [key.strip('/') for key in self.handle.keys()] def _to_hdf_key(self, key): return '/' + key def _read_array(self, key, *args, **kwargs): return read_hdf(self.handle, self._to_hdf_key(key), *args, **kwargs) def _dump(self, key, value, *args, **kwargs): value.to_hdf(self.handle, self._to_hdf_key(key), *args, **kwargs) def close(self): self.handle.close()
def init_parameters(self): ''' Initialize the parameters of the simulation ''' try: population_file = CONF.get('paths', 'population_file') store_pop = HDFStore(population_file, 'r') self.population_choices = store_pop.keys() store_pop.close() profiles_file = CONF.get('paths', 'profiles_file') store_prof = HDFStore(profiles_file, 'r') profiles = store_prof['profiles'] self.set_population_prolong() self.set_taxes_proj() except Exception, e: self.population_loaded = False QMessageBox.warning( self, u"Impossible de lire les données de population", u"GA n'a pas réussi à lire les données de population. L'erreur suivante a été renvoyée:\n%s\n\nVous pouvez configuer le chemin vers le fichier de données Fichier>Paramètres>Chemins>Fichier données population" % e) return False
class Serialization(object): def __init__(self, filename): self._filename = filename def __enter__(self): self._store = HDFStore(self._filename, complib='blosc', complevel=9) return self def __exit__(self, exc_type, exc_val, exc_tb): self._store.close() @property def keys(self): return self._store.keys() def store_pandas_object(self, name, object, **metadata): self._store.put(name, object) self._store.get_storer(name).attrs.metadata = metadata def retrieve_pandas_object(self, name): # Get the metadata metadata = self._store.get_storer(name).attrs.metadata # Get the object obj = self._store[name] return obj, metadata
def convert_format(path, face1, face2=None, design=None, mode='mean', output_type="csv", parameters=None, decimal_places=8, append_comp=True, keep_index=True, **kwargs): ''' Takes all HDF5 files in given path comparing face1 to face2 and combines them into a single file. Parameters ---------- path : str Path where files are located. face1 : int Face of the cube. face2: int, optional Face of the cube compared to. Disabled for observational comparison. design : str or pandas.DataFrame, optional If str, assumes a 'csv' file. Disabled for observational comparison. output_type : str, optional Type of file to output. parameters : list, optional Contains column names of design that are the parameters varied in the set. If None, all columns are appended to the output file. decimal_places : int, optional Specify the number of decimal places to keep. append_comp : bool, optional Append on columns with fiducial numbers copy ''' if face2 is not None: files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and "_" + str(face1) + "_" + str(face2) + "_" in f and "fid_comp" not in f] else: # Observational comparisons explicitly have 'face' in filename files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and "face_" + str(face1) in f and "fid_comp" not in f] files.sort() print("Files used: %s" % (files)) if len(files) == 0: raise StandardError("No files found for " + str(face1) + " and " + str(face2)) if design is not None: if isinstance(design, str): design = read_csv(design) if isinstance(parameters, list): design_df = {} for param in parameters: design_df[param] = Series(design[param]) design_df = DataFrame(design_df) else: design_df = design for i, f in enumerate(files): store = HDFStore(f) data_columns = {} # Get data from HDF5 for key in store.keys(): data = store[key].sort(axis=0).sort(axis=1) index = data.index mean_data = timestep_choose(data, mode=mode, **kwargs) data_columns[key[1:]] = trunc_float(mean_data, decimal_places) store.close() # Add on design matrix if design is not None: for key in design_df: # can get nans if the file was made in excel design_df = design_df.dropna() design_df.index = index data_columns[key] = design_df[key] if keep_index: data_columns = DataFrame(data_columns, index=index) else: data_columns = DataFrame(data_columns) if append_comp: data_columns["Fiducial"] = \ Series(np.asarray([i] * len(index)).T, index=index) data_columns["Designs"] = Series(index.T, index=index) if i == 0: # Create dataframe df = data_columns else: # Add on to dataframe df = concat([df, data_columns]) if face2 is not None: filename = "distances_" + str(face1) + "_" + str(face2) else: filename = "complete_distances_face_" + str(face1) if "Name" in df.keys(): del df["Name"] if output_type == "csv": df.to_csv(os.path.join(path, filename + ".csv"))
def concat_convert_HDF5(path, face=None, combine_axis=0, mode='mean', average_axis=None, interweave=True, statistics=None, extension="h5", return_df=False, output_type="csv", **kwargs): ''' A more general function for combining sets of results. The output format defaults to a csv file and should be compatible with the plotting routines included in this module. Parameters ---------- path : str Path to folder with the HDF5 files. face : int, optional If using a specific face to compare to, specify it here. This will look for files in the provided path that contain, for example, "face_0". combine_axis : int, optional The axis along which the data should be concatenated together. Defaults to the first axis (ie. 0). average_axis : int, optional If specified, the data is averaged along this axis. interweave : bool, optional Instead of appending directly together, this order the indices by grouping like labels. statistics : list, optional Which statistics to be extracted from the HDF5 files. If the statistic is not contained in all of the files, an error will be raised. By default, all statistics contained in all of the files will be returned. extension : str, optional The extension used for the HDF5 files. Defaults to ".h5". Several extensions are permitted and this is in place to allow whichever has been used. ''' # Grab the files in the path if face is None: hdf5_files = glob.glob(os.path.join(path, "*", extension)) else: if not isinstance(face, int): raise TypeError("face must be an integer.") hdf5_files = \ glob.glob(os.path.join(path, "*face_" + str(face) + "*" + extension)) if len(hdf5_files) == 0: raise Warning( "Did not find any HDF5 files in the path %s" % (path)) if statistics is None: for i, hdf5 in enumerate(hdf5_files): store = HDFStore(hdf5) individ_stats = store.keys() store.close() if i == 0: statistics = individ_stats else: statistics = list(set(statistics) & set(individ_stats)) if len(statistics) == 0: raise Warning( "There are no statistics that are contained in every file.") statistics = [stat[1:] for stat in statistics] for j, stat in enumerate(statistics): # Loop through the files and extract the statistic's table dfs = [] for hdf5 in hdf5_files: store = HDFStore(hdf5) dfs.append(DataFrame(store[stat])) store.close() if average_axis is not None: for i in range(len(dfs)): dfs[i] = DataFrame(dfs[i].mean(average_axis)) # dfs[i] = \ # DataFrame(timestep_choose(dfs[i], # avg_axis=average_axis, **kwargs)) for i in range(len(dfs)): num = dfs[i].shape[0] dfs[i]['Names'] = dfs[i].index dfs[i]['Order'] = Series([i] * num, index=dfs[i].index) dfs[i].index = Index(range(num)) stats_df = concat(dfs, axis=combine_axis) if interweave: stats_df = stats_df.sort_index() num = len(hdf5_files) num_splits = stats_df.shape[0] / num split_dfs = [] for i in range(num_splits): split_df = stats_df[i * num:(i + 1) * num].copy() split_df = split_df.sort(columns=['Order']) split_dfs.append(split_df) stats_df = concat(split_dfs, axis=0) if j == 0: master_df = stats_df.copy() del master_df[0] master_df[stat] = DataFrame(stats_df[0], index=master_df.index) if return_df: return master_df else: if face is not None: master_df.to_csv(os.path.join( path, "distances_" + str(face) + ".csv")) else: master_df.to_csv(os.path.join(path, "combined_distances.csv"))
from pandas import HDFStore import os folder_path = sys.argv[1] faces = ["_0_0_", "_0_2_", "_2_0_", "_2_2_"] for face in faces: old_comp = glob(os.path.join(folder_path, "*_comparisons_*" + face + "*.h5")) new_comp = glob(os.path.join(folder_path, "*8_fiducialfid_comp" + face + "*.h5")) print(old_comp) print(new_comp) assert len(old_comp) == 1 assert len(new_comp) == 1 old_result = HDFStore(old_comp[0]) new_result = HDFStore(new_comp[0]) for key in old_result.keys(): if key in new_result.keys(): continue new_result[key] = old_result[key].copy() print("New file keys: " + str(new_result.keys())) old_result.close() new_result.close()
hdf.append(tag,data) else: hdf.put(tag,data) hdf.close() def getKrakenData(interval=1440,since=0): directory = krakenutl.SRCDIR if not os.path.exists(directory): os.makedirs(directory) for p in krakenutl.PAIRS: logger.debug('download data for: '+p+' interval: '+str(interval)+' since:'+str(krakenutl.localTimeFromEpoch(since))) pdata = getOhlc(p, interval,since) storeHdf5(pdata,krakenutl.getTagFromPair(p,interval),krakenutl.getH5source()) if __name__ == '__main__': getKrakenData(krakenutl.DAY,STARTDATE) getKrakenData(krakenutl.WEEK,STARTDATE) getKrakenData(krakenutl.H3,STARTDATE) getKrakenData(krakenutl.H1,STARTDATE) getKrakenData(krakenutl.M30,STARTDATE) getKrakenData(krakenutl.M15,STARTDATE) getKrakenData(krakenutl.M5,STARTDATE) #df=getOhlc("XXBTZUSD",5,1441148619) #print(df) hdf = HDFStore(krakenutl.getH5source()) for k in hdf.keys(): print(k,len(hdf[k])) hdf.close()
folder_path = sys.argv[1] # All HDF5 files in the path all_files = glob(os.path.join(folder_path, "*.h5")) # Remove the PDF only ones (relevant results are in the PDF_KS and # PDF_Hellinger keywords) remove_keys = ["PDF", "PDF_AD"] # Rename keys rename_keys = {"VCS_Density": "VCS_Small_Scale", "VCS_Velocity": "VCS_Large_Scale"} for f in all_files: store = HDFStore(f) # Removals for key in remove_keys: if "/" + key in store.keys(): del store[key] # Rename for old_key in rename_keys: if "/" + old_key in store.keys(): store[rename_keys[old_key]] = store[old_key].copy() del store[old_key] print("Final keys: " + str(store.keys())) store.close()
monitor = Monitor(len(datelist)) from pandas import HDFStore store = HDFStore('store.h5', complevel=9) fmap = wrap_monitor(wrap_write(partial(fetch_safe, rse=args.rse), store, overwrite=args.overwrite), monitor) p.map(fmap, datelist) monitor.close() logging.info("closing file") store.close() logging.info("trying to open output") store = HDFStore('store.h5') data = [] for k in store.keys(): try: d = store.get(k) d['timestamp'] = pd.to_datetime(k.split("_")[1], format='%d%m%Y') data.append(d) except Exception as e: print "Problem reading", k print e store.close() data = pd.concat(data) data = data.set_index(['timestamp', 'owner']) data_to_plot = data['size'].unstack().fillna(0) dataplot = data_to_plot.iplot(kind='area', fill=True, asFigure=True) for d in dataplot['data']: d['hoverinfo'] = 'text+x+name'
class LogSaver: """ self.directory : Directory structure for temp and saved files self.log_list : List of server.log files to process self.extra : True if log messages and thread ids are to be saved too self.history_path : History of server.log conversions saved here self.progress_store_path : HDF5 file that holds one DataFrame for each server.log file self.store_path : Final DataFrame of all server.log entries saved here self.history : History of server.log conversions """ FINAL = 'logs' PROGRESS = 'progress' HISTORY = 'history' @staticmethod def normalize(name): return re.sub(r'[^a-zA-Z0-9]', '_', name) @staticmethod def make_name(base_name, extra): if extra: return base_name + '.extra' else: return base_name #@staticmethod #def temp_name(log_list, extra): # hsh = hash(log_list) # sgn = 'n' if hsh < 0 else 'p' # temp = 'temp_%s%08X' % (sgn, abs(hsh)) # return LogSaver.make_name(temp, extra) def __init__(self, store_path, log_list, extra): self.directory = ObjectDirectory(store_path) self.log_list = tuple(sorted(log_list)) self.extra = extra self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True) self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True) self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), is_df=True) self.history = ObjectDirectory.load_object(self.history_path, {}) self.saved = False def __repr__(self): return '\n'.join('%s: %s' % (k,v) for k,v in self.__dict__.items()) def __str__(self): return '\n'.join([repr(self), '%d log files' % len(self.log_list)]) def save_all_logs(self, force=False): if os.path.exists(self.store_path): final_store = HDFStore(self.store_path) print 'Keys: %s' % final_store final_store.close() return if not force: assert not os.path.exists(self.history_path), ''' %s exists but %s does not. There appears to be a conversion in progress. -f forces conversion to complete. ''' % (self.history_path, self.store_path) self.directory.make_dir_if_necessary(self.progress_store_path) self.progress_store = HDFStore(self.progress_store_path) for path in self.log_list: self.save_log(path) self.check() print '--------' print 'All tables in %s' % self.progress_store_path print self.progress_store.keys() print '--------' def get_log(path): try: return self.progress_store.get(LogSaver.normalize(path)) except Exception as e: print print path raise e df_list = [get_log(path) for path in self.log_list] self.progress_store.close() print 'Closed %s' % self.progress_store_path df_all = pd.concat(df_list) print 'Final list has %d entries' % len(df_all) final_store = HDFStore(self.store_path) final_store.put('logs', df_all) print 'Keys: %s' % final_store final_store.close() print 'Closed %s' % self.store_path # Save the history in a corresponding file self.directory.save('history', self.history) print 'Saved history' self.saved = True def test_store(self): final_store = HDFStore(self.store_path) print '----' print final_store.keys() print '-' * 80 logs = final_store['/logs'] print type(logs) print len(logs) print logs.columns final_store.close() def cleanup(self): os.remove(self.progress_store_path) os.remove(self.history_path) def delete(self): os.remove(self.store_path) def save_log(self, path): """Return a pandas DataFrame for all the valid log entry lines in log_file The index of the DataFrame are the uniqufied timestamps of the log entries """ if path in self.history: return print 'Processing %s' % path, start = time.time() header, df = load_log(path, extra=self.extra) if df is None: print 'Could not process %s' % path return self.progress_store.put(LogSaver.normalize(path), df) load_time = time.time() - start self.history[path] = { 'start': df.index[0], 'end': df.index[-1], 'load_time': int(load_time), 'num': len(df), 'header': header } ObjectDirectory.save_object(self.history_path, self.history) del df print { k:v for k,v in self.history[path].items() if k != 'header' }, print '%d of %d' % (len(self.history), len(self.log_list)) def check(self): history = ObjectDirectory.load_object(self.history_path, {}) sorted_keys = history.keys() sorted_keys.sort(key=lambda k: history[k]['start']) print '-' * 80 print 'Time range by log file' for i, path in enumerate(sorted_keys): hist = history[path] print '%2d: %s --- %s : %s' % (i, hist['start'], hist['end'], path) path0 = sorted_keys[0] for path1 in sorted_keys[1:]: hist0,hist1 = history[path0],history[path1] assert hist0['end'] < hist1['start'], ''' ----------- %s %s start: %s end : %s ----------- %s %s hist1['start'] start: %s end : %s ''' % ( path0, hist0, hist0['start'], hist0['end'], path1, hist1, hist1['start'], hist1['end'])
class WeatherStore(object): """ WeatherStore serves as a datasource for weather data """ def __init__(self, filename): """ Parameters ---------- filename : filename pointing to an existing HDFStore with valid data in it. """ self._store = HDFStore(filename) def dframe(self, city): """ Get weather data for specified city Parameters ---------- city : string City for which to fetch data Returns ------- result : pandas DataFrame """ val = self._store[city] if isinstance(val, Panel): key = val.items[0] val = val[key] return val def field_numpy(self, city, field): """ Get weather field for specified city Parameters ---------- city : string City for which data is being requested field : string Weather field being requested Returns ------- result : numpy ndarray Value of requested weather field for city """ df = self.dframe(city) y = np.empty((df.shape[0], ), dtype=np.float64) y[:] = df[field] return y def time_indices(self, df): """ Get time indices out of Pandas DataFrame Parameters ---------- df : Pandas DataFrame Returns ------- result : numpy ndarray Time index for given DataFrame """ X = np.empty((df.shape[0], 3), dtype=np.float64) X[:, 0] = df.index.year X[:, 1] = df.index.month X[:, 2] = df.index.day return X def learning_data(self, city, field): """ Get input parameters and output values so that it can be shipped to a learning method. Returns ------- X : numpy array of shape (n,2). Columns are month and day y : numpy array of shape (n,). value of field being requested """ df = self.dframe(city) X = self.time_indices(df)[:, 1:] y = self.field_numpy(city, field) return X, y def dataseries(self, city, field): """ Get dataseries containing field data for city Parameters ---------- city : string City for which data is being requested field : string Weather field being requested Returns ------- result : DataSeries get the specified fieldute for city as a DataSeries """ df = self.dframe(city) indices = self.time_indices(df) data = self.field_numpy(city, field) return DataSeries(city, data, indices) def cities(self): """ Get cities contained in this WeatherStore Returns ------- result : List of strings Names of cities for which this Store has some weather data. """ return self._store.keys()
def append_store_mod( module, path_store, n_days_refresh=None, b_ptrk=False ): """ append all new rows in module.field to store. Resize store as appropriate. """ store = HDFStore( path_store ) for field in module.__dict__.keys(): if ( type( getattr( module, field ) ) is DataFrame or type( getattr( module, field ) ) is Panel ) and "/{}".format( field ) in store.keys(): if "tdate" in field: getattr( module, field ).to_hdf( path_store, field, mode='a', format='fixed' ) else: solbasic.logger.info( "Working on {}...".format( field ) ) df = store[ field ].copy() df_new = getattr( module, field ).copy() if n_days_refresh == None: l_index = sorted( list( set( df_new.index ) - set( df.index ) ) ) else: l_index = sorted( list( df_new.index[ -n_days_refresh: ] ) ) l_columns = sorted( list( set( df_new.columns ) - set( df.columns ) ) ) l_columns_rev = sorted( list( set( df.columns ) - set( df_new.columns ) ) ) if l_columns: solbasic.logger.info( "Adding {} instruments: {}".format( len( l_columns ), l_columns ) ) for col in l_columns: df[ col ] = np.nan if l_columns_rev: for col in l_columns_rev: df_new[ col ] = df[ col ] if l_index: solbasic.logger.info( "Refreshing {} dates: {}".format( len( l_index ), l_index ) ) for ind in l_index: df.ix[ ind ] = df_new.ix[ ind ] df.to_hdf( path_store, field, mode='a', format='fixed' ) store.close() if b_ptrk: ptrk_store( path_store )
def test_hdf5(h5_name): store = HDFStore(h5_name) for key in store.keys(): print key store.close()
"lying", "sitting", "standing", "walking", "running", "cycling" ]] = totals_row[[ "lying", "sitting", "standing", "walking", "running", "cycling" ]].divide(12, axis="index") return totals_row host = 'http://localhost' port = 10200 hf = HDFStore('/Volumes/LaCie/dataset/timestamped_predictions.hdf') limit = 27007 subjectlist = pd.read_csv( api.subject_names(host, port, limit=limit, successful_only=True)) subjectlist = subjectlist['name'].tolist() frames = [] for subject in subjectlist: print(subject) data = pd.read_csv(api.timestamped_predictions(host, port, subject), names=["timestamp", "label", "probability"]) hf.put('s' + subject.__str__(), data, format='table', data_columns=True) #summary_df = create_summary(subject,data) #frames.append(summary_df) #appended_data = pd.concat(frames, axis=0) #appended_data.to_csv("../output/AAAAAA-summary-all-classes.csv", index=False) print(hf.keys()) hf.close()
def preprocess(directory, n_entries): hdf_path = directory.get_path("logs.h5", temp=False) print "hdf_path: %s" % hdf_path store = HDFStore(hdf_path) print "Keys: %s" % store.keys() print store store.close() df = pd.read_hdf(hdf_path, "logs") # df = directory.load('logs.h5') print "df: %s" % df if n_entries >= 0: df = df[:n_entries] secs = (df.index.max() - df.index.min()).total_seconds() hours = secs / 3600 levels = df.level.unique() print "%.1f hours of logs" % hours print "%d log entries/hour" % int(len(df) / hours) print "%.1f thousand log entries/hour" % (int(len(df) / hours) / 1000.0) print df.shape, df.columns for level in levels: print "%-5s : %5d" % (level, len(df[df.level == level])) print "df : %s" % str(df.shape) if False: def get_peak(counts): """Retun the peak value in Series counts""" if len(counts) == 0: return None return counts.indmax() # return counts.index[counts.argmax()] start_time, end_time = df.index.min(), df.index.max() print "orginal: start_time, end_time = %s, %s" % (start_time, end_time) # Start time and end time trunctated to whole minutes start_time = truncate_to_minutes(start_time + timedelta(minutes=2)) end_time = truncate_to_minutes(end_time - timedelta(minutes=2)) print "cleaned: start_time, end_time = %s, %s" % (start_time, end_time) details = get_details(df) directory.save("details", details) # The counts for each 1 minute bin minute_counts = get_minute_counts(df, start_time, end_time) print "minute_counts: %s\n%s" % (type(minute_counts), minute_counts.describe()) print "total entries: %s" % minute_counts.sum() level_counts = {level: get_minute_counts(df[df.level == level], start_time, end_time) for level in levels} # level_peaks = {level: get_peak(level_counts[level]) for level in levels} # print 'level_peaks: %s' % level_peaks if False: unique_files = df.file.unique() print "%d source files" % len(unique_files) for i, fl in enumerate(sorted(unique_files)[:5]): print "%3d: %s" % (i, fl) directory.save("unique_files", unique_files) # # Get all the unique log messages # level_file_line = df.groupby(["level", "file", "line"]) lfl_size = level_file_line.size() lfl_sorted = lfl_size.order(ascending=False) print "lfl_sorted: %s" % str(lfl_sorted.shape) # directory.save('level_file_line', tuple(level_file_line)) directory.save("lfl_sorted", lfl_sorted) # file:line uniquely identifies each level,file,line # Construct mappings in both directions lfl_to_string = OrderedDict(((lvl, fl, ln), "%s:%d" % (fl, ln)) for lvl, fl, ln in lfl_sorted.index) string_to_lfl = OrderedDict(("%s:%d" % (fl, ln), (lvl, fl, ln)) for lvl, fl, ln in lfl_sorted.index) print "string_to_lfl: %s" % len(string_to_lfl) # [((level,file,line),count)] sorted by count in descending order entry_types_list = zip(lfl_sorted.index, lfl_sorted) # {(level,file,line) : count} entry_types = OrderedDict(entry_types_list) directory.save("entry_types", entry_types) print "entry_types: %s" % len(entry_types) # # Build the correlation table # threshold = min(100, len(df) // 1000) lfl_freq_dict = { s: get_minute_counts(df[(df.file == fl) & (df.line == ln)], start_time, end_time) for s, (lvl, fl, ln) in string_to_lfl.items() if len(df[(df.file == fl) & (df.line == ln)]) >= threshold } print "++++" lfl_freq = DataFrame(lfl_freq_dict, columns=string_to_lfl.keys()) directory.save("lfl_freq", lfl_freq) lfl_freq_corr = lfl_freq.corr() directory.save("lfl_freq_corr", lfl_freq_corr) print "lfl_freq_corr: %s" % str(lfl_freq_corr.shape)
def populate_from_survey_data(self, fname, year = None): ''' Populates a DataTable from survey data ''' list_entities = self.list_entities if isinstance(fname, str) or isinstance(fname, unicode): if fname[-4:] == '.csv': # TODO: implement it for _num_table==3 (or remove) if self.num_table == 1 : with open(fname) as survey_data_file: self.table = read_csv(survey_data_file) else : raise Exception('For now, use three csv table is not allowed' 'although there is no major difficulty. Please,' 'feel free to code it') elif fname[-3:] == '.h5': store = HDFStore(fname) if self.num_table == 1 : available_years = sorted([int(x[-4:]) for x in store.keys()]) elif self.num_table == 3 : available_years = (sorted([int(x[-8:-4]) for x in store.keys()])) # note+ we have a repetition here in available_years but it doesn't matter if year is None: if self.datesim is not None: year_ds = self.datesim.year else: raise Exception('self.datesim or year should be defined') else: year_ds = year yr = year_ds + 0 # to avoid pointers problem while yr not in available_years and yr > available_years[0]: yr = yr - 1 base_name = 'survey_' + str(yr) if year_ds != yr: print 'Survey data for year %s not found. Using year %s' % (str(year_ds), str(yr)) else: print 'Survey data for year %s found' % str(year_ds) if yr in available_years: self.survey_year = yr if self.num_table == 1 : self.table = _survey_subset(store[str(base_name)], self.subset) elif self.num_table == 3 : for entity in self.list_entities: self.table3[entity] = _survey_subset(store[str(base_name) + '/' + entity], self.subset) store.close() else: if self.num_table == 1: if not isinstance(fname, DataFrame): raise Exception("When num_table=1, the object given as survey data must be a pandas DataFrame") else: self.table = _survey_subset(fname, self.subset) elif self.num_table == 3: try: for entity in list_entities: assert isinstance(fname[entity], DataFrame) self.table3[entity] = _survey_subset(fname[entity], self.subset) except: log.error("When num_table=3, the object given as survey data" " must be a dictionary of pandas DataFrame with each entity in keys") raise missing_col = [] var_entity = {} if self.num_table == 1 : self._nrows = self.table.shape[0] # Intialize to default value the missing variables for col in self.column_by_name.itervalues(): if col.name not in self.table: missing_col.append(col.name) self.table[col.name] = col._default try: if self.table[col.name].isnull().any(): self.table[col.name].fillna(col._default, inplace = True) self.table[col.name] = self.table[col.name].astype(col._dtype) except: log.error("Impossible de lire la variable suivante issue des données d'enquête :\n%s\n" % col.name) raise # Keeping only valid input variables drop_variables = list(set(self.table.columns) - set(self.column_by_name.keys())) self.table.drop(drop_variables, inplace = True, axis = 1) elif self.num_table == 3 : self._nrows = self.table3['ind'].shape[0] for ent in list_entities: var_entity[ent] = [x for x in self.column_by_name.itervalues() if x.entity == ent] for col in var_entity[ent]: if not col.name in self.table3[ent]: missing_col.append(col.name) self.table3[ent][col.name] = col._default if self.table3[ent][col.name].isnull().any(): self.table3[ent][col.name].fillna(col._default, inplace = True) self.table3[ent][col.name] = self.table3[ent][col.name].astype(col._dtype) if ent == 'foy': self.table3[ent] = self.table3[ent].to_sparse(fill_value = 0) if missing_col: message = "%i input variables missing\n" % len(missing_col) messagef = "" messageb = "" missing_col.sort() for var in missing_col: if var[0] == 'f': messagef += ' - ' + var + '\n' elif var[0] == 'b': messageb += ' - ' + var + '\n' else: message += ' - ' + var + '\n' if self.print_missing: print Warning(message + messagef + messageb) for var in model.ENTITIES_INDEX: if ('id' + var) in missing_col: raise Exception('Survey data needs variable %s' % ('id' + var)) if ('qui' + var) in missing_col: raise Exception('Survey data needs variable %s' % ('qui' + var)) self.gen_index(model.ENTITIES_INDEX)
def get_group_names(self): s = HDFStore(self.path) names = s.keys() s.close() return names
def get_population_choices(self, filename): store_pop = HDFStore(filename,'r') choices = store_pop.keys() store_pop.close() return choices
__author__ = 'Gleb' import warnings from datetime import timedelta from datetime import datetime import pandas as pd import numpy as np from pandas import HDFStore warnings.filterwarnings("ignore") prices = ['F:\\DataBase\\BestBidAsk.h5'] deals = ['F:\\DataBase\\DealsFrom27Apr.h5', 'F:\\DataBase\\Deals.h5'] prices_store = HDFStore(prices[0]) instruments = prices_store.keys() deals_store1 = HDFStore(deals[0]) deals_store2 = HDFStore(deals[1]) sizes = Data_merger('Deals', ' Nonaggr.csv') def get_client_order_book(aggr_id, index): bid_path = '/' + str(aggr_id) + '/BidQuotes' ask_path = '/' + str(aggr_id) + '/AskQuotes' bool = pd.to_datetime(index) > datetime(2015,4,25) if bool: try: bid_quotes = deals_store1.select(bid_path).drop_duplicates(subset='QuoteId', take_last=False) bid_aval = 1 except KeyError: bid_quotes = pd.DataFrame()
# All HDF5 files in the path all_files = glob(os.path.join(folder_path, "*.h5")) # Remove the PDF only ones (relevant results are in the PDF_KS and # PDF_Hellinger keywords) remove_keys = ["PDF", "PDF_AD"] # Rename keys rename_keys = { "VCS_Density": "VCS_Small_Scale", "VCS_Velocity": "VCS_Large_Scale" } for f in all_files: store = HDFStore(f) # Removals for key in remove_keys: if "/" + key in store.keys(): del store[key] # Rename for old_key in rename_keys: if "/" + old_key in store.keys(): store[rename_keys[old_key]] = store[old_key].copy() del store[old_key] print("Final keys: " + str(store.keys())) store.close()
class WikiStore(object): """ WikiStore is a HDFStore storage for a Quandl WIKI dataset. The Quandl WIKI dataset can be retrieved from: https://www.quandl.com/data/WIKI-Wiki-EOD-Stock-Prices. """ def __init__(self, base_dir, date_index=True): self.base_dir = base_dir assert os.path.exists(self.base_dir) self.date_index = date_index self._init() def keys(self): return self.tickers @lru_cache(maxsize=100) def __getitem__(self, item): df = self.store[item] if self.date_index: df.set_index('date', inplace=True) return df @staticmethod def store_snapshot(base_dir, snapshot_file): w_df = pd.read_csv(snapshot_file, parse_dates=[1]) w_df.columns = [c.replace('-', '_') for c in w_df.columns] w_df.set_index('ticker', inplace=True) w_df.sort_index(inplace=True) snapshot_file = datetime.today().strftime('%Y%m%d') with HDFStore(os.path.join(base_dir, '{}.h5'.format(snapshot_file)), 'w', complevel=6, complib='blosc') as store: tickers = set(w_df.index) for ticker in tickers: df = w_df.loc[ticker, :] df.reset_index(inplace=True) df = df.drop('ticker', 1) store[ticker] = df def _init(self): self.store = HDFStore(latest_filename('{}/*.h5'.format(self.base_dir))) self.tickers = [t[1:] for t in self.store.keys()] def close(self): self.store.close() def tickers_column(self, tickers, col='adj_close', fun_filter=None): if not tickers: return None def fetch_column(ticker): ticker_dat = self[ticker] df = ticker_dat[[col]] df.columns = [ticker] if fun_filter: df = fun_filter(df) return df buf = [fetch_column(ticker) for ticker in tickers] if len(tickers) == 1: return buf[0] return buf[0].join(buf[1:])
def populate_from_survey_data(self, fname, year=None): ''' Populates a DataTable from survey data ''' list_entities = self.list_entities if isinstance(fname, str) or isinstance(fname, unicode): if fname[-4:] == '.csv': # TODO: implement it for _num_table==3 (or remove) if self.num_table == 1: with open(fname) as survey_data_file: self.table = read_csv(survey_data_file) else: raise Exception( 'For now, use three csv table is not allowed' 'although there is no major difficulty. Please,' 'feel free to code it') elif fname[-3:] == '.h5': store = HDFStore(fname) if self.num_table == 1: available_years = sorted( [int(x[-4:]) for x in store.keys()]) elif self.num_table == 3: available_years = (sorted( [int(x[-8:-4]) for x in store.keys()])) # note+ we have a repetition here in available_years but it doesn't matter if year is None: if self.datesim is not None: year_ds = self.datesim.year else: raise Exception( 'self.datesim or year should be defined') else: year_ds = year yr = year_ds + 0 # to avoid pointers problem while yr not in available_years and yr > available_years[0]: yr = yr - 1 base_name = 'survey_' + str(yr) if year_ds != yr: print 'Survey data for year %s not found. Using year %s' % ( str(year_ds), str(yr)) else: print 'Survey data for year %s found' % str(year_ds) if yr in available_years: self.survey_year = yr if self.num_table == 1: self.table = _survey_subset(store[str(base_name)], self.subset) elif self.num_table == 3: for entity in self.list_entities: self.table3[entity] = _survey_subset( store[str(base_name) + '/' + entity], self.subset) store.close() else: if self.num_table == 1: if not isinstance(fname, DataFrame): raise Exception( "When num_table=1, the object given as survey data must be a pandas DataFrame" ) else: self.table = _survey_subset(fname, self.subset) elif self.num_table == 3: try: for entity in list_entities: assert isinstance(fname[entity], DataFrame) self.table3[entity] = _survey_subset( fname[entity], self.subset) except: log.error( "When num_table=3, the object given as survey data" " must be a dictionary of pandas DataFrame with each entity in keys" ) raise missing_col = [] var_entity = {} if self.num_table == 1: self._nrows = self.table.shape[0] # Intialize to default value the missing variables for col in self.column_by_name.itervalues(): if col.name not in self.table: missing_col.append(col.name) self.table[col.name] = col._default try: if self.table[col.name].isnull().any(): self.table[col.name].fillna(col._default, inplace=True) self.table[col.name] = self.table[col.name].astype( col._dtype) except: log.error( "Impossible de lire la variable suivante issue des données d'enquête :\n%s\n" % col.name) raise # Keeping only valid input variables drop_variables = list( set(self.table.columns) - set(self.column_by_name.keys())) self.table.drop(drop_variables, inplace=True, axis=1) elif self.num_table == 3: self._nrows = self.table3['ind'].shape[0] for ent in list_entities: var_entity[ent] = [ x for x in self.column_by_name.itervalues() if x.entity == ent ] for col in var_entity[ent]: if not col.name in self.table3[ent]: missing_col.append(col.name) self.table3[ent][col.name] = col._default if self.table3[ent][col.name].isnull().any(): self.table3[ent][col.name].fillna(col._default, inplace=True) self.table3[ent][col.name] = self.table3[ent][ col.name].astype(col._dtype) if ent == 'foy': self.table3[ent] = self.table3[ent].to_sparse(fill_value=0) if missing_col: message = "%i input variables missing\n" % len(missing_col) messagef = "" messageb = "" missing_col.sort() for var in missing_col: if var[0] == 'f': messagef += ' - ' + var + '\n' elif var[0] == 'b': messageb += ' - ' + var + '\n' else: message += ' - ' + var + '\n' if self.print_missing: print Warning(message + messagef + messageb) for var in model.ENTITIES_INDEX: if ('id' + var) in missing_col: raise Exception('Survey data needs variable %s' % ('id' + var)) if ('qui' + var) in missing_col: raise Exception('Survey data needs variable %s' % ('qui' + var)) self.gen_index(model.ENTITIES_INDEX)
def test_multiple_open_close(setup_path): # gh-4409: open & close multiple times with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") # single store = HDFStore(path) assert "CLOSED" not in store.info() assert store.is_open store.close() assert "CLOSED" in store.info() assert not store.is_open with ensure_clean_path(setup_path) as path: if pytables._table_file_open_policy_is_strict: # multiples store1 = HDFStore(path) msg = ( r"The file [\S]* is already opened\. Please close it before " r"reopening in write mode\." ) with pytest.raises(ValueError, match=msg): HDFStore(path) store1.close() else: # multiples store1 = HDFStore(path) store2 = HDFStore(path) assert "CLOSED" not in store1.info() assert "CLOSED" not in store2.info() assert store1.is_open assert store2.is_open store1.close() assert "CLOSED" in store1.info() assert not store1.is_open assert "CLOSED" not in store2.info() assert store2.is_open store2.close() assert "CLOSED" in store1.info() assert "CLOSED" in store2.info() assert not store1.is_open assert not store2.is_open # nested close store = HDFStore(path, mode="w") store.append("df", df) store2 = HDFStore(path) store2.append("df2", df) store2.close() assert "CLOSED" in store2.info() assert not store2.is_open store.close() assert "CLOSED" in store.info() assert not store.is_open # double closing store = HDFStore(path, mode="w") store.append("df", df) store2 = HDFStore(path) store.close() assert "CLOSED" in store.info() assert not store.is_open store2.close() assert "CLOSED" in store2.info() assert not store2.is_open # ops on a closed store with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") store = HDFStore(path) store.close() msg = r"[\S]* file is not open!" with pytest.raises(ClosedFileError, match=msg): store.keys() with pytest.raises(ClosedFileError, match=msg): "df" in store with pytest.raises(ClosedFileError, match=msg): len(store) with pytest.raises(ClosedFileError, match=msg): store["df"] with pytest.raises(ClosedFileError, match=msg): store.select("df") with pytest.raises(ClosedFileError, match=msg): store.get("df") with pytest.raises(ClosedFileError, match=msg): store.append("df2", df) with pytest.raises(ClosedFileError, match=msg): store.put("df3", df) with pytest.raises(ClosedFileError, match=msg): store.get_storer("df2") with pytest.raises(ClosedFileError, match=msg): store.remove("df2") with pytest.raises(ClosedFileError, match=msg): store.select("df") msg = "'HDFStore' object has no attribute 'df'" with pytest.raises(AttributeError, match=msg): store.df
hdf.close() def getKrakenData(interval=1440, since=0): directory = krakenutl.SRCDIR if not os.path.exists(directory): os.makedirs(directory) for p in krakenutl.PAIRS: logger.debug('download data for: ' + p + ' interval: ' + str(interval) + ' since:' + str(krakenutl.localTimeFromEpoch(since))) pdata = getOhlc(p, interval, since) storeHdf5(pdata, krakenutl.getTagFromPair(p, interval), krakenutl.getH5source()) if __name__ == '__main__': getKrakenData(krakenutl.DAY, STARTDATE) getKrakenData(krakenutl.WEEK, STARTDATE) getKrakenData(krakenutl.H3, STARTDATE) getKrakenData(krakenutl.H1, STARTDATE) getKrakenData(krakenutl.M30, STARTDATE) getKrakenData(krakenutl.M15, STARTDATE) getKrakenData(krakenutl.M5, STARTDATE) #df=getOhlc("XXBTZUSD",5,1441148619) #print(df) hdf = HDFStore(krakenutl.getH5source()) for k in hdf.keys(): print(k, len(hdf[k])) hdf.close()
import os import pdb import numpy as np from pandas import HDFStore # DataFrame from openfisca_core import model filename = os.path.join(model.DATA_DIR, 'survey.h5') filename3 = os.path.join(model.DATA_DIR, 'survey3.h5') store = HDFStore(filename) output = HDFStore(filename3) #faire un remove de output pour pouvoir ecraser available_years = sorted([int(x[-4:]) for x in store.keys()]) available_years = [2006] def from_one_to_three(table, entity): return [ name for name, column in model.column_by_name.iteritems() if name in table.columns and column.entity == entity ] # on peut en profiter pour faire l'index ici ? Ca tournerait un peu plus vite # mais surtout de maniere plus "essentielle" for year in available_years:
def load_filter(cls, filter_name=None, wavelength_unit=None, interpolation_kind='linear'): """ Parameters ---------- filter_name: str or None wavelength_unit: str or astropy.units.Unit for some filtersets (e.g. gemini) this can be autodetected interpolation_kind: str see scipy.interpolation.interp1d """ if filter_name is None: filter_store = HDFStore(filter_data_fname, mode='r') available_filters = filter_store.keys() filter_store.close() print("Available Filters\n" "-----------------\n\n" + '\n'.join(available_filters)) else: filter_store = HDFStore(filter_data_fname, mode='r') try: filter = filter_store[filter_name] except KeyError: filter_store.close() raise ValueError( 'Requested filter ({0}) does not exist'.format( filter_name)) finally: filter_store.close() if 'gemini' in filter_name: wavelength_unit = 'nm' elif 'bessell' in filter_name: wavelength_unit = 'angstrom' elif 'hst' in filter_name: wavelength_unit = 'angstrom' elif 'decam' in filter_name: wavelength_unit = 'angstrom' elif 'sdss' in filter_name: wavelength_unit = 'angstrom' if wavelength_unit is None: raise ValueError('No "wavelength_unit" given and none ' 'autodetected') wavelength = filter.wavelength.values * u.Unit(wavelength_unit) return cls(wavelength, filter.transmission_lambda.values, interpolation_kind=interpolation_kind, filter_name=filter_name)
def get_population_choices(self, filename): store_pop = HDFStore(filename, 'r') choices = store_pop.keys() store_pop.close() return choices
# In[46]: def hdf5_to_csv(filename): """ Converts hdf5 files to csv Parameters ---------- filename: string or list of strings Name of the hdf5 file being converted Returns ------- Rewrites hdf5 keys to invidual csv files (returns nothing) """ store = HDFStore(filename) for key in range(len(store.keys())): store[store.keys()[key]].to_csv(store.keys()[key][1:] + '.csv') # In[39]: # hdf5 files can also be dumped into an asci file with the following line in the command prompt h5dump -o dset.asci -y -w 400 sanfran_public.h5
def convert_format(path, design, face1, face2, output_type="csv", parameters=None): ''' Takes all HDF5 files in given path comparing face1 to face2 and combines them into a single file. Parameters ---------- path : str Path where files are located. design : str or pandas.DataFrame If str, assumes a 'csv' file. face1 : int Face of the cube. face2: int Face of the cube compared to. output_type : str, optional Type of file to output. parameters : list, optional Contains column names of design that are the parameters varied in the set. If None, all columns are appended to the output file. ''' files = [path + f for f in os.listdir(path) if os.path.isfile(path + f) and str(face1) + "_" + str(face2) in f and f[:9] != "fiducial_"] print "Files used: %s" % (files) if isinstance(design, str): design = read_csv(design) if isinstance(parameters, list): design_df = {} for param in parameters: design_df[param] = Series(design[param]) design_df = DataFrame(design_df) else: design_df = design for i, f in enumerate(files): store = HDFStore(f) data_columns = {} # Get data from HDF5 for key in store.keys(): data = store[key].sort(axis=0).sort(axis=1) index = data.index mean_data = data.mean(axis=1) data_columns[key[1:]] = mean_data store.close() # Add on design matrix for key in design_df: # can get nans if the file was made in excel design_df = design_df.dropna() design_df.index = index data_columns[key] = design_df[key] if i == 0: # Create dataframe df = DataFrame(data_columns) else: # Add on to dataframe data_columns = DataFrame(data_columns) df = concat([df, data_columns]) filename = "distances_"+str(face1)+"_"+str(face2) if output_type == "csv": df.to_csv(path+filename+".csv")
def get_keys(self): s = HDFStore(self.path) keys = s.keys() s.close() return keys