def AddSeqComp(mypath): """ Loads TestLogAll.h5 from the specified path, then calls MeasurementGroupTools.AddSeqComp to recalculate seq components using FFT Input: Directory of the measurment campaign, e.g.: "aLabView2" Output: Results1.h5, Results1.pdf in the data subdirs. """ from pandas import HDFStore, ExcelWriter import MeasurementGroupTools as mgt h5logs = HDFStore(mypath + "\\" + 'TestLogsAll.h5') TestLog = h5logs['TestLogsAll'] dirs = TestLog[u'DirName'].unique() for dname in dirs: mysubdirpath = mypath + "\\" + dname print "Processing: " + dname mgt.AddSeqComp(mysubdirpath, TestLog, dname) h5logs.put('TestLogsAll',TestLog) h5logs.close() writer = ExcelWriter(mypath + "\\" + 'TestLogsAll.xlsx') TestLog.to_excel(writer,'TestLogsAll') # the second argument defines sheet name writer.save() return
def init_h5_database(database_name, meta_data, overwrite=False): """Initialize a h5 file for storing EEMs using a pandas DataFrame containing EEM meta data Args: database_name (str): filename and relative path for h5 database meta_data (pandas DataFrame): DataFrame containing eem meta data from `pyeem.load_eem_meta_data` function or created manually - see pyeem.load_eem_meta_data for required columns. NOTE: do not use spaces or decimals in column names as this causes a warning when saving to H5 file format Returns: no retun - data is saved as h5 and may be loaded using `pyeem.load_eem_data` """ from pandas import HDFStore # check if h5 file exists and overwrite or warn if os.path.isfile(database_name): if overwrite is True: print('overwriting ' + database_name) os.remove(database_name) else: raise ValueError( "h5 file " + database_name + " exists. Choose new database name or set overwrite=True") # create a h5 file to store EEM meta data hdf = HDFStore(database_name) hdf.put('meta', meta_data, format='table', data_columns=True) hdf.close() return
class engine(Engine): """Engine instance for writing data to a HDF5 file.""" name = "HDF5" abbreviation = "hdf5" insert_limit = 1000 required_opts = [ ("file", "Enter the filename of your HDF5 file", "hdf5.h5"), ("table_name", "Format of table name", "{db}_{table}"), ("data_dir", "Install directory", DATA_DIR), ] def create_db(self): """Override create_db since an SQLite dataset needs to be created first followed by the creation of an empty HDFStore file. """ file_path = os.path.join(self.opts["data_dir"], self.opts["file"]) self.file = HDFStore(file_path) def create_table(self): """Don't create table for HDF5 HDF5 doesn't create tables. Each database is a file which has been created. This overloads`create_table` to do nothing in this case. """ return None def insert_data_from_file(self, filename): """Fill the table by fetching the dataframe from the SQLite engine and putting it into the HDFStore file. """ table_name = self.table_name() df = self.fetch_table(table_name) self.file.put(table_name, df, data_columns=True) def fetch_table(self, table_name): """Return a table from sqlite dataset as pandas dataframe.""" connection = self.get_sqlite_connection() sql_query = "SELECT * FROM {};".format(table_name) return pd.read_sql_query(sql_query, connection) def get_sqlite_connection(self): # self.get_input() file = self.opts["file"] file = (file.split("."))[0] + ".db" db_file = self.opts["data_dir"] full_path = os.path.join(db_file, file) return dbapi.connect(os.path.normpath(full_path)) def get_connection(self): """Gets the db connection.""" self.get_input() return DummyConnection() def disconnect(self): """Close the file after being written""" self.file.close() file = self.opts["file"] file = (file.split("."))[0] + ".db" os.remove(file)
def create_store(sub): hdf = HDFStore('all.h5') d = DataFrame(columns=[ 'SUB', 'SEED', 'SEED ROI', 'TARGET ROI', 'HEMISPHERE', 'DISTANCE', 'STRENGTH', 'CAT1', 'CAT2', 'CAT3' ]) for i in range(1, 181): LSfname = '../' + sub + '/out/L' + str( i) + '/matrix_seeds_to_all_targets' LDfname = '../' + sub + '/out/L' + str( i) + '/matrix_seeds_to_all_targets_lengths' RSfname = '../' + sub + '/out/R' + str( i) + '/matrix_seeds_to_all_targets' RDfname = '../' + sub + '/out/R' + str( i) + '/matrix_seeds_to_all_targets_lengths' ls = readS2R(LSfname) rs = readS2R(RSfname) ld = readS2R_L(LDfname) rd = readS2R_L(RDfname) numSeeds, numROIs = ls.shape for j in tqdm(range(numSeeds), total=numSeeds): for q in range(numROIs): tmp = Series([ sub, j + 1, i + 1, q + 1, 'L', ld[j, q], ls[j, q], '', '', '' ]) d = d.append(tmp, ignore_index=True) # numSeeds ,numROIs = rs.shape # for j in range(numSeeds): # for q in range(numROIs): # tmp = Series([sub,j+1,i+1,q+1,'R',rd[j,q],rs[j,q],'','','']) # d = d.append(tmp,ignore_index=True) if i == 1: break hdf.put(sub, d)
def compute_and_save_hist_as_pd(values : np.array , out_file : pd.HDFStore , hist_name : str , n_bins : int , range_hist : Tuple[float, float], norm : bool = False )->None: """ Computes 1d-histogram and saves it in a file. The name of the table inside the file must be provided. Parameters ---------- values : np.array Array with values to be plotted. out_file: pd.HDFStore File where histogram will be saved. hist_name: string Name of the pd.Dataframe to contain the histogram. n_bins: int Number of bins to make the histogram. range_hist: length-2 tuple (optional) Range of the histogram. norm: bool If True, histogram will be normalized. """ n, b = np.histogram(values, bins = n_bins, range = range_hist, density = norm) table = pd.DataFrame({'entries': n, 'magnitude': shift_to_bin_centers(b)}) out_file.put(hist_name, table, format='table', data_columns=True) return
class PandasHDFHandler(FileHandler): r""" Handler for HDF5 files using Pandas. """ def _open_for_read(self): self.handle = HDFStore(self.fname, mode='r') def _open_for_write(self): self.handle = HDFStore(self.fname) def list_items(self): keys = [key.strip('/') for key in self.handle.keys()] items = [(key, _get_type_from_attrs(self.handle.get_storer(key).attrs)) for key in keys if '/' not in key] # ---- for backward compatibility (LArray < 0.33) ---- # axes items += [(key.split('/')[-1], 'Axis_Backward_Comp') for key in keys if '__axes__' in key] # groups items += [(key.split('/')[-1], 'Group_Backward_Comp') for key in keys if '__groups__' in key] return items def _read_item(self, key, typename, *args, **kwargs): if typename in _supported_typenames: hdf_key = '/' + key # ---- for backward compatibility (LArray < 0.33) ---- elif typename == 'Axis_Backward_Comp': hdf_key = '__axes__/' + key elif typename == 'Group_Backward_Comp': hdf_key = '__groups__/' + key else: raise TypeError() return read_hdf(self.handle, hdf_key, *args, **kwargs) def _dump_item(self, key, value, *args, **kwargs): hdf_key = '/' + key if isinstance(value, (Array, Axis)): value.to_hdf(self.handle, hdf_key, *args, **kwargs) elif isinstance(value, Group): hdf_axis_key = '/' + value.axis.name value.to_hdf(self.handle, hdf_key, hdf_axis_key, *args, **kwargs) elif isinstance(value, _supported_scalars_types): s = pd.Series(data=value) self.handle.put(hdf_key, s) self.handle.get_storer(hdf_key).attrs.type = type(value).__name__ else: raise TypeError() def _read_metadata(self): metadata = Metadata.from_hdf(self.handle) if metadata is None: metadata = Metadata() return metadata def _dump_metadata(self, metadata): metadata.to_hdf(self.handle) def close(self): self.handle.close()
def storeHdf5(data, tag, path): hdf = HDFStore(path,'a') if tag in hdf.keys(): hdf.append(tag,data) else: hdf.put(tag,data) hdf.close()
def storeHdf5(data, tag, path): hdf = HDFStore(path, 'a') if tag in hdf.keys(): hdf.append(tag, data) else: hdf.put(tag, data) hdf.close()
def build_from_openfisca( directory = None): df_age_final = None for yr in range(2006,2010): simulation = SurveySimulation() simulation.set_config(year = yr) simulation.set_param() simulation.set_survey() df_age = get_age_structure(simulation) df_age[yr] = df_age['wprm'] del df_age['wprm'] if df_age_final is None: df_age_final = df_age else: df_age_final = df_age_final.merge(df_age) if directory is None: directory = os.path.dirname(__file__) fname = os.path.join(directory, H5_FILENAME) store = HDFStore(fname) print df_age_final.dtypes store.put("openfisca", df_age_final) store.close()
def save(self, store: pandas.HDFStore) -> None: """ Save a model to an open HDFStore. Notes: Performs an IO operation. Args: store (pandas.HDFStore) Returns: None """ # save the config as an attribute config = self.get_config() store.put('model', pandas.DataFrame()) store.get_storer('model').attrs.config = config # save the parameters for i in range(self.num_weights): key = os.path.join('weights', 'weights' + str(i)) self.weights[i].save_params(store, key) for i in range(self.num_layers): key = os.path.join('layers', 'layers' + str(i)) self.layers[i].save_params(store, key)
def convert_to_3_tables(year=2006, survey_file=None, output_file=None): if survey_file is None: raise Exception( 'You need a .h5 file with the survey to extract the variables from' ) if output_file is None: output_file = survey_file raise Warning( 'the survey file will be used to store the created tables') store = HDFStore(survey_file) output = HDFStore(output_file) print output simulation = SurveySimulation() simulation.set_config(year=year) table1 = store['survey_' + str(year)] for entity in ['ind', 'foy', 'men', 'fam']: key = 'survey_' + str(year) + '/' + str(entity) vars_matching_entity = vars_matching_entity_from_table( table1, simulation, entity) print entity, vars_matching_entity_from_table print 'table1 enum' if entity == 'ind': print 'INDIVIDUALS' print table1['noindiv'] table_entity = table1.loc[:, vars_matching_entity] # we take care have all ident and selecting qui==0 else: # print ' entity :', entity # print table1['noindiv'].head() position = 'qui' + entity # print table1[position] table_entity = table1.ix[table1[position] == 0, [ 'noi', 'idmen', 'idfoy', 'idfam', 'quifoy', 'quimen', 'quifam' ] + vars_matching_entity] # print table_entity.noi.head() table_entity = table_entity.rename_axis(table_entity['id' + entity], axis=1) # print ' APRES' # print table_entity.noi.head() print key output.put(key, table_entity) del table1 import gc gc.collect() store.close() output.close()
def write_file(format): outfile = '../inst/exampledata/pytables_' + format + '.h5' if os.path.isfile(outfile): os.remove(outfile) hdf = HDFStore(outfile) hdf.put('mydata', df, format=format, data_columns=True, encoding="utf-8") hdf.close()
class HdfStore(DataStore): complevel = 9 complib = "blosc:zstd" def __init__(self, path: str, table: str, compute: Optional[Callable] = None) -> None: self.table = table if compute: self.store = PandasHDFStore(path, complevel=self.complevel, complib=self.complib) dataframe = compute() dataframe.sort_values(by="where", axis=0, inplace=True) self._mangle_where(dataframe) self.store.put( self.table, dataframe, append=False, format="table", expectedrows=len(dataframe), data_columns=[ "where_", "where_type", "who", "who_type", "when", "when_type" ], ) else: self.store = PandasHDFStore(path, complevel=self.complevel, complib=self.complib, mode="r") def query(self, query: str) -> DataFrame: query = self._mangle_where_in_query(query) df = self.store.select(self.table, where=query) self._unmangle_where(df) return df def _mangle_where(self, df: DataFrame) -> None: # See: https://github.com/PyTables/PyTables/issues/638 df.rename(columns={"where": "where_"}, inplace=True) def _unmangle_where(self, df: DataFrame) -> None: # See: https://github.com/PyTables/PyTables/issues/638 df.rename(columns={"where_": "where"}, inplace=True) def _mangle_where_in_query( self, query: Union[str, List[str]]) -> Union[str, List[str]]: # See: https://github.com/PyTables/PyTables/issues/638 if isinstance(query, str): return re.sub("where([^_])", "where_\\1", query) else: return [ self._mangle_where_in_query(subquery) for subquery in query ]
def save(self, store: pandas.HDFStore) -> None: config = self.get_config() store.put('model', pandas.DataFrame()) store.get_storer('model').attrs.config = config for i in range(self.num_layers): key = os.path.join('layers', 'layers_'+str(i)) self.layers[i].save_params(store, key) for i in range(self.num_connections): key = os.path.join('connections', 'weights_'+str(i)) self.connections[i].weights.save_params(store, key)
def to_frame_hdf(self, store_path, store_key, df_cb=None, max_msg=None, usecols=None, chunk_cnt=CHUNK_CNT, show_prog=True): store = HDFStore(store_path, 'w') df = self._to_frame(usecols, chunk_cnt, show_prog) df['msg'] = df['msg'].apply(lambda m: m.encode('utf8')) if df_cb is not None: df_cb(df) min_itemsize = {'kind': 20, 'msg': 255} if max_msg is not None: min_itemsize['msg'] = max_msg store.put(store_key, df, format='table', min_itemsize=min_itemsize) store.flush() store.close()
def save_all_logs(self, force=False): if os.path.exists(self.store_path): final_store = HDFStore(self.store_path) print 'Keys: %s' % final_store final_store.close() return if not force: assert not os.path.exists(self.history_path), ''' %s exists but %s does not. There appears to be a conversion in progress. -f forces conversion to complete. ''' % (self.history_path, self.store_path) self.directory.make_dir_if_necessary(self.progress_store_path) self.progress_store = HDFStore(self.progress_store_path) for path in self.log_list: self.save_log(path) self.check() print '--------' print 'All tables in %s' % self.progress_store_path print self.progress_store.keys() print '--------' def get_log(path): try: return self.progress_store.get(LogSaver.normalize(path)) except Exception as e: print print path raise e df_list = [get_log(path) for path in self.log_list] self.progress_store.close() print 'Closed %s' % self.progress_store_path df_all = pd.concat(df_list) print 'Final list has %d entries' % len(df_all) final_store = HDFStore(self.store_path) final_store.put('logs', df_all) print 'Keys: %s' % final_store final_store.close() print 'Closed %s' % self.store_path # Save the history in a corresponding file self.directory.save('history', self.history) print 'Saved history' self.saved = True
def save_xarray_to_HDF5(dataArray, filename, complib=None): """Save the xarray DataArray to HDF file using pandas HDFStore attrs will be saved as metadata via pickle requries pytables complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None""" from pandas import HDFStore f = HDFStore(filename, mode='w', complib=complib) f.put('data', dataArray.to_pandas()) if len(dataArray.attrs) > 0: f.get_storer('data').attrs.metadata = dataArray.attrs f.close()
def convert_to_3_tables(year=2006, survey_file=None, output_file=None): if survey_file is None: raise Exception('You need a .h5 file with the survey to extract the variables from') if output_file is None: output_file = survey_file raise Warning('the survey file will be used to store the created tables') store = HDFStore(survey_file) output = HDFStore(output_file) print output simulation = SurveySimulation() simulation.set_config(year=year) table1 = store['survey_'+str(year)] for entity in ['ind','foy','men','fam']: key = 'survey_'+str(year) + '/'+str(entity) vars_matching_entity = vars_matching_entity_from_table(table1, simulation, entity) print entity, vars_matching_entity_from_table print 'table1 enum' if entity == 'ind': print 'INDIVIDUALS' print table1['noindiv'] table_entity = table1.loc[:, vars_matching_entity] # we take care have all ident and selecting qui==0 else: # print ' entity :', entity # print table1['noindiv'].head() position = 'qui'+entity # print table1[position] table_entity = table1.ix[table1[position] == 0 ,['noi','idmen','idfoy','idfam','quifoy','quimen','quifam'] + vars_matching_entity] # print table_entity.noi.head() table_entity= table_entity.rename_axis(table_entity['id'+entity], axis=1) # print ' APRES' # print table_entity.noi.head() print key output.put(key, table_entity) del table1 import gc gc.collect() store.close() output.close()
def make_df_file(): path = '../data/' remove_list = ['CAMERA', 'END', 'DISSOLVE', 'CUT'] files = [f for f in listdir(path) if isfile(join(path, f))] for f in files: script = list(open(path+f, 'r', encoding='utf-8')) act_list = get_main_actors(script) speaker, sentence = get_flow(script, act_list) # Evaluate Sentimet value = calculateSent(sentence) value = moving_average(value) hdf = HDFStore('../processed_data/'+f[0:-4]+'.h5') hdf.put('d1', pd.DataFrame({'speaker':speaker, 'value':value}), format='table', data_columns=True) hdf.close() print("Store ", f)
def save_data(data=pd.DataFrame, hdfs=True, dir=""): import datetime file_name = "raw_data_" + datetime.datetime.now().strftime("%y%m%d") data.to_csv(dir + "/" + file_name + ".csv", encoding="utf-8") print("Data saved as csv already") if hdfs: hdf = HDFStore(file_name + ".h5") hdf.put(file_name, result, format="table", data_columns=True, encoding="utf-8") hdf.close() print("Data saved as HDF already") else: pass
def pf2pandas(wd, files, vars=None, npwd=None, rmvars=None, \ debug=False): """ Read in GEOS-Chem planeflight output and convert to HDF format - Converts date and time columns to datetime format indexes - the resultant HDF is in 2D list form ( aka further processing required to 3D /2D output ) Note: - This function is limited by the csv read speed. for large csv output expect significant processing times or set to automatically run post run - Original files are not removed, so this function will double space usage for output unless the original fiels are deleted. """ # Ensure working dorectory string has leading foreward slash if wd[-1] != '/': wd += '/' # pfdate =( re.findall('\d+', file ) )[-1] if not isinstance(vars, list ): vars, sites = get_pf_headers( files[0], debug=debug ) if not isinstance(npwd, str ): npwd = get_dir('npwd') hdf =HDFStore( npwd+ 'pf_{}_{}.h5'.format( wd.split('/')[-3], \ wd.split('/')[-2], wd.split('/')[-1] )) if debug: print hdf for file in files: print file#, pfdate # convert planeflight.log to DataFrame df = pf_csv2pandas( file, vars ) if file==files[0]: hdf.put('d1', df, format='table', data_columns=True) else: hdf.append('d1', df, format='table', data_columns=True) if debug: print hdf['d1'].shape, hdf['d1'].index del df hdf.close()
def save_hdf_r_readable(data_frame, config_files_directory = default_config_files_directory, file_name = None, file_path = None): if file_path is None: parser = SafeConfigParser() config_ini = os.path.join(config_files_directory, 'config.ini') parser.read(config_ini) tmp_directory = parser.get('data', 'tmp_directory') if file_name is not None: if not file_name.endswith('.h5'): file_name = "{}.h5".format(file_name) file_path = os.path.join(tmp_directory, file_name) else: file_path = os.path.join(tmp_directory, 'temp.h5') store = HDFStore(file_path, "w", complib = str("zlib"), complevel = 5) store.put("dataframe", data_frame, data_columns = data_frame.columns) store.close()
class Serialization(object): def __init__(self, filename, mode='r', compress=True): self._filename = filename self._compress = compress self._mode = mode def __enter__(self): if self._compress: self._store = HDFStore(self._filename, complib='blosc:lz4', complevel=9, mode=self._mode) else: # pragma: no cover self._store = HDFStore(self._filename, mode=self._mode) return self def __exit__(self, exc_type, exc_val, exc_tb): self._store.close() @property def keys(self): return self._store.keys() def store_pandas_object(self, path, obj, **metadata): self._store.put(path, obj, format='fixed') self._store.get_storer(path).attrs.metadata = metadata def retrieve_pandas_object(self, path): # Get the metadata metadata = self._store.get_storer(path).attrs.metadata # Get the object obj = self._store.get(path) return obj, metadata
def build_from_insee( directory = None, verbose=False): if directory is None: directory = os.path.dirname(__file__) fname = os.path.join(directory, H5_FILENAME) store = HDFStore(fname) xls = ExcelFile(os.path.join(model.DATA_SOURCES_DIR, "sd2010_t6_fm.xls")) df_age_final = None for year in range(2006,2010): sheet_name = str(year) df = xls.parse(sheet_name, header=0, index_col=0, skiprows=8, parse_cols=[1,2], na_values=['NA']) df.index.name = u"âge" df.rename(columns = {"Unnamed: 1" : year}, inplace = True) # Dealing with te 90 et plus and 105 et plus df = df.reset_index() df = df.dropna(axis=0) df.set_value(106,u"âge", 105) df = df.set_index(u"âge") df.drop(df.index[90], axis=0, inplace=True) df.index.name = u"âge" df = df.reset_index() if verbose: print "year : " + str(year) print df.to_string() if df_age_final is None: df_age_final = df else: df_age_final = df_age_final.merge(df) if verbose: print df_age_final.to_string() print df_age_final.dtypes from numpy import dtype df_age_final[u"âge"] = df_age_final[u"âge"].astype(dtype("int64")) store.put("insee", df_age_final)
def main(): knowledge_dir = '/home/tor/xprmnt/knowledge-construction' # read relloc_knowledge = kr.read(knowledge_dir+'/relative-location-knowledge/relloc-pickle') # write relloc_knowledge_hdf5 = HDFStore(knowledge_dir+'/relative-location-knowledge/relloc-hdf5/relloc.h5') for key, local in relloc_knowledge.iteritems(): for key2 in local: df = DataFrame(local[key2]) df_id = key+'/'+key2 print 'writing:', df_id relloc_knowledge_hdf5.put(df_id, df) print 'writing: obj_class' obj_class = Series(relloc_knowledge.keys()) relloc_knowledge_hdf5.put('obj_class', obj_class)
def save_hdf_r_readable(data_frame, config_files_directory=default_config_files_directory, file_name=None, file_path=None): if file_path is None: parser = ConfigParser() config_ini = os.path.join(config_files_directory, 'config.ini') parser.read(config_ini) tmp_directory = parser.get('data', 'tmp_directory') if file_name is not None: if not file_name.endswith('.h5'): file_name = "{}.h5".format(file_name) file_path = os.path.join(tmp_directory, file_name) else: file_path = os.path.join(tmp_directory, 'temp.h5') store = HDFStore(file_path, "w", complib=str("zlib"), complevel=5) store.put("dataframe", data_frame, data_columns=data_frame.columns) store.close()
def aggregate(hdf_store_loc, file_pattern, headerfile=None, remove_part_files=False): df = None store = HDFStore(hdf_store_loc) store_keys = [w.replace('/', '') for w in store.keys()] print( f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file' ) for key in store_keys: if re.match(file_pattern.replace('*', '.+'), key): print( f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}' ) #thisdf = pd.read_hdf(store_loc, key) thisdf = store.select(key) if df is None: df = thisdf else: #' for gz file that not have headers assign headers. try: df = df.append(thisdf, ignore_index=True, sort=True) except Exception as e: print('Error while joining data {e}') if remove_part_files: store.remove(key) try: #df.to_hdf(store_loc, key=file_pattern.replace('*','')) store.put(key=file_pattern.replace('*', ''), value=df) except Exception as e: print( f'Exception while combining flile for {file_pattern} exception {e}' ) store.close()
def write_to_hdf5_on_disk(self, file_name, writing_dataframe, episode=None): if episode == 0: # create HDFStore container print(file_name) hdf_container = HDFStore(file_name + '.h5') # print('--------------- Container type: {}'.format(type(hdf_container))) hdf_container.put(str(episode), writing_dataframe, format='table', data_columns=True) hdf_container.close() else: with HDFStore(file_name + '.h5', mode='a') as store: store.append(str(episode), writing_dataframe, append=True, format='table', data_columns=True)
class ProbStore: def __init__(self, path=PREDICT_PROBS_PATH): self._hdf = HDFStore(path) self._path = path self._length = 0 def saveProbs(self, data): """ data -- a DataFrame for one image, index -- index of an image in test """ assert isinstance(data, DataFrame) # self._hdf['d' + str(self._length)] = data self._hdf.put('d' + str(self._length), data, table=True) self._length += 1 def __len__(self): return self._length def __getitem__(self, index): return self._hdf['d' + str(index)]
def PopulateMasterTestLogTable(mypath): """ Calls MergeTestLogs to iterate over all subdirs of mypath and read in TestLogSnn Excel files and build a dataframe with all recorded test conditions. Then, calls ProcessResults to extract scalar properties of islanding test results and save them into placeholders in the master table. Finally, saves the master table as h5 and Excel files Input: Directory with the test result directories, e.g.: "aLabView2" Output: TestLogsAll.h5, TestLogsAll.xlsx Note: ProcessResults generate Results.h5 and Results.pdf files in the data subdirs. """ from os import listdir from os.path import isdir, join from pandas import HDFStore, ExcelWriter import MeasurementGroupTools as mgt TestLog = MergeTestLogs(mypath) mydirs = [d for d in listdir(mypath) if isdir(join(mypath,d)) ] # print mydirs + mydirs[1:2] for dname in mydirs: mysubdirpath = mypath + "\\" + dname print "Processing: " + dname mgt.ProcessResults(mysubdirpath, TestLog) h5store = HDFStore(mypath + "\\" + 'TestLogsAll.h5') h5store.put('TestLogsAll',TestLog) h5store.close() writer = ExcelWriter(mypath + "\\" + 'TestLogsAll.xlsx') TestLog.to_excel(writer,'TestLogsAll') # the second argument defines sheet name writer.save() return
def save(self, store: pandas.HDFStore, num_components_save: int = None) -> None: """ Save the PCA transform in an HDFStore. Allows to save only the first num_components_save. Notes: Performs an IO operation. Args: store (pandas.HDFStore) num_components_save (int): the number of principal components to save. If None, all are saved. Returns: None """ n = num_components_save if num_components_save is not None \ else self.num_components assert n <= self.num_components # the config config = {'num_components': n, 'stepsize': self.stepsize} store.put('pca', pandas.DataFrame()) store.get_storer('pca').attrs.config = config # the parameters store.put('pca/W', pandas.DataFrame(be.to_numpy_array(self.W[:, :n]))) store.put('pca/var', pandas.DataFrame(be.to_numpy_array(self.var[:n]))) # check if the mean exists before saving if self.mean is not None: store.put('pca/mean', pandas.DataFrame(be.to_numpy_array(self.mean))) var_calc_df = self.var_calc.to_dataframe() # if fit from SVD, there is no calculator used if var_calc_df is not None: store.put('pca/var_calc', var_calc_df.iloc[:n])
def csv2hdf5(csv_name, h5_name, dfname, option='frame'): """ Convert a csv file to a dataframe in a hdf5 Parameters: csv_name: string csv file name h5_name : string hdf5 file name dfname : string dataframe name option : string, 'frame' or 'table', default to 'frame' stoing type in the pytable """ table = read_csv(csv_name) store = HDFStore(h5_name) if option == 'frame': store.put(dfname, table) elif option == 'table': # for frame_table à la pytables object_cols = table.dtypes[ table.dtypes == 'object'] print object_cols.index try: store.append(dfname,table) except: print table.get_dtype_counts() object_cols = table.dtypes[ table.dtypes == 'object'] for col in object_cols.index: print 'removing object column :', col del table[col] store.append(dfname,table) print store store.close()
def csv2hdf5(csv_name, h5_name, dfname, option='frame'): """ Convert a csv file to a dataframe in a hdf5 Parameters: csv_name: string csv file name h5_name : string hdf5 file name dfname : string dataframe name option : string, 'frame' or 'table', default to 'frame' stoing type in the pytable """ table = read_csv(csv_name) store = HDFStore(h5_name) if option == 'frame': store.put(dfname, table) elif option == 'table': # for frame_table à la pytables object_cols = table.dtypes[table.dtypes == 'object'] print object_cols.index try: store.append(dfname, table) except: print table.get_dtype_counts() object_cols = table.dtypes[table.dtypes == 'object'] for col in object_cols.index: print 'removing object column :', col del table[col] store.append(dfname, table) print store store.close()
def pull(start, end): hdf = HDFStore('/home/slin/alldata/leaps/liq.h5', mode='a') dt = start while dt <= end: print dt try: fit_func = partial(fit_full, dt=dt) cores = 48 vpool = Pool(cores) ids = pd.read_hdf('/home/slin/alldata/leaps/secids.h5', 'secids') # ids = ids.iloc[:10] grouped = ids.groupby(['ticker'], as_index=False, sort=False) keys = grouped.groups.keys() split = [list(arr) for arr in np.array_split(keys, cores)] groups = [map(lambda x: grouped.get_group(x), s) for s in split] results = vpool.map(fit_list, groups) frame = pd.concat(results).reset_index(drop=True) print frame try: hdf.put('{0}'.format(dt.strftime('%Y%m%d')), frame, format='table', data_columns=True) except Exception as err: print err except Exception as err: print err finally: dt = (dt + us_bd).date()
class Serialization(object): def __init__(self, filename): self._filename = filename def __enter__(self): self._store = HDFStore(self._filename, complib='blosc', complevel=9) return self def __exit__(self, exc_type, exc_val, exc_tb): self._store.close() @property def keys(self): return self._store.keys() def store_pandas_object(self, name, object, **metadata): self._store.put(name, object) self._store.get_storer(name).attrs.metadata = metadata def retrieve_pandas_object(self, name): # Get the metadata metadata = self._store.get_storer(name).attrs.metadata # Get the object obj = self._store[name] return obj, metadata
def save_content(self, name, filename): """ Saves content from the simulation in an HDF store. We save output_table, input_table, and the default output_table dataframes, along with the other attributes using pickle. TODO : we don't save attributes P, P_default for simulation neither _param, _default_param for datatables. WARNING : Be careful when committing, you may have created a .pk data file. Parameters ---------- name : the base name of the content inside the store. filename : the name of the .h5 file where the table is stored. Created if not existant. """ sys.setrecursionlimit(32000) # Store the tables if self.verbose: print 'Saving content for simulation under name %s' % name ERF_HDF5_DATA_DIR = os.path.join(model.DATA_DIR, 'erf') store = HDFStore(os.path.join(os.path.dirname(ERF_HDF5_DATA_DIR), filename + '.h5')) if self.verbose: print 'Putting output_table in...' store.put(name + '_output_table', self.output_table.table) if self.verbose: print 'Putting input_table in...' store.put(name + '_input_table', self.input_table.table) if self.verbose: print 'Putting output_table_default in...' store.put(name + '_output_table_default', self.output_table_default.table) store.close() # Store all attributes from simulation with open(filename + '.pk', 'wb') as output: if self.verbose: print 'Storing attributes for simulation (including sub-attributes)' pickle.dump(self, output)
def save_content(self, name, filename): """ Saves content from the simulation in an HDF store. We save output_table, input_table, and the default output_table dataframes, along with the other attributes using pickle. TODO : we don't save attributes P, P_default for simulation neither _param, _default_param for datatables. WARNING : Be careful when committing, you may have created a .pk data file. Parameters ---------- name : the base name of the content inside the store. filename : the name of the .h5 file where the table is stored. Created if not existant. """ sys.setrecursionlimit(32000) # Store the tables if self.verbose: print 'Saving content for simulation under name %s' %name ERF_HDF5_DATA_DIR = os.path.join(model.DATA_DIR, 'erf') store = HDFStore(os.path.join(os.path.dirname(ERF_HDF5_DATA_DIR),filename+'.h5')) if self.verbose: print 'Putting output_table in...' store.put(name + '_output_table', self.output_table.table) if self.verbose: print 'Putting input_table in...' store.put(name + '_input_table', self.input_table.table) if self.verbose: print 'Putting output_table_default in...' store.put(name + '_output_table_default', self.output_table_default.table) store.close() # Store all attributes from simulation with open(filename + '.pk', 'wb') as output: if self.verbose: print 'Storing attributes for simulation (including sub-attributes)' pickle.dump(self, output)
for k in diff1: pd.set_printoptions(max_columns=30) listind = table['ind'][table['ind'][ident]==k] print listind for indiv in np.unique(listind['id']): print table['ind'].ix[table['ind']['id']==indiv,['id','period','sexe','idmen','quimen','idfoy','quifoy','conj','mere','pere']] pdb.set_trace() for year in years: goal.remove('survey_'+str(year)) for ent in ('ind','men','foy','fam'): tab = table[ent].ix[table[ent]['period']==year] key = 'survey_'+str(year) + '/'+ent goal.put(key, tab) # if year == 2010: # pdb.set_trace() # tab = table[ent].ix[table[ent]['period']==year] # tab[:5] # len(tab['idfam']) # len(np.unique(tab['idfam'])) # list_qui = tab['idfam'] # double = list_qui.value_counts()[list_qui.value_counts()>1] # tabind = table['ind'].ix[table['ind']['period']==year] store.close() goal.close() # on fais maintenant tourner le modèle OF
class HDFStoreDataFrame(BaseIO): def setup(self): N = 25000 index = tm.makeStringIndex(N) self.df = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N)}, index=index) self.df_mixed = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N), 'string1': ['foo'] * N, 'bool1': [True] * N, 'int1': np.random.randint(0, N, size=N)}, index=index) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] self.df2 = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N)}, index=date_range('1/1/2000', periods=N)) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] self.df_wide2 = DataFrame(np.random.randn(N, 100), index=date_range('1/1/2000', periods=N)) self.df_dc = DataFrame(np.random.randn(N, 10), columns=['C%03d' % i for i in range(10)]) self.fname = '__test__.h5' self.store = HDFStore(self.fname) self.store.put('fixed', self.df) self.store.put('fixed_mixed', self.df_mixed) self.store.append('table', self.df2) self.store.append('table_mixed', self.df_mixed) self.store.append('table_wide', self.df_wide) self.store.append('table_wide2', self.df_wide2) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store(self): self.store.get('fixed') def time_read_store_mixed(self): self.store.get('fixed_mixed') def time_write_store(self): self.store.put('fixed_write', self.df) def time_write_store_mixed(self): self.store.put('fixed_mixed_write', self.df_mixed) def time_read_store_table_mixed(self): self.store.select('table_mixed') def time_write_store_table_mixed(self): self.store.append('table_mixed_write', self.df_mixed) def time_read_store_table(self): self.store.select('table') def time_write_store_table(self): self.store.append('table_write', self.df) def time_read_store_table_wide(self): self.store.select('table_wide') def time_write_store_table_wide(self): self.store.append('table_wide_write', self.df_wide) def time_write_store_table_dc(self): self.store.append('table_dc_write', self.df_dc, data_columns=True) def time_query_store_table_wide(self): self.store.select('table_wide', where="index > self.start_wide and " "index < self.stop_wide") def time_query_store_table(self): self.store.select('table', where="index > self.start and " "index < self.stop") def time_store_repr(self): repr(self.store) def time_store_str(self): str(self.store) def time_store_info(self): self.store.info()
import get_trn_test as gtt import save_dist_m import traintest as tt import plot_results df = gdb.get_current_data() df.to_csv("per_day_from_pandas.csv") # df = pd.read_csv("per_day_from_pandas.csv") preprocess.normalize_by_event_count(df) # hdf5 doesn't like unicode df["country"] = df["country"].apply(lambda x: x.encode("ascii", "ignore")) countrydict = preprocess.get_country_lookup(df) hdf = HDFStore("project_data.h5") hdf.put("per_day_preprocessed", df, format="table", data_columns=True) hdf.get_storer("per_day_preprocessed").attrs.country_lookup = countrydict ##END PREPROCESSING train_years = 5 test_years = 1 hdf = HDFStore("project_data.h5") df = hdf["per_day_preprocessed"] basename_out = "last_6_years" train_start = 20091030 trainxy, testxy, countrylist = gtt.get_train_test(df, train_start, train_years, test_years) train_x = trainxy[0] train_y = trainxy[1] test_x = testxy[0] test_y = testxy[1] np.savez(
8020084, 8001993, 8004222, 8027386, 9039721, 9047848, 9016763]) ] print len(table_in_one) for entity in ['ind','foy','men','fam']: key = 'survey_'+str(year) + '/'+str(entity) vars_entity = from_one_to_three(table_in_one,entity) print entity, vars_entity if entity == 'ind': table_entity = table_in_one[vars_entity] # we take care have all ident and selecting qui==0 else: enum = 'qui'+entity table_entity = table_in_one.ix[table_in_one[enum] ==0 ,['noi','idmen','idfoy','idfam'] + vars_entity] table_entity= table_entity.rename_axis(table_entity['id'+entity],axis=1) print key output.put(key, table_entity) del table_in_one gc.collect() store.close() output.close() # test pour voir si les "lignes" sont nulles #enum = 'qui'+entity #table_in_one[enum] == 0 # #voir = np.array(table_in_one.ix[table_in_one[enum] == 0,vars_entity]) #voir[:,:-1] != 0 #len(np.where( voir[:,:-1] != 0 )[0]) #np.unique(np.where( voir != 0 )[1])
def main(simulation, period=None, output=".h5"): temps = time.clock() output_tab = path_til + "/output/to_run_leg.h5" name_convertion = {'person':'ind','declar':'foy','menage':'men', 'fam':'fam'} # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee # on étudie d'abord la table individu pour pouvoir séléctionner les identifiants # step 1 table = {} entities = simulation.entities for entity in entities: nom = entity.name if nom == 'person': ent = name_convertion[nom] # convert from PyTables to Pandas table[ent] = pd.DataFrame(entity.array.columns) # rename variables to make them OF ones table['ind'] = table['ind'].rename(columns={ 'men': 'idmen', 'foy': 'idfoy', 'id': 'noi', 'statmarit': 'civilstate'}) # get years years = np.unique(table['ind']['period'].values/100) ent = 'ind' # création de variable # useless since agem is in simu # table[ent]['agem'] = 12 * table[ent]['age'] table[ent]['ageq'] = table[ent]['age']/5 - 4 table[ent]['ageq'] = table[ent]['ageq']*(table[ent]['ageq'] > 0) table[ent]['ageq'] = 12 + (table[ent]['ageq']-12)*(table[ent]['ageq'] < 12) #TODO: modifier pour les jeunes veufs # create fam entity try: table[ent][['idfam','quifam']] = table[ent].loc[:,['idmen','quimen']] except: pdb.set_trace() # save information on qui == 0 foy0 = table[ent].ix[table[ent]['quifoy']==0,['noi','idfoy','idmen','idfam','period']] men0 = table[ent].ix[table[ent]['quimen']==0,['noi','idfoy','idmen','idfam','period']] # # Travail sur les qui quand on ne controle pas dans la simulation que tout le monde n'est pas qui==2 ## inutile car fait maintenant dans la simulation mais peut-être mieux à refaire ici un jour ## parce que ça prend du temps dans la simulation # time_qui = time.clock() # for ent in ('men','foy'): # 'fam' un jour... # print "Deal with qui for ", ent # qui= 'qui'+ent # ident = 'id'+ent # trav = table['ind'].ix[table['ind'][qui]==2, [ident,qui,'period']] # for name, groupfor nom in ('menage','declar','fam'):for nom in ('menage','declar','fam'): in trav.groupby([ident,'period']): # to_add = range(len(group)) # group[qui] = group[qui]+to_add # table['ind'].ix[group[qui].index, qui] = group[qui] # print "les qui pour ", ent," sont réglés" # time_qui = time.clock() - time_qui # print "le temps passé à s'occuper des qui a été",time_qui for entity in entities: nom = entity.name if nom in name_convertion: if nom != 'person': pd.DataFrame(entity.array.columns) ent = name_convertion[nom] # convert from PyTables to Pandas table[ent] = pd.DataFrame(entity.array.columns) ident = 'id'+ent table[ent] = table[ent].rename(columns={'id': ident}) table[ent] = merge(table[ent], eval(ent +'0'), how='left', left_on=[ident,'period'], right_on=[ident,'period']) # traduction de variable en OF pour ces entités if ent=='men': # nbinde est limité à 6 personnes et donc valeur = 5 en python table[ent]['nbinde'] = (table[ent]['nb_persons']-1) * (table[ent]['nb_persons']-1 <=5) +5*(table[ent]['nb_persons']-1 >5) table['fam'] = men0 if period is not None: years=[period] print years # a comnmenter quand on est sur du nodele pour gagner un peu de temps # test = {} # for year in years: # for nom in ('menage','declar'): # ent = name_convertion[nom] ## print ent, base, ident # test[ent] = pd.DataFrame(entity.array.columns).rename(columns={'id': ident}) # test[ent] = test[ent].ix[test[ent]['period']==year,:] # # test0 = eval(ent +'0')[eval(ent +'0')['period']==year] # # tab = table[ent].ix[table[ent]['period']==year,['noi','id'+ent,'idfam']] # ind = table['ind'].ix[table['ind']['period']==year,['qui'+ent]] # try: # list_ind = ind[ind==0] # except: # pdb.set_trace() # lidmen = test[ent][ident] # lidmenU = np.unique(lidmen) # diff1 = set(test0[ident]).symmetric_difference(lidmenU) # print year, ent, diff1 # for k in diff1: # # pd.set_printoptions(max_columns=30) # listind = table['ind'][table['ind'][ident]==k] # print listind # for indiv in np.unique(listind['noi']): # print table['ind'].ix[table['ind']['noi']==indiv,['noi','period','sexe','idmen','quimen','idfoy','quifoy','conj','mere','pere']] # pdb.set_trace() #available_years = sorted([int(x[-4:]) for x in store.keys()]) for year in years: if output=='.h5': try: os.remove(output_tab) except: print("Attention, la table intermediaire n'a pas ete supprimee") goal = HDFStore(output_tab) goal.remove('survey_'+str(year)) for ent in ('ind','men','foy','fam'): tab = table[ent].ix[table[ent]['period']/100==year] key = 'survey_'+str(year) + '/'+ent goal.put(key, tab) goal.close() else: for ent in ('ind','men','foy','fam'): table[ent] = table[ent].ix[table[ent]['period']/100==year] return table
class HDFStoreDataFrame(BaseIO): goal_time = 0.2 def setup(self): N = 25000 index = tm.makeStringIndex(N) self.df = DataFrame( { 'float1': np.random.randn(N), 'float2': np.random.randn(N) }, index=index) self.df_mixed = DataFrame( { 'float1': np.random.randn(N), 'float2': np.random.randn(N), 'string1': ['foo'] * N, 'bool1': [True] * N, 'int1': np.random.randint(0, N, size=N) }, index=index) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] self.df2 = DataFrame( { 'float1': np.random.randn(N), 'float2': np.random.randn(N) }, index=date_range('1/1/2000', periods=N)) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] self.df_wide2 = DataFrame(np.random.randn(N, 100), index=date_range('1/1/2000', periods=N)) self.df_dc = DataFrame(np.random.randn(N, 10), columns=['C%03d' % i for i in range(10)]) self.fname = '__test__.h5' self.store = HDFStore(self.fname) self.store.put('fixed', self.df) self.store.put('fixed_mixed', self.df_mixed) self.store.append('table', self.df2) self.store.append('table_mixed', self.df_mixed) self.store.append('table_wide', self.df_wide) self.store.append('table_wide2', self.df_wide2) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store(self): self.store.get('fixed') def time_read_store_mixed(self): self.store.get('fixed_mixed') def time_write_store(self): self.store.put('fixed_write', self.df) def time_write_store_mixed(self): self.store.put('fixed_mixed_write', self.df_mixed) def time_read_store_table_mixed(self): self.store.select('table_mixed') def time_write_store_table_mixed(self): self.store.append('table_mixed_write', self.df_mixed) def time_read_store_table(self): self.store.select('table') def time_write_store_table(self): self.store.append('table_write', self.df) def time_read_store_table_wide(self): self.store.select('table_wide') def time_write_store_table_wide(self): self.store.append('table_wide_write', self.df_wide) def time_write_store_table_dc(self): self.store.append('table_dc_write', self.df_dc, data_columns=True) def time_query_store_table_wide(self): self.store.select('table_wide', where="index > self.start_wide and " "index < self.stop_wide") def time_query_store_table(self): self.store.select('table', where="index > self.start and " "index < self.stop") def time_store_repr(self): repr(self.store) def time_store_str(self): str(self.store) def time_store_info(self): self.store.info()
def put_to_hdf(df): hdf = HDFStore(dirname + "\_InstrumentData.5") hdf.put('InstrumentData', df, format='table', data_columns=True) hdf.close() # closes the file
def extract_relevant_data( case_list = [], exceptions = [], y_delta_locs = [], x_2h_locs = [] , plot = False): """ This will extract the wall normal data at the spanwise location TE at a certain y density """ from os import listdir from os.path import join,split from pandas import DataFrame, HDFStore, read_pickle from boundary_layer_routines import return_bl_parameters from raw_data_processing_routines import decript_case_name from progressbar import ProgressBar,Percentage from progressbar import Bar,ETA,SimpleProgress from numpy import array, round, linspace from data_cleaning_routines import show_surface_from_df x_2h_locs = round( array( x_2h_locs ), 2 ) y_delta_locs = round( array( y_delta_locs ), 2 ) # Get the available HDF5 files ############################################# hdf5_root = '/media/carlos/6E34D2CD34D29783/' +\ '2015-02_SerrationPIV/TR_Data_Location_Calibrated_Article3' if not len(case_list): hdf5_files = [f for f in listdir( hdf5_root ) \ if f.endswith('.hdf5') \ and not f in exceptions ] else: hdf5_files = [f for f in listdir( hdf5_root ) \ if f.endswith('.hdf5') \ and f in case_list ] # ########################################################################## for hf in [join( hdf5_root, f ) for f in hdf5_files]: f = split( hf )[1].replace('_AirfoilNormal','')\ .replace('_Aligned.hdf5','') print " Extracting data from {0}".format(f) print " at the normalized streamwise locations:" print " {0}".format( x_2h_locs ) hdf_t = HDFStore( hf, 'r' ) # Get the available coordinates ######################################## hf_coords = hdf_t.select('data', where = [ 't = 0' ], columns = [ 'x', 'y' ] ) # ###################################################################### # Turn the non-dim requested locations into physical coords ############ requested_locations = [] requested_normalized_locations = [] #for x,x_norm in zip(x_2h_locs * tooth_length, x_2h_locs): # for y_d in y_delta_locs: # bl_params = return_bl_parameters( f , [x] ) # d_99 = bl_params.delta_99.values[0] # #if "STE" in f: # # d_99 = 9.4 # y = y_d * d_99 # requested_locations.append( (x,y) ) # requested_normalized_locations.append( ( x_norm, y_d ) ) # Get the normalization locations depending on the case ################ if 'z00' in f and not 'STE' in f: x_bl_loc = 40 elif 'z05' in f: x_bl_loc = 20 elif 'z10' in f or 'STE' in f: x_bl_loc = 0 bl_params = return_bl_parameters( f , [x_bl_loc] ) d_99 = bl_params.delta_99.values[0] for x,x_norm in zip(x_2h_locs * tooth_length, x_2h_locs): for y_d in y_delta_locs: y = y_d * d_99 requested_locations.append( (x,y) ) requested_normalized_locations.append( ( x_norm, y_d ) ) print " Normalizing to a BL thickness of {0:.2f} mm".\ format(d_99) # ###################################################################### available_xy_locs = hf_coords[ ( hf_coords.x > min( x_2h_locs ) * 40. ) & \ ( hf_coords.x < max( x_2h_locs ) * 40. ) & \ ( hf_coords.y > min( y_delta_locs ) * d_99 ) & \ ( hf_coords.y < max( y_delta_locs ) * d_99 ) ][ ['x','y'] ] available_xy_locs = [tuple(x) for x in available_xy_locs.values] if plot: trailing_edge,phi,alpha,U,z = decript_case_name( f ) if trailing_edge == 'serrated': device = 'Sr20R21' elif trailing_edge == 'straight': device = 'STE' elif trailing_edge == 'slitted': device = 'Slit20R21' case_name = "{0}_phi{1}_alpha{2}_U{3}_loc{4}_tr.dat".format( device, phi, alpha, U, z ) df_av = read_pickle( 'averaged_data/' + case_name + '.p' ) show_surface_from_df( df_av , points = available_xy_locs , plot_name = 'ReservedData/' + f + '.png' ) query = '' cnt_all = 0 cnt = 0 time_series_hdf = HDFStore( 'ReservedData/' + f + '.hdf5' , 'w' ) vertical_split_blocks = 10 progress = ProgressBar( widgets=[ Bar(),' ', Percentage(),' ', ETA(), ' (query bunch ', SimpleProgress(),')'], maxval = vertical_split_blocks ).start() # Don't try to get it all at once; split the vertical in 4 pieces y_ranges = linspace( min( y_delta_locs ), max( y_delta_locs ), vertical_split_blocks ) * d_99 xmin = min(x_2h_locs) * 40. xmax = max(x_2h_locs) * 40. for ymin, ymax in zip( y_ranges[:-1], y_ranges[1:] ): query = " x>={0} & x<{1} & y>={2} & y<{3} ".\ format( xmin, xmax, ymin, ymax ) df_t = hdf_t.select( key = 'data', where = [ query ], ) df_t['near_x_2h'] = round( df_t.x / 40., 4 ) df_t['near_y_delta'] = round( df_t.y / d_99, 4 ) if not cnt: time_series_hdf.put( 'data', df_t , data_columns = [ 'near_x_2h', 'near_y_delta', 't' ], format = 't') else: time_series_hdf.append( 'data', df_t , data_columns = [ 'near_x_2h', 'near_y_delta', 't' ], format = 't') cnt_all += 1 cnt += 1 progress.update(cnt_all) df_t = DataFrame() progress.finish() hdf_t.close() time_series_hdf.close()
def read_raw_tecplot_case_and_write_pandas_hdf5( case_folder, root = 0, output_file = 0, serration_angle = 0, angle_correction = 0, height_correction = 0, streamwise_correction = 0, overwrite = False, time_step_limit = 0, airfoil_normal = False, ): from os.path import isfile,join,splitext from os import listdir from progressbar import ProgressBar,Percentage,Bar,ETA,SimpleProgress from pandas import HDFStore # File related things ###################################################### if not output_file: output_file = case_folder+".hdf5" if airfoil_normal: output_file = output_file+"_AirfoilNormal" if not output_file.endswith('.hdf5'): output_file = output_file.replace(".hdf5","")+".hdf5" if isfile(output_file) and not overwrite: print " Exiting; file exists:\n{0}".format(output_file) return 0 else: print " Writing\n{0}".format(output_file) # ########################################################################## hdf = HDFStore(output_file) time_step_files = sorted([f for f in listdir(join(root,case_folder)) \ if splitext(f)[1] == '.dat']) if time_step_limit: time_step_files = time_step_files[:time_step_limit] progress = ProgressBar( widgets=[ Bar(),' ', Percentage(),' ', ETA(), ' (file ', SimpleProgress(),')'], maxval=len(time_step_files) ).start() cnt = 0 for f,t in zip(time_step_files,range(len(time_step_files))): df_t = read_tecplot_file_and_correct_for_location_rotation( tecplot_file = join(root,case_folder,f), serration_angle = serration_angle, angle_correction = angle_correction, height_correction = height_correction, streamwise_correction = streamwise_correction, time_step = t, airfoil_normal = airfoil_normal, ) df_t = get_vorticity(df_t) if cnt == 0: df = df_t.copy() else: df = df.append( df_t, drop_index = True) #df = df.drop_duplicates() try: x_cnt = df.x.value_counts().max() except AttributeError: print df raise if not x_cnt.max() == x_cnt.min(): print " There's something wrong, counted {0} instances of x"\ .format(x_cnt.max()) return 0 if t == 30: hdf.put(case_folder, df.convert_objects(), format='table', data_columns=True ) elif cnt == 30 and not t == cnt: hdf.append(case_folder, df.convert_objects(), format='table', data_columns=True ) cnt = 0 cnt += 1 progress.update(t) progress.finish() hdf.close() return 1
def table_for_of(simulation, period=None, check_validity=False, save_tables=False): temps = time.clock() output_tab = os.path.join(path_til[0], "output", "to_run_leg.h5" ) # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee # on étudie d'abord la table individu pour pouvoir séléctionner les identifiants # step 1 table = {} entities = simulation.entities entities_name = map( lambda e: e.name, simulation.entities) def _get_entity(name): position = entities_name.index(name) return simulation.entities[position] ind = _get_entity('person') table['ind'] = DataFrame(ind.array.columns) table['ind'] = table['ind'].rename(columns={'men': 'idmen', 'foy': 'idfoy', 'id': 'noi', 'statmarit': 'civilstate'}) # création de variable table['ind']['ageq'] = table['ind']['age']/5 - 4 table['ind']['ageq'] = table['ind']['ageq']*(table['ind']['ageq'] > 0) table['ind']['ageq'] = 12 + (table['ind']['ageq']-12)*(table['ind']['ageq'] < 12) #TODO: modifier pour les jeunes veufs # create fam entity try: table['ind'][['idfam','quifam']] = table['ind'].loc[:,['idmen','quimen']] except: pdb.set_trace() # # Travail sur les qui quand on ne controle pas dans la simulation que tout le monde n'est pas qui==2 ## inutile car fait maintenant dans la simulation mais peut-être mieux à refaire ici un jour ## parce que ça prend du temps dans la simulation # time_qui = time.clock() # for ent in ('men','foy'): # 'fam' un jour... # print "Deal with qui for ", ent # qui= 'qui'+ent # ident = 'id'+ent # trav = table['ind'].ix[table['ind'][qui]==2, [ident,qui,'period']] # for name, groupfor nom in ('menage','declar','fam'):for nom in ('menage','declar','fam'): in trav.groupby([ident,'period']): # to_add = range(len(group)) # group[qui] = group[qui]+to_add # table['ind'].ix[group[qui].index, qui] = group[qui] # print "les qui pour ", ent," sont réglés" # time_qui = time.clock() - time_qui # print "le temps passé à s'occuper des qui a été",time_qui ind = table['ind'] for ent in ['men','foy']: entity = _get_entity(of_name_to_til[ent]) table[ent] = DataFrame(entity.array.columns) id = 'id' + ent qui = 'qui' + ent table[ent] = table[ent].rename(columns={'id': id}) # travail sur les qui nb_qui = ind.loc[ind[qui]>1, ['noi',id,qui]].groupby(id, sort=True).size() if len(nb_qui)>0: new_qui = concatenated_ranges(nb_qui) + 2 table['ind'] = table['ind'].sort(id) #note the sort col_qui = table['ind'][qui] col_qui[col_qui>1] = new_qui table['ind'][qui] = col_qui # informations on qui == 0 qui0 = table['ind'].loc[table['ind']['qui' + ent]==0,['noi','idfoy','idmen','idfam','period']] table[ent] = merge(table[ent], qui0, how='left', left_on=[id,'period'], right_on=[id,'period']) if ent=='men': # nbinde est limité à 6 personnes et donc valeur = 5 en python table[ent]['nbinde'] = (table[ent]['nb_persons']-1) * (table[ent]['nb_persons']-1 <=5) +5*(table[ent]['nb_persons']-1 >5) table['fam'] = qui0 # remove non-ordinary household cond = (table['ind']['idmen'] >= 10) & (table['ind']['idfoy'] >= 10) table['ind'] = table['ind'][cond] table['men'] = table['men'][table['men']['idmen']>=10] table['foy'] = table['foy'][table['foy']['idfoy']>=10] table['fam'] = table['fam'][table['fam']['idfam']>=10] # get years years = np.unique(table['ind']['period'].values/100) if period is not None: years=[period] print years if check_validity: for year in years: ind = table['ind'] for ent in ['men','foy']: #fam id = 'id' + ent qui = 'qui' + ent tab = table[ent] try: assert ind.groupby([id,qui]).size().max() == 1 except: print ent pb = ind.groupby([id,qui]).size() > 1 print(ind.groupby([id,qui]).size()[pb]) pdb.set_trace() print(ind[ind[id]==43][['noi',id,qui]]) qui0 = ind[ind[qui]==0] try: assert qui0[id].isin(tab[id]).all() except: cond = tab[id].isin(qui0[id]) print(tab[~cond]) pdb.set_trace() try: assert tab[id].isin(qui0[id]).all() except: cond = tab[id].isin(qui0[id]) print(tab[~cond]) pdb.set_trace() for year in years: if save_tables: try: os.remove(output_tab) except: print("Attention, la table intermediaire n'a pas ete supprimee") goal = HDFStore(output_tab) goal.remove('survey_'+str(year)) for ent in ('ind','men','foy','fam'): tab = table[ent].loc[table[ent]['period']/100==year] key = 'survey_'+str(year) + '/'+ent goal.put(key, tab) goal.close() else: for ent in ('ind','men','foy','fam'): table[ent] = table[ent].loc[table[ent]['period']/100==year] return table
from pandas import HDFStore, merge # DataFrame import numpy as np import pdb import time import pandas as pd output = "C:/openfisca/output/liam/" get_years = HDFStore("C:/openfisca/src/countries/france/data/surveyLiam.h5") input_h5 = HDFStore(output + "LiamLeg.h5") output_h5 = HDFStore(output + "LiamLeg2.h5") years = [x[-4:] for x in dir(get_years.root) if x[0] != "_"] nb_year = len(years) get_years.close() ent = "ind" list_tab = ["survey_" + x + "/" + ent for x in years] output_h5["ind"] = pd.DataFrame() for ent in ("ind", "men", "foy", "fam"): output_h5[ent] = pd.DataFrame() for year in years: name_tab = "survey_" + year + "/" + ent tab = input_h5[name_tab] tab["period"] = pd.Series(np.ones(len(tab)) * int(year)) output_h5.put(ent, output_h5[ent].append(tab)) # par entité, lire les tables pour chaque année, ajouter périod, et ajouter tout ça
def save_df(path, df): """Save DataFrame df in HDF5 store path in '\logs' table FIXME: Use a more neutral name for the table""" store = HDFStore(path) store.put('logs', df) store.close()
def main(args): """ main function """ if args.num_store and args.num_store > _NUM_STORE_TOTAL: sys.exit(1) print("[FAIL] Data should only have %d stores!" % (_NUM_STORE_TOTAL)) if args.nrows and args.nrows < _NUM_STORE_TOTAL: print("[FAIL] Should read more rows than than the number of stores!") sys.exit(1) train_data = None test_data = None store_data = None additional_info = pd.DataFrame() print("Loading data files") if args.type == _ARG_TRAIN: train_data = load_data_file(args.file[0], _DTYPE_TRAIN, nrows=args.nrows) elif args.type == _ARG_TEST: test_data = load_data_file(args.file[0], _DTYPE_TEST, nrows=args.nrows) elif args.type == _ARG_STORE: store_data = load_data_file(args.file[0], _DTYPE_STORE, nrows=args.nrows, parse_date=False) elif args.type == "all": if len(args.file) < 3: print("all option requires three input files") sys.exit(1) train_data = load_data_file(args.file[0], _DTYPE_TRAIN, nrows=args.nrows) test_data = load_data_file(args.file[1], _DTYPE_TEST, nrows=args.nrows) store_data = load_data_file(args.file[2], _DTYPE_STORE, nrows=args.nrows, parse_date=False) else: print("Invalid database type") sys.exit(1) # Reverse dates if train_data is not None: train_data = train_data.iloc[::-1] if test_data is not None: test_data = test_data.iloc[::-1] if args.num_store: print("Selecting %d random stores" % (args.num_store)) store_list = [] random.seed() for _ in range(args.num_store + 1): store_list.append(random.randint(1, _NUM_STORE_TOTAL + 1)) if train_data is not None: train_data = train_data.loc[train_data['Store'].isin(store_list)] if test_data is not None: test_data = test_data.loc[test_data['Store'].isin(store_list)] if store_data is not None: store_data = store_data.loc[store_data['Store'].isin(store_list)] print("Perform additional processing...") train_data, test_data, store_data, additional_info \ = process_data(train_data, test_data, store_data, additional_info) print("Writing to HDF5 type file %s" % (args.file_out.name)) hdf = HDFStore(args.file_out.name, mode='w') if train_data is not None: hdf.put(_ARG_TRAIN + "_data", train_data, format='table', data_columns=True) if test_data is not None: hdf.put(_ARG_TEST + "_data", test_data, format='table', data_columns=True) if store_data is not None: hdf.put(_ARG_STORE + "_data", store_data, format='table', data_columns=True) #if not additional_info.empty: hdf.put("common_info", additional_info, format='table', data_columns=True) return
meanCompetitionDistance = np.mean(data_store.CompetitionDistance) data_store['CompetitionDistance'] = data_store['CompetitionDistance'].fillna(meanCompetitionDistance) data_store['CompetitionDistance'] = data_store['CompetitionDistance'].astype(np.float32) # binary missing values # classical replacement will be -1 for negatives, 0 for missing, 1 for positives # but in open column there is really small amount of missing values data_test.loc[data_test.Open.isnull(), 'Open'] = 1 data_test['Open'] = data_test['Open'].astype(np.int8) print('Normalize data set ...') min_max_scaler = preprocessing.MinMaxScaler() data_store['CompetitionDistance'] = min_max_scaler.fit_transform(data_store['CompetitionDistance']) print('Create ultimate data') # this is concatenating datasets including info from stores and mean data_ut_store = pd.merge(data_mean, data_store, on='Store') data_ut_train = pd.merge(data_train,data_ut_store, on='Store') data_ut_test = pd.merge(data_test,data_ut_store, on='Store') assert( len( data_ut_train ) == len( data_train )) assert( len( data_ut_test ) == len( data_test )) print('Storing data ...') hdf = HDFStore(data_dir + 'data.h5') hdf.put('data_train', data_ut_train, format='table', data_columns=True) hdf.put('data_test', data_ut_test, format='table', data_columns=True) hdf.put('data_store', data_ut_store, format='table', data_columns=True) print('Done ...')
def read_raw_tecplot_folder_and_write_pandas_hdf5( case_folder, root = 0, output_file = 0, output_root = 0, overwrite = False, ): from os.path import isfile,join,splitext from os import listdir from progressbar import ProgressBar,Percentage,Bar from progressbar import ETA,SimpleProgress from pandas import DataFrame, HDFStore # File related things ###################################################### if not output_file: output_file = case_folder+"_Aligned.hdf5" if not output_root: output_root = '/media/carlos/6E34D2CD34D29783/' +\ '2015-02_SerrationPIV/TR_Data_Location_Calibrated_Article3' if not output_file.endswith('_Aligned.hdf5'): output_file = output_file.replace("_Aligned.hdf5","")+"_Aligned.hdf5" if 'STE' in case_folder or 'z10' in case_folder: output_file = output_file.replace( '.hdf5', '_AirfoilNormal.hdf5' ) if isfile(join( output_root, output_file )) and not overwrite: print " Exiting; file exists:\n {0}".format(output_file) return 0 else: print " Writing\n {0}".format(output_file) # ########################################################################## time_step_files = sorted( [join(root,case_folder,f) for f in listdir(join( root, case_folder )) \ if splitext(f)[1] == '.dat'] ) progress = ProgressBar( widgets=[ Bar(),' ', Percentage(),' ', ETA(), ' (file ', SimpleProgress(),')'], maxval=len(time_step_files) ).start() cnt = 0 hdf_store = HDFStore( join( output_root, output_file ) ) for f,t in zip(time_step_files,range(len(time_step_files))): df_t = read_tecplot_file( tecplot_folder = join( root, case_folder ), tecplot_time_step_file = f, time_step = t, ) if cnt == 0: df = df_t.copy() else: df = df.append( df_t, ignore_index = True) if cnt == 50: df = correct_df_translation_rotation( df )\ [['x','y','t','u','v','w']] df = df.sort_values( by = ['x','y','t'] ) #df.set_index( ['x','y'], inplace = True) if t == 0: hdf_store.put( 'data', df , data_columns = ['x','y','t'], format = 't') else: hdf_store.append( 'data', df , data_columns = ['x','y','t'], format = 't') cnt = 0 df = DataFrame() cnt += 1 progress.update(t) progress.finish() hdf_store.close() return 1
class LogSaver: """ self.directory : Directory structure for temp and saved files self.log_list : List of server.log files to process self.extra : True if log messages and thread ids are to be saved too self.history_path : History of server.log conversions saved here self.progress_store_path : HDF5 file that holds one DataFrame for each server.log file self.store_path : Final DataFrame of all server.log entries saved here self.history : History of server.log conversions """ FINAL = 'logs' PROGRESS = 'progress' HISTORY = 'history' @staticmethod def normalize(name): return re.sub(r'[^a-zA-Z0-9]', '_', name) @staticmethod def make_name(base_name, extra): if extra: return base_name + '.extra' else: return base_name #@staticmethod #def temp_name(log_list, extra): # hsh = hash(log_list) # sgn = 'n' if hsh < 0 else 'p' # temp = 'temp_%s%08X' % (sgn, abs(hsh)) # return LogSaver.make_name(temp, extra) def __init__(self, store_path, log_list, extra): self.directory = ObjectDirectory(store_path) self.log_list = tuple(sorted(log_list)) self.extra = extra self.history_path = self.directory.get_path(LogSaver.HISTORY, temp=True) self.progress_store_path = self.directory.get_path(LogSaver.PROGRESS, temp=True, is_df=True) self.store_path = self.directory.get_path(LogSaver.make_name(LogSaver.FINAL, extra), is_df=True) self.history = ObjectDirectory.load_object(self.history_path, {}) self.saved = False def __repr__(self): return '\n'.join('%s: %s' % (k,v) for k,v in self.__dict__.items()) def __str__(self): return '\n'.join([repr(self), '%d log files' % len(self.log_list)]) def save_all_logs(self, force=False): if os.path.exists(self.store_path): final_store = HDFStore(self.store_path) print 'Keys: %s' % final_store final_store.close() return if not force: assert not os.path.exists(self.history_path), ''' %s exists but %s does not. There appears to be a conversion in progress. -f forces conversion to complete. ''' % (self.history_path, self.store_path) self.directory.make_dir_if_necessary(self.progress_store_path) self.progress_store = HDFStore(self.progress_store_path) for path in self.log_list: self.save_log(path) self.check() print '--------' print 'All tables in %s' % self.progress_store_path print self.progress_store.keys() print '--------' def get_log(path): try: return self.progress_store.get(LogSaver.normalize(path)) except Exception as e: print print path raise e df_list = [get_log(path) for path in self.log_list] self.progress_store.close() print 'Closed %s' % self.progress_store_path df_all = pd.concat(df_list) print 'Final list has %d entries' % len(df_all) final_store = HDFStore(self.store_path) final_store.put('logs', df_all) print 'Keys: %s' % final_store final_store.close() print 'Closed %s' % self.store_path # Save the history in a corresponding file self.directory.save('history', self.history) print 'Saved history' self.saved = True def test_store(self): final_store = HDFStore(self.store_path) print '----' print final_store.keys() print '-' * 80 logs = final_store['/logs'] print type(logs) print len(logs) print logs.columns final_store.close() def cleanup(self): os.remove(self.progress_store_path) os.remove(self.history_path) def delete(self): os.remove(self.store_path) def save_log(self, path): """Return a pandas DataFrame for all the valid log entry lines in log_file The index of the DataFrame are the uniqufied timestamps of the log entries """ if path in self.history: return print 'Processing %s' % path, start = time.time() header, df = load_log(path, extra=self.extra) if df is None: print 'Could not process %s' % path return self.progress_store.put(LogSaver.normalize(path), df) load_time = time.time() - start self.history[path] = { 'start': df.index[0], 'end': df.index[-1], 'load_time': int(load_time), 'num': len(df), 'header': header } ObjectDirectory.save_object(self.history_path, self.history) del df print { k:v for k,v in self.history[path].items() if k != 'header' }, print '%d of %d' % (len(self.history), len(self.log_list)) def check(self): history = ObjectDirectory.load_object(self.history_path, {}) sorted_keys = history.keys() sorted_keys.sort(key=lambda k: history[k]['start']) print '-' * 80 print 'Time range by log file' for i, path in enumerate(sorted_keys): hist = history[path] print '%2d: %s --- %s : %s' % (i, hist['start'], hist['end'], path) path0 = sorted_keys[0] for path1 in sorted_keys[1:]: hist0,hist1 = history[path0],history[path1] assert hist0['end'] < hist1['start'], ''' ----------- %s %s start: %s end : %s ----------- %s %s hist1['start'] start: %s end : %s ''' % ( path0, hist0, hist0['start'], hist0['end'], path1, hist1, hist1['start'], hist1['end'])