def open(self, name): if not path.exists(self.importpath): raise FileNotFoundError(path.join(self.dataroot, 'imported')) filename = path.join(self.importpath, name) if not path.exists(filename): raise FileExistsError(filename) self.store = HDFStore(filename)
def test_walk_groups(self): with tm.ensure_clean('walk_groups.hdf') as filename: store = HDFStore(filename, 'w') dfs = { 'df1': pd.DataFrame([1, 2, 3]), 'df2': pd.DataFrame([4, 5, 6]), 'df3': pd.DataFrame([6, 7, 8]), 'df4': pd.DataFrame([9, 10, 11]), } store.put('/first_group/df1', dfs['df1']) store.put('/first_group/df2', dfs['df2']) store.put('/second_group/df3', dfs['df3']) store.put('/second_group/third_group/df4', dfs['df4']) expect = { '/': ({'first_group', 'second_group'}, set()), '/first_group': (set(), {'df1', 'df2'}), '/second_group': ({'third_group'}, {'df3'}), '/second_group/third_group': (set(), {'df4'}), } for path, groups, frames in store.walk_groups(): self.assertIn(path, expect) expect_groups, expect_frames = expect[path] self.assertEqual(expect_groups, set(groups)) self.assertEqual(expect_frames, set(frames)) for frame in frames: frame_path = '/'.join([path, frame]) df = store.get(frame_path) self.assert_(df.equals(dfs[frame]))
def hdf(): df = ts.get_hist_data('000875') # df.to_hdf('c:/day/store.h5','table') store = HDFStore('c:/day/store.h5') store['000875'] = df store.close()
def quantitative_analysis(df_name, df_seq_col, df_quant_col, func=lambda x: x): print "Quantitative analysis of ", df_name store = HDFStore('_data_/ProteinDataStore.h5') summary = store['DataBases_Summary'] df = store[df_name] df = df[[df_seq_col, df_quant_col]] renamed_col = '_'.join(df_quant_col.split(' ')) print "Filling column ", renamed_col summary[renamed_col] = ['.'] * len(summary) print "Current summary shape: ", summary.shape seq_list = map(lambda x: re.sub(r'[^A-Z]', '', x), df[df_seq_col].values) for i in zip(seq_list, df[df_quant_col].values): query = np.where(summary['GlyGly Probabilities'] == i[0])[0] if len(query) != 0: index = query[0] else: print "Omitted data: ", i continue if not np.isnan(i[1]): try: tmp = func(i[1]) summary.loc[index, renamed_col] = tmp except Exception as e: print i print e.message else: summary.loc[index, renamed_col] = '.' store['DataBases_Summary'] = summary store.close()
def to_hdf5(self, fname, complevel=9, complib='bzip2'): if os.path.exists(fname): logger.warning('Overwrite %s with current history', fname) history_store = HDFStore(fname, mode='w', complevel=complevel, complib=complib) for attribute in self._store_attributes: history_store[attribute] = getattr(self, attribute) history_store.close()
def update_exchanges(): """ Updates data for exchanges such as NYSE """ ####### LOAD DATE RANGES AND SYMBOLS start_date = Config(CFG).get('Exchange Data Start Date', 'default_start_date') end_date = datetime.datetime.now().strftime('%Y-%m-%d') symbols = [Config(CFG).get('Symbol List', 'list')] ####### BACKUP and UPDATE DB filename = Config(CFG).get("DB Locations", 'exchange_data') backup = Config(CFG).get("DB Locations", 'exchange_data_backup') file_update_backup(filename, backup) ####### START HDF5 INSTANCE operator = HDFStore(filename) for symbol in symbols: ####### PULL YAHOO FINANCE DATA data = get_daily_history(symbol, start_date, end_date) ####### PULL ADVANCES/DECLINES DATA data = data.merge(update_unicorn(symbol), left_index=True, right_index=True, how='outer') ####### SAVE DATA TO HDF5 operator[symbol] = data operator.close()
def save(self, filename, force=False): """Save the trials and samples arrays from the current importer instance to a dataset inside a lzf compressed hdf5 file for later use. Args: param1: (string): filename, will be stored in self.importpath Optional Args: force: (boolean) Wether or not to overwrite an existing file (default: False) """ try: mkdir(self.importpath) except FileExistsError: pass filename = path.join(self.importpath, filename) if path.exists(filename): if force: unlink(filename) else: raise FileExistsError('Import file "' + filename + '" already exists.') self.__sort() self.store = HDFStore(filename, complib='lzo') self.store['samples'] = self.ds.samples self.store['targets'] = self.ds.targets self.store.close()
def drop_with_low_probability(storename, df_name, loc_probability_colname, threshold=0.95): print 'Filtering by low probability in', df_name store = HDFStore(storename) df = store[df_name] if loc_probability_colname is not None: df = df[df[loc_probability_colname] >= threshold] store[df_name] = df store.close()
def reindex_summary(): store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] range_index = [x for x in np.arange(len(data_summary))] print "Reindexing..." data_summary = data_summary.set_index([range_index]) store['DataBases_Summary'] = data_summary store.close()
def test_legacy_read(self): pth = curpath() store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') store['a'] store['b'] store['c'] store['d'] store.close()
def _check_roundtrip(self, obj, comparator): store = HDFStore(self.scratchpath, 'w') try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj) finally: store.close() os.remove(self.scratchpath)
def colorful_dump_summary_to_excel(output_filename, range_label='L1:U36229'): # < -2 dark green # -2 to -1 light green # -1 to 1 yellow # 1 to 2 Orange # > 2 red store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] writer = ExcelWriter(output_filename + '.xlsx', engine='xlsxwriter') data_summary.to_excel(writer, 'DataBases_Summary', index=True) workbook = writer.book worksheet = writer.sheets['DataBases_Summary'] # using pallete http://www.colourlovers.com/palette/3687876/ blue = workbook.add_format({'bg_color': '#69D2E7', 'font_color': '#000000'}) coral = workbook.add_format({'bg_color': '#A7DBD8', 'font_color': '#000000'}) yellow = workbook.add_format({'bg_color': '#EAE319', 'font_color': '#000000'}) orange = workbook.add_format({'bg_color': '#FA6900', 'font_color': '#000000'}) red = workbook.add_format({'bg_color': '#E2434B', 'font_color': '#000000'}) # empty = workbook.add_format({'bg_color': '#FFFFFF', 'font_color': '#000000'}) # # worksheet.conditional_format(range_label, {'type': 'text', # 'criteria': 'begins with', # 'value': '.', # 'format': empty}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': '<', 'value': -2, 'format': blue}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': 'between', 'minimum': -2, 'maximum': -1, 'format': coral}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': 'between', 'minimum': -1, 'maximum': 1, 'format': yellow}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': 'between', 'minimum': 1, 'maximum': 2, 'format': orange}) worksheet.conditional_format(range_label, {'type': 'cell', 'criteria': '>', 'value': 2, 'format': red}) writer.save() store.close()
def parse_one_and_save(input_file, output_store_name): sheet_name = 'All sites' skip_rows = [0] store = HDFStore(output_store_name) df = pd.ExcelFile(input_file).parse(sheetname=sheet_name, skiprows=skip_rows) name = (input_file.split('/')[1]).split('.')[0] print "Parsing ", name store[name] = df store.close()
def _check_roundtrip_table(self, obj, comparator): store = HDFStore(self.scratchpath, 'w') try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath)
def load(self, format='csv'): savefile = self.__savefile() if format == "csv": self.frame.from_csv(savefile + ".csv") elif format == "hdf": store = HDFStore(savefile + ".hdf") try: self.frame = store['data'] finally: store.close()
def test_walk_groups(self): with tm.ensure_clean('walk_groups.hdf') as filename: store = HDFStore(filename, 'w') dfs = { 'df1': pd.DataFrame([1,2,3]), 'df2': pd.DataFrame([4,5,6]), 'df3': pd.DataFrame([6,7,8]), 'df4': pd.DataFrame([9,10,11]), } store.put('/first_group/df1', dfs['df1']) store.put('/first_group/df2', dfs['df2']) store.put('/second_group/df3', dfs['df3']) store.put('/second_group/third_group/df4', dfs['df4']) expect = { '/': ({'first_group', 'second_group'}, set()), '/first_group': (set(), {'df1', 'df2'}), '/second_group': ({'third_group'}, {'df3'}), '/second_group/third_group': (set(), {'df4'}), } for path, groups, frames in store.walk_groups(): self.assertIn(path, expect) expect_groups, expect_frames = expect[path] self.assertEqual(expect_groups, set(groups)) self.assertEqual(expect_frames, set(frames)) for frame in frames: frame_path = '/'.join([path, frame]) df = store.get(frame_path) self.assert_(df.equals(dfs[frame]))
def parse_list_and_save(list_of_files, output_store_name): sheet_name = 'All sites' skip_rows = [0] store = HDFStore(output_store_name) for _file_ in list_of_files: df = pd.ExcelFile(_file_).parse(sheetname=sheet_name, skiprows=skip_rows) name = (_file_.split('/')[2]).split('.')[0] print "Parsing ", name store[name] = df store.close()
def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert (recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath)
def load_exchange_data(symbol): """ Returns data for a specific exchange """ filename = Config(CFG).get("DB Locations", 'exchange_data') operator = HDFStore(filename) data = operator[symbol] operator.close() return data
def test_store_index_name(self): df = tm.makeDataFrame() df.index.name = 'foo' try: store = HDFStore(self.scratchpath) store['frame'] = df recons = store['frame'] assert(recons.index.name == 'foo') finally: store.close() os.remove(self.scratchpath)
def test_legacy_table_write(self): # legacy table types pth = curpath() df = tm.makeDataFrame() wp = tm.makePanel() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a') self.assertRaises(Exception, store.append, 'df1', df) self.assertRaises(Exception, store.append, 'wp1', wp) store.close()
def test_fixed_offset_tz(self): rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath)
def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert (recons.name == 'A') finally: store.close() os.remove(self.scratchpath)
def test_timezones(self): rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') frame = DataFrame(np.random.randn(len(rng), 4), index=rng) try: store = HDFStore(self.scratchpath) store['frame'] = frame recons = store['frame'] self.assert_(recons.index.equals(rng)) self.assertEquals(rng.tz, recons.index.tz) finally: store.close() os.remove(self.scratchpath)
def test_context(setup_path): with tm.ensure_clean(setup_path) as path: try: with HDFStore(path) as tbl: raise ValueError("blah") except ValueError: pass with tm.ensure_clean(setup_path) as path: with HDFStore(path) as tbl: tbl["a"] = tm.makeDataFrame() assert len(tbl) == 1 assert type(tbl["a"]) == DataFrame
def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] try: store = HDFStore(self.scratchpath) store['series'] = series recons = store['series'] assert(recons.name == 'A') finally: store.close() os.remove(self.scratchpath)
def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store['obj'] = obj retrieved = store['obj'] comparator(retrieved, obj, **kwargs) finally: store.close() os.remove(self.scratchpath)
def hdfWrite(self, path, excode, symbol, indata, kind1, kind2, kind3): # kind1为 'Rawdata'、'Stitch'、'Indicator' # kind2为 '00' '01' # kind3为 '1d' '60m' '30m' '15m' '5m' '1m' # 写各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d' # 写StitchRule: kind1='Stitch', kind2='00',kind3=None # 写StitchData: kind1='Stitch', kind2='00',kind3='1d' # 写Indicator: kind1='Indicator',kind2='Indicator_name',kind3='params' store = HDFStore(path, mode='a') if kind1 == EXT_Rawdata: key = '/'.join([kind1, excode, symbol, kind3]) elif kind1 == EXT_Stitch: key = '/'.join([kind1, excode, symbol, EXT_Rule, kind2 ]) if kind3 == None else '/'.join([ kind1, excode, symbol, EXT_Period, kind3, kind2 ]) elif kind1 == EXT_Indicator: key = '/'.join([kind1, excode, symbol, kind2]) else: print("kind not supported") return if kind1 == EXT_Indicator: f = h5py.File(path, 'a') try: store[key] except KeyError: # 路径不存在时创建 store[key] = indata f[key].attrs['Params'] = kind3 else: if f[key].attrs['Params'] == kind3: #Params匹配时合并 adddata = indata[~indata.index.isin(store[key].index)] store.append(key, adddata) else: # Params不匹配时覆盖 store[key] = indata f[key].attrs['Params'] = kind3 f.close() store.close() else: try: store[key] except KeyError: store[key] = indata else: adddata = indata[~indata.index.isin(store[key].index)] if kind2 in [EXT_Series_00, EXT_Series_01]: adddata[EXT_Out_AdjFactor] = adddata[ EXT_Out_AdjFactor] * store[key][EXT_Out_AdjFactor].iloc[ -1] / adddata[EXT_Out_AdjFactor].iloc[0] store.append(key, adddata) store.close()
def hload(self,fname): from pandas.io.pytables import HDFStore store = HDFStore(fname,mode='r') self.clear() read = [] for k in store.keys(): if re.match('^_MISSING',k): v = store.get(k).to_dict().values() self._missing = v continue name = re.sub('^/','',k) self[name]=store[k] read.append(name) store.close()
def hsave(self,fname): from pandas.io.pytables import HDFStore LOGGER.debug('Saving HDF in %s\n%s',fname,self.report()) store = HDFStore(fname,mode='w') for k,v in self.items(): if re.match('^__',k): continue if isinstance(v,np.ndarray): v = Series(v) LOGGER.debug('Saving HDF for %s',k) store.put(k,v) if self._missing: store['_MISSING']=Series(self._missing) store.close()
def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: store.put('obj', obj, table=True) retrieved = store['obj'] sorted_obj = _test_sort(obj) comparator(retrieved, sorted_obj) finally: store.close() os.remove(self.scratchpath)
def load_historical_data(start=datetime(2010, 1, 1), end=datetime.today(), filename='stock_data.h5'): store = HDFStore(filename) with open('companylist.csv', 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in spamreader: print row[0] try: stock_info = web.DataReader(row[0], "yahoo", start, end) store[row[0]] = stock_info except: print "Error on", row[0] store.close()
def test_legacy_table_read(self): # legacy table types pth = curpath() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') store.close()
def test_wide_table_dups(self): wp = tm.makePanel() try: store = HDFStore(self.scratchpath) store._quiet = True store.put('panel', wp, table=True) store.put('panel', wp, table=True, append=True) recons = store['panel'] tm.assert_panel_equal(recons, wp) finally: store.close() os.remove(self.scratchpath)
def pandas_roundtrip(filename, dma1, dma2): from pandas.io.pytables import HDFStore store = HDFStore(filename) store['dma1'] = dma1 store['dma2'] = dma2 dma1 = store['dma1'] dma2 = store['dma2']
def read_archive(hdf_path, items=['train_x', 'valid_x', 'test_x', 'train_y', 'valid_y', 'test_y']): ''' convenience function used for retrieving data within a hdf archive Args: hdf_path (str): fullpath of file which data is stored in items opt(list): items to be retrieved default: ['train_x', 'valid_x', 'test_x', 'train_y', 'valid_y', 'test_y'] ''' hdf = HDFStore(hdf_path) output = map(lambda x: hdf[x], items) hdf.close() return output
def dump_summary_to_excel(output_filename): # Save to XLSX store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] writer = ExcelWriter(output_filename + '.xlsx', engine='xlsxwriter') data_summary.to_excel(writer, 'DataBases_Summary', index=True) writer.save()
def maybe_load(): data_scheme = {'df_lines': ['line_ID', '''select line_ID, house_ref, match_ref, TS_ref, line_value, line_increment, snapshot_time, is_it_starting, RTV_ref, time_increment from Lines where RTV_ref in (1, 2, 3)'''], 'df_match_results': ['MR_ID', '''select MR_ID, RTV_ref, match_ref, actual_value, text_result from Match_results where RTV_ref in (1, 2, 3)'''], } loaded_data = {} from pandas.io.pytables import HDFStore with HDFStore('BRefDB.h5') as store: for df_name in data_scheme: if df_name in store: loaded_data[df_name] = store[df_name] else: import pymssql with pymssql.connect('.\\SQLEXPRESS', 'BB_miner', 'BB_3817_miner', "BRefDB") as conn: loaded_data[df_name] = pd.read_sql(data_scheme[df_name][1], conn, index_col=data_scheme[df_name][0]) return loaded_data
def save_to_store(loaded_data): from pandas.io.pytables import HDFStore with HDFStore('BRefDB.h5') as store: for df_name in loaded_data: store[df_name] = loaded_data[df_name] print df_name + ' saved'
def test_hdfstore_iteritems_deprecated(setup_path): with ensure_clean_path(setup_path) as path: df = DataFrame({"a": [1]}) with HDFStore(path, mode="w") as hdf: hdf.put("table", df) with tm.assert_produces_warning(FutureWarning): next(hdf.iteritems())
def make_summary(newcols): """ :param newcols: column names in the main summary table :return: none """ print "Making summary..." # open store end read base dataframe store = HDFStore('_data_/ProteinDataStore.h5') df1 = store['Mol_Cell_Proteomics_2011_Epub_2011_September1Supp2'] # clean sequences LEN = len(df1) positions = [0] * LEN real_glygly = [0] * LEN clean_glygly = [0] * LEN for i in np.arange(LEN): positions[i] = df1['Position'].values[i] real_glygly[i] = clear_sequence(df1['GlyGly (K) Probabilities'].values[i]) clean_glygly[i] = re.sub(r'[^A-Z]', '', real_glygly[i]) # align with SwissProt Human and Rodents using blastp blastpID_HUMAN, blastpID_RODENTS = fetch_indentity_from_local_batch(clean_glygly) del df1 print "Length test", len(positions) == len(real_glygly) == len(clean_glygly) == len(blastpID_HUMAN) == len( blastpID_RODENTS) # convert to pandas series clean_glygly = pd.Series(clean_glygly) blastpID_HUMAN = pd.Series(blastpID_HUMAN) blastpID_RODENTS = pd.Series(blastpID_RODENTS) # Create empty dataframe data_summary = pd.DataFrame(columns=newcols) # Combine everything required in dataframe data_summary['Position'] = positions data_summary['GlyGly (K) Probabilities'] = real_glygly data_summary['GlyGly Probabilities'] = clean_glygly data_summary['SP_ID_BLASTP_HUMAN'] = blastpID_HUMAN data_summary['SP_ID_BLASTP_RODENTS'] = blastpID_RODENTS # Save to HDF store store['DataBases_Summary'] = data_summary store.close()
def hdfRead(self, path, excode, symbol, kind1, kind2, kind3, startdate=EXT_Start, enddate=EXT_End, is_stitch=True): # kind1为 'Rawdata',Stitch','Indicator' # kind2为 '00' '01' # kind3为 '1d' '60m' '30m' '15m' '5m' '1m' # 读各个频率的Rawdata: kind1='Rawdata',kind2=None,kind3='1d' # 读StitchRule: kind1='Stitch', kind2='00',kind3=None # 读STitchData: kind1='Stitch', kind2='00',kind3='1d' # 读Indicator: kind1='Indicator',kind2='Indicator_name',kind3=None store = HDFStore(path, mode='r') if kind1 == EXT_Rawdata: key = '/'.join([kind1, excode, symbol, kind3]) elif kind1 == EXT_Stitch: key = '/'.join([kind1, excode, symbol, EXT_Rule, kind2 ]) if kind3 == None else '/'.join([ kind1, excode, symbol, EXT_Period, kind3, kind2 ]) elif kind1 == EXT_Indicator: key = '/'.join([kind1, excode, symbol, kind2]) else: print("kind not supported") return data = store[key].ix[( (store[key].index.get_level_values(0) >= pd.to_datetime(startdate)) & (store[key].index.get_level_values(0) <= pd.to_datetime(enddate)) ), :] if kind1 == EXT_Stitch and is_stitch == True and kind3 != None: data[EXT_Bar_Open] = data[EXT_AdjFactor] * data[EXT_Bar_Open] data[EXT_Bar_High] = data[EXT_AdjFactor] * data[EXT_Bar_High] data[EXT_Bar_Low] = data[EXT_AdjFactor] * data[EXT_Bar_Low] data[EXT_Bar_Close] = data[EXT_AdjFactor] * data[EXT_Bar_Close] store.close() if kind1 == EXT_Indicator: f = h5py.File(path, 'r') params = f[key].attrs['Params'] f.close() return data, params return data
def pandas_roundtrip(filename, dma1, dma2): # What's the best way to code this? from pandas.io.pytables import HDFStore store = HDFStore(filename) store['dma1'] = dma1 store['dma2'] = dma2 dma1 = store['dma1'] dma2 = store['dma2']
def _exists(self) -> bool: path = self._get_load_path() if Path(path).is_file(): with HDFStore(Path(path), mode="r") as hdfstore: key_with_slash = (self._key if self._key.startswith("/") else "/" + self._key) if key_with_slash in hdfstore.keys(): return True return False
def analyze_existence(storename_to_append, gly_gly_seq_colname): print "Analyzing occurence in ", storename_to_append store = HDFStore('_data_/ProteinDataStore.h5') data_summary = store['DataBases_Summary'] tmp_store_sequences = store[storename_to_append][gly_gly_seq_colname].values tmp_store_sequences = map(clear_sequence, tmp_store_sequences) # Make binary vector which represents existence # of the sequence in storename_to_append dataset existense_index = data_summary['GlyGly (K) Probabilities'].isin(tmp_store_sequences).values existense_index = np.asarray(existense_index, dtype=int) # Create new column in summary table data_summary[storename_to_append] = existense_index print np.sum(data_summary[storename_to_append]) # Save to HDF store store['DataBases_Summary'] = data_summary store.close()
def test_legacy_table_read(self): # legacy table types pth = curpath() store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') # force the frame store.select('df2', typ = 'legacy_frame') # old version (this still throws an exception though) import warnings warnings.filterwarnings('ignore', category=IncompatibilityWarning) self.assertRaises(Exception, store.select, 'wp1', Term('minor_axis','=','B')) warnings.filterwarnings('always', category=IncompatibilityWarning) store.close()