def load_quotes_from_hdf_store(store, place_code, pcode, from_time=None, to_time=None): # загружает котировки из hdf файла key = place_code + '/' + pcode cond = list() if from_time is not None: cond.append(pd.Term('index', '>=', pd.Timestamp(from_time))) if to_time is not None: cond.append(pd.Term('index', '<=', pd.Timestamp(to_time))) return store.select(key, cond)
def _merge_data(self): self.est_data = crsp.select('/crsp/dsf', where=[ pd.Term('PERMNO', '=', self._id), pd.Term('DATE', '=', self.est_period.tolist()) ]) self.evt_data = crsp.select('/crsp/dsf', where=[ pd.Term('PERMNO', '=', self._id), pd.Term('DATE', '=', self.evt_window.tolist()) ]) self.est_data = self.est_data.reset_index(level=0).join(DAILY_FACTORS) self.evt_data = self.evt_data.reset_index(level=0).join(DAILY_FACTORS) self._has_data = True
def getCentroid(self, targetdict, valcol, topN=20): # get the rows. query = [ pd.Term(key, "=", value) for key, value in targetdict.iteritems() ] selectdf = self.store.select(self.tablename, where=query) # pivot the table. colindex = [ x for x in selectdf.columns if x not in targetdict.keys() + [valcol] ] pivotdf = selectdf.pivot_table(valcol, targetdict.keys(), colindex).fillna(0) ## unit-vectorize the pivot table. The *columns* are treated as the vectors. #for col in pivotdf.columns: # pivotdf[col] = nla.norm(pivotdf[col]) # The columns are the vectors. Return the topN columnheads by length. pivotdf = pivotdf.apply(lambda x: nla.norm(x)) # FIXED: sort is deprecated # pivotdf.sort(ascending=False) pivotdf.sort_values(inplace=True, ascending=False) topcols = list(pivotdf[:topN].index) if not topcols: return None topcols = [x.replace('"', "\\\"") for x in topcols] #print >>sys.stderr, pivotdf # Now use the topN to get relevant rows for the colindex. query = [ pd.Term(colindex[n], "=", topcols) for n in range(0, len(colindex)) ] #print >>sys.stderr, repr(query) # Now create the centroid based on the colindex. selectdf = self.store.select(self.tablename, where=query) pivotdf = selectdf.pivot_table(valcol, colindex, sorted(targetdict.keys())).fillna(0) return pivotdf.sum()
def getRow(self, rowname, rowval, valcol): query = pd.Term(rowname, '=', rowval) selectdf = self.store.select(self.tablename, where=query) colindex = [x for x in selectdf.columns if x not in [rowname, valcol]] pivotdf = selectdf.pivot_table(valcol, rowname, sorted(colindex)).fillna(0) return pivotdf.sum() # need this to get a Series out.
def get_store_data(mode, chid, term): store = pd.HDFStore('/raid1/maye/rdr20_month_samples/' + mode + '/' + chid + '.h5') if not 'm' in chid: # if no m85 in chid, no need for pd.Term mine = store.select('df', [pd.Term(term)], columns=['clat', 'clon', 'cloctime', 'tb']) else: mine = store.select('df') store.close() return mine
def data(self,channels=None): """Efficiently get data chunks from disk by supplying a column list (Note:data must be in table format) Keyword Arguments: channels=None -- The channels to pull from the data (This should be done efficiently but needs to be reviewed)""" if channels is not None: try: #Efficient read from disk (no need to load all data in memory) if data is in table format d = self.__signals.select('data', [pd.Term('columns','=',channels)]) except: #If in pytables format then we need to load all data into memory and clip d= self.__signals['data'][channels] else: d= self.__signals['data'] gc.collect() #Do garbage collection to free up wasted memory return d
def get_var_ib_null_or_nan(variable, condition): # condition == null ou condition == Nan """ voir dans quelles catégories des variables etat, qualite et statut sont les indiv dont l'ib est égal à 0, pour comprendre le sens d'un ib égal à 0 et déceler d'éventuelles anomalies.""" idents_annee = get_df_ib_condition(condition) idents = idents_annee['ident'].tolist() annees = idents_annee['annee'].tolist() df = pd.read_hdf( hdf5_file_path, '{}'.format(variable), where = [pd.Term("ident", "=", idents)], start = 800000, stop = 999999, ) df = df[df['annee'].isin(annees)] df_per_year = df.groupby(['annee', variable]).size().reset_index() df_per_year.columns = ['annee', '{}_categorie'.format(variable), '{}_compte'.format(variable)] sns.pointplot(x="annee", y="{}_compte".format(variable), hue="{}_categorie".format(variable), data=df_per_year) plt.title('Effectifs annuels par categorie de la variable {} pour ib {}'.format(variable, condition))
def read_method(method): if method == 1: a = pd.read_hdf(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5') time_list = [time.time()] # cost 0.30s res = a.loc[pd.IndexSlice[tradingday, windcode], :] time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) # use idx elif method == 2: a = pd.HDFStore(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5') time_list = [time.time()] # cost 0.05s res = a.select( key='Data', where=pd.Term("(TradingDay = tradingday)&(WindCode = windcode)")) time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) elif method == 3: a = pd.read_hdf(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5') pn = a.to_panel() time_list = [time.time()] # cost 0.001s res = pn.loc[:, tradingday, windcode] time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) elif method == 4: # this method is very slow!! a = pd.read_hdf(r'F:\Python_3\MyPython_3\0_Data\Pub\BRPrice.h5') dict_frm = a.to_dict(orient='index') res = dict_frm[tradingday]
def test_term(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.Term('index>=date')
def get_equal_time_term(self, t): return pd.Term( 'major_axis', '==', pd.Timestamp(t))
def get_interval(self, t0, t1): t1 = t1 or t0 return self.select( self.name, [pd.Term('major_axis', '>=', pd.Timestamp(t0)), pd.Term('major_axis', '<=', pd.Timestamp(t1))] )
def join_col(self, df, add_cols, join_cols=None, join_key=None, join_store=None, join_filter=None, drop_joining_duplicates=True): """ This function is meant to return the input df with add_cols added. These columns are fetched in join_store[join_key] and are aligned to df using join_cols. Note: At the time of this writing, only a restricted case is handled, namely: join_cols has only one element that must be in the index of the store """ join_store = join_store or self.join_store join_key = join_key or self.join_key if isinstance(add_cols, basestring): if add_cols in self.add_from.keys(): if 'join_store' in self.add_from[add_cols].keys(): join_store = join_store or self.add_from[add_cols][ 'join_store'] if 'join_key' in self.add_from[add_cols].keys(): join_key = join_key or self.add_from[add_cols]['join_key'] if 'join_cols' in self.add_from[add_cols].keys(): join_cols = join_cols or self.add_from[add_cols][ 'join_cols'] join_cols = util_ulist.ascertain_list(join_cols) add_cols = util_ulist.ascertain_list(add_cols) # get the df values to join (and see if they're in cols or index) if coll_op.contains(list(df.columns), join_cols): df_join_cols_in_columns = True df_join_col_values = np.unique(df[join_cols]) else: df_join_cols_in_columns = False df_join_col_values = np.unique(list(df.index)) # get necessary information from store store_key_info = self.store_info[join_store] join_key = ascertain_prefix_slash(join_key) store_key_info = store_key_info[join_key] if len(join_cols) == 1 and join_cols[0] == 'index': print "uploading only specific indices for join_df" join_df = self.store[join_store].select( key=join_key, where=[pd.Term('index', df_join_col_values)], columns=add_cols) elif join_cols in store_key_info['column_names']: print "uploading only specific columns for join_df" join_df = self.store[join_store].select( key=join_key, where=[pd.Term(join_cols[0], df_join_col_values)], columns=join_cols + add_cols) join_df.set_index(join_cols[0]) else: print "uploading the whole potential join_df" join_df = self.store[join_store].select(key=join_key, columns=join_cols + add_cols) #print join_cols #print add_cols #print join_df.head(10) # drop duplicates if drop_joining_duplicates == True: join_df = join_df.drop_duplicates() if coll_op.contains(list(join_df.columns), join_cols): join_df_cols_in_cols = True else: join_df_cols_in_cols = False #print df_join_cols_in_columns #print join_df_cols_in_cols # join if df_join_cols_in_columns: if join_df_cols_in_cols: return pd.merge(df, join_df, on=join_cols) else: return pd.merge(df, join_df, right_on=join_cols, left_index=True) else: if join_df_cols_in_cols: return pd.merge(df, join_df, right_index=True, left_on=join_cols) else: return pd.merge(df, join_df, right_index=True, left_index=True)
def get_table(self, selection, columns=None, key=None): key = key or self.key columns = columns or self.columns return self.select(key=self.key, where=pd.Term(self.selection_col, selection), columns=self.columns)
def or_select_single_var(self, key=None, where=None): key = key or self.key #where_string = where[0]+'=' return self.select(key=key, where=pd.Term(where[0], where[1]))
def read_method(method): if method == 1: a = pd.read_hdf(r'D:\BRPrice_0302.h5') time_list = [time.time()] # cost 0.30s res = a.loc[pd.IndexSlice[tradingday, windcode], :] time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) # use idx elif method == 2: a = pd.HDFStore(r'D:\BRPrice_0302.h5') time_list = [time.time()] # cost 0.10s res = a.select( key='Data', where=pd.Term("(TradingDay = tradingday)&(WindCode = windcode)")) time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) elif method == 3: a = pd.read_hdf('D:\\New_BRPrice_0302.h5') pn = a.to_panel() time_list = [time.time()] # cost 0.42s # print(pn) # if tradingday in pn.major_axis: # print(tradingday) # if windcode in pn.minor_axis: # print(windcode) # else: # print('not exists') res = pn.loc[:, tradingday, windcode] # print(res) time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) elif method == 4: # this method is very slow!! a = pd.read_hdf(r'D:\BRPrice_0302.h5') dict_frm = a.to_dict(orient='index') res = dict_frm[tradingday] #日期用str比Timestamp慢 elif method == 5: a = pd.read_hdf('D:\\New_BRPrice_0302.h5') pn = a.to_panel() time_list = [time.time()] res = pn.loc[:, '2005-01-05', windcode] # print(res) time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) #ix 很慢 elif method == 6: time_list = [time.time()] dailyQuote = pd.read_hdf(r'D:\BRPrice.h5') # # print(store) # dailyQuote = store.select('Data' # # [ # # Term('InnerCode', '=', 3), # # Term('TradingDay', '>=', startDate), # # Term('TradingDay', '<=', endDate), # # Term('columns', '=', 'Mom') # # ] # ); time_list.append(time.time()) # dailyQuote = dailyQuote[dailyQuote.index.get_level_values(0) == tradingday] time_list.append(time.time()) # dailyQuote.sort_index(inplace= True) for i in range(10000): dailyQuote2 = dailyQuote.xs(tradingday) # print(i) # dailyQuote2 = dailyQuote.loc[tradingday] time_list.append(time.time()) # print(dailyQuote) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1])) print('-----') planBuyList = [ '000001.SZ', '000002.SZ', '000004.SZ', '000007.SZ', '000010.SZ' ] for innerCode in planBuyList: time_list = [time.time()] # entity = dailyQuote.ix[(tradingday,innerCode)] entity = dailyQuote2.ix[innerCode] # print(entity) time_list.append(time.time()) print( np.array(time_list[1:len(time_list)]) - np.array(time_list[0:len(time_list) - 1]))
def getLengths(self, lengths): lengthquery = pd.Term(self.vecnamecol, "=", lengths) return self.store.select(self.tablename, where=lengthquery).set_index(self.vecnamecol)