def munge_tables(self, hashval, start_date, end_date): store = self.store # store.select('cache_spec', where=[('hashval', hashval)]) store['/cache_spec'][['start_date','end_date']].sort(['start_date']) df_min = store_select(store, 'cache_spec', where=[('start_date', '<=', start_date)]).reset_index() df_max = store_select(store, 'cache_spec', where=[('end_date', '<=', end_date)]).reset_index() df_total = df_min.append(df_max) df_total.drop_duplicates('_end_row',inplace=True) df_total.reset_index(inplace=True) ss_vals = df_total[['_start_row','_end_row', ]].values df_list = [] for s in ss_vals: start_row = s[0] end_row = s[1] temp = store_select(store, self.localpath, start=start_row, stop=end_row) temp.head() df_list.append(temp) df_concat = pd.concat(df_list) df_concat.sort(['date'],inplace=True) df_return = df_concat[(df_concat['date'] >= start_date) & (df_concat['date'] <= end_date)] return df_return
def select(self, query_filter, where=None): cache_info = self.cache_info(query_filter) if cache_info is None: self.cache_data(query_filter) cache_info = self.cache_info(query_filter) start_row, end_row = cache_info result = store_select(self.store, self.localpath, where=where, start=start_row, stop=end_row) return result
def cache_info(self, query_params): param_dict = self.parameter_dict(query_params) query = param_dict.items() try: result = store_select(self.store, 'cache_spec', where=query) except KeyError: return None if result is None: return None if result.shape[0] == 0: return None else: return result['_start_row'], result['_end_row']
def cache_info(self, query_filter): hashval = self.gethashval(query_filter) try: #rewriting where statement for 0.13 pandas style result = store_select(self.store, 'cache_spec', where=[('hashval', hashval)]) except KeyError: return None if result is None: return None if result.shape[0] == 0: return None else: return result['_start_row'], result['_end_row']
def cache_info(self, query_params): data = self.parameter_dict(query_params) hashval = gethashval(data) try: result = store_select(self.store, 'cache_spec', where=[('hashval', hashval)]) except KeyError: return None if result is None: return None if result.shape[0] == 0: return None else: return result['_start_row'], result['_end_row']
def select(self, **kwargs): for field in self.cache_discrete_fields: if not isinstance(kwargs.get(field), (list, tuple, np.ndarray)): kwargs[field] = [kwargs.get(field)] query_params = kwargs where = query_params.pop('where', None) cache_info = self.cache_info(query_params) if cache_info is None: self.cache_data(query_params) cache_info = self.cache_info(query_params) start_row, end_row = cache_info if not where: where = None result = store_select(self.store, self.localpath, where=where, start=start_row, stop=end_row) return result
def _single_select(self, **kwargs): query_params = kwargs where = query_params.pop('where', None) cache_info = self.cache_info(query_params) if cache_info is None: self.cache_data(query_params) cache_info = self.cache_info(query_params) start_row, end_row = cache_info #convert these series to ints start_row = start_row[0] end_row = end_row[0] if not where: where = None result = store_select(self.store, self.localpath, where=where, start=start_row, stop=end_row) return result
def query_min_itemsize(self): try: min_itemsize = store_select(self.store, 'min_itemsize') except KeyError: return None return min_itemsize.to_dict()