def run_step( self, run_number: int, step_size: int, howlong: float ) -> ReturnRunStep: x1 = self.get_input_slot("x1") x2 = self.get_input_slot("x2") if x1.updated.any() or x1.deleted.any() or x2.updated.any() or x2.deleted.any(): x1.reset() x2.reset() if self.result is not None: self.table.resize(0) x1.update(run_number) x2.update(run_number) step_size = min(x1.created.length(), x2.created.length(), step_size) x1_indices = x1.created.next(step_size) x2_indices = x2.created.next(step_size) res = {} data1 = x1.data().loc[fix_loc(x1_indices)] data2 = x2.data().loc[fix_loc(x2_indices)] assert data1.columns == data2.columns for col in data1.columns: res[col] = np.multiply(data1[col].value, data2[col].value) if self.result is None: self.result = Table(name="simple_hadamard", data=res, create=True) else: self.table.append(res) return self._return_run_step(self.next_state(x1), steps_run=step_size)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: if self.params.fixed_step_size and False: step_size = self.params.fixed_step_size input_slot = self.get_input_slot("table") assert input_slot is not None steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(length=step_size) steps = indices_len(created) self._steps += steps input_table = input_slot.data() if self.result is None: self.result = Table( self.generate_table_name("stirrer"), dshape=input_table.dshape, ) raw_ids = self.table.index before_ = raw_ids # bitmap(raw_ids[raw_ids >= 0]) v = input_table.loc[fix_loc(created), :] self.table.append(v) # indices=bitmap(created)) delete = [] if self._delete_rows and self.test_delete_threshold(before_): if isinstance(self._delete_rows, int): delete = random.sample(tuple(before_), min(self._delete_rows, len(before_))) elif self._delete_rows == "half": delete = random.sample(tuple(before_), len(before_) // 2) elif self._delete_rows == "all": delete = before_ else: delete = self._delete_rows if delete and self.params.del_twice: mid = len(delete) // 2 del self.table.loc[delete[:mid]] del self.table.loc[delete[mid:]] elif delete: steps += len(delete) del self.table.loc[delete] if self._update_rows and len(before_): before_ -= bitmap(delete) if isinstance(self._update_rows, int): updated = random.sample(tuple(before_), min(self._update_rows, len(before_))) else: updated = self._update_rows v = np.random.rand(len(updated)) if updated: steps += len(updated) self.table.loc[fix_loc(updated), [self._update_column]] = [v] return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): if self.params.fixed_step_size and False: step_size = self.params.fixed_step_size input_slot = self.get_input_slot('table') input_slot.update(run_number) steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(step_size) steps = indices_len(created) with input_slot.lock: input_table = input_slot.data() p = self.params if self._table is None: self._table = Table( self.generate_table_name('dummy'), dshape=input_table.dshape, ) raw_ids = self._table.index.values before_ = bitmap(raw_ids[raw_ids >= 0]) v = input_table.loc[fix_loc(created), :] #print("creations: ", created) self._table.append(v) # indices=bitmap(created)) delete = [] if self._delete_rows and self.test_delete_threshold(before_): if isinstance(self._delete_rows, int): delete = random.sample(tuple(before_), min(self._delete_rows, len(before_))) elif self._delete_rows == 'half': delete = random.sample(tuple(before_), len(before_) // 2) elif self._delete_rows == 'all': delete = before_ else: delete = self._delete_rows #print("deletions: ", delete) if self.params.del_twice: mid = len(delete) // 2 del self._table.loc[delete[:mid]] del self._table.loc[delete[mid:]] else: del self._table.loc[delete] if self._update_rows and len(before_): before_ -= bitmap(delete) if isinstance(self._update_rows, int): updated = random.sample(tuple(before_), min(self._update_rows, len(before_))) else: updated = self._update_rows v = np.random.rand(len(updated)) if updated: self._table.loc[fix_loc(updated), [self._update_column]] = [v] return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step( self, run_number: int, step_size: int, howlong: float ) -> ReturnRunStep: input_slot = self.get_input_slot("table") assert input_slot is not None steps = 0 input_table = input_slot.data() self._input_table = input_table if input_table is None or len(input_table) < self.params.init_threshold: # there are not enough rows. it's not worth building an index yet return self._return_run_step(self.state_blocked, steps_run=0) if self._impl is None: input_slot.reset() input_slot.update(run_number) input_slot.clear_buffers() bound_min, bound_max = self.compute_bounds(input_table) self._impl = _HistogramIndexImpl( self.column, input_table, bound_min, bound_max, self.params.bins ) self.selection = bitmap(input_table.index) self.result = TableSelectedView(input_table, self.selection) return self._return_run_step(self.state_blocked, len(self.selection)) else: # Many not always, or should the implementation decide? self._impl.reshape() deleted: Optional[bitmap] = None if input_slot.deleted.any(): deleted = input_slot.deleted.next(as_slice=False) # steps += indices_len(deleted) # deleted are constant time steps = 1 deleted = fix_loc(deleted) self.selection -= deleted created: Optional[bitmap] = None if input_slot.created.any(): created = input_slot.created.next(length=step_size, as_slice=False) created = fix_loc(created) steps += indices_len(created) self.selection |= created updated: Optional[bitmap] = None if input_slot.updated.any(): updated = input_slot.updated.next(length=step_size, as_slice=False) updated = fix_loc(updated) steps += indices_len(updated) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) input_table = input_slot.data() # self._table = input_table self._impl.update_histogram(created, updated, deleted) return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_deleted() or dfslot.has_updated(): logger.debug('has deleted or updated, reseting') self.reset() dfslot.update(run_number) print('dfslot has buffered %d elements'% dfslot.created_length()) if dfslot.created_length() < self.mbk.n_clusters: # Should add more than k items per loop return self._return_run_step(self.state_blocked, steps_run=0) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() X = self.filter_columns(input_df, fix_loc(indices)).values batch_size = self.mbk.batch_size or 100 for batch in gen_batches(steps, batch_size): self.mbk.partial_fit(X[batch]) if self._buffer is not None: df = pd.DataFrame({'labels': self.mbk.labels_}) df[self.UPDATE_COLUMN] = run_number self._buffer.append(df) with self.lock: self._df = pd.DataFrame(self.mbk.cluster_centers_, columns=self.columns) self._df[self.UPDATE_COLUMN] = run_number if self._buffer is not None: logger.debug('Setting the labels') self._labels = self._buffer.df() return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() dfslot.update(run_number) self.tdigest = TDigest() # reset indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() with dfslot.lock: x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x) df = self._df values = [] for p in self._percentiles: values.append(self.tdigest.percentile(p*100)) values.append(run_number) with self.lock: df.loc[run_number] = values if len(df) > self.params.history: self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=len(self._df))
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() self._df = None dfslot.update(run_number) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.filter_columns(input_df, fix_loc(indices)).max() if not op.index.equals(self._columns): # some columns are not numerical self._columns = op.index op[self.UPDATE_COLUMN] = run_number if self._df is None: self._df = pd.DataFrame([op],index=[run_number]) else: op = pd.concat([last_row(self._df), op], axis=1).max(axis=1) # Also computed the max over the UPDATE_COLUMNS so reset it op[self.UPDATE_COLUMN] = run_number self._df.loc[run_number] = op if len(self._df) > self.params.history: self._df = self._df.loc[self._df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() if self._table is not None: self._table.resize(0) dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.filter_columns(input_df, fix_loc(indices)).max(keepdims=True) if self._table is None: self._table = Table( self.generate_table_name('max'), data=op, # scheduler=self.scheduler(), create=True) elif len(self._table) == 0: # has been resetted self._table.append(op) else: last = self._table.last() for colname in last: current_max = op[colname] current_max[0] = np.maximum(current_max, last[colname]) self._table.append(op) #TODO manage the history in a more efficient way #if len(self._table) > self.params.history: # self._table = self._table.loc[self._df.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: prev_min = prev_max = np.nan dfslot = self.get_input_slot("table") assert dfslot is not None if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() dfslot.update(run_number) else: df = self.table prev = df.last_id if prev > 0: prev_min = df.at[prev, self._min_column] prev_max = df.at[prev, self._max_column] indices = dfslot.created.next(length=step_size) # returns a slice input_df = dfslot.data() steps = indices_len(indices) if steps > 0: x = input_df.to_array(locs=fix_loc(indices), columns=[self._column]) new_min = np.nanmin(x) # type: ignore new_max = np.nanmax(x) # type: ignore row = { self._min_column: np.nanmin([prev_min, new_min]), # type: ignore self._max_column: np.nanmax([prev_max, new_max]), # type: ignore } if run_number in df.index: df.loc[run_number] = row else: df.add(row, index=run_number) return self._return_run_step(self.next_state(dfslot), steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() self._table = None dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) input_table = dfslot.data() op = self.filter_columns(input_table, fix_loc(indices)).idxmin() #if not op.index.equals(self._columns): # # some columns are not numerical # self._columns = op.index if self._min is None: min_ = OrderedDict(zip(op.keys(), [np.nan] * len(op.keys()))) for col, ix in op.items(): min_[col] = input_table.at[ ix, col] # lookup value, is there a better way? self._min = Table(self.generate_table_name('_min'), dshape=input_table.dshape, create=True) self._min.append(min_, indices=[run_number]) self._table = Table(self.generate_table_name('_table'), dshape=input_table.dshape, create=True) self._table.append(op, indices=[run_number]) else: prev_min = self._min.last() prev_idx = self._table.last() min_ = OrderedDict(prev_min.items()) for col, ix in op.items(): val = input_table.at[ix, col] if np.isnan(val): pass elif np.isnan(min_[col]) or val < min_[col]: op[col] = prev_idx[col] min_[col] = val with self.lock: self._table.append(op, indices=[run_number]) self._min.append(min_, indices=[run_number]) if len(self._table) > self.params.history: data = self._table.loc[ self._table.index[-self.params.history:]] self._table = Table(self.generate_table_name('_table'), data=data, create=True) data = self._min.loc[ self._min.index[-self.params.history:]] self._min = Table(self.generate_table_name('_min'), data=data, create=True) return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot("df") df = dfslot.data() dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() logger.info("Reseting history because of changes in the input df") dfslot.update(run_number, df) # TODO: be smarter with changed values m = step_size indices = dfslot.next_created(m) m = indices_len(indices) i = None j = None Si = self._buf.matrix() arrayslot = self.get_input_slot("array") if arrayslot is not None and arrayslot.data() is not None: array = arrayslot.data() logger.debug("Using array instead of DataFrame columns") if Si is not None: i = array[self._last_index] j = array[indices] if j is None: if self.columns is None: self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN)) elif not isinstance(self.columns, pd.Index): self.columns = pd.Index(self.columns) rows = df[self.columns] if Si is not None: i = rows.loc[self._last_index] assert len(i) == len(self._last_index) j = rows.loc[fix_loc(indices)] assert len(j) == indices_len(indices) Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs) if Si is None: mat = self._buf.resize(Sj.shape[0]) mat[:, :] = Sj self._last_index = dfslot.last_index[indices] else: Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs) n0 = i.shape[0] n1 = n0 + j.shape[0] mat = self._buf.resize(n1) mat[0:n0, n0:n1] = Sij mat[n0:n1, 0:n0] = Sij.T mat[n0:n1, n0:n1] = Sj self._last_index = self._last_index.append(df.index[indices]) # truth = pairwise_distances(array[0:n1], metric=self._metric) # import pdb # pdb.set_trace() # assert np.allclose(mat,truth) return self._return_run_step(dfslot.next_state(), steps_run=m)
def _eval_to_ids(self, operator_, limit, input_ids=None): input_slot = self.get_input_slot('table') table_ = input_slot.data() if input_ids is None: input_ids = table_.index else: input_ids = fix_loc(input_ids) x = table_[self.column].loc[input_ids] mask_ = operator_(x, limit) arr = slice_to_arange(input_ids) return bitmap(arr[np.nonzero(mask_)[0]])
def run_step(self,run_number,step_size,howlong): query_slot = self.get_input_slot('query') df_slot = self.get_input_slot('df') if not query_slot: query = None else: query_df = query_slot.data() query_slot.update(run_number) if query_slot.has_created(): # ignore deleted and updated df_slot.reset() # re-filter self._buffer.reset(); indices = query_slot.next_created() # read it all with query_slot.lock: query = last_row(query_df)[self._query_column] # get the query expression if query is not None: if len(query)==0: query=None else: query = unicode(query) # make sure we have a string df_slot.update(run_number) if df_slot.has_deleted() or df_slot.has_updated(): df_slot.reset() self._buffer.reset() df_slot.update(run_number) indices = df_slot.next_created(step_size) steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=steps) if query is None: # nothing to query, just pass through logger.info('No query, passing data through') self._df = df_slot.data() return self._return_run_step(self.state_blocked, steps_run=steps) with df_slot.lock: new_df = df_slot.data().loc[fix_loc(indices)] try: selected_df = new_df.eval(query) #print 'Select evaluated %d/%d rows'%(len(selected_df),steps) if isinstance(selected_df, pd.Series): if selected_df.index.has_duplicates: import pdb pdb.set_trace() selected_df = new_df.loc[selected_df] except Exception as e: logger.error('Probably a syntax error in query expression: %s', e) self._df = df_slot.data() return self._return_run_step(self.state_blocked, steps_run=steps) selected_df.loc[:,self.UPDATE_COLUMN] = run_number self._buffer.append(selected_df) #, ignore_index=False) TODO later self._df = self._buffer.df() return self._return_run_step(self.state_blocked, steps_run=steps)
def run_step(self,run_number,step_size,howlong): df_slot = self.get_input_slot('df') df_slot.update(run_number, buffer_created=True, buffer_updated=True) if df_slot.has_deleted(): self.reset() df_slot.reset() df_slot.update(run_number) input_df = df_slot.data() columns = self.get_columns(input_df) if input_df is None or len(input_df)==0: return self._return_run_step(self.state_blocked, steps_run=0) indices = df_slot.next_created(step_size) steps = indices_len(indices) step_size -= steps steps_run = steps if steps != 0: indices = fix_loc(indices) self._buffer.append(input_df.loc[indices]) self._df = self._buffer.df() self._df.loc[indices,self.UPDATE_COLUMN] = run_number if step_size > 0 and df_slot.has_updated(): indices = df_slot.next_updated(step_size,as_slice=False) steps = indices_len(indices) if steps != 0: steps_run += steps indices = fix_loc(indices) # no need, but stick to the stereotype updated = self.filter_columns(input_df, indices) df = self.filter_columns(self._df, indices) norms = row_norms(updated-df) selected = (norms > (self._delta*self.get_scale())) indices = indices[selected] if selected.any(): logger.debug('updating at %d', run_number) self._df.loc[indices, self._columns] = updated.loc[indices, self._columns] self._df.loc[indices, self.UPDATE_COLUMN] = run_number else: logger.debug('Not updating at %d', run_number) return self._return_run_step(df_slot.next_state(), steps_run=steps_run)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: indices = ctx.table.created.next(step_size) # returns a slice steps = indices_len(indices) input_df = ctx.table.data() op = input_df.loc[fix_loc(indices)].max(keepdims=False) if self.result is None: self.result = PsDict(op) else: for k, v in self.psdict.items(): self.result[k] = np.maximum(op[k], v) return self._return_run_step(self.next_state(ctx.table), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() self._df = None dfslot.update(run_number) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.filter_columns(input_df, fix_loc(indices)).idxmin() if not op.index.equals(self._columns): # some columns are not numerical self._columns = op.index op[self.UPDATE_COLUMN] = run_number if self._min is None: min = pd.Series([np.nan], index=op.index) # the UPDATE_COLUMN is included min[self.UPDATE_COLUMN] = run_number for col in op.index: if col==self.UPDATE_COLUMN: continue min[col] = input_df.loc[op[col], col] # lookup value, is there a better way? self._min = pd.DataFrame([min], columns=op.index) self._df = pd.DataFrame([op], columns=op.index) else: prev_min = last_row(self._min) prev_idx = last_row(self._df) min = pd.Series(prev_min) min[self.UPDATE_COLUMN] = run_number for col in op.index: if col==self.UPDATE_COLUMN: continue val = input_df.loc[op[col], col] if np.isnan(val): pass elif np.isnan(min[col]) or val < min[col]: op[col] = prev_idx[col] min[col] = val op[self.UPDATE_COLUMN] = run_number with self.lock: self._df = self._df.append(op, ignore_index=True) self._min = self._min.append(min, ignore_index=True) if len(self._df) > self.params.history: self._df = self._df.loc[self._df.index[-self.params.history:]] self._min = self._min.loc[self._min.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def _eval_to_ids( self, operator_: Callable[[Any, Any], Any], limit: Any, input_ids: Optional[slice] = None, ) -> bitmap: input_slot = self.get_input_slot("table") table_ = input_slot.data() if input_ids is None: input_ids = table_.index else: input_ids = fix_loc(input_ids) x = table_[self.column].loc[input_ids] mask_ = operator_(x, limit) assert isinstance(input_ids, slice) arr = slice_to_arange(input_ids) return bitmap(arr[np.nonzero(mask_)[0]])
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.deleted.any() or dfslot.updated.any(): logger.debug('has deleted or updated, reseting') self.reset() dfslot.update(run_number) #print('dfslot has buffered %d elements'% dfslot.created_length()) input_df = dfslot.data() if (input_df is None or len(input_df) == 0) and dfslot.created_length() < self.mbk.n_clusters: # Should add more than k items per loop return self._return_run_step(self.state_blocked, steps_run=0) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) cols = self.get_columns(input_df) if len(cols) == 0: return self._return_run_step(self.state_blocked, steps_run=0) locs = fix_loc(indices) if self._labels is not None and isinstance(indices, slice): indices = np.arange(indices.start, indices.stop) X = input_df.to_array(columns=cols, locs=locs) batch_size = self.mbk.batch_size or 100 for batch in gen_batches(steps, batch_size): self.mbk.partial_fit(X[batch]) if self._labels is not None: self._labels.append({'labels': self.mbk.labels_}, indices=indices[batch]) if self._table is None: dshape = self.dshape_from_columns(input_df, cols, dshape_from_dtype(X.dtype)) self._table = Table(self.generate_table_name('centers'), dshape=dshape, create=True) self._table.resize(self.mbk.cluster_centers_.shape[0]) self._table[cols] = self.mbk.cluster_centers_ return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: input_slot = self.get_input_slot("table") # input_slot.update(run_number) steps = 0 if not input_slot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) created = input_slot.created.next(step_size) steps = indices_len(created) input_table = input_slot.data() if self.result is None: self.result = Table( self.generate_table_name("stirrer"), dshape=input_table.dshape, ) v = input_table.loc[fix_loc(created), :] self.table.append(v) if not self.done: module = self.scheduler()[self.watched] sensitive_ids = bitmap(getattr(module, "_sensitive_ids").values()) if sensitive_ids: if self.proc_sensitive: if self.mode == "delete": # print('delete sensitive', sensitive_ids) del self.table.loc[sensitive_ids] else: # print('update sensitive', sensitive_ids) self.table.loc[sensitive_ids, 0] = self.value self.done = True else: # non sensitive if len(self.result) > 10: for i in range(10): id_ = self.table.index[i] if id_ not in sensitive_ids: if self.mode == "delete": del self.table.loc[id_] else: self.table.loc[id_, 0] = self.value self.done = True return self._return_run_step(self.next_state(input_slot), steps_run=steps)
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: slot = self.get_input_slot("table") if slot.updated.any() or slot.deleted.any(): slot.reset() if self.result is not None: self.psdict.clear() # resize(0) slot.update(run_number) indices = slot.created.next(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) data = slot.data() op = data.loc[fix_loc(indices)].max(keepdims=False) if self.result is None: self.result = PsDict(op) else: for k, v in self.psdict.items(): self.result[k] = np.maximum(op[k], v) return self._return_run_step(self.next_state(slot), steps_run=steps)
def run_step(self, run_number, step_size, howlong): prev_min = prev_max = np.nan dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() dfslot.update(run_number) else: df = self._table prev = len(df)-1 if prev > 0: prev_min = df.iat[prev, self._min_column] prev_max = df.iat[prev, self._max_column] indices = dfslot.created.next(step_size) # returns a slice input_df = dfslot.data() steps = indices_len(indices) if steps > 0: x = input_df.to_array(locs=fix_loc(indices), columns=[self._column]) new_min = np.nanmin(x) new_max = np.nanmax(x) row = {self._min_column: np.nanmin([prev_min, new_min]), self._max_column: np.nanmax([prev_max, new_max])} with self.lock: if run_number in df.index: df.loc[run_number] = row else: df.add(row, index=run_number) # while len(df) > self.params.history: # drop ...self._table # if self._reset_index: # new_ = Table(get_random_name('stats_'), dshape=self._table.dshape) # new_.resize(len(self._table)) # new_.iloc[:,self._min_column] = self._table[self._min_column] # new_.iloc[:,self._max_column] = self._table[self._max_column] # self._table = new_ #print(repr(df)) return self._return_run_step(self.next_state(dfslot), steps_run=steps, reads=steps, updates=len(self._table))
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() self._df = None dfslot.update(run_number) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.op(self.filter_columns(input_df,fix_loc(indices))) op[self.UPDATE_COLUMN] = run_number if self._df is None: self._df = pd.DataFrame([op], index=[run_number]) else: self._df.loc[run_number] = op print self._df if len(self._df) > self.params.history: self._df = self._df.loc[self._df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() self._table = None dfslot.update(run_number) indices = dfslot.created.next(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() op = self.op(self.filter_columns(input_df,fix_loc(indices))) if self._table is None: self._table = Table(self.generate_table_name('var'), dshape=input_df.dshape, # scheduler=self.scheduler(), create=True) self._table.append(op, indices=[run_number]) print(self._table) if len(self._table) > self.params.history: self._table = self._table.loc[self._table.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') input_df = dfslot.data() dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): raise ProgressiveError('%s module does not manage updates or deletes', self.__class__.__name__) indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) (x, y) = input_df.loc[fix_loc(indices),[self._x, self._y]] df = self._df sum_x = df.at[0, 'sum_x'] + x.sum() sum_x_sqr = df.at[0, 'sum_x_sqr'] + (x*x).sum() sum_y = df.at[0, 'sum_y'] + y.sum() sum_xy = df.at[0, 'sum_xy'] + (x*y).sum() denom = len(x) * sum_x_sqr - sum_x*sum_x coef = (sum_y*sum_x_sqr - sum_x*sum_xy) / denom intercept = (len(x)*sum_xy - sum_x*sum_y) / denom df.loc[0] = [coef, intercept, sum_x, sum_x_sqr, sum_y, sum_xy, run_number] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=1)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') df = dfslot.data() dfslot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): dfslot.reset() logger.info( 'Reseting history because of changes in the input table') dfslot.update(run_number) #TODO: be smarter with changed values m = step_size indices = dfslot.created.next(m) m = indices_len(indices) i = None j = None Si = self._table['document'] arrayslot = self.get_input_slot('array') if arrayslot is not None and arrayslot.data() is not None: array = arrayslot.data() logger.debug('Using array instead of DataFrame columns') if Si is not None: i = array[self._last_index] j = array[indices] if j is None: if self.columns is None: self.columns = df.columns.delete( np.where(df.columns == UPDATE_COLUMN)) elif not isinstance(self.columns, pd.Index): self.columns = pd.Index(self.columns) rows = df[self.columns] if Si is not None: i = rows.loc[self._last_index] assert len(i) == len(self._last_index) j = rows.loc[fix_loc(indices)] assert len(j) == indices_len(indices) Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs) if Si is None: mat = self._buf.resize(Sj.shape[0]) mat[:, :] = Sj self._last_index = dfslot.last_index[indices] else: Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs) n0 = i.shape[0] n1 = n0 + j.shape[0] mat = self._buf.resize(n1) mat[0:n0, n0:n1] = Sij mat[n0:n1, 0:n0] = Sij.T mat[n0:n1, n0:n1] = Sj self._last_index = self._last_index.append(df.index[indices]) #truth = pairwise_distances(array[0:n1], metric=self._metric) #import pdb #pdb.set_trace() #assert np.allclose(mat,truth) return self._return_run_step(self.next_state(dfslot), steps_run=m)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.updated.any(): logger.debug('reseting histogram') self.reset() dfslot.update(run_number) if not (dfslot.created.any() or min_slot.created.any() or max_slot.created.any()): logger.info('Input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) xmin, xmax, ymin, ymax = bounds if self._bounds is None: (xdelta, ydelta) = self.get_delta(*bounds) self._bounds = (xmin - xdelta, xmax + xdelta, ymin - ydelta, ymax + ydelta) logger.info("New bounds at run %d: %s", run_number, self._bounds) else: (dxmin, dxmax, dymin, dymax) = self._bounds (xdelta, ydelta) = self.get_delta(*bounds) assert xdelta >= 0 and ydelta >= 0 # Either the min/max has extended, or it has shrunk beyond the deltas if ((xmin < dxmin or xmax > dxmax or ymin < dymin or ymax > dymax) or (xmin > (dxmin + xdelta) or xmax < (dxmax - xdelta) or ymin > (dymin + ydelta) or ymax < (dymax - ydelta))): #print('Old bounds: %s,%s,%s,%s'%(dxmin,dxmax,dymin,dymax)) self._bounds = (xmin - xdelta, xmax + xdelta, ymin - ydelta, ymax + ydelta) #print('Updated bounds at run %d: %s old %s deltas %s, %s'%(run_number,self._bounds, bounds, xdelta, ydelta)) logger.info('Updated bounds at run %s: %s', run_number, self._bounds) self.reset() dfslot.update(run_number) xmin, xmax, ymin, ymax = self._bounds if xmin >= xmax or ymin >= ymax: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) # Now, we know we have data and bounds, proceed to create a new histogram # or to update the previous if is still exists (i.e. no reset) p = self.params steps = 0 # if there are new deletions, build the histogram of the deleted pairs # then subtract it from the main histogram if dfslot.deleted.any() and self._histo is not None: input_df = get_physical_base(dfslot.data()) indices = dfslot.deleted.next(step_size) steps += indices_len(indices) #print('Histogram2D steps :%d'% steps) logger.info('Read %d rows', steps) x = input_df[self.x_column] y = input_df[self.y_column] idx = input_df.id_to_index(fix_loc(indices)) #print(idx) x = x[idx] y = y[idx] bins = [p.ybins, p.xbins] if len(x) > 0: histo = histogram2d(y, x, bins=bins, range=[[ymin, ymax], [xmin, xmax]]) self._histo -= histo # if there are new creations, build a partial histogram with them then # add it to the main histogram input_df = dfslot.data() indices = dfslot.created.next(step_size) steps += indices_len(indices) #print('Histogram2D steps :%d'% steps) logger.info('Read %d rows', steps) self.total_read += steps x = input_df[self.x_column] y = input_df[self.y_column] idx = input_df.id_to_index(fix_loc(indices)) #print(idx) x = x[idx] y = y[idx] if self._xedges is not None: bins = [self._xedges, self._yedges] else: bins = [p.ybins, p.xbins] if len(x) > 0: #t = default_timer() # using fast_histogram histo = histogram2d(y, x, bins=bins, range=[[ymin, ymax], [xmin, xmax]]) # using numpy histogram #histo, xedges, yedges = np.histogram2d(y, x, # bins=bins, # range=[[ymin, ymax], [xmin, xmax]], # normed=False) #t = default_timer()-t #print('Time for histogram2d: %f'%t) #self._xedges = xedges #self._yedges = yedges else: histo = None cmax = 0 if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo if self._histo is not None: cmax = self._histo.max() values = { 'array': np.flip(self._histo, axis=0), 'cmin': 0, 'cmax': cmax, 'xmin': xmin, 'xmax': xmax, 'ymin': ymin, 'ymax': ymax, 'time': run_number } if self._with_output: with self.lock: table = self._table table['array'].set_shape([p.ybins, p.xbins]) l = len(table) last = table.last() if l == 0 or last['time'] != run_number: table.add(values) else: table.iloc[last.row] = values self.build_heatmap(values) return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): logger.debug('reseting histogram') dfslot.reset() self._histo = None self._xedges = None self._yedges = None dfslot.update(run_number) if not (dfslot.has_created() or min_slot.has_created() or max_slot.has_created()): # nothing to do, just wait logger.info('Input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: print('No bounds yet at run %d'%run_number) logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) xmin, xmax, ymin, ymax = bounds if self._bounds is None: (xdelta, ydelta) = self.get_delta(*bounds) self._bounds = (xmin-xdelta,xmax+xdelta,ymin-ydelta,ymax+ydelta) print('New bounds at run %d: %s'%(run_number,self._bounds)) else: (dxmin, dxmax, dymin, dymax) = self._bounds (xdelta, ydelta) = self.get_delta(*bounds) # Either the min/max has extended, or it has shrunk beyond the deltas if (xmin<dxmin or xmax>dxmax or ymin<dymin or ymax>dymax) \ or (xmin>(dxmin+xdelta) or xmax<(dxmax-xdelta) or ymin>(dymin+ydelta) or ymax<(dymax-ydelta)): self._bounds = (xmin-xdelta,xmax+xdelta,ymin-ydelta,ymax+ydelta) print('Updated bounds at run %d: %s'%(run_number,self._bounds)) logger.info('Updated bounds at run %s: %s', run_number, self._bounds) dfslot.reset() dfslot.update(run_number) # should recompute the histogram from scatch self._histo = None self._xedges = None self._yedges = None xmin, xmax, ymin, ymax = self._bounds if xmin>=xmax or ymin>=ymax: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) # Now, we know we have data and bounds, proceed to create a new histogram input_df = dfslot.data() indices = dfslot.next_created(step_size) steps = indices_len(indices) logger.info('Read %d rows', steps) self.total_read += steps filtered_df = input_df.loc[fix_loc(indices)] x = filtered_df[self.x_column] y = filtered_df[self.y_column] p = self.params if self._xedges is not None: bins = [self._xedges, self._yedges] else: bins = [p.ybins, p.xbins] if len(x)>0: histo, xedges, yedges = np.histogram2d(y, x, bins=bins, range=[[ymin, ymax], [xmin, xmax]], normed=False) self._xedges = xedges self._yedges = yedges else: histo = None cmax = 0 if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo if self._histo is not None: cmax = self._histo.max() print 'cmax=%d'%cmax values = [self._histo, 0, cmax, xmin, xmax, ymin, ymax, run_number] with self.lock: self._df.loc[run_number] = values if len(self._df) > p.history: self._df = self._df.loc[self._df.index[-p.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def _eval_to_ids(self, limit, input_ids): x = self._table.loc[fix_loc(input_ids)][self._column].values mask_ = self._op(x, limit) arr = slice_to_arange(input_ids) return bitmap(arr[np.nonzero(mask_)[0]]) # maybe fancy indexing ...
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): logger.debug('resetting histogram') dfslot.reset() self._histo = None self._edges = None dfslot.update(run_number) if not (dfslot.has_created() or min_slot.has_created() or max_slot.has_created()): logger.info('input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) bound_min, bound_max = bounds if self._bounds is None: delta = self.get_delta(*bounds) self._bounds = (bound_min - delta, bound_max + delta) logger.info("New bounds at run %d: %s"%(run_number, self._bounds)) else: (old_min, old_max) = self._bounds delta = self.get_delta(*bounds) if(bound_min < old_min or bound_max > old_max) \ or bound_min > (old_min + delta) or bound_max < (old_max - delta): self._bounds = (bound_min - delta, bound_max + delta) logger.info('Updated bounds at run %d: %s', run_number, self._bounds) dfslot.reset() dfslot.update(run_number) self._histo = None (curr_min, curr_max) = self._bounds if curr_min >= curr_max: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() indices = dfslot.next_created(step_size) # returns a slice or ... ? steps = indices_len(indices) logger.info('Read %d rows', steps) self.total_read += steps filtered_df = input_df.loc[fix_loc(indices)] column = filtered_df[self.column] bins = self._edges if self._edges is not None else self.params.bins histo = None if len(column) > 0: histo, self._edges = np.histogram(column, bins=bins, range=[curr_min, curr_max], normed=False) if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo values = [self._histo, curr_min, curr_max, run_number] with self.lock: self._df.loc[run_number] = values self._df = self._df.loc[self._df.index[-1:]] return self._return_run_step(dfslot.next_state(), steps_run=steps)
def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('table') dfslot.update(run_number) min_slot = self.get_input_slot('min') min_slot.update(run_number) max_slot = self.get_input_slot('max') max_slot.update(run_number) if dfslot.updated.any() or dfslot.deleted.any(): logger.debug('reseting histogram') dfslot.reset() self._histo = None self._edges = None dfslot.update(run_number) if not (dfslot.created.any() or min_slot.created.any() or max_slot.created.any()): logger.info('Input buffers empty') return self._return_run_step(self.state_blocked, steps_run=0) bounds = self.get_bounds(min_slot, max_slot) if bounds is None: logger.debug('No bounds yet at run %d', run_number) return self._return_run_step(self.state_blocked, steps_run=0) bound_min, bound_max = bounds if self._bounds is None: delta = self.get_delta(*bounds) self._bounds = (bound_min - delta, bound_max + delta) logger.info("New bounds at run %d: %s", run_number, self._bounds) else: (old_min, old_max) = self._bounds delta = self.get_delta(*bounds) if(bound_min < old_min or bound_max > old_max) \ or bound_min > (old_min + delta) or bound_max < (old_max - delta): self._bounds = (bound_min - delta, bound_max + delta) logger.info('Updated bounds at run %d: %s', run_number, self._bounds) dfslot.reset() dfslot.update(run_number) self._histo = None self._edges = None (curr_min, curr_max) = self._bounds if curr_min >= curr_max: logger.error('Invalid bounds: %s', self._bounds) return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() indices = dfslot.created.next(step_size) # returns a slice or ... ? steps = indices_len(indices) logger.info('Read %d rows', steps) self.total_read += steps column = input_df[self.column] column = column.loc[fix_loc(indices)] bins = self._edges if self._edges is not None else self.params.bins histo = None if len(column) > 0: histo, self._edges = np.histogram(column, bins=bins, range=[curr_min, curr_max], normed=False, density=False) if self._histo is None: self._histo = histo elif histo is not None: self._histo += histo values = { 'array': [self._histo], 'min': [curr_min], 'max': [curr_max], 'time': [run_number] } with self.lock: self._table['array'].set_shape((self.params.bins, )) self._table.append(values) return self._return_run_step(self.next_state(dfslot), steps_run=steps)