def __init__(self, n_clusters, columns=None, batch_size=100, tol=0.0, is_input=True, random_state=None, **kwds): self._add_slots(kwds, 'input_descriptors', [SlotDescriptor('table', type=Table, required=True)]) self._add_slots(kwds, 'output_descriptors', [SlotDescriptor('labels', type=Table, required=False)]) super(MBKMeans, self).__init__(**kwds) self.mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, verbose=True, tol=tol, random_state=random_state) self.columns = columns self.n_clusters = n_clusters self.default_step_size = 100 self._labels = None self._remaining_inits = 10 self._initialization_steps = 0 self._is_input = is_input
class MBKMeansFilter(TableModule): """ Filters data corresponding to a specific label """ inputs = [ SlotDescriptor("table", type=Table, required=True), SlotDescriptor("labels", type=Table, required=True), ] def __init__(self, sel: Any, **kwds: Any) -> None: self._sel = sel super().__init__(**kwds) @process_slot("table", "labels") @run_if_any def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: indices_t = ctx.table.created.next( length=step_size) # returns a slice steps_t = indices_len(indices_t) ctx.table.clear_buffers() indices_l = ctx.labels.created.next( length=step_size) # returns a slice steps_l = indices_len(indices_l) ctx.labels.clear_buffers() steps = steps_t + steps_l if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) if self.result is None: self.result = TableSelectedView(ctx.table.data(), ctx.labels.data().selection) else: self.selected.selection = ctx.labels.data().selection return self._return_run_step(self.next_state(ctx.table), steps_run=steps) def create_dependent_modules(self, mbkmeans: MBKMeans, data_module: Module, data_slot: str) -> None: with self.grouped(): scheduler = self.scheduler() filter_ = FilterMod(expr=f"labels=={self._sel}", scheduler=scheduler) filter_.input.table = mbkmeans.output.labels self.filter = filter_ self.input.labels = filter_.output.result self.input.table = data_module.output[data_slot]
def __init__(self, **kwds): self._add_slots(kwds, 'input_descriptors', [SlotDescriptor('table', type=Table)]) self._add_slots( kwds, 'output_descriptors', [SlotDescriptor('select', type=bitmap, required=False)]) super(Sample, self).__init__(**kwds) self._tmp_table = Table( self.generate_table_name('sample'), dshape='{select: int64}', # scheduler=self.scheduler(), create=True) self._size = 0 # holds the size consumed from the input table so far self._bitmap = None self._table = None
def __init__(self, **kwds): self._add_slots(kwds,'input_descriptors', [SlotDescriptor('table', type=Table, required=True), ]) self._kde = None self._json_cache = {} self._inserted = 0 self._lately_inserted = 0 super(KernelDensity, self).__init__(**kwds)
def __init__(self, filepath_or_buffer=None, filter_=None, force_valid_ids=True, fillvalues=None, timeout=None, save_context=None, recovery=0, recovery_table_size=3, save_step_size=100000, **kwds): self._add_slots( kwds, 'input_descriptors', [SlotDescriptor('filenames', type=Table, required=False)]) super(CSVLoader, self).__init__(**kwds) self.default_step_size = kwds.get('chunksize', 1000) # initial guess kwds.setdefault('chunksize', self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds = self._filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids self.parser = None self.csv_kwds = csv_kwds self._compression = csv_kwds.get('compression', "infer") csv_kwds['compression'] = None self._encoding = csv_kwds.get('encoding', None) csv_kwds['encoding'] = None self._rows_read = 0 if filter_ is not None and not callable(filter_): raise ProgressiveError( 'filter parameter should be callable or None') self._filter = filter_ self._input_stream = None # stream that returns a position through the 'tell()' method self._input_encoding = None self._input_compression = None self._input_size = 0 # length of the file or input stream when available self._timeout = timeout self._table_params = dict(name=self.name, fillvalues=fillvalues) self._save_context = True if save_context is None and is_recoverable( filepath_or_buffer) else False self._recovery = recovery self._recovery_table_size = recovery_table_size self._recovery_table = None self._recovery_table_inv = None self._save_step_size = save_step_size self._last_saved_id = 0 self._table = None
def __init__(self, required: str = "result", **kwds: Any) -> None: assert required in ("result", "select") super(Sample, self).__init__(output_required=(required == "result"), **kwds) if required == "select": # Change the descriptor so required # The original SD is kept in the shared outputs/all_outputs # class variables sd = SlotDescriptor("select", type=Table, required=True) self.output_descriptors["select"] = sd self._tmp_table = Table(self.generate_table_name("sample"), dshape="{select: int64}", create=True) self._size = 0 # holds the size consumed from the input table so far self._bitmap: Optional[bitmap] = None self.result: Optional[TableSelectedView] = None
class Variable(Constant): inputs = [SlotDescriptor("like", type=(Table, PsDict), required=False)] def __init__(self, table: Optional[Table] = None, **kwds: Any) -> None: super(Variable, self).__init__(table, **kwds) self.tags.add(self.TAG_INPUT) async def from_input(self, input_: JSon) -> str: if not isinstance(input_, dict): raise ProgressiveError("Expecting a dictionary") if self.result is None and self.get_input_slot("like") is None: error = f"Variable {self.name} with no initial value and no input slot" logger.error(error) return error if self.result is None: error = f"Variable {self.name} has to run once before receiving input" logger.error(error) return error last: PsDict = copy.copy(self.psdict) error = "" for (k, v) in input_.items(): if k in last: last[k] = v else: error += f"Invalid key {k} ignored. " await self.scheduler().for_input(self) self.psdict.update(last) return error def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: if self.result is None: slot = self.get_input_slot("like") if slot is not None: like = slot.data() if like is not None: if isinstance(like, Table): last = like.last() assert last is not None like = last.to_dict(ordered=True) self.result = copy.copy(like) self._ignore_inputs = True return self._return_run_step(self.state_blocked, steps_run=1)
def __init__(self, table=None, **kwds): self._add_slots(kwds, 'input_descriptors', [SlotDescriptor('like', type=Table, required=False)]) super(Variable, self).__init__(table, **kwds)
class MBKMeans(TableModule): """ Mini-batch k-means using the sklearn implementation. """ parameters = [("samples", np.dtype(int), 50)] inputs = [ SlotDescriptor("table", type=Table, required=True), SlotDescriptor("var", type=Table, required=True), SlotDescriptor("moved_center", type=PsDict, required=False), ] outputs = [ SlotDescriptor("labels", type=Table, required=False), SlotDescriptor("conv", type=PsDict, required=False), ] def __init__( self, n_clusters: int, columns: Optional[List[str]] = None, batch_size: int = 100, tol: float = 0.01, is_input: bool = True, is_greedy: bool = True, random_state: Union[int, np.random.RandomState, None] = None, **kwds: Any, ): super().__init__(columns=columns, **kwds) self.mbk = MiniBatchKMeans( n_clusters=n_clusters, batch_size=batch_size, verbose=True, tol=tol, random_state=random_state, ) self.n_clusters = n_clusters self.default_step_size = 100 self._labels: Optional[Table] = None self._remaining_inits = 10 self._initialization_steps = 0 self._is_input = is_input self._tol = tol self._conv_out = PsDict({"convergence": "unknown"}) self.params.samples = n_clusters self._is_greedy: bool = is_greedy self._arrays: Optional[Dict[int, np.ndarray[Any, Any]]] = None # self.convergence_context = {} def predict_step_size(self, duration: float) -> int: p = super().predict_step_size(duration) return max(p, self.n_clusters) def reset(self, init: str = "k-means++") -> None: self.mbk = MiniBatchKMeans( n_clusters=self.mbk.n_clusters, batch_size=self.mbk.batch_size, init=init, random_state=self.mbk.random_state, ) dfslot = self.get_input_slot("table") dfslot.reset() self.set_state(self.state_ready) # self.convergence_context = {} # do not resize result to zero # it contains 1 row per centroid if self._labels is not None: self._labels.truncate() def starting(self) -> None: super().starting() opt_slot = self.get_output_slot("labels") if opt_slot: logger.debug("Maintaining labels") self.maintain_labels(True) else: logger.debug("Not maintaining labels") self.maintain_labels(False) def maintain_labels(self, yes: bool = True) -> None: if yes and self._labels is None: self._labels = Table( self.generate_table_name("labels"), dshape="{labels: int64}", create=True, ) elif not yes: self._labels = None def labels(self) -> Optional[Table]: return self._labels def get_data(self, name: str) -> Any: if name == "labels": return self.labels() if name == "conv": return self._conv_out return super().get_data(name) def is_greedy(self) -> bool: return self._is_greedy def _process_labels(self, locs: bitmap) -> None: labels = self.mbk.labels_ assert self._labels is not None u_locs = locs & self._labels.index # ids to update if not u_locs: # shortcut self._labels.append({"labels": labels}, indices=locs) return a_locs = locs - u_locs # ids to append if not a_locs: # 2nd shortcut assert self._labels is not None return df = pd.DataFrame({"labels": labels}, index=locs) u_labels = df.loc[u_locs, "labels"] a_labels = df.loc[a_locs, "labels"] self._labels.loc[u_locs, "labels"] = u_labels self._labels.append({"labels": a_labels}, indices=a_locs) def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: dfslot = self.get_input_slot("table") # TODO varslot is only required if we have tol > 0 varslot = self.get_input_slot("var") moved_center = self.get_input_slot("moved_center") init_centers = "k-means++" if moved_center is not None: if moved_center.has_buffered(): print("Moved center!!") moved_center.clear_buffers() msg = moved_center.data() for c in msg: self.set_centroid(c, msg[c][:2]) init_centers = self.mbk.cluster_centers_ self.reset(init=init_centers) dfslot.clear_buffers() # No need to re-reset next varslot.clear_buffers() if dfslot.has_buffered() or varslot.has_buffered(): logger.debug("has deleted or updated, reseting") self.reset(init=init_centers) dfslot.clear_buffers() varslot.clear_buffers() # print('dfslot has buffered %d elements'% dfslot.created_length()) input_df = dfslot.data() var_data = varslot.data() batch_size = self.mbk.batch_size or 100 if (input_df is None or var_data is None or len(input_df) < max(self.mbk.n_clusters, batch_size)): # Not enough data yet ... return self._return_run_step(self.state_blocked, 0) cols = self.get_columns(input_df, "table") dtype = input_df.columns_common_dtype(cols) n_features = len(cols) n_samples = len(input_df) if self._arrays is None: def _array_factory() -> np.ndarray[Any, Any]: return np.empty((self._key, n_features), dtype=dtype) self._arrays = defaultdict(_array_factory) is_conv = False if self._tol > 0: # v = np.array(list(var_data.values()), dtype=np.float64) # tol = np.mean(v) * self._tol prev_centers = np.zeros((self.n_clusters, n_features), dtype=dtype) else: # tol = 0 prev_centers = np.zeros(0, dtype=dtype) random_state = check_random_state(self.mbk.random_state) X: Optional[np.ndarray[Any, Any]] = None # Attributes to monitor the convergence self.mbk._ewa_inertia = None self.mbk._ewa_inertia_min = None self.mbk._no_improvement = 0 for iter_ in range(step_size): mb_ilocs = random_state.randint(0, n_samples, batch_size) mb_locs = input_df.index[mb_ilocs] self._key = len(mb_locs) arr = self._arrays[self._key] X = input_df.to_array(columns=cols, locs=mb_locs, ret=arr) if hasattr(self.mbk, "cluster_centers_"): prev_centers[:, :] = self.mbk.cluster_centers_ self.mbk.partial_fit(X) if self._labels is not None: self._process_labels(mb_locs) centers = self.mbk.cluster_centers_ nearest_center, batch_inertia = self.mbk.labels_, self.mbk.inertia_ k = centers.shape[0] squared_diff = 0.0 for ci in range(k): center_mask = nearest_center == ci if np.count_nonzero(center_mask) > 0: diff = centers[ci].ravel() - prev_centers[ci].ravel() squared_diff += np.dot(diff, diff) # type: ignore if self.mbk._mini_batch_convergence(iter_, step_size, n_samples, squared_diff, batch_inertia): is_conv = True break if self.result is None: assert X is not None dshape = dshape_from_columns(input_df, cols, dshape_from_dtype(X.dtype)) self.result = Table(self.generate_table_name("centers"), dshape=dshape, create=True) self.result.resize(self.mbk.cluster_centers_.shape[0]) self.psdict[cols] = self.mbk.cluster_centers_ # type: ignore if is_conv: return self._return_run_step(self.state_blocked, iter_) return self._return_run_step(self.state_ready, iter_) def to_json(self, short: bool = False, with_speed: bool = True) -> JSon: json = super().to_json(short, with_speed) if short: return json return self._centers_to_json(json) def _centers_to_json(self, json: JSon) -> JSon: json["cluster_centers"] = self.table.to_json() return json def set_centroid(self, c: int, values: List[float]) -> List[float]: try: c = int(c) except ValueError: pass centroids = self.table # idx = centroids.id_to_index(c) dfslot = self.get_input_slot("table") input_df = dfslot.data() columns = self.get_columns(input_df, "table") if len(values) != len(columns): raise ProgressiveError( f"Expected {len(columns)} values, received {values}") centroids.loc[c, columns] = values # TODO unpack the table centers = centroids.loc[c, columns] assert isinstance(centers, BaseTable) self.mbk.cluster_centers_[c] = list(centers) return self.mbk.cluster_centers_.tolist() def create_dependent_modules(self, input_module: Module, input_slot: str = "result") -> None: with self.grouped(): s = self.scheduler() self.input_module = input_module self.input.table = input_module.output[input_slot] self.input_slot = input_slot c = DynVar(group=self.name, scheduler=s) self.moved_center = c self.input.moved_center = c.output.result v = Var(group=self.name, scheduler=s) self.variance = v v.input.table = input_module.output[input_slot] self.input.var = v.output.result
class KernelDensity(TableModule): parameters = [ ("samples", np.dtype(object), 1), ("bins", np.dtype(int), 1), ("threshold", np.dtype(int), 1000), ("knn", np.dtype(int), 100), ] inputs = [SlotDescriptor("table", type=Table, required=True)] def __init__(self, **kwds: Any) -> None: self._kde: Optional[KNNKernelDensity] = None self._json_cache: JSon = {} self._inserted: int = 0 self._lately_inserted: int = 0 super(KernelDensity, self).__init__(**kwds) self.tags.add(self.TAG_VISUALIZATION) def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: dfslot = self.get_input_slot("table") assert dfslot is not None if dfslot.deleted.any(): raise ValueError("Not implemented yet") if not dfslot.created.any(): return self._return_run_step(self.state_blocked, steps_run=0) indices = dfslot.created.next(length=step_size, as_slice=False) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=0) if self._kde is None: self._kde = KNNKernelDensity(dfslot.data(), online=True) res = self._kde.run_ids(indices.to_array()) self._inserted += res["numPointsInserted"] self._lately_inserted += steps samples = self.params.samples sample_num = self.params.bins threshold = self.params.threshold knn = self.params.knn if self._lately_inserted > threshold: scores = self._kde.score_samples(samples.astype(np.float32), k=knn) self._lately_inserted = 0 self._json_cache = { "points": np.array(dfslot.data().loc[:500, :].to_dict( orient="split")["data"]).tolist(), "bins": sample_num, "inserted": self._inserted, "total": len(dfslot.data()), "samples": [ (sample, score) for sample, score in zip( samples.tolist(), scores.tolist()) # type: ignore ], } return self._return_run_step(self.state_ready, steps_run=steps) def get_visualization(self) -> Optional[str]: return "knnkde" def to_json(self, short: bool = False, with_speed: bool = True) -> JSon: json = super(KernelDensity, self).to_json(short, with_speed) if short: return json return self.knnkde_to_json(json) def knnkde_to_json(self, json: JSon) -> JSon: json.update(self._json_cache) return json
class SimpleCSVLoader(TableModule): inputs = [SlotDescriptor("filenames", type=Table, required=False)] def __init__(self, filepath_or_buffer: Optional[Any] = None, filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, force_valid_ids: bool = True, fillvalues: Optional[Dict[str, Any]] = None, throttle: Union[bool, int, float] = False, **kwds: Any) -> None: super().__init__(**kwds) self.default_step_size = kwds.get("chunksize", 1000) # initial guess kwds.setdefault("chunksize", self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds: Dict[str, Any] = filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids if throttle and isinstance(throttle, integer_types + (float, )): self.throttle = throttle else: self.throttle = False self.parser: Optional[pd.TextReader] = None self.csv_kwds = csv_kwds self._compression: Any = csv_kwds.get("compression", "infer") csv_kwds["compression"] = None self._encoding: Any = csv_kwds.get("encoding", None) csv_kwds["encoding"] = None self._nrows = csv_kwds.get("nrows") csv_kwds["nrows"] = None # nrows clashes with chunksize self._rows_read = 0 if filter_ is not None and not callable(filter_): raise ProgressiveError( "filter parameter should be callable or None") self._filter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = filter_ self._input_stream: Optional[ io. IOBase] = None # stream that returns a position through the 'tell()' method self._input_encoding: Optional[str] = None self._input_compression: Optional[str] = None self._input_size = 0 # length of the file or input stream when available self._file_mode = False self._table_params: Dict[str, Any] = dict(name=self.name, fillvalues=fillvalues) def rows_read(self) -> int: return self._rows_read def is_ready(self) -> bool: if self.has_input_slot("filenames"): # Can be called before the first update so fn.created can be None fn = self.get_input_slot("filenames") if fn.created is None or fn.created.any(): return True return super().is_ready() def is_data_input(self) -> bool: # pylint: disable=no-self-use "Return True if this module brings new data" return True def open(self, filepath: Any) -> io.IOBase: if self._input_stream is not None: self.close() compression: Optional[str] = _infer_compression( filepath, self._compression) istream: io.IOBase encoding: Optional[str] size: int (istream, encoding, compression, size) = filepath_to_buffer(filepath, encoding=self._encoding, compression=compression) self._input_stream = istream self._input_encoding = encoding self._input_compression = compression self._input_size = size self.csv_kwds["encoding"] = encoding self.csv_kwds["compression"] = compression return istream def close(self) -> None: if self._input_stream is None: return try: self._input_stream.close() # pylint: disable=bare-except except Exception: pass self._input_stream = None self._input_encoding = None self._input_compression = None self._input_size = 0 def get_progress(self) -> Tuple[int, int]: if self._input_size == 0: return (0, 0) if self._input_stream is None: return (0, 0) pos = self._input_stream.tell() return (pos, self._input_size) def validate_parser(self, run_number: int) -> ModuleState: if self.parser is None: if self.filepath_or_buffer is not None: try: self.parser = pd.read_csv( self.open(self.filepath_or_buffer), **self.csv_kwds) except IOError as e: logger.error("Cannot open file %s: %s", self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None self._file_mode = True else: if not self.has_input_slot("filenames"): return self.state_terminated fn_slot = self.get_input_slot("filenames") if fn_slot.output_module is None: return self.state_terminated fn_slot.update(run_number) if fn_slot.deleted.any() or fn_slot.updated.any(): raise ProgressiveError("Cannot handle input file changes") df = fn_slot.data() while self.parser is None: indices = fn_slot.created.next(length=1) assert isinstance(indices, slice) if indices.stop == indices.start: return self.state_blocked filename = df.at[indices.start, "filename"] try: self.parser = pd.read_csv(self.open(filename), **self.csv_kwds) except IOError as e: logger.error("Cannot open file %s: %s", filename, e) self.parser = None # fall through return self.state_ready def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: if step_size == 0: # bug logger.error("Received a step_size of 0") return self._return_run_step(self.state_ready, steps_run=0) if self.throttle: step_size = np.min([self.throttle, step_size]) # type: ignore status = self.validate_parser(run_number) if status == self.state_terminated: raise ProgressiveStopIteration("no more filenames") elif status == self.state_blocked: return self._return_run_step(status, steps_run=0) elif status != self.state_ready: logger.error("Invalid state returned by validate_parser: %d", status) self.close() raise ProgressiveStopIteration("Unexpected situation") logger.info("loading %d lines", step_size) try: assert self.parser df: pd.DataFrame = self.parser.read( step_size) # raises StopIteration at EOF except StopIteration: self.close() if self.has_input_slot("filenames"): fn_slot = self.get_input_slot("filenames") if (fn_slot is None or fn_slot.output_module is None) and not self._file_mode: raise self.parser = None return self._return_run_step(self.state_ready, steps_run=0) creates = len(df) if creates == 0: # should not happen logger.error("Received 0 elements") raise ProgressiveStopIteration if self._filter is not None: df = self._filter(df) creates = len(df) if creates == 0: logger.info("frame has been filtered out") else: self._rows_read += creates logger.info("Loaded %d lines", self._rows_read) if self.force_valid_ids: force_valid_id_columns(df) if self.result is None: self._table_params["name"] = self.generate_table_name("table") self._table_params["dshape"] = dshape_from_dataframe(df) self._table_params["data"] = df self._table_params["create"] = True self.result = Table(**self._table_params) else: self.table.append(df) return self._return_run_step(self.state_ready, steps_run=creates)
class Sample(TableModule): parameters = [("samples", np.dtype(int), 50)] inputs = [SlotDescriptor("table", type=Table)] outputs = [SlotDescriptor("select", type=bitmap, required=False)] def __init__(self, required: str = "result", **kwds: Any) -> None: assert required in ("result", "select") super(Sample, self).__init__(output_required=(required == "result"), **kwds) if required == "select": # Change the descriptor so required # The original SD is kept in the shared outputs/all_outputs # class variables sd = SlotDescriptor("select", type=Table, required=True) self.output_descriptors["select"] = sd self._tmp_table = Table(self.generate_table_name("sample"), dshape="{select: int64}", create=True) self._size = 0 # holds the size consumed from the input table so far self._bitmap: Optional[bitmap] = None self.result: Optional[TableSelectedView] = None def reset(self) -> None: self._tmp_table.resize(0) self._size = 0 self._bitmap = None slot = self.get_input_slot("table") if slot is not None: slot.reset() def get_data(self, name: str) -> Any: if name == "select": return self.get_bitmap() if self.result is not None: self.result.selection = self.get_bitmap() return super(Sample, self).get_data(name) def get_bitmap(self) -> bitmap: if self._bitmap is None: len_ = len(self._tmp_table["select"]) # Avoid "ValueError: Iteration of zero-sized operands is not enabled" self._bitmap = bitmap( self._tmp_table["select"]) if len_ else bitmap() return self._bitmap @process_slot("table", reset_if="delete", reset_cb="reset") @run_if_any def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: if self.result is None: self.result = TableSelectedView(ctx.table.data(), bitmap([])) indices = ctx.table.created.next(length=step_size, as_slice=False) steps = indices_len(indices) k = int(self.params.samples) reservoir = self._tmp_table res = reservoir["select"] size = self._size # cache in local variable if size < k: logger.info("Filling the reservoir %d/%d", size, k) # fill the reservoir array until it contains k elements rest = indices.pop(k - size) reservoir.append({"select": rest}) size = len(reservoir) if len(indices) == 0: # nothing else to do self._size = size if steps: self._bitmap = None return self._return_run_step(self.state_blocked, steps_run=steps) t = 4 * k # Threshold (t) determines when to start fast sampling # logic. The optimal value for (t) may vary depending on RNG # performance characteristics. if size < t and len(indices) != 0: logger.info("Normal sampling from %d to %d", size, t) while size < t and len(indices) != 0: # Normal reservoir sampling is fastest up to (t) samples j = np.random.randint(size) if j < k: res[j] = indices.pop()[0] size += 1 if len(indices) == 0: self._size = size if steps: self._bitmap = None return self._return_run_step(self.state_blocked, steps_run=steps) logger.info("Fast sampling with %d indices", len(indices)) while indices: # draw gap size (g) from geometric distribution with probability p = k / size p = k / size u = np.random.rand() g = int(np.floor(np.log(u) / np.log(1 - p))) # advance over the gap, and assign next element to the reservoir if (g + 1) < len(indices): j = np.random.randint(k) res[j] = indices[g] indices.pop(g + 1) size += g + 1 else: size += len(indices) break self._size = size if steps: self._bitmap = None return self._return_run_step(self.state_blocked, steps_run=steps)
class BlobsTableABC(TableModule): """Isotropic Gaussian blobs => table The purpose of the "reservoir" approach is to ensure the reproducibility of the results """ outputs = [SlotDescriptor("labels", type=Table, required=False)] kw_fun: Optional[Callable[..., Any]] = None def __init__( self, columns: Union[int, List[str], np.ndarray[Any, Any]], rows: int = -1, dtype: npt.DTypeLike = np.float64, seed: int = 0, throttle: Union[int, bool, float] = False, **kwds: Any, ) -> None: super().__init__(**kwds) self.tags.add(self.TAG_SOURCE) dtype = dshape_from_dtype(np.dtype(dtype)) self._kwds = {} # FIXME """assert 'centers' in self._kwds assert 'n_samples' not in self._kwds assert 'n_features' not in self._kwds assert 'random_state' not in self._kwds""" # self._kwds['n_samples'] = rows # self._kwds['n_features'] self.default_step_size = 1000 self.columns: Union[List[str], np.ndarray[Any, Any]] if isinstance(columns, integer_types): self.columns = [f"_{i}" for i in range(1, columns + 1)] # self._kwds['n_features'] = columns elif isinstance(columns, (list, np.ndarray)): self.columns = columns # self._kwds['n_features'] = len(columns) else: raise ProgressiveError("Invalid type for columns") self.rows = rows self.seed = seed self._reservoir: Optional[ Tuple[np.ndarray[Any, Any], np.ndarray[Any, Any]] ] = None self._labels: Optional[Table] = None self._reservoir_idx = 0 if throttle and isinstance(throttle, integer_types + (float,)): self.throttle: Union[int, bool, float] = throttle else: self.throttle = False dshape = ", ".join([f"{col}: {dtype}" for col in self.columns]) dshape = "{" + dshape + "}" table = Table(self.generate_table_name("table"), dshape=dshape, create=True) self.result = table self.columns = table.columns def starting(self) -> None: super().starting() opt_slot = self.get_output_slot("labels") if opt_slot: logger.debug("Maintaining labels") self.maintain_labels(True) else: logger.debug("Not maintaining labels") self.maintain_labels(False) def maintain_labels(self, yes: bool = True) -> None: if yes and self._labels is None: self._labels = Table( self.generate_table_name("blobs_labels"), dshape="{labels: int64}", create=True, ) elif not yes: self._labels = None def labels(self) -> Optional[Table]: return self._labels def get_data(self, name: str) -> Any: if name == "labels": return self.labels() return super().get_data(name) @abstractmethod def fill_reservoir(self) -> None: pass def run_step( self, run_number: int, step_size: int, howlong: float ) -> ReturnRunStep: if step_size == 0: logger.error("Received a step_size of 0") return self._return_run_step(self.state_ready, steps_run=0) logger.info("generating %d lines", step_size) if self.throttle: step_size = np.min([self.throttle, step_size]) # type: ignore if self.rows >= 0 and (len(self.table) + step_size) > self.rows: step_size = self.rows - len(self.table) logger.info("truncating to %d lines", step_size) if step_size <= 0: raise ProgressiveStopIteration if self._reservoir is None: self.fill_reservoir() steps = int(step_size) while steps > 0: assert self._reservoir level = len(self._reservoir[0]) - self._reservoir_idx assert level >= 0 if steps >= level: blobs_dict, y_ = xy_to_dict( *self._reservoir, self._reservoir_idx, None, self.columns ) steps -= level # reservoir was emptied so: self.fill_reservoir() else: # steps < level blobs_dict, y_ = xy_to_dict( *self._reservoir, self._reservoir_idx, steps, self.columns ) self._reservoir_idx += steps steps = 0 self.table.append(blobs_dict) if self._labels is not None: self._labels.append({"labels": y_}) if len(self.table) == self.rows: next_state = self.state_zombie elif self.throttle: next_state = self.state_blocked else: next_state = self.state_ready return self._return_run_step(next_state, steps_run=step_size)
class CSVLoader(TableModule): """ Warning : this module do not wait for "filenames" """ inputs = [SlotDescriptor("filenames", type=Table, required=False)] def __init__( self, filepath_or_buffer: Optional[Any] = None, filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, force_valid_ids: bool = True, fillvalues: Optional[Dict[str, Any]] = None, as_array: Optional[Any] = None, timeout: Optional[float] = None, save_context: Optional[Any] = None, # FIXME seems more like a bool recovery: int = 0, # FIXME seems more like a bool recovery_tag: Union[str, int] = "", recovery_table_size: int = 3, save_step_size: int = 100000, **kwds: Any, ) -> None: super(CSVLoader, self).__init__(**kwds) self.tags.add(self.TAG_SOURCE) self.default_step_size = kwds.get("chunksize", 1000) # initial guess kwds.setdefault("chunksize", self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds = filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids self.parser: Optional[Parser] = None self.csv_kwds = csv_kwds self._compression = csv_kwds.get("compression", "infer") csv_kwds["compression"] = None self._encoding = csv_kwds.get("encoding", None) csv_kwds["encoding"] = None self._rows_read = 0 if filter_ is not None and not callable(filter_): raise ProgressiveError( "filter parameter should be callable or None") self._filter = filter_ # self._input_stream: Optional[Any] = ( # None # stream that returns a position through the 'tell()' method # ) self._input_encoding = None self._input_compression = None self._input_size = 0 # length of the file or input stream when available self._timeout_csv = timeout self._table_params: Dict[str, Any] = dict(name=self.name, fillvalues=fillvalues) self._as_array = as_array self._save_context = (True if save_context is None and is_recoverable(filepath_or_buffer) else False) self._recovery = recovery self._recovery_table_size = recovery_table_size self._recovery_table: Optional[Table] = None self._recovery_table_name = f"csv_loader_recovery_{recovery_tag}" self._recovery_table_inv: Optional[Table] = None self._recovery_table_inv_name = f"csv_loader_recovery_invariant_{recovery_tag}" self._save_step_size = save_step_size self._last_saved_id = 0 if self._recovery and not self.recovery_tables_exist(): self._recovery = False if not self._recovery: self.trunc_recovery_tables() def recovery_tables_exist(self) -> bool: try: Table(name=self._recovery_table_name, create=False) except ValueError as ve: if "exist" in ve.args[0]: print("WARNING: recovery table does not exist") return False raise try: Table(name=self._recovery_table_inv_name, create=False) except Exception as ve: # FIXME JDF: is that the right way? if "exist" in ve.args[0]: # FIXME print("WARNING: recovery table invariant does not exist") return False raise return True def trunc_recovery_tables(self) -> None: len_ = 0 rt: Optional[Table] = None try: rt = Table(name=self._recovery_table_name, create=False) len_ = len(rt) except Exception: pass if len_ and rt is not None: rt.drop(slice(None, None, None), truncate=True) len_ = 0 try: rt = Table(name=self._recovery_table_inv_name, create=False) len_ = len(rt) except Exception: pass if len_ and rt is not None: rt.drop(slice(None, None, None), truncate=True) def rows_read(self) -> int: "Return the number of rows read so far." return self._rows_read def is_ready(self) -> bool: if self.has_input_slot("filenames"): fn = self.get_input_slot("filenames") if fn.created is None or fn.created.any(): return True return super(CSVLoader, self).is_ready() def is_data_input(self) -> bool: # pylint: disable=no-self-use "Return True if this module brings new data" return True def create_input_source(self, filepath: str) -> InputSource: usecols = self.csv_kwds.get("usecols") return InputSource.create( filepath, encoding=self._encoding, compression=self._compression, timeout=self._timeout_csv, start_byte=0, usecols=usecols, ) def close(self) -> None: # if self._input_stream is None: # return # try: # self._input_stream.close() # # pylint: disable=bare-except # except Exception: # pass # self._input_stream = None self._input_encoding = None self._input_compression = None self._input_size = 0 def get_progress(self) -> Tuple[int, int]: if self._input_size == 0: return (0, 0) pos = 0 # self._input_stream.tell() return (pos, self._input_size) def validate_parser(self, run_number: int) -> ModuleState: if self.parser is None: if self.filepath_or_buffer is not None: if not self._recovery: try: self.parser = read_csv( self.create_input_source(self.filepath_or_buffer), **self.csv_kwds, ) except IOError as e: logger.error("Cannot open file %s: %s", self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: # do recovery try: if self._recovery_table is None: self._recovery_table = Table( name=self._recovery_table_name, create=False) if self._recovery_table_inv is None: self._recovery_table_inv = Table( name=self._recovery_table_inv_name, create=False) if self.result is None: self._table_params[ "name"] = self._recovery_table_inv[ "table_name"].loc[0] self._table_params["create"] = False table = Table(**self._table_params) self.result = table table.last_id except Exception as e: # TODO: specify the exception? logger.error(f"Cannot acces recovery table {e}") return self.state_terminated table = self.table try: last_ = self._recovery_table.eval("last_id=={}".format( len(table)), as_slice=False) len_last = len(last_) if len_last > 1: logger.error("Inconsistent recovery table") return self.state_terminated # last_ = self._recovery_table.argmax()['offset'] snapshot: Optional[Dict[str, Any]] = None if len_last == 1: row = self._recovery_table.row(last_[0]) assert row is not None snapshot = row.to_dict(ordered=True) if not check_snapshot(snapshot): snapshot = None if (snapshot is None ): # i.e. snapshot not yet found or inconsistent max_ = -1 for i in self._recovery_table.eval( "last_id<{}".format(len(table)), as_slice=False): row = self._recovery_table.row(i) assert row is not None sn: Dict[str, Any] = row.to_dict(ordered=True) if check_snapshot(sn) and sn["last_id"] > max_: max_, snapshot = sn["last_id"], sn if max_ < 0: # logger.error('Cannot acces recovery table (max_<0)') return self.state_terminated table.drop(slice(max_ + 1, None, None), truncate=True) assert snapshot self._recovered_csv_table_name = snapshot["table_name"] except Exception as e: logger.error("Cannot read the snapshot %s", e) return self.state_terminated try: self.parser = recovery(snapshot, self.filepath_or_buffer, **self.csv_kwds) except Exception as e: logger.error("Cannot recover from snapshot %s, %s", snapshot, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: # this case does not support recovery fn_slot = None if self.has_input_slot("filenames"): fn_slot = self.get_input_slot("filenames") if fn_slot is None or fn_slot.output_module is None: return self.state_terminated # fn_slot.update(run_number) if fn_slot.deleted.any() or fn_slot.updated.any(): raise ProgressiveError("Cannot handle input file changes") df = fn_slot.data() while self.parser is None: indices = fn_slot.created.next(length=1) assert isinstance(indices, slice) if indices.stop == indices.start: return self.state_blocked filename = df.at[indices.start, "filename"] try: self.parser = read_csv( self.create_input_source(filename), **self.csv_kwds) except IOError as e: logger.error("Cannot open file %s: %s", filename, e) self.parser = None # fall through return self.state_ready def _data_as_array(self, df: pd.DataFrame) -> Tuple[Any, DataShape]: if not self._as_array: return (df, dshape_from_dataframe(df)) if callable(self._as_array): self._as_array = self._as_array(list(df.columns)) # FIXME if isinstance(self._as_array, str): data = df.values dshape = array_dshape(data, self._as_array) return ({self._as_array: data}, dshape) if not isinstance(self._as_array, dict): raise ValueError( f"Unexpected parameter specified to as_array: {self._as_array}" ) columns = set(df.columns) ret = {} for colname, cols in self._as_array.items(): if colname in ret: raise KeyError(f"Duplicate column {colname} in as_array") colset = set(cols) assert colset.issubset(columns) columns -= colset view = df[cols] values = view.values ret[colname] = values for colname in columns: if colname in ret: raise KeyError(f"Duplicate column {colname} in as_array") ret[colname] = df[colname].values return ret, dshape_from_dict(ret) def _needs_save(self) -> bool: table = self.table if table is None: return False return table.last_id >= self._last_saved_id + self._save_step_size def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: if step_size == 0: # bug logger.error("Received a step_size of 0") return self._return_run_step(self.state_ready, steps_run=0) status = self.validate_parser(run_number) if status == self.state_terminated: raise ProgressiveStopIteration("no more filenames") elif status == self.state_blocked: return self._return_run_step(status, steps_run=0) elif status != self.state_ready: logger.error("Invalid state returned by validate_parser: %d", status) self.close() raise ProgressiveStopIteration("Unexpected situation") logger.info("loading %d lines", step_size) needs_save = self._needs_save() assert self.parser df_list: List[pd.DataFrame] try: df_list = self.parser.read( step_size, flush=needs_save) # raises StopIteration at EOF if not df_list: raise ProgressiveStopIteration except ProgressiveStopIteration: self.close() if self.has_input_slot("filenames"): fn_slot = self.get_input_slot("filenames") assert fn_slot.output_module is not None self.parser = None return self._return_run_step(self.state_ready, 0) df_len = sum([len(df) for df in df_list]) creates = df_len if creates == 0: # should not happen logger.error("Received 0 elements") raise ProgressiveStopIteration if self._filter is not None: df_list = [self._filter(df) for df in df_list] creates = sum([len(df) for df in df_list]) if creates == 0: logger.info("frame has been filtered out") else: self._rows_read += creates logger.info("Loaded %d lines", self._rows_read) if self.force_valid_ids: for df in df_list: force_valid_id_columns(df) if self.result is None: table = self.table data, dshape = self._data_as_array(pd.concat(df_list)) if not self._recovery: self._table_params["name"] = self.generate_table_name( "table") self._table_params["data"] = data self._table_params["dshape"] = dshape self._table_params["create"] = True self.result = Table(**self._table_params) else: self._table_params["name"] = self._recovered_csv_table_name # self._table_params['dshape'] = dshape self._table_params["create"] = False table = Table(**self._table_params) self.result = table table.append(self._data_as_array(pd.concat(df_list))) else: table = self.table for df in df_list: data, dshape = self._data_as_array(df) table.append(data) if (self.parser.is_flushed() and needs_save and self._recovery_table is None and self._save_context): table = self.table snapshot = self.parser.get_snapshot( run_number=run_number, table_name=table.name, last_id=table.last_id, ) self._recovery_table = Table( name=self._recovery_table_name, data=pd.DataFrame(snapshot, index=[0]), create=True, ) self._recovery_table_inv = Table( name=self._recovery_table_inv_name, data=pd.DataFrame( dict( table_name=table.name, csv_input=self.filepath_or_buffer, ), index=[0], ), create=True, ) self._last_saved_id = table.last_id elif self.parser.is_flushed( ) and needs_save and self._save_context: snapshot = self.parser.get_snapshot( run_number=run_number, last_id=table.last_id, table_name=table.name, ) assert self._recovery_table self._recovery_table.add(snapshot) if len(self._recovery_table) > self._recovery_table_size: oldest = self._recovery_table.argmin()["offset"] self._recovery_table.drop(oldest) self._last_saved_id = table.last_id return self._return_run_step(self.state_ready, steps_run=creates)