Esempio n. 1
0
 def __init__(self,
              n_clusters,
              columns=None,
              batch_size=100,
              tol=0.0,
              is_input=True,
              random_state=None,
              **kwds):
     self._add_slots(kwds, 'input_descriptors',
                     [SlotDescriptor('table', type=Table, required=True)])
     self._add_slots(kwds, 'output_descriptors',
                     [SlotDescriptor('labels', type=Table, required=False)])
     super(MBKMeans, self).__init__(**kwds)
     self.mbk = MiniBatchKMeans(n_clusters=n_clusters,
                                batch_size=batch_size,
                                verbose=True,
                                tol=tol,
                                random_state=random_state)
     self.columns = columns
     self.n_clusters = n_clusters
     self.default_step_size = 100
     self._labels = None
     self._remaining_inits = 10
     self._initialization_steps = 0
     self._is_input = is_input
Esempio n. 2
0
class MBKMeansFilter(TableModule):
    """
    Filters data corresponding to a specific label
    """

    inputs = [
        SlotDescriptor("table", type=Table, required=True),
        SlotDescriptor("labels", type=Table, required=True),
    ]

    def __init__(self, sel: Any, **kwds: Any) -> None:
        self._sel = sel
        super().__init__(**kwds)

    @process_slot("table", "labels")
    @run_if_any
    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        assert self.context
        with self.context as ctx:
            indices_t = ctx.table.created.next(
                length=step_size)  # returns a slice
            steps_t = indices_len(indices_t)
            ctx.table.clear_buffers()
            indices_l = ctx.labels.created.next(
                length=step_size)  # returns a slice
            steps_l = indices_len(indices_l)
            ctx.labels.clear_buffers()
            steps = steps_t + steps_l
            if steps == 0:
                return self._return_run_step(self.state_blocked, steps_run=0)
            if self.result is None:
                self.result = TableSelectedView(ctx.table.data(),
                                                ctx.labels.data().selection)
            else:
                self.selected.selection = ctx.labels.data().selection
            return self._return_run_step(self.next_state(ctx.table),
                                         steps_run=steps)

    def create_dependent_modules(self, mbkmeans: MBKMeans, data_module: Module,
                                 data_slot: str) -> None:
        with self.grouped():
            scheduler = self.scheduler()
            filter_ = FilterMod(expr=f"labels=={self._sel}",
                                scheduler=scheduler)
            filter_.input.table = mbkmeans.output.labels
            self.filter = filter_
            self.input.labels = filter_.output.result
            self.input.table = data_module.output[data_slot]
Esempio n. 3
0
    def __init__(self, **kwds):
        self._add_slots(kwds, 'input_descriptors',
                        [SlotDescriptor('table', type=Table)])
        self._add_slots(
            kwds, 'output_descriptors',
            [SlotDescriptor('select', type=bitmap, required=False)])

        super(Sample, self).__init__(**kwds)
        self._tmp_table = Table(
            self.generate_table_name('sample'),
            dshape='{select: int64}',
            #                            scheduler=self.scheduler(),
            create=True)
        self._size = 0  # holds the size consumed from the input table so far
        self._bitmap = None
        self._table = None
Esempio n. 4
0
 def __init__(self, **kwds):
     self._add_slots(kwds,'input_descriptors',
                         [SlotDescriptor('table', type=Table, required=True),
                          ])
     self._kde = None
     self._json_cache = {}
     self._inserted = 0
     self._lately_inserted = 0        
     super(KernelDensity, self).__init__(**kwds)
Esempio n. 5
0
 def __init__(self,
              filepath_or_buffer=None,
              filter_=None,
              force_valid_ids=True,
              fillvalues=None,
              timeout=None,
              save_context=None,
              recovery=0,
              recovery_table_size=3,
              save_step_size=100000,
              **kwds):
     self._add_slots(
         kwds, 'input_descriptors',
         [SlotDescriptor('filenames', type=Table, required=False)])
     super(CSVLoader, self).__init__(**kwds)
     self.default_step_size = kwds.get('chunksize', 1000)  # initial guess
     kwds.setdefault('chunksize', self.default_step_size)
     # Filter out the module keywords from the csv loader keywords
     csv_kwds = self._filter_kwds(kwds, pd.read_csv)
     # When called with a specified chunksize, it returns a parser
     self.filepath_or_buffer = filepath_or_buffer
     self.force_valid_ids = force_valid_ids
     self.parser = None
     self.csv_kwds = csv_kwds
     self._compression = csv_kwds.get('compression', "infer")
     csv_kwds['compression'] = None
     self._encoding = csv_kwds.get('encoding', None)
     csv_kwds['encoding'] = None
     self._rows_read = 0
     if filter_ is not None and not callable(filter_):
         raise ProgressiveError(
             'filter parameter should be callable or None')
     self._filter = filter_
     self._input_stream = None  # stream that returns a position through the 'tell()' method
     self._input_encoding = None
     self._input_compression = None
     self._input_size = 0  # length of the file or input stream when available
     self._timeout = timeout
     self._table_params = dict(name=self.name, fillvalues=fillvalues)
     self._save_context = True if save_context is None and is_recoverable(
         filepath_or_buffer) else False
     self._recovery = recovery
     self._recovery_table_size = recovery_table_size
     self._recovery_table = None
     self._recovery_table_inv = None
     self._save_step_size = save_step_size
     self._last_saved_id = 0
     self._table = None
Esempio n. 6
0
    def __init__(self, required: str = "result", **kwds: Any) -> None:
        assert required in ("result", "select")
        super(Sample, self).__init__(output_required=(required == "result"),
                                     **kwds)
        if required == "select":
            # Change the descriptor so required
            # The original SD is kept in the shared outputs/all_outputs
            # class variables
            sd = SlotDescriptor("select", type=Table, required=True)
            self.output_descriptors["select"] = sd

        self._tmp_table = Table(self.generate_table_name("sample"),
                                dshape="{select: int64}",
                                create=True)
        self._size = 0  # holds the size consumed from the input table so far
        self._bitmap: Optional[bitmap] = None
        self.result: Optional[TableSelectedView] = None
Esempio n. 7
0
class Variable(Constant):
    inputs = [SlotDescriptor("like", type=(Table, PsDict), required=False)]

    def __init__(self, table: Optional[Table] = None, **kwds: Any) -> None:
        super(Variable, self).__init__(table, **kwds)
        self.tags.add(self.TAG_INPUT)

    async def from_input(self, input_: JSon) -> str:
        if not isinstance(input_, dict):
            raise ProgressiveError("Expecting a dictionary")
        if self.result is None and self.get_input_slot("like") is None:
            error = f"Variable {self.name} with no initial value and no input slot"
            logger.error(error)
            return error
        if self.result is None:
            error = f"Variable {self.name} has to run once before receiving input"
            logger.error(error)
            return error
        last: PsDict = copy.copy(self.psdict)
        error = ""
        for (k, v) in input_.items():
            if k in last:
                last[k] = v
            else:
                error += f"Invalid key {k} ignored. "
        await self.scheduler().for_input(self)
        self.psdict.update(last)
        return error

    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        if self.result is None:
            slot = self.get_input_slot("like")
            if slot is not None:
                like = slot.data()
                if like is not None:
                    if isinstance(like, Table):
                        last = like.last()
                        assert last is not None
                        like = last.to_dict(ordered=True)
                    self.result = copy.copy(like)
                    self._ignore_inputs = True
        return self._return_run_step(self.state_blocked, steps_run=1)
Esempio n. 8
0
 def __init__(self, table=None, **kwds):
     self._add_slots(kwds, 'input_descriptors',
                     [SlotDescriptor('like', type=Table, required=False)])
     super(Variable, self).__init__(table, **kwds)
Esempio n. 9
0
class MBKMeans(TableModule):
    """
    Mini-batch k-means using the sklearn implementation.
    """

    parameters = [("samples", np.dtype(int), 50)]
    inputs = [
        SlotDescriptor("table", type=Table, required=True),
        SlotDescriptor("var", type=Table, required=True),
        SlotDescriptor("moved_center", type=PsDict, required=False),
    ]
    outputs = [
        SlotDescriptor("labels", type=Table, required=False),
        SlotDescriptor("conv", type=PsDict, required=False),
    ]

    def __init__(
        self,
        n_clusters: int,
        columns: Optional[List[str]] = None,
        batch_size: int = 100,
        tol: float = 0.01,
        is_input: bool = True,
        is_greedy: bool = True,
        random_state: Union[int, np.random.RandomState, None] = None,
        **kwds: Any,
    ):
        super().__init__(columns=columns, **kwds)
        self.mbk = MiniBatchKMeans(
            n_clusters=n_clusters,
            batch_size=batch_size,
            verbose=True,
            tol=tol,
            random_state=random_state,
        )
        self.n_clusters = n_clusters
        self.default_step_size = 100
        self._labels: Optional[Table] = None
        self._remaining_inits = 10
        self._initialization_steps = 0
        self._is_input = is_input
        self._tol = tol
        self._conv_out = PsDict({"convergence": "unknown"})
        self.params.samples = n_clusters
        self._is_greedy: bool = is_greedy
        self._arrays: Optional[Dict[int, np.ndarray[Any, Any]]] = None
        # self.convergence_context = {}

    def predict_step_size(self, duration: float) -> int:
        p = super().predict_step_size(duration)
        return max(p, self.n_clusters)

    def reset(self, init: str = "k-means++") -> None:
        self.mbk = MiniBatchKMeans(
            n_clusters=self.mbk.n_clusters,
            batch_size=self.mbk.batch_size,
            init=init,
            random_state=self.mbk.random_state,
        )
        dfslot = self.get_input_slot("table")
        dfslot.reset()
        self.set_state(self.state_ready)
        # self.convergence_context = {}
        # do not resize result to zero
        # it contains 1 row per centroid
        if self._labels is not None:
            self._labels.truncate()

    def starting(self) -> None:
        super().starting()
        opt_slot = self.get_output_slot("labels")
        if opt_slot:
            logger.debug("Maintaining labels")
            self.maintain_labels(True)
        else:
            logger.debug("Not maintaining labels")
            self.maintain_labels(False)

    def maintain_labels(self, yes: bool = True) -> None:
        if yes and self._labels is None:
            self._labels = Table(
                self.generate_table_name("labels"),
                dshape="{labels: int64}",
                create=True,
            )
        elif not yes:
            self._labels = None

    def labels(self) -> Optional[Table]:
        return self._labels

    def get_data(self, name: str) -> Any:
        if name == "labels":
            return self.labels()
        if name == "conv":
            return self._conv_out
        return super().get_data(name)

    def is_greedy(self) -> bool:
        return self._is_greedy

    def _process_labels(self, locs: bitmap) -> None:
        labels = self.mbk.labels_
        assert self._labels is not None
        u_locs = locs & self._labels.index  # ids to update
        if not u_locs:  # shortcut
            self._labels.append({"labels": labels}, indices=locs)
            return
        a_locs = locs - u_locs  # ids to append
        if not a_locs:  # 2nd shortcut
            assert self._labels is not None
            return
        df = pd.DataFrame({"labels": labels}, index=locs)
        u_labels = df.loc[u_locs, "labels"]
        a_labels = df.loc[a_locs, "labels"]
        self._labels.loc[u_locs, "labels"] = u_labels
        self._labels.append({"labels": a_labels}, indices=a_locs)

    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        dfslot = self.get_input_slot("table")
        # TODO varslot is only required if we have tol > 0
        varslot = self.get_input_slot("var")
        moved_center = self.get_input_slot("moved_center")
        init_centers = "k-means++"
        if moved_center is not None:
            if moved_center.has_buffered():
                print("Moved center!!")
                moved_center.clear_buffers()
                msg = moved_center.data()
                for c in msg:
                    self.set_centroid(c, msg[c][:2])
                init_centers = self.mbk.cluster_centers_
                self.reset(init=init_centers)
                dfslot.clear_buffers()  # No need to re-reset next
                varslot.clear_buffers()
        if dfslot.has_buffered() or varslot.has_buffered():
            logger.debug("has deleted or updated, reseting")
            self.reset(init=init_centers)
            dfslot.clear_buffers()
            varslot.clear_buffers()
        # print('dfslot has buffered %d elements'% dfslot.created_length())
        input_df = dfslot.data()
        var_data = varslot.data()
        batch_size = self.mbk.batch_size or 100
        if (input_df is None or var_data is None
                or len(input_df) < max(self.mbk.n_clusters, batch_size)):
            # Not enough data yet ...
            return self._return_run_step(self.state_blocked, 0)
        cols = self.get_columns(input_df, "table")
        dtype = input_df.columns_common_dtype(cols)
        n_features = len(cols)
        n_samples = len(input_df)
        if self._arrays is None:

            def _array_factory() -> np.ndarray[Any, Any]:
                return np.empty((self._key, n_features), dtype=dtype)

            self._arrays = defaultdict(_array_factory)
        is_conv = False
        if self._tol > 0:
            # v = np.array(list(var_data.values()), dtype=np.float64)
            # tol = np.mean(v) * self._tol
            prev_centers = np.zeros((self.n_clusters, n_features), dtype=dtype)
        else:
            # tol = 0
            prev_centers = np.zeros(0, dtype=dtype)
        random_state = check_random_state(self.mbk.random_state)
        X: Optional[np.ndarray[Any, Any]] = None
        # Attributes to monitor the convergence
        self.mbk._ewa_inertia = None
        self.mbk._ewa_inertia_min = None
        self.mbk._no_improvement = 0
        for iter_ in range(step_size):
            mb_ilocs = random_state.randint(0, n_samples, batch_size)
            mb_locs = input_df.index[mb_ilocs]
            self._key = len(mb_locs)
            arr = self._arrays[self._key]
            X = input_df.to_array(columns=cols, locs=mb_locs, ret=arr)
            if hasattr(self.mbk, "cluster_centers_"):
                prev_centers[:, :] = self.mbk.cluster_centers_
            self.mbk.partial_fit(X)
            if self._labels is not None:
                self._process_labels(mb_locs)
            centers = self.mbk.cluster_centers_
            nearest_center, batch_inertia = self.mbk.labels_, self.mbk.inertia_
            k = centers.shape[0]
            squared_diff = 0.0
            for ci in range(k):
                center_mask = nearest_center == ci
                if np.count_nonzero(center_mask) > 0:
                    diff = centers[ci].ravel() - prev_centers[ci].ravel()
                    squared_diff += np.dot(diff, diff)  # type: ignore
            if self.mbk._mini_batch_convergence(iter_, step_size, n_samples,
                                                squared_diff, batch_inertia):
                is_conv = True
                break
        if self.result is None:
            assert X is not None
            dshape = dshape_from_columns(input_df, cols,
                                         dshape_from_dtype(X.dtype))
            self.result = Table(self.generate_table_name("centers"),
                                dshape=dshape,
                                create=True)
            self.result.resize(self.mbk.cluster_centers_.shape[0])
        self.psdict[cols] = self.mbk.cluster_centers_  # type: ignore
        if is_conv:
            return self._return_run_step(self.state_blocked, iter_)
        return self._return_run_step(self.state_ready, iter_)

    def to_json(self, short: bool = False, with_speed: bool = True) -> JSon:
        json = super().to_json(short, with_speed)
        if short:
            return json
        return self._centers_to_json(json)

    def _centers_to_json(self, json: JSon) -> JSon:
        json["cluster_centers"] = self.table.to_json()
        return json

    def set_centroid(self, c: int, values: List[float]) -> List[float]:
        try:
            c = int(c)
        except ValueError:
            pass

        centroids = self.table
        # idx = centroids.id_to_index(c)

        dfslot = self.get_input_slot("table")
        input_df = dfslot.data()
        columns = self.get_columns(input_df, "table")
        if len(values) != len(columns):
            raise ProgressiveError(
                f"Expected {len(columns)} values, received {values}")
        centroids.loc[c, columns] = values
        # TODO unpack the table
        centers = centroids.loc[c, columns]
        assert isinstance(centers, BaseTable)
        self.mbk.cluster_centers_[c] = list(centers)
        return self.mbk.cluster_centers_.tolist()

    def create_dependent_modules(self,
                                 input_module: Module,
                                 input_slot: str = "result") -> None:
        with self.grouped():
            s = self.scheduler()
            self.input_module = input_module
            self.input.table = input_module.output[input_slot]
            self.input_slot = input_slot
            c = DynVar(group=self.name, scheduler=s)
            self.moved_center = c
            self.input.moved_center = c.output.result
            v = Var(group=self.name, scheduler=s)
            self.variance = v
            v.input.table = input_module.output[input_slot]
            self.input.var = v.output.result
Esempio n. 10
0
class KernelDensity(TableModule):
    parameters = [
        ("samples", np.dtype(object), 1),
        ("bins", np.dtype(int), 1),
        ("threshold", np.dtype(int), 1000),
        ("knn", np.dtype(int), 100),
    ]
    inputs = [SlotDescriptor("table", type=Table, required=True)]

    def __init__(self, **kwds: Any) -> None:
        self._kde: Optional[KNNKernelDensity] = None
        self._json_cache: JSon = {}
        self._inserted: int = 0
        self._lately_inserted: int = 0
        super(KernelDensity, self).__init__(**kwds)
        self.tags.add(self.TAG_VISUALIZATION)

    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        dfslot = self.get_input_slot("table")
        assert dfslot is not None
        if dfslot.deleted.any():
            raise ValueError("Not implemented yet")
        if not dfslot.created.any():
            return self._return_run_step(self.state_blocked, steps_run=0)
        indices = dfslot.created.next(length=step_size, as_slice=False)
        steps = indices_len(indices)
        if steps == 0:
            return self._return_run_step(self.state_blocked, steps_run=0)
        if self._kde is None:
            self._kde = KNNKernelDensity(dfslot.data(), online=True)
        res = self._kde.run_ids(indices.to_array())
        self._inserted += res["numPointsInserted"]
        self._lately_inserted += steps
        samples = self.params.samples
        sample_num = self.params.bins
        threshold = self.params.threshold
        knn = self.params.knn
        if self._lately_inserted > threshold:
            scores = self._kde.score_samples(samples.astype(np.float32), k=knn)
            self._lately_inserted = 0
            self._json_cache = {
                "points":
                np.array(dfslot.data().loc[:500, :].to_dict(
                    orient="split")["data"]).tolist(),
                "bins":
                sample_num,
                "inserted":
                self._inserted,
                "total":
                len(dfslot.data()),
                "samples": [
                    (sample, score) for sample, score in zip(
                        samples.tolist(), scores.tolist())  # type: ignore
                ],
            }
        return self._return_run_step(self.state_ready, steps_run=steps)

    def get_visualization(self) -> Optional[str]:
        return "knnkde"

    def to_json(self, short: bool = False, with_speed: bool = True) -> JSon:
        json = super(KernelDensity, self).to_json(short, with_speed)
        if short:
            return json
        return self.knnkde_to_json(json)

    def knnkde_to_json(self, json: JSon) -> JSon:
        json.update(self._json_cache)
        return json
Esempio n. 11
0
class SimpleCSVLoader(TableModule):
    inputs = [SlotDescriptor("filenames", type=Table, required=False)]

    def __init__(self,
                 filepath_or_buffer: Optional[Any] = None,
                 filter_: Optional[Callable[[pd.DataFrame],
                                            pd.DataFrame]] = None,
                 force_valid_ids: bool = True,
                 fillvalues: Optional[Dict[str, Any]] = None,
                 throttle: Union[bool, int, float] = False,
                 **kwds: Any) -> None:
        super().__init__(**kwds)
        self.default_step_size = kwds.get("chunksize", 1000)  # initial guess
        kwds.setdefault("chunksize", self.default_step_size)
        # Filter out the module keywords from the csv loader keywords
        csv_kwds: Dict[str, Any] = filter_kwds(kwds, pd.read_csv)
        # When called with a specified chunksize, it returns a parser
        self.filepath_or_buffer = filepath_or_buffer
        self.force_valid_ids = force_valid_ids
        if throttle and isinstance(throttle, integer_types + (float, )):
            self.throttle = throttle
        else:
            self.throttle = False
        self.parser: Optional[pd.TextReader] = None
        self.csv_kwds = csv_kwds
        self._compression: Any = csv_kwds.get("compression", "infer")
        csv_kwds["compression"] = None
        self._encoding: Any = csv_kwds.get("encoding", None)
        csv_kwds["encoding"] = None
        self._nrows = csv_kwds.get("nrows")
        csv_kwds["nrows"] = None  # nrows clashes with chunksize

        self._rows_read = 0
        if filter_ is not None and not callable(filter_):
            raise ProgressiveError(
                "filter parameter should be callable or None")
        self._filter: Optional[Callable[[pd.DataFrame],
                                        pd.DataFrame]] = filter_
        self._input_stream: Optional[
            io.
            IOBase] = None  # stream that returns a position through the 'tell()' method
        self._input_encoding: Optional[str] = None
        self._input_compression: Optional[str] = None
        self._input_size = 0  # length of the file or input stream when available
        self._file_mode = False
        self._table_params: Dict[str, Any] = dict(name=self.name,
                                                  fillvalues=fillvalues)

    def rows_read(self) -> int:
        return self._rows_read

    def is_ready(self) -> bool:
        if self.has_input_slot("filenames"):
            # Can be called before the first update so fn.created can be None
            fn = self.get_input_slot("filenames")
            if fn.created is None or fn.created.any():
                return True
        return super().is_ready()

    def is_data_input(self) -> bool:
        # pylint: disable=no-self-use
        "Return True if this module brings new data"
        return True

    def open(self, filepath: Any) -> io.IOBase:
        if self._input_stream is not None:
            self.close()
        compression: Optional[str] = _infer_compression(
            filepath, self._compression)
        istream: io.IOBase
        encoding: Optional[str]
        size: int
        (istream, encoding, compression,
         size) = filepath_to_buffer(filepath,
                                    encoding=self._encoding,
                                    compression=compression)
        self._input_stream = istream
        self._input_encoding = encoding
        self._input_compression = compression
        self._input_size = size
        self.csv_kwds["encoding"] = encoding
        self.csv_kwds["compression"] = compression
        return istream

    def close(self) -> None:
        if self._input_stream is None:
            return
        try:
            self._input_stream.close()
            # pylint: disable=bare-except
        except Exception:
            pass
        self._input_stream = None
        self._input_encoding = None
        self._input_compression = None
        self._input_size = 0

    def get_progress(self) -> Tuple[int, int]:
        if self._input_size == 0:
            return (0, 0)
        if self._input_stream is None:
            return (0, 0)
        pos = self._input_stream.tell()
        return (pos, self._input_size)

    def validate_parser(self, run_number: int) -> ModuleState:
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                try:
                    self.parser = pd.read_csv(
                        self.open(self.filepath_or_buffer), **self.csv_kwds)
                except IOError as e:
                    logger.error("Cannot open file %s: %s",
                                 self.filepath_or_buffer, e)
                    self.parser = None
                    return self.state_terminated
                self.filepath_or_buffer = None
                self._file_mode = True
            else:
                if not self.has_input_slot("filenames"):
                    return self.state_terminated
                fn_slot = self.get_input_slot("filenames")
                if fn_slot.output_module is None:
                    return self.state_terminated
                fn_slot.update(run_number)
                if fn_slot.deleted.any() or fn_slot.updated.any():
                    raise ProgressiveError("Cannot handle input file changes")
                df = fn_slot.data()
                while self.parser is None:
                    indices = fn_slot.created.next(length=1)
                    assert isinstance(indices, slice)
                    if indices.stop == indices.start:
                        return self.state_blocked
                    filename = df.at[indices.start, "filename"]
                    try:
                        self.parser = pd.read_csv(self.open(filename),
                                                  **self.csv_kwds)
                    except IOError as e:
                        logger.error("Cannot open file %s: %s", filename, e)
                        self.parser = None
                        # fall through
        return self.state_ready

    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        if step_size == 0:  # bug
            logger.error("Received a step_size of 0")
            return self._return_run_step(self.state_ready, steps_run=0)
        if self.throttle:
            step_size = np.min([self.throttle, step_size])  # type: ignore
        status = self.validate_parser(run_number)
        if status == self.state_terminated:
            raise ProgressiveStopIteration("no more filenames")
        elif status == self.state_blocked:
            return self._return_run_step(status, steps_run=0)
        elif status != self.state_ready:
            logger.error("Invalid state returned by validate_parser: %d",
                         status)
            self.close()
            raise ProgressiveStopIteration("Unexpected situation")
        logger.info("loading %d lines", step_size)
        try:
            assert self.parser
            df: pd.DataFrame = self.parser.read(
                step_size)  # raises StopIteration at EOF
        except StopIteration:
            self.close()
            if self.has_input_slot("filenames"):
                fn_slot = self.get_input_slot("filenames")
                if (fn_slot is None or
                        fn_slot.output_module is None) and not self._file_mode:
                    raise
            self.parser = None
            return self._return_run_step(self.state_ready, steps_run=0)
        creates = len(df)
        if creates == 0:  # should not happen
            logger.error("Received 0 elements")
            raise ProgressiveStopIteration
        if self._filter is not None:
            df = self._filter(df)
        creates = len(df)
        if creates == 0:
            logger.info("frame has been filtered out")
        else:
            self._rows_read += creates
            logger.info("Loaded %d lines", self._rows_read)
            if self.force_valid_ids:
                force_valid_id_columns(df)
            if self.result is None:
                self._table_params["name"] = self.generate_table_name("table")
                self._table_params["dshape"] = dshape_from_dataframe(df)
                self._table_params["data"] = df
                self._table_params["create"] = True
                self.result = Table(**self._table_params)
            else:
                self.table.append(df)
        return self._return_run_step(self.state_ready, steps_run=creates)
Esempio n. 12
0
class Sample(TableModule):
    parameters = [("samples", np.dtype(int), 50)]
    inputs = [SlotDescriptor("table", type=Table)]
    outputs = [SlotDescriptor("select", type=bitmap, required=False)]

    def __init__(self, required: str = "result", **kwds: Any) -> None:
        assert required in ("result", "select")
        super(Sample, self).__init__(output_required=(required == "result"),
                                     **kwds)
        if required == "select":
            # Change the descriptor so required
            # The original SD is kept in the shared outputs/all_outputs
            # class variables
            sd = SlotDescriptor("select", type=Table, required=True)
            self.output_descriptors["select"] = sd

        self._tmp_table = Table(self.generate_table_name("sample"),
                                dshape="{select: int64}",
                                create=True)
        self._size = 0  # holds the size consumed from the input table so far
        self._bitmap: Optional[bitmap] = None
        self.result: Optional[TableSelectedView] = None

    def reset(self) -> None:
        self._tmp_table.resize(0)
        self._size = 0
        self._bitmap = None
        slot = self.get_input_slot("table")
        if slot is not None:
            slot.reset()

    def get_data(self, name: str) -> Any:
        if name == "select":
            return self.get_bitmap()
        if self.result is not None:
            self.result.selection = self.get_bitmap()
        return super(Sample, self).get_data(name)

    def get_bitmap(self) -> bitmap:
        if self._bitmap is None:
            len_ = len(self._tmp_table["select"])
            # Avoid "ValueError: Iteration of zero-sized operands is not enabled"
            self._bitmap = bitmap(
                self._tmp_table["select"]) if len_ else bitmap()
        return self._bitmap

    @process_slot("table", reset_if="delete", reset_cb="reset")
    @run_if_any
    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        assert self.context
        with self.context as ctx:
            if self.result is None:
                self.result = TableSelectedView(ctx.table.data(), bitmap([]))
            indices = ctx.table.created.next(length=step_size, as_slice=False)
            steps = indices_len(indices)
            k = int(self.params.samples)
            reservoir = self._tmp_table
            res = reservoir["select"]
            size = self._size  # cache in local variable
            if size < k:
                logger.info("Filling the reservoir %d/%d", size, k)
                # fill the reservoir array until it contains k elements
                rest = indices.pop(k - size)
                reservoir.append({"select": rest})
                size = len(reservoir)

            if len(indices) == 0:  # nothing else to do
                self._size = size
                if steps:
                    self._bitmap = None
                return self._return_run_step(self.state_blocked,
                                             steps_run=steps)

            t = 4 * k
            # Threshold (t) determines when to start fast sampling
            # logic. The optimal value for (t) may vary depending on RNG
            # performance characteristics.

            if size < t and len(indices) != 0:
                logger.info("Normal sampling from %d to %d", size, t)
            while size < t and len(indices) != 0:
                # Normal reservoir sampling is fastest up to (t) samples
                j = np.random.randint(size)
                if j < k:
                    res[j] = indices.pop()[0]
                size += 1

            if len(indices) == 0:
                self._size = size
                if steps:
                    self._bitmap = None
                return self._return_run_step(self.state_blocked,
                                             steps_run=steps)

            logger.info("Fast sampling with %d indices", len(indices))
            while indices:
                # draw gap size (g) from geometric distribution with probability p = k / size
                p = k / size
                u = np.random.rand()
                g = int(np.floor(np.log(u) / np.log(1 - p)))
                # advance over the gap, and assign next element to the reservoir
                if (g + 1) < len(indices):
                    j = np.random.randint(k)
                    res[j] = indices[g]
                    indices.pop(g + 1)
                    size += g + 1
                else:
                    size += len(indices)
                    break

            self._size = size
            if steps:
                self._bitmap = None
            return self._return_run_step(self.state_blocked, steps_run=steps)
Esempio n. 13
0
class BlobsTableABC(TableModule):
    """Isotropic Gaussian blobs => table
    The purpose of the "reservoir" approach is to ensure the reproducibility of the results
    """

    outputs = [SlotDescriptor("labels", type=Table, required=False)]
    kw_fun: Optional[Callable[..., Any]] = None

    def __init__(
        self,
        columns: Union[int, List[str], np.ndarray[Any, Any]],
        rows: int = -1,
        dtype: npt.DTypeLike = np.float64,
        seed: int = 0,
        throttle: Union[int, bool, float] = False,
        **kwds: Any,
    ) -> None:
        super().__init__(**kwds)
        self.tags.add(self.TAG_SOURCE)
        dtype = dshape_from_dtype(np.dtype(dtype))
        self._kwds = {}  # FIXME
        """assert 'centers' in self._kwds
        assert 'n_samples' not in self._kwds
        assert 'n_features' not in self._kwds
        assert 'random_state' not in self._kwds"""
        # self._kwds['n_samples'] = rows
        # self._kwds['n_features']
        self.default_step_size = 1000
        self.columns: Union[List[str], np.ndarray[Any, Any]]
        if isinstance(columns, integer_types):
            self.columns = [f"_{i}" for i in range(1, columns + 1)]
            # self._kwds['n_features'] = columns
        elif isinstance(columns, (list, np.ndarray)):
            self.columns = columns
            # self._kwds['n_features'] = len(columns)
        else:
            raise ProgressiveError("Invalid type for columns")
        self.rows = rows
        self.seed = seed
        self._reservoir: Optional[
            Tuple[np.ndarray[Any, Any], np.ndarray[Any, Any]]
        ] = None
        self._labels: Optional[Table] = None
        self._reservoir_idx = 0
        if throttle and isinstance(throttle, integer_types + (float,)):
            self.throttle: Union[int, bool, float] = throttle
        else:
            self.throttle = False
        dshape = ", ".join([f"{col}: {dtype}" for col in self.columns])
        dshape = "{" + dshape + "}"
        table = Table(self.generate_table_name("table"), dshape=dshape, create=True)
        self.result = table
        self.columns = table.columns

    def starting(self) -> None:
        super().starting()
        opt_slot = self.get_output_slot("labels")
        if opt_slot:
            logger.debug("Maintaining labels")
            self.maintain_labels(True)
        else:
            logger.debug("Not maintaining labels")
            self.maintain_labels(False)

    def maintain_labels(self, yes: bool = True) -> None:
        if yes and self._labels is None:
            self._labels = Table(
                self.generate_table_name("blobs_labels"),
                dshape="{labels: int64}",
                create=True,
            )
        elif not yes:
            self._labels = None

    def labels(self) -> Optional[Table]:
        return self._labels

    def get_data(self, name: str) -> Any:
        if name == "labels":
            return self.labels()
        return super().get_data(name)

    @abstractmethod
    def fill_reservoir(self) -> None:
        pass

    def run_step(
        self, run_number: int, step_size: int, howlong: float
    ) -> ReturnRunStep:
        if step_size == 0:
            logger.error("Received a step_size of 0")
            return self._return_run_step(self.state_ready, steps_run=0)
        logger.info("generating %d lines", step_size)
        if self.throttle:
            step_size = np.min([self.throttle, step_size])  # type: ignore
        if self.rows >= 0 and (len(self.table) + step_size) > self.rows:
            step_size = self.rows - len(self.table)
            logger.info("truncating to %d lines", step_size)
            if step_size <= 0:
                raise ProgressiveStopIteration
        if self._reservoir is None:
            self.fill_reservoir()
        steps = int(step_size)
        while steps > 0:
            assert self._reservoir
            level = len(self._reservoir[0]) - self._reservoir_idx
            assert level >= 0
            if steps >= level:
                blobs_dict, y_ = xy_to_dict(
                    *self._reservoir, self._reservoir_idx, None, self.columns
                )
                steps -= level
                # reservoir was emptied so:
                self.fill_reservoir()
            else:  # steps < level
                blobs_dict, y_ = xy_to_dict(
                    *self._reservoir, self._reservoir_idx, steps, self.columns
                )
                self._reservoir_idx += steps
                steps = 0
            self.table.append(blobs_dict)
            if self._labels is not None:
                self._labels.append({"labels": y_})
        if len(self.table) == self.rows:
            next_state = self.state_zombie
        elif self.throttle:
            next_state = self.state_blocked
        else:
            next_state = self.state_ready
        return self._return_run_step(next_state, steps_run=step_size)
Esempio n. 14
0
class CSVLoader(TableModule):
    """
    Warning : this module do not wait for "filenames"
    """

    inputs = [SlotDescriptor("filenames", type=Table, required=False)]

    def __init__(
        self,
        filepath_or_buffer: Optional[Any] = None,
        filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
        force_valid_ids: bool = True,
        fillvalues: Optional[Dict[str, Any]] = None,
        as_array: Optional[Any] = None,
        timeout: Optional[float] = None,
        save_context: Optional[Any] = None,  # FIXME seems more like a bool
        recovery: int = 0,  # FIXME seems more like a bool
        recovery_tag: Union[str, int] = "",
        recovery_table_size: int = 3,
        save_step_size: int = 100000,
        **kwds: Any,
    ) -> None:
        super(CSVLoader, self).__init__(**kwds)
        self.tags.add(self.TAG_SOURCE)
        self.default_step_size = kwds.get("chunksize", 1000)  # initial guess
        kwds.setdefault("chunksize", self.default_step_size)
        # Filter out the module keywords from the csv loader keywords
        csv_kwds = filter_kwds(kwds, pd.read_csv)
        # When called with a specified chunksize, it returns a parser
        self.filepath_or_buffer = filepath_or_buffer
        self.force_valid_ids = force_valid_ids
        self.parser: Optional[Parser] = None
        self.csv_kwds = csv_kwds
        self._compression = csv_kwds.get("compression", "infer")
        csv_kwds["compression"] = None
        self._encoding = csv_kwds.get("encoding", None)
        csv_kwds["encoding"] = None
        self._rows_read = 0
        if filter_ is not None and not callable(filter_):
            raise ProgressiveError(
                "filter parameter should be callable or None")
        self._filter = filter_
        # self._input_stream: Optional[Any] = (
        #     None  # stream that returns a position through the 'tell()' method
        # )
        self._input_encoding = None
        self._input_compression = None
        self._input_size = 0  # length of the file or input stream when available
        self._timeout_csv = timeout
        self._table_params: Dict[str, Any] = dict(name=self.name,
                                                  fillvalues=fillvalues)
        self._as_array = as_array
        self._save_context = (True if save_context is None
                              and is_recoverable(filepath_or_buffer) else
                              False)
        self._recovery = recovery
        self._recovery_table_size = recovery_table_size
        self._recovery_table: Optional[Table] = None
        self._recovery_table_name = f"csv_loader_recovery_{recovery_tag}"
        self._recovery_table_inv: Optional[Table] = None
        self._recovery_table_inv_name = f"csv_loader_recovery_invariant_{recovery_tag}"
        self._save_step_size = save_step_size
        self._last_saved_id = 0
        if self._recovery and not self.recovery_tables_exist():
            self._recovery = False
        if not self._recovery:
            self.trunc_recovery_tables()

    def recovery_tables_exist(self) -> bool:
        try:
            Table(name=self._recovery_table_name, create=False)
        except ValueError as ve:
            if "exist" in ve.args[0]:
                print("WARNING: recovery table does not exist")
                return False
            raise
        try:
            Table(name=self._recovery_table_inv_name, create=False)
        except Exception as ve:
            # FIXME JDF: is that the right way?
            if "exist" in ve.args[0]:  # FIXME
                print("WARNING: recovery table invariant does not exist")
                return False
            raise
        return True

    def trunc_recovery_tables(self) -> None:
        len_ = 0
        rt: Optional[Table] = None
        try:
            rt = Table(name=self._recovery_table_name, create=False)
            len_ = len(rt)
        except Exception:
            pass
        if len_ and rt is not None:
            rt.drop(slice(None, None, None), truncate=True)
        len_ = 0
        try:
            rt = Table(name=self._recovery_table_inv_name, create=False)
            len_ = len(rt)
        except Exception:
            pass
        if len_ and rt is not None:
            rt.drop(slice(None, None, None), truncate=True)

    def rows_read(self) -> int:
        "Return the number of rows read so far."
        return self._rows_read

    def is_ready(self) -> bool:
        if self.has_input_slot("filenames"):
            fn = self.get_input_slot("filenames")
            if fn.created is None or fn.created.any():
                return True
        return super(CSVLoader, self).is_ready()

    def is_data_input(self) -> bool:
        # pylint: disable=no-self-use
        "Return True if this module brings new data"
        return True

    def create_input_source(self, filepath: str) -> InputSource:
        usecols = self.csv_kwds.get("usecols")
        return InputSource.create(
            filepath,
            encoding=self._encoding,
            compression=self._compression,
            timeout=self._timeout_csv,
            start_byte=0,
            usecols=usecols,
        )

    def close(self) -> None:
        # if self._input_stream is None:
        #     return
        # try:
        #     self._input_stream.close()
        #     # pylint: disable=bare-except
        # except Exception:
        #     pass
        # self._input_stream = None
        self._input_encoding = None
        self._input_compression = None
        self._input_size = 0

    def get_progress(self) -> Tuple[int, int]:
        if self._input_size == 0:
            return (0, 0)
        pos = 0  # self._input_stream.tell()
        return (pos, self._input_size)

    def validate_parser(self, run_number: int) -> ModuleState:
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                if not self._recovery:
                    try:
                        self.parser = read_csv(
                            self.create_input_source(self.filepath_or_buffer),
                            **self.csv_kwds,
                        )
                    except IOError as e:
                        logger.error("Cannot open file %s: %s",
                                     self.filepath_or_buffer, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None
                else:  # do recovery
                    try:
                        if self._recovery_table is None:
                            self._recovery_table = Table(
                                name=self._recovery_table_name, create=False)
                        if self._recovery_table_inv is None:
                            self._recovery_table_inv = Table(
                                name=self._recovery_table_inv_name,
                                create=False)
                        if self.result is None:
                            self._table_params[
                                "name"] = self._recovery_table_inv[
                                    "table_name"].loc[0]
                            self._table_params["create"] = False
                            table = Table(**self._table_params)
                            self.result = table
                            table.last_id
                    except Exception as e:  # TODO: specify the exception?
                        logger.error(f"Cannot acces recovery table {e}")
                        return self.state_terminated
                    table = self.table
                    try:
                        last_ = self._recovery_table.eval("last_id=={}".format(
                            len(table)),
                                                          as_slice=False)
                        len_last = len(last_)
                        if len_last > 1:
                            logger.error("Inconsistent recovery table")
                            return self.state_terminated
                        # last_ = self._recovery_table.argmax()['offset']
                        snapshot: Optional[Dict[str, Any]] = None
                        if len_last == 1:
                            row = self._recovery_table.row(last_[0])
                            assert row is not None
                            snapshot = row.to_dict(ordered=True)
                            if not check_snapshot(snapshot):
                                snapshot = None
                        if (snapshot is None
                            ):  # i.e. snapshot not yet found or inconsistent
                            max_ = -1
                            for i in self._recovery_table.eval(
                                    "last_id<{}".format(len(table)),
                                    as_slice=False):
                                row = self._recovery_table.row(i)
                                assert row is not None
                                sn: Dict[str, Any] = row.to_dict(ordered=True)
                                if check_snapshot(sn) and sn["last_id"] > max_:
                                    max_, snapshot = sn["last_id"], sn
                            if max_ < 0:
                                # logger.error('Cannot acces recovery table (max_<0)')
                                return self.state_terminated
                            table.drop(slice(max_ + 1, None, None),
                                       truncate=True)
                        assert snapshot
                        self._recovered_csv_table_name = snapshot["table_name"]
                    except Exception as e:
                        logger.error("Cannot read the snapshot %s", e)
                        return self.state_terminated
                    try:
                        self.parser = recovery(snapshot,
                                               self.filepath_or_buffer,
                                               **self.csv_kwds)
                    except Exception as e:
                        logger.error("Cannot recover from snapshot %s, %s",
                                     snapshot, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None

            else:  # this case does not support recovery
                fn_slot = None
                if self.has_input_slot("filenames"):
                    fn_slot = self.get_input_slot("filenames")
                if fn_slot is None or fn_slot.output_module is None:
                    return self.state_terminated
                # fn_slot.update(run_number)
                if fn_slot.deleted.any() or fn_slot.updated.any():
                    raise ProgressiveError("Cannot handle input file changes")
                df = fn_slot.data()
                while self.parser is None:
                    indices = fn_slot.created.next(length=1)
                    assert isinstance(indices, slice)
                    if indices.stop == indices.start:
                        return self.state_blocked
                    filename = df.at[indices.start, "filename"]
                    try:
                        self.parser = read_csv(
                            self.create_input_source(filename),
                            **self.csv_kwds)
                    except IOError as e:
                        logger.error("Cannot open file %s: %s", filename, e)
                        self.parser = None
                        # fall through
        return self.state_ready

    def _data_as_array(self, df: pd.DataFrame) -> Tuple[Any, DataShape]:
        if not self._as_array:
            return (df, dshape_from_dataframe(df))
        if callable(self._as_array):
            self._as_array = self._as_array(list(df.columns))  # FIXME
        if isinstance(self._as_array, str):
            data = df.values
            dshape = array_dshape(data, self._as_array)
            return ({self._as_array: data}, dshape)
        if not isinstance(self._as_array, dict):
            raise ValueError(
                f"Unexpected parameter specified to as_array: {self._as_array}"
            )
        columns = set(df.columns)
        ret = {}
        for colname, cols in self._as_array.items():
            if colname in ret:
                raise KeyError(f"Duplicate column {colname} in as_array")
            colset = set(cols)
            assert colset.issubset(columns)
            columns -= colset
            view = df[cols]
            values = view.values
            ret[colname] = values
        for colname in columns:
            if colname in ret:
                raise KeyError(f"Duplicate column {colname} in as_array")
            ret[colname] = df[colname].values
        return ret, dshape_from_dict(ret)

    def _needs_save(self) -> bool:
        table = self.table
        if table is None:
            return False
        return table.last_id >= self._last_saved_id + self._save_step_size

    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        if step_size == 0:  # bug
            logger.error("Received a step_size of 0")
            return self._return_run_step(self.state_ready, steps_run=0)
        status = self.validate_parser(run_number)
        if status == self.state_terminated:
            raise ProgressiveStopIteration("no more filenames")
        elif status == self.state_blocked:
            return self._return_run_step(status, steps_run=0)
        elif status != self.state_ready:
            logger.error("Invalid state returned by validate_parser: %d",
                         status)
            self.close()
            raise ProgressiveStopIteration("Unexpected situation")
        logger.info("loading %d lines", step_size)
        needs_save = self._needs_save()
        assert self.parser
        df_list: List[pd.DataFrame]
        try:
            df_list = self.parser.read(
                step_size, flush=needs_save)  # raises StopIteration at EOF
            if not df_list:
                raise ProgressiveStopIteration
        except ProgressiveStopIteration:
            self.close()
            if self.has_input_slot("filenames"):
                fn_slot = self.get_input_slot("filenames")
                assert fn_slot.output_module is not None
            self.parser = None
            return self._return_run_step(self.state_ready, 0)
        df_len = sum([len(df) for df in df_list])
        creates = df_len
        if creates == 0:  # should not happen
            logger.error("Received 0 elements")
            raise ProgressiveStopIteration
        if self._filter is not None:
            df_list = [self._filter(df) for df in df_list]
        creates = sum([len(df) for df in df_list])
        if creates == 0:
            logger.info("frame has been filtered out")
        else:
            self._rows_read += creates
            logger.info("Loaded %d lines", self._rows_read)
            if self.force_valid_ids:
                for df in df_list:
                    force_valid_id_columns(df)
            if self.result is None:
                table = self.table
                data, dshape = self._data_as_array(pd.concat(df_list))
                if not self._recovery:
                    self._table_params["name"] = self.generate_table_name(
                        "table")
                    self._table_params["data"] = data
                    self._table_params["dshape"] = dshape
                    self._table_params["create"] = True
                    self.result = Table(**self._table_params)
                else:
                    self._table_params["name"] = self._recovered_csv_table_name
                    # self._table_params['dshape'] = dshape
                    self._table_params["create"] = False
                    table = Table(**self._table_params)
                    self.result = table
                    table.append(self._data_as_array(pd.concat(df_list)))
            else:
                table = self.table
                for df in df_list:
                    data, dshape = self._data_as_array(df)
                    table.append(data)
            if (self.parser.is_flushed() and needs_save
                    and self._recovery_table is None and self._save_context):
                table = self.table
                snapshot = self.parser.get_snapshot(
                    run_number=run_number,
                    table_name=table.name,
                    last_id=table.last_id,
                )
                self._recovery_table = Table(
                    name=self._recovery_table_name,
                    data=pd.DataFrame(snapshot, index=[0]),
                    create=True,
                )
                self._recovery_table_inv = Table(
                    name=self._recovery_table_inv_name,
                    data=pd.DataFrame(
                        dict(
                            table_name=table.name,
                            csv_input=self.filepath_or_buffer,
                        ),
                        index=[0],
                    ),
                    create=True,
                )
                self._last_saved_id = table.last_id
            elif self.parser.is_flushed(
            ) and needs_save and self._save_context:
                snapshot = self.parser.get_snapshot(
                    run_number=run_number,
                    last_id=table.last_id,
                    table_name=table.name,
                )
                assert self._recovery_table
                self._recovery_table.add(snapshot)
                if len(self._recovery_table) > self._recovery_table_size:
                    oldest = self._recovery_table.argmin()["offset"]
                    self._recovery_table.drop(oldest)
                self._last_saved_id = table.last_id
        return self._return_run_step(self.state_ready, steps_run=creates)