def test_assign(self) -> None:
     t = Table("table_eval_assign",
               dshape="{a: int, b: float32}",
               create=True)
     t.resize(20)
     ivalues = np.random.randint(100, size=20)
     t["a"] = ivalues
     fvalues: np.ndarray[Any, Any] = np.random.rand(20) * 100
     t["b"] = fvalues
     df = pd.DataFrame(t.to_dict())
     t2 = t.eval("a = a+2*b", inplace=False)
     df2 = cast(pd.DataFrame, df.eval("a = a+2*b", inplace=False))
     self.assertTrue(np.allclose(t2["a"], df2["a"]))
     self.assertTrue(np.allclose(t2["b"], df2["b"]))
     t.eval("b = a+2*b", inplace=True)
     df.eval("b = a+2*b", inplace=True)
     self.assertTrue(np.allclose(t["a"].values, df["a"].values))
     self.assertTrue(np.allclose(t["b"].values, df["b"].values))
Exemple #2
0
 def test_assign(self):
     t = Table('table_eval_assign',
               dshape="{a: int, b: float32}",
               create=True)
     t.resize(20)
     ivalues = np.random.randint(100, size=20)
     t['a'] = ivalues
     fvalues = np.random.rand(20) * 100
     t['b'] = fvalues
     df = pd.DataFrame(t.to_dict())
     t2 = t.eval('a = a+2*b', inplace=False)
     df2 = df.eval('a = a+2*b', inplace=False)
     self.assertTrue(np.allclose(t2['a'], df2['a']))
     self.assertTrue(np.allclose(t2['b'], df2['b']))
     t.eval('b = a+2*b', inplace=True)
     df.eval('b = a+2*b', inplace=True)
     self.assertTrue(np.allclose(t['a'], df['a']))
     self.assertTrue(np.allclose(t['b'], df['b']))
 def test_user_dict(self) -> None:
     t = Table("table_user_dict",
               dshape="{a: int, b: float32}",
               create=True)
     t.resize(20)
     ivalues = np.random.randint(100, size=20)
     t["a"] = ivalues
     fvalues: np.ndarray[Any, Any] = np.random.rand(20) * 100
     t["b"] = fvalues
     df = pd.DataFrame(t.to_dict())
     _ = t.eval("a = a+2*b", inplace=False)
     _ = df.eval("x = a.loc[3]+2*b.loc[3]", inplace=False)
Exemple #4
0
 def test_user_dict(self):
     t = Table('table_user_dict',
               dshape="{a: int, b: float32}",
               create=True)
     t.resize(20)
     ivalues = np.random.randint(100, size=20)
     t['a'] = ivalues
     fvalues = np.random.rand(20) * 100
     t['b'] = fvalues
     df = pd.DataFrame(t.to_dict())
     t2 = t.eval('a = a+2*b', inplace=False)
     df2 = df.eval('x = a.loc[3]+2*b.loc[3]', inplace=False)
     print(df2.x)
Exemple #5
0
class CSVLoader(TableModule):
    """
    Warning : this module do not wait for "filenames"
    """

    inputs = [SlotDescriptor("filenames", type=Table, required=False)]

    def __init__(
        self,
        filepath_or_buffer: Optional[Any] = None,
        filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
        force_valid_ids: bool = True,
        fillvalues: Optional[Dict[str, Any]] = None,
        as_array: Optional[Any] = None,
        timeout: Optional[float] = None,
        save_context: Optional[Any] = None,  # FIXME seems more like a bool
        recovery: int = 0,  # FIXME seems more like a bool
        recovery_tag: Union[str, int] = "",
        recovery_table_size: int = 3,
        save_step_size: int = 100000,
        **kwds: Any,
    ) -> None:
        super(CSVLoader, self).__init__(**kwds)
        self.tags.add(self.TAG_SOURCE)
        self.default_step_size = kwds.get("chunksize", 1000)  # initial guess
        kwds.setdefault("chunksize", self.default_step_size)
        # Filter out the module keywords from the csv loader keywords
        csv_kwds = filter_kwds(kwds, pd.read_csv)
        # When called with a specified chunksize, it returns a parser
        self.filepath_or_buffer = filepath_or_buffer
        self.force_valid_ids = force_valid_ids
        self.parser: Optional[Parser] = None
        self.csv_kwds = csv_kwds
        self._compression = csv_kwds.get("compression", "infer")
        csv_kwds["compression"] = None
        self._encoding = csv_kwds.get("encoding", None)
        csv_kwds["encoding"] = None
        self._rows_read = 0
        if filter_ is not None and not callable(filter_):
            raise ProgressiveError(
                "filter parameter should be callable or None")
        self._filter = filter_
        # self._input_stream: Optional[Any] = (
        #     None  # stream that returns a position through the 'tell()' method
        # )
        self._input_encoding = None
        self._input_compression = None
        self._input_size = 0  # length of the file or input stream when available
        self._timeout_csv = timeout
        self._table_params: Dict[str, Any] = dict(name=self.name,
                                                  fillvalues=fillvalues)
        self._as_array = as_array
        self._save_context = (True if save_context is None
                              and is_recoverable(filepath_or_buffer) else
                              False)
        self._recovery = recovery
        self._recovery_table_size = recovery_table_size
        self._recovery_table: Optional[Table] = None
        self._recovery_table_name = f"csv_loader_recovery_{recovery_tag}"
        self._recovery_table_inv: Optional[Table] = None
        self._recovery_table_inv_name = f"csv_loader_recovery_invariant_{recovery_tag}"
        self._save_step_size = save_step_size
        self._last_saved_id = 0
        if self._recovery and not self.recovery_tables_exist():
            self._recovery = False
        if not self._recovery:
            self.trunc_recovery_tables()

    def recovery_tables_exist(self) -> bool:
        try:
            Table(name=self._recovery_table_name, create=False)
        except ValueError as ve:
            if "exist" in ve.args[0]:
                print("WARNING: recovery table does not exist")
                return False
            raise
        try:
            Table(name=self._recovery_table_inv_name, create=False)
        except Exception as ve:
            # FIXME JDF: is that the right way?
            if "exist" in ve.args[0]:  # FIXME
                print("WARNING: recovery table invariant does not exist")
                return False
            raise
        return True

    def trunc_recovery_tables(self) -> None:
        len_ = 0
        rt: Optional[Table] = None
        try:
            rt = Table(name=self._recovery_table_name, create=False)
            len_ = len(rt)
        except Exception:
            pass
        if len_ and rt is not None:
            rt.drop(slice(None, None, None), truncate=True)
        len_ = 0
        try:
            rt = Table(name=self._recovery_table_inv_name, create=False)
            len_ = len(rt)
        except Exception:
            pass
        if len_ and rt is not None:
            rt.drop(slice(None, None, None), truncate=True)

    def rows_read(self) -> int:
        "Return the number of rows read so far."
        return self._rows_read

    def is_ready(self) -> bool:
        if self.has_input_slot("filenames"):
            fn = self.get_input_slot("filenames")
            if fn.created is None or fn.created.any():
                return True
        return super(CSVLoader, self).is_ready()

    def is_data_input(self) -> bool:
        # pylint: disable=no-self-use
        "Return True if this module brings new data"
        return True

    def create_input_source(self, filepath: str) -> InputSource:
        usecols = self.csv_kwds.get("usecols")
        return InputSource.create(
            filepath,
            encoding=self._encoding,
            compression=self._compression,
            timeout=self._timeout_csv,
            start_byte=0,
            usecols=usecols,
        )

    def close(self) -> None:
        # if self._input_stream is None:
        #     return
        # try:
        #     self._input_stream.close()
        #     # pylint: disable=bare-except
        # except Exception:
        #     pass
        # self._input_stream = None
        self._input_encoding = None
        self._input_compression = None
        self._input_size = 0

    def get_progress(self) -> Tuple[int, int]:
        if self._input_size == 0:
            return (0, 0)
        pos = 0  # self._input_stream.tell()
        return (pos, self._input_size)

    def validate_parser(self, run_number: int) -> ModuleState:
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                if not self._recovery:
                    try:
                        self.parser = read_csv(
                            self.create_input_source(self.filepath_or_buffer),
                            **self.csv_kwds,
                        )
                    except IOError as e:
                        logger.error("Cannot open file %s: %s",
                                     self.filepath_or_buffer, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None
                else:  # do recovery
                    try:
                        if self._recovery_table is None:
                            self._recovery_table = Table(
                                name=self._recovery_table_name, create=False)
                        if self._recovery_table_inv is None:
                            self._recovery_table_inv = Table(
                                name=self._recovery_table_inv_name,
                                create=False)
                        if self.result is None:
                            self._table_params[
                                "name"] = self._recovery_table_inv[
                                    "table_name"].loc[0]
                            self._table_params["create"] = False
                            table = Table(**self._table_params)
                            self.result = table
                            table.last_id
                    except Exception as e:  # TODO: specify the exception?
                        logger.error(f"Cannot acces recovery table {e}")
                        return self.state_terminated
                    table = self.table
                    try:
                        last_ = self._recovery_table.eval("last_id=={}".format(
                            len(table)),
                                                          as_slice=False)
                        len_last = len(last_)
                        if len_last > 1:
                            logger.error("Inconsistent recovery table")
                            return self.state_terminated
                        # last_ = self._recovery_table.argmax()['offset']
                        snapshot: Optional[Dict[str, Any]] = None
                        if len_last == 1:
                            row = self._recovery_table.row(last_[0])
                            assert row is not None
                            snapshot = row.to_dict(ordered=True)
                            if not check_snapshot(snapshot):
                                snapshot = None
                        if (snapshot is None
                            ):  # i.e. snapshot not yet found or inconsistent
                            max_ = -1
                            for i in self._recovery_table.eval(
                                    "last_id<{}".format(len(table)),
                                    as_slice=False):
                                row = self._recovery_table.row(i)
                                assert row is not None
                                sn: Dict[str, Any] = row.to_dict(ordered=True)
                                if check_snapshot(sn) and sn["last_id"] > max_:
                                    max_, snapshot = sn["last_id"], sn
                            if max_ < 0:
                                # logger.error('Cannot acces recovery table (max_<0)')
                                return self.state_terminated
                            table.drop(slice(max_ + 1, None, None),
                                       truncate=True)
                        assert snapshot
                        self._recovered_csv_table_name = snapshot["table_name"]
                    except Exception as e:
                        logger.error("Cannot read the snapshot %s", e)
                        return self.state_terminated
                    try:
                        self.parser = recovery(snapshot,
                                               self.filepath_or_buffer,
                                               **self.csv_kwds)
                    except Exception as e:
                        logger.error("Cannot recover from snapshot %s, %s",
                                     snapshot, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None

            else:  # this case does not support recovery
                fn_slot = None
                if self.has_input_slot("filenames"):
                    fn_slot = self.get_input_slot("filenames")
                if fn_slot is None or fn_slot.output_module is None:
                    return self.state_terminated
                # fn_slot.update(run_number)
                if fn_slot.deleted.any() or fn_slot.updated.any():
                    raise ProgressiveError("Cannot handle input file changes")
                df = fn_slot.data()
                while self.parser is None:
                    indices = fn_slot.created.next(length=1)
                    assert isinstance(indices, slice)
                    if indices.stop == indices.start:
                        return self.state_blocked
                    filename = df.at[indices.start, "filename"]
                    try:
                        self.parser = read_csv(
                            self.create_input_source(filename),
                            **self.csv_kwds)
                    except IOError as e:
                        logger.error("Cannot open file %s: %s", filename, e)
                        self.parser = None
                        # fall through
        return self.state_ready

    def _data_as_array(self, df: pd.DataFrame) -> Tuple[Any, DataShape]:
        if not self._as_array:
            return (df, dshape_from_dataframe(df))
        if callable(self._as_array):
            self._as_array = self._as_array(list(df.columns))  # FIXME
        if isinstance(self._as_array, str):
            data = df.values
            dshape = array_dshape(data, self._as_array)
            return ({self._as_array: data}, dshape)
        if not isinstance(self._as_array, dict):
            raise ValueError(
                f"Unexpected parameter specified to as_array: {self._as_array}"
            )
        columns = set(df.columns)
        ret = {}
        for colname, cols in self._as_array.items():
            if colname in ret:
                raise KeyError(f"Duplicate column {colname} in as_array")
            colset = set(cols)
            assert colset.issubset(columns)
            columns -= colset
            view = df[cols]
            values = view.values
            ret[colname] = values
        for colname in columns:
            if colname in ret:
                raise KeyError(f"Duplicate column {colname} in as_array")
            ret[colname] = df[colname].values
        return ret, dshape_from_dict(ret)

    def _needs_save(self) -> bool:
        table = self.table
        if table is None:
            return False
        return table.last_id >= self._last_saved_id + self._save_step_size

    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        if step_size == 0:  # bug
            logger.error("Received a step_size of 0")
            return self._return_run_step(self.state_ready, steps_run=0)
        status = self.validate_parser(run_number)
        if status == self.state_terminated:
            raise ProgressiveStopIteration("no more filenames")
        elif status == self.state_blocked:
            return self._return_run_step(status, steps_run=0)
        elif status != self.state_ready:
            logger.error("Invalid state returned by validate_parser: %d",
                         status)
            self.close()
            raise ProgressiveStopIteration("Unexpected situation")
        logger.info("loading %d lines", step_size)
        needs_save = self._needs_save()
        assert self.parser
        df_list: List[pd.DataFrame]
        try:
            df_list = self.parser.read(
                step_size, flush=needs_save)  # raises StopIteration at EOF
            if not df_list:
                raise ProgressiveStopIteration
        except ProgressiveStopIteration:
            self.close()
            if self.has_input_slot("filenames"):
                fn_slot = self.get_input_slot("filenames")
                assert fn_slot.output_module is not None
            self.parser = None
            return self._return_run_step(self.state_ready, 0)
        df_len = sum([len(df) for df in df_list])
        creates = df_len
        if creates == 0:  # should not happen
            logger.error("Received 0 elements")
            raise ProgressiveStopIteration
        if self._filter is not None:
            df_list = [self._filter(df) for df in df_list]
        creates = sum([len(df) for df in df_list])
        if creates == 0:
            logger.info("frame has been filtered out")
        else:
            self._rows_read += creates
            logger.info("Loaded %d lines", self._rows_read)
            if self.force_valid_ids:
                for df in df_list:
                    force_valid_id_columns(df)
            if self.result is None:
                table = self.table
                data, dshape = self._data_as_array(pd.concat(df_list))
                if not self._recovery:
                    self._table_params["name"] = self.generate_table_name(
                        "table")
                    self._table_params["data"] = data
                    self._table_params["dshape"] = dshape
                    self._table_params["create"] = True
                    self.result = Table(**self._table_params)
                else:
                    self._table_params["name"] = self._recovered_csv_table_name
                    # self._table_params['dshape'] = dshape
                    self._table_params["create"] = False
                    table = Table(**self._table_params)
                    self.result = table
                    table.append(self._data_as_array(pd.concat(df_list)))
            else:
                table = self.table
                for df in df_list:
                    data, dshape = self._data_as_array(df)
                    table.append(data)
            if (self.parser.is_flushed() and needs_save
                    and self._recovery_table is None and self._save_context):
                table = self.table
                snapshot = self.parser.get_snapshot(
                    run_number=run_number,
                    table_name=table.name,
                    last_id=table.last_id,
                )
                self._recovery_table = Table(
                    name=self._recovery_table_name,
                    data=pd.DataFrame(snapshot, index=[0]),
                    create=True,
                )
                self._recovery_table_inv = Table(
                    name=self._recovery_table_inv_name,
                    data=pd.DataFrame(
                        dict(
                            table_name=table.name,
                            csv_input=self.filepath_or_buffer,
                        ),
                        index=[0],
                    ),
                    create=True,
                )
                self._last_saved_id = table.last_id
            elif self.parser.is_flushed(
            ) and needs_save and self._save_context:
                snapshot = self.parser.get_snapshot(
                    run_number=run_number,
                    last_id=table.last_id,
                    table_name=table.name,
                )
                assert self._recovery_table
                self._recovery_table.add(snapshot)
                if len(self._recovery_table) > self._recovery_table_size:
                    oldest = self._recovery_table.argmin()["offset"]
                    self._recovery_table.drop(oldest)
                self._last_saved_id = table.last_id
        return self._return_run_step(self.state_ready, steps_run=creates)