Exemple #1
0
 def __init__(self, **kwds: Any) -> None:
     super(Wait, self).__init__(**kwds)
     if np.isnan(self.params.delay) and self.params.reads == -1:
         raise ProgressiveError(
             "Module %s needs either a delay or " "a number of reads, not both",
             self.pretty_typename(),
         )
Exemple #2
0
 def __init__(
     self,
     filepath_or_buffer: Optional[Any] = None,
     filter_: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
     force_valid_ids: bool = True,
     fillvalues: Optional[Dict[str, Any]] = None,
     as_array: Optional[Any] = None,
     timeout: Optional[float] = None,
     save_context: Optional[Any] = None,  # FIXME seems more like a bool
     recovery: int = 0,  # FIXME seems more like a bool
     recovery_tag: Union[str, int] = "",
     recovery_table_size: int = 3,
     save_step_size: int = 100000,
     **kwds: Any,
 ) -> None:
     super(CSVLoader, self).__init__(**kwds)
     self.tags.add(self.TAG_SOURCE)
     self.default_step_size = kwds.get("chunksize", 1000)  # initial guess
     kwds.setdefault("chunksize", self.default_step_size)
     # Filter out the module keywords from the csv loader keywords
     csv_kwds = filter_kwds(kwds, pd.read_csv)
     # When called with a specified chunksize, it returns a parser
     self.filepath_or_buffer = filepath_or_buffer
     self.force_valid_ids = force_valid_ids
     self.parser: Optional[Parser] = None
     self.csv_kwds = csv_kwds
     self._compression = csv_kwds.get("compression", "infer")
     csv_kwds["compression"] = None
     self._encoding = csv_kwds.get("encoding", None)
     csv_kwds["encoding"] = None
     self._rows_read = 0
     if filter_ is not None and not callable(filter_):
         raise ProgressiveError(
             "filter parameter should be callable or None")
     self._filter = filter_
     # self._input_stream: Optional[Any] = (
     #     None  # stream that returns a position through the 'tell()' method
     # )
     self._input_encoding = None
     self._input_compression = None
     self._input_size = 0  # length of the file or input stream when available
     self._timeout_csv = timeout
     self._table_params: Dict[str, Any] = dict(name=self.name,
                                               fillvalues=fillvalues)
     self._as_array = as_array
     self._save_context = (True if save_context is None
                           and is_recoverable(filepath_or_buffer) else
                           False)
     self._recovery = recovery
     self._recovery_table_size = recovery_table_size
     self._recovery_table: Optional[Table] = None
     self._recovery_table_name = f"csv_loader_recovery_{recovery_tag}"
     self._recovery_table_inv: Optional[Table] = None
     self._recovery_table_inv_name = f"csv_loader_recovery_invariant_{recovery_tag}"
     self._save_step_size = save_step_size
     self._last_saved_id = 0
     if self._recovery and not self.recovery_tables_exist():
         self._recovery = False
     if not self._recovery:
         self.trunc_recovery_tables()
Exemple #3
0
 def _validate_descriptors(descriptor_list: List[SlotDescriptor]) -> Dict[str, Any]:
     slots: Dict[str, Any] = {}
     for desc in descriptor_list:
         if desc.name in slots:
             raise ProgressiveError(
                 "Duplicate slot name %s" f" in slot descriptor {desc.name}"
             )
         slots[desc.name] = None
     return slots
 def _add_class(
     self,
     name: str,
     x_column: str,
     y_column: str,
     sample: Union[Literal["default"], Module] = "default",
     sample_slot: str = "result",
     input_module: Optional[Module] = None,
     input_slot: Optional[str] = None,
 ) -> None:
     if self.input_module is None and input_module is None:
         raise ProgressiveError("Input module is not defined!")
     if self.input_module is not None and input_module is not None:
         raise ProgressiveError("Input module is defined twice!")
     if self.input_slot is None and input_slot is None:
         raise ProgressiveError("Input slot is not defined!")
     if (self.input_slot is not None and input_slot is not None
             and self.input_slot != input_slot):
         raise ProgressiveError("Input slot is defined twice!")
     data_class = _DataClass(
         name,
         self,
         x_column,
         y_column,
         approximate=self._approximate,
         scheduler=self._scheduler,
     )
     data_class.sample = sample
     input_module = input_module or self.input_module
     input_slot = input_slot or self.input_slot
     if input_module is not None and input_slot is not None:
         data_class.create_dependent_modules(input_module, input_slot)
     col_translation = {self._x_label: x_column, self._y_label: y_column}
     hist_meta = dict(inp="hist", class_=name, **col_translation)
     if data_class.histogram2d is not None:
         self.input["table",
                    hist_meta] = data_class.histogram2d.output.result
     if isinstance(data_class.sample, Module):
         meta = dict(inp="sample", class_=name, **col_translation)
         self.input["table", meta] = data_class.sample.output[sample_slot]
     self._data_class_dict[name] = data_class
Exemple #5
0
 def collect_dependencies(self) -> Dict[str, Set[str]]:
     "Return the dependecies of the modules"
     errors = self.validate()
     if errors:
         raise ProgressiveError(f"Invalid dataflow: {errors}")
     dependencies = {}
     for valid in self.valid:
         module = valid.name
         slots = self.inputs[module]
         outs = [m.output_module.name for m in slots.values()]
         dependencies[module] = set(outs)
     return dependencies
Exemple #6
0
 def connect(
     self,
     output_module: Module,
     output_name: str,
     input_module: Module,
     input_name: str,
 ) -> None:
     "Declare a connection between two modules slots"
     slot = output_module.create_slot(output_name, input_module, input_name)
     if not slot.validate_types():
         raise ProgressiveError(
             "Incompatible types for slot (%s,%s) in %s" %
             (output_name, input_name, str(slot)))
     self.add_connection(slot)
Exemple #7
0
 def add_connection(self,
                    slot: Optional[Slot],
                    rename: bool = True) -> None:
     "Declare a connection between two module slots"
     if not slot:
         return
     output_module = slot.output_module
     output_name = slot.output_name
     input_module = slot.input_module
     input_name = slot.original_name or slot.input_name
     if input_module is None:
         return
     assert input_name is not None
     if input_module.input_slot_multiple(input_name):
         if rename:
             slot.original_name = input_name
             input_name += f".{self.multiple_slots_name_generator:04}"
             self.multiple_slots_name_generator += 1
             logger.info(f"{slot.original_name} renamed {input_name}")
             slot.input_name = input_name
         else:
             input_name = slot.input_name
     if input_name in self.inputs[input_module.name]:
         if slot is self.inputs[input_module.name][input_name]:
             logger.warn(
                 "redundant connection:"
                 "Input slot %s already connected to "
                 "slot %s in module %s",
                 input_name,
                 self.inputs[input_module.name][input_name],
                 input_module.name,
             )
         else:
             raise ProgressiveError(
                 "Input slot %s already connected to"
                 "slot %s in module %s" % (
                     input_name,
                     self.inputs[input_module.name][input_name],
                     input_module.name,
                 ))
     assert input_name is not None
     self.inputs[input_module.name][input_name] = slot
     if output_module.name not in self.outputs:
         self.outputs[output_module.name] = {output_name: [slot]}
     elif output_name not in self.outputs[output_module.name]:
         self.outputs[output_module.name][output_name] = [slot]
     else:
         self.outputs[output_module.name][output_name].append(slot)
     self.valid = []  # Not sure
Exemple #8
0
 def _add_module(self, module: Module) -> None:
     if module.name in self.inputs:
         raise ProgressiveError("Module %s already exists" % module.name)
     self._modules[module.name] = module
     self.inputs[module.name] = {}
     self.outputs[module.name] = {}
Exemple #9
0
    def __init__(
        self,
        name: Optional[str] = None,
        group: Optional[str] = None,
        scheduler: Optional[Scheduler] = None,
        storagegroup: Optional[Group] = None,
        **kwds: Any,
    ) -> None:
        self._args: Sequence[Tuple[str, Any]]
        self._kwds: Dict[str, Any]
        if scheduler is None:
            scheduler = Scheduler.default
        self._scheduler: Scheduler = scheduler
        if scheduler.dataflow is None:
            raise ProgressiveError("No valid context in scheduler")
        dataflow: Dataflow = scheduler.dataflow
        if name is None:
            name = dataflow.generate_name(self.pretty_typename())
        elif name in dataflow:
            raise ProgressiveError(
                "module already exists in scheduler," " delete it first"
            )
        self.name = name  # need to set the name so exception can remove it
        predictor = TimePredictor.default()
        predictor.name = name
        self.predictor = predictor
        storage = StorageManager.default
        self.storage = storage
        if storagegroup is None:
            assert Group.default_internal is not None
            storagegroup = Group.default_internal(get_random_name(name + "_tracer"))
        self.storagegroup: Group = storagegroup
        tracer = Tracer.default(name, storagegroup)

        self.tags = set(ModuleTag.tags)
        self.order: int = -1
        self.group: Optional[str] = group or GroupContext.group
        self.tracer = tracer
        self._start_time: float = 0
        self._end_time: float = 0
        self._last_update: int = 0
        self._state: ModuleState = Module.state_created
        self._saved_state: ModuleState = Module.state_invalid
        self._had_error = False
        self._parse_parameters(kwds)

        # always present
        input_descriptors = self.all_inputs
        output_descriptors = self.all_outputs
        self._input_slots: Dict[str, Optional[Slot]] = self._validate_descriptors(
            input_descriptors
        )
        self.input_descriptors: Dict[str, SlotDescriptor] = {
            d.name: d for d in input_descriptors
        }
        # self.input_multiple: Dict[str, int] = {
        #     d.name: 0 for d in input_descriptors if d.multiple
        # }
        self._output_slots: Dict[
            str, Optional[List[Slot]]
        ] = self._validate_descriptors(output_descriptors)
        self.output_descriptors: Dict[str, SlotDescriptor] = {
            d.name: d for d in output_descriptors
        }
        self.default_step_size: int = 100
        self.input = InputSlots(self)
        self.output = OutputSlots(self)
        self.steps_acc: int = 0
        # self.wait_expr = aio.FIRST_COMPLETED
        self.context: Optional[_Context] = None
        # callbacks
        self._start_run = ModuleCallbackList()
        self._after_run = ModuleCallbackList()
        self._ending: List[ModuleCb] = []
        # Register module
        dataflow.add_module(self)
Exemple #10
0
 def __setattr__(self, name: str, slot: Slot) -> None:
     raise ProgressiveError("Output slots cannot be assigned, only read")
Exemple #11
0
 def __getitem__(self, name: str) -> Slot:
     raise ProgressiveError("Input slots cannot be read, only assigned to")
Exemple #12
0
    def validate_parser(self, run_number: int) -> ModuleState:
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                if not self._recovery:
                    try:
                        self.parser = read_csv(
                            self.create_input_source(self.filepath_or_buffer),
                            **self.csv_kwds,
                        )
                    except IOError as e:
                        logger.error("Cannot open file %s: %s",
                                     self.filepath_or_buffer, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None
                else:  # do recovery
                    try:
                        if self._recovery_table is None:
                            self._recovery_table = Table(
                                name=self._recovery_table_name, create=False)
                        if self._recovery_table_inv is None:
                            self._recovery_table_inv = Table(
                                name=self._recovery_table_inv_name,
                                create=False)
                        if self.result is None:
                            self._table_params[
                                "name"] = self._recovery_table_inv[
                                    "table_name"].loc[0]
                            self._table_params["create"] = False
                            table = Table(**self._table_params)
                            self.result = table
                            table.last_id
                    except Exception as e:  # TODO: specify the exception?
                        logger.error(f"Cannot acces recovery table {e}")
                        return self.state_terminated
                    table = self.table
                    try:
                        last_ = self._recovery_table.eval("last_id=={}".format(
                            len(table)),
                                                          as_slice=False)
                        len_last = len(last_)
                        if len_last > 1:
                            logger.error("Inconsistent recovery table")
                            return self.state_terminated
                        # last_ = self._recovery_table.argmax()['offset']
                        snapshot: Optional[Dict[str, Any]] = None
                        if len_last == 1:
                            row = self._recovery_table.row(last_[0])
                            assert row is not None
                            snapshot = row.to_dict(ordered=True)
                            if not check_snapshot(snapshot):
                                snapshot = None
                        if (snapshot is None
                            ):  # i.e. snapshot not yet found or inconsistent
                            max_ = -1
                            for i in self._recovery_table.eval(
                                    "last_id<{}".format(len(table)),
                                    as_slice=False):
                                row = self._recovery_table.row(i)
                                assert row is not None
                                sn: Dict[str, Any] = row.to_dict(ordered=True)
                                if check_snapshot(sn) and sn["last_id"] > max_:
                                    max_, snapshot = sn["last_id"], sn
                            if max_ < 0:
                                # logger.error('Cannot acces recovery table (max_<0)')
                                return self.state_terminated
                            table.drop(slice(max_ + 1, None, None),
                                       truncate=True)
                        assert snapshot
                        self._recovered_csv_table_name = snapshot["table_name"]
                    except Exception as e:
                        logger.error("Cannot read the snapshot %s", e)
                        return self.state_terminated
                    try:
                        self.parser = recovery(snapshot,
                                               self.filepath_or_buffer,
                                               **self.csv_kwds)
                    except Exception as e:
                        logger.error("Cannot recover from snapshot %s, %s",
                                     snapshot, e)
                        self.parser = None
                        return self.state_terminated
                    self.filepath_or_buffer = None

            else:  # this case does not support recovery
                fn_slot = None
                if self.has_input_slot("filenames"):
                    fn_slot = self.get_input_slot("filenames")
                if fn_slot is None or fn_slot.output_module is None:
                    return self.state_terminated
                # fn_slot.update(run_number)
                if fn_slot.deleted.any() or fn_slot.updated.any():
                    raise ProgressiveError("Cannot handle input file changes")
                df = fn_slot.data()
                while self.parser is None:
                    indices = fn_slot.created.next(length=1)
                    assert isinstance(indices, slice)
                    if indices.stop == indices.start:
                        return self.state_blocked
                    filename = df.at[indices.start, "filename"]
                    try:
                        self.parser = read_csv(
                            self.create_input_source(filename),
                            **self.csv_kwds)
                    except IOError as e:
                        logger.error("Cannot open file %s: %s", filename, e)
                        self.parser = None
                        # fall through
        return self.state_ready