Ejemplo n.º 1
0
 def _record_explanation(self, cid: CandidateId, cfg_key: ConfigKey,
                         name: str, candidates: List[Configuration],
                         loss: List[float]):
     self.explanations[cid.external_name] = {
         'candidates':
         [PartialConfig(cfg_key, c, name, None) for c in candidates],
         'loss':
         loss,
         'marginalization':
         self._compute_marginalization()
     }
Ejemplo n.º 2
0
    def _get_config_for_step(self, idx: int, prefix: str, name: str,
                             logger: ProcessLogger) -> Configuration:
        start = timeit.default_timer()

        cfg_key = self.cfg_keys[idx]
        config, cfg_key = self.cfg_cache.sample_configuration(cid=self.cid,
                                                              name=name,
                                                              cfg_key=cfg_key)

        intermediate = PartialConfig(cfg_key, config, name, None)
        logger.new_step(prefixed_name(prefix, name), intermediate)

        self.config_time += timeit.default_timer() - start
        return config
Ejemplo n.º 3
0
    def restore_config(
        self, pipeline: FlexiblePipeline
    ) -> Tuple[Configuration, List[PartialConfig]]:
        partial_configs: List[PartialConfig] = []

        with open(self.file) as fh:
            for line in fh:
                name, partial_config, origin = json.loads(line)
                partial_config = PartialConfig.from_dict(
                    partial_config, origin)

                partial_configs.append(partial_config)

        partial_configs = self._add_missing_configs(partial_configs, pipeline)
        config = self._merge_configs(partial_configs, pipeline)
        os.remove(self.file)
        return config, partial_configs
Ejemplo n.º 4
0
    def _add_missing_configs(self, partial_configs: List[PartialConfig], pipeline: FlexiblePipeline) -> \
            List[PartialConfig]:
        if len(partial_configs) == 0:
            self.logger.warning(
                'Encountered job without any partial configurations. Simulating complete config'
            )

        missing_steps = set(pipeline.all_names())
        for partial_config in partial_configs:
            missing_steps.remove(partial_config.name)

        # Create random configuration for missing steps
        latest_mf = None if len(
            partial_configs) == 0 else partial_configs[-1].mf
        for name in missing_steps:
            config = pipeline.get_step(name).get_hyperparameter_search_space(mf=latest_mf) \
                .sample_configuration()
            config.origin = 'Random Search'
            # noinspection PyTypeChecker
            partial_configs.append(PartialConfig(None, config, name,
                                                 latest_mf))
        return partial_configs
Ejemplo n.º 5
0
 def _record_explanation(self, cid: CandidateId, cfg_key: ConfigKey, name: str, config: Configuration):
     self.explanations[cid.external_name] = {
         'candidates': [PartialConfig(cfg_key, config, name, None)],
         'loss': [0.5],
         'marginalization': self._compute_marginalization()
     }
Ejemplo n.º 6
0
    def _expand(self, nodes: List[Node],
                worker: Worker,
                cid: CandidateId,
                max_distance: float = 0.05,
                max_failures: int = 3,
                timeout: float = None,
                include_preprocessing: bool = True) -> Tuple[Optional[Node], Optional[Result], int]:
        node = nodes[-1]

        failure_count = 0
        n_actions = len(node.available_actions())
        while True:
            with self.lock:
                if node.is_terminal() and self.tree.fully_expanded(node):
                    return None, None, failure_count
                if timeout is not None and timeit.default_timer() > timeout:
                    self.logger.warning('Aborting expansion due to timeout')
                    return None, None, max_failures

                if not node.expanded:
                    self.tree.expand_node(node)
                n_children = len(self.tree.get_children(node.id))
                if n_children >= n_actions:
                    break

                children = self.tree.get_children(node.id, include_unvisited=True)
                action = self.policy.get_next_action(node, children, cid, include_preprocessing=include_preprocessing)
                if action is None:
                    return None, None, failure_count
                if failure_count >= max_failures:
                    self.logger.warning(f'Aborting expansion due to {failure_count} failed expansions')
                    return None, None, failure_count

                new_node = self.tree.inflate_node(estimator=action, parent_node=node)
                component = new_node.component
                self.logger.debug(f'\tExpanding with {component.name()}. Option {n_children + 1}/{n_actions}')

            ds = node.ds
            config, key = self.cfg_cache.sample_configuration(
                cid=cid.with_config(0),
                name=new_node.steps[-1][0],
                configspace=component.get_hyperparameter_search_space(),
                mf=ds.meta_features,
                default=True)

            job = EvaluationJob(ds, cid.with_config(f'{len(node.steps)}_{component.name(short=True)}'),
                                cs=component, cutoff=self.cutoff, config=config, cfg_keys=[key])
            result = worker.start_transform_dataset(job)

            if result.status.value == StatusType.SUCCESS.value:
                ds = Dataset(result.transformed_X, ds.y, ds.metric, ds.cutoff)
                new_node.partial_config = PartialConfig(key, config, str(new_node.id), ds.meta_features)
                new_node.ds = ds

                # Add trainings time of all previous nodes to new node
                result.runtime.training_time += sum([n.runtime.training_time for n in nodes])
                new_node.runtime = result.runtime

                if ds.meta_features is None:
                    result.status = StatusType.CRASHED
                    result.structure_loss = util.worst_score(ds.metric)[-1]
                    new_node.failure_message = 'Missing MF'
                    # Currently only missing MF is counted as a failure
                    failure_count += 1
                else:
                    # Check if any node in the tree is similar to the new dataset
                    distance, _, idx = self.store.get_similar(ds.meta_features)
                    if np.allclose(node.ds.meta_features, ds.meta_features, equal_nan=True):
                        self.logger.debug(f'\t{component.name()} did not modify dataset')
                        result.status = StatusType.INEFFECTIVE
                        result.structure_loss = util.worst_score(ds.metric)[-1]
                        new_node.failure_message = 'Ineffective'
                    elif distance <= max_distance:
                        # TODO: currently always the existing node is selected. This node could represent simpler model
                        self.logger.debug(f'\t{component.name()} produced a dataset similar to {idx}')
                        result.status = StatusType.DUPLICATE
                        result.structure_loss = util.worst_score(ds.metric)[-1]
                        new_node.failure_message = f'Duplicate {idx}'
                    else:
                        self.store.add(ds.meta_features, data=new_node.id)
                        # Enter node as enter was not called during tree traversal yet
                        new_node.enter(cid)
                        new_node.failure_message = None

                        if self.store_ds:
                            with open(os.path.join(self.workdir, f'{new_node.id}.pkl'), 'wb') as f:
                                pickle.dump(ds, f)
            else:
                self.logger.debug(f'\t{component.name()} failed with default hyperparamter: {result.status}')
                result.structure_loss = util.worst_score(ds.metric)[-1]
                if result.status == StatusType.TIMEOUT:
                    new_node.failure_message = 'Timeout'
                else:
                    new_node.failure_message = 'Crashed'

            if result.structure_loss is not None:
                n_children += 1
                self._backpropagate(new_node, result.structure_loss)
                if result.structure_loss < util.worst_score(ds.metric)[-1]:
                    result.partial_configs = [n.partial_config for n in nodes[1:]]
                    result.partial_configs.append(new_node.partial_config)
                    config = FlexiblePipeline(new_node.steps).configuration_space.get_default_configuration()
                    config.origin = 'Default'
                    result.config = config

                    job.result = result
                    self.cfg_cache.register_result(job)
                    # Successful classifiers
                    return new_node, result, failure_count

            else:
                # Successful preprocessors
                return new_node, result, failure_count
        return None, None, failure_count
Ejemplo n.º 7
0
 def new_step(self, name: str, config: PartialConfig) -> None:
     self.partial_configs.append(config)
     with open(self.file, 'a') as fh:
         fh.write(json.dumps([name,
                              config.as_dict(), config.config.origin]))
         fh.write('\n')