Ejemplo n.º 1
0
def main(args):
    if args.launch:
        pipeline = PipelineEngine(
            ["schema_path", "scenario_path", "launch_path"])
        pipeline.add_stages([
            LXMLLoadStage("schema"),
            LXMLLoadStage("scenario"),
            LXMLLoadStage("launch"),
            SlicingSchemaByVMTypeStage(),
            UpgradingScenarioStage(has_launch_xml=True),
        ])
    else:
        pipeline = PipelineEngine(["schema_path", "scenario_path"])
        pipeline.add_stages([
            LXMLLoadStage("schema"),
            LXMLLoadStage("scenario"),
            SlicingSchemaByVMTypeStage(),
            UpgradingScenarioStage(),
        ])

    obj = PipelineObject(schema_path=args.schema,
                         scenario_path=args.scenario,
                         launch_path=args.launch)
    pipeline.run(obj)
    # We know we are using lxml to parse the scenario XML, so it is ok to use lxml specific write options here.
    obj.get("scenario_etree").write(args.out, pretty_print=True)
Ejemplo n.º 2
0
def main(args):
    from xml_loader import XMLLoadStage
    from lxml_loader import LXMLLoadStage

    pipeline = PipelineEngine(["schema_path", "scenario_path"])
    pipeline.add_stages([
        LXMLLoadStage("schema"),
        XMLLoadStage("scenario"),
        SlicingSchemaByVMTypeStage(),
        DefaultValuePopulatingStage(),
    ])

    obj = PipelineObject(schema_path=args.schema, scenario_path=args.scenario)
    pipeline.run(obj)
    obj.get("scenario_etree").write(args.out)
Ejemplo n.º 3
0
def main(args):
    from lxml_loader import LXMLLoadStage

    pipeline = PipelineEngine(["schema_path"])
    pipeline.add_stages([
        LXMLLoadStage("schema"),
        SlicingSchemaByVMTypeStage(),
        SlicingSchemaByViewStage(),
    ])

    obj = PipelineObject(schema_path=args.schema)
    pipeline.run(obj)
    obj.get("schema_etree").write(args.out)

    print(f"Sliced schema written to {args.out}")
Ejemplo n.º 4
0
def main(args):
    from xml_loader import XMLLoadStage
    from lxml_loader import LXMLLoadStage

    validator_construction_pipeline = PipelineEngine(["schema_path", "datachecks_path"])
    validator_construction_pipeline.add_stages([
        LXMLLoadStage("schema"),
        LXMLLoadStage("datachecks"),
        SlicingSchemaByVMTypeStage(),
        ValidatorConstructionStage(),
    ])

    validation_pipeline = PipelineEngine(["board_path", "scenario_path", "schema_etree", "validator"])
    validation_pipeline.add_stages([
        XMLLoadStage("board"),
        XMLLoadStage("scenario"),
        DefaultValuePopulatingStage(),
        SyntacticValidationStage(),
        SemanticValidationStage(),
        ReportValidationResultStage(),
    ])

    obj = PipelineObject(schema_path = args.schema, datachecks_path = args.datachecks)
    validator_construction_pipeline.run(obj)
    if args.board and args.scenario:
        nr_all_errors = validate_one(validation_pipeline, obj, args.board, args.scenario)
    elif args.board:
        nr_all_errors = validate_board(validation_pipeline, obj, args.board)
    else:
        nr_all_errors = validate_all(validation_pipeline, obj, os.path.join(config_tools_dir, "data"))

    sys.exit(1 if nr_all_errors > 0 else 0)
Ejemplo n.º 5
0
    def _greedy_initial_selection(self, x_train, y_train, t_predicted,
                                  runtime_limit):
        if self.verbose:
            print("Fitting fast pipelines that perform well on average.")

        candidate_pipeline_indices = list(
            set(np.where(t_predicted <= runtime_limit / 8)[0]).intersection(
                set(
                    np.argsort(
                        np.nanmedian(tl.unfold(self.error_tensor_imputed,
                                               mode=0),
                                     axis=0))
                    [:int(len(self.pipeline_settings_on_dataset) / 50)])))

        candidate_pipelines = [
            PipelineObject(p_type=self.p_type,
                           config=self.pipeline_settings_on_dataset[i],
                           index=i,
                           verbose=self.verbose)
            for i in candidate_pipeline_indices
        ]

        p1 = mp.Pool(self.n_cores)
        candidate_pipeline_errors = [
            p1.apply_async(PipelineObject.kfold_fit_validate,
                           args=[
                               p, x_train, y_train, self.n_folds,
                               runtime_limit / 4, self.random_state
                           ]) for p in candidate_pipelines
        ]
        p1.close()
        p1.join()

        if self.verbose:
            print("Initial greedy fitting completed")

        for i, error in enumerate(candidate_pipeline_errors):
            cv_error, cv_predictions, t_elapsed = error.get()
            if not np.isnan(cv_error):
                candidate_pipelines[i].cv_error, candidate_pipelines[
                    i].cv_predictions = cv_error, cv_predictions
                candidate_pipelines[i].sampled = True
                self.new_row[:, candidate_pipeline_indices[i]] = cv_error
                self.sampled_pipelines[
                    candidate_pipeline_indices[i]] = candidate_pipelines[i]
                self.ensemble.candidate_learners.append(candidate_pipelines[i])
                # update sampled indices
                self.sampled_indices = self.sampled_indices.union(
                    set([candidate_pipeline_indices[i]]))
                self._t_predicted[candidate_pipeline_indices[i]] = t_elapsed
            else:
                self._t_predicted[candidate_pipeline_indices[i]] = max(
                    t_elapsed,
                    self._t_predicted[candidate_pipeline_indices[i]])

        if len(self.ensemble.candidate_learners) > 0:
            self.ensemble.fitted = True
            self.ensemble.fit(x_train, y_train)
        else:
            if self.verbose:
                print(
                    "Insufficient time to fit fast and on average best-performing pipelines."
                )
Ejemplo n.º 6
0
    def _fit(self,
             x_train,
             y_train,
             t_predicted,
             ranks=None,
             runtime_limit=None,
             remaining_global=None):
        """This private method is a single round of the doubling process. It fits an AutoLearner object on a new dataset.
        This will sample the performance of several algorithms on the new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            t_predicted (np.ndarray): Predicted running time.
            ranks (int):            Rank of error tensor factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
            remaining_global (float): The remaining runtime 
        """
        if self.verbose:
            print("\nSingle round runtime target: {}".format(runtime_limit))

        # set to defaults if not provided
        ranks = ranks or linalg.approx_tensor_rank(self.error_tensor_imputed,
                                                   threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with maximum runtime {} seconds'.format(
                runtime_limit))

        # cold-start: pick the initial set of models to fit on the new dataset
        valid = np.where(t_predicted <= self.n_cores * runtime_limit / 8)[0]
        Y = self.Y[:ranks[0], valid]

        if self.verbose:
            print("Selecting an initial set of models to evaluate ...")

        selected_columns_qr, t_sum, case = ED.pivot_columns_time(
            Y,
            t_predicted[valid],
            runtime_limit / 8,
            columns_to_avoid=None,
            rank=Y.shape[0])
        if case == 'greedy_initialization':
            if self.verbose:
                print(case)
            to_sample = valid[selected_columns_qr]

        elif case == 'qr_initialization':
            to_sample = valid[ED.greedy_stepwise_selection_with_time(
                Y=Y,
                t=t_predicted[valid],
                initialization=selected_columns_qr,
                t_elapsed=t_sum,
                t_max=runtime_limit / 8,
                idx_to_exclude=None)]
        # TODO: check if Y is rank-deficient, i.e. will ED problem fail

        if self.verbose:
            print(t_predicted[to_sample])

        if np.isnan(to_sample).any():
            to_sample = np.argsort(t_predicted)[:ranks[0]]

        if len(to_sample) == 0 and len(self.sampled_indices) == 0:
            # if no columns are selected in first iteration (log det instability), sample n fastest columns
            n = len(
                np.where(
                    np.cumsum(np.sort(t_predicted)) <= runtime_limit / 4)[0])
            if n > 0:
                to_sample = np.argsort(t_predicted)[:n]
            else:
                self.ensemble.fitted = False
                return

        start = time.time()
        if self.selection_method is not 'random':
            candidate_indices = []
            # we only need to fit models on the new dataset if it has not been fitted already
            to_sample = list(set(to_sample) - self.sampled_indices)
            if self.verbose:
                print('Sampling {} entries of new row...'.format(
                    len(to_sample)))

#             return to_sample

            sampled_pipelines_single_round = [
                PipelineObject(p_type=self.p_type,
                               config=self.pipeline_settings_on_dataset[i],
                               index=i,
                               verbose=self.verbose) for i in to_sample
            ]

            p1 = mp.Pool(self.n_cores)
            sampled_pipeline_errors_single_round = [
                p1.apply_async(PipelineObject.kfold_fit_validate,
                               args=[
                                   p, x_train, y_train, self.n_folds,
                                   runtime_limit / 4, self.random_state
                               ]) for p in sampled_pipelines_single_round
            ]
            p1.close()
            p1.join()

            if self.verbose:
                print("pool fitting completed")

            # predict performance of models not actually fitted on the new dataset

            for i, error in enumerate(sampled_pipeline_errors_single_round):
                cv_error, cv_predictions, t_elapsed = error.get()

                if not np.isnan(cv_error):
                    sampled_pipelines_single_round[
                        i].cv_error, sampled_pipelines_single_round[
                            i].cv_predictions = cv_error, cv_predictions
                    sampled_pipelines_single_round[i].sampled = True
                    self.new_row[:, to_sample[i]] = cv_error
                    self.sampled_pipelines[
                        to_sample[i]] = sampled_pipelines_single_round[i]
                    self.sampled_indices = self.sampled_indices.union(
                        set([to_sample[i]]))
                    candidate_indices.append(to_sample[i])
                    self._t_predicted[to_sample[i]] = t_elapsed
                else:
                    self._t_predicted[to_sample[i]] = max(
                        t_elapsed, self._t_predicted[to_sample[i]])

            self.new_row_pred = linalg.impute_with_coefficients(
                self.Y[:ranks[0], :], self.new_row, list(self.sampled_indices))

            for idx in np.argsort(
                    self.new_row[0, :]
            )[:5]:  # np.argsort automatically put nans at the end of the list
                if not np.isnan(self.new_row[0, idx]):
                    if self.verbose:
                        print(self.sampled_pipelines[idx])
                    self.ensemble.candidate_learners.append(
                        self.sampled_pipelines[idx])

            # impute ALL entries
            # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices))
            # self.new_row[:, unknown] = imputed[:, unknown]

            # k-fold fit candidate learners of ensemble
            remaining = (runtime_limit - (time.time() - start)) * self.n_cores
            first = (case == 'qr_initialization'
                     ) and not self.ever_once_selected_best

            if remaining > 0 or first:
                # add models predicted to be the best to list of candidate learners
                if self.verbose:
                    if remaining < 0 and first:
                        print(
                            "Insufficient time in this doubling round, but we add models predicted to be the best at least once."
                        )
                    print("length of sampled indices: {}".format(
                        len(self.sampled_indices)))

    #             self.ensemble.candidate_learners.append(self.sampled_pipelines[best_sampled_idx])
                for i in np.argsort(self.new_row_pred[0]):
                    if (
                            first and len(candidate_indices) <= 3
                    ) or t_predicted[i] + t_predicted[candidate_indices].sum(
                    ) <= remaining / 4:
                        #                         if self.verbose:
                        #                             print("Adding models predicted to be the best to the ensemble ...")
                        candidate_indices.append(i)
                        # if model has already been k-fold fitted, immediately add to candidate learners
                        if i in self.sampled_indices:
                            assert self.sampled_pipelines[i] is not None
                            self.ensemble.candidate_learners.append(
                                self.sampled_pipelines[i])
                # candidate learners that need to be k-fold fitted
                to_fit = list(set(candidate_indices) - self.sampled_indices)
                if self.verbose:
                    print("{} candidate learners need to be k-fold fitted".
                          format(to_fit))
            else:
                if self.verbose:
                    print("Insufficient time in this doubling round.")
        else:
            remaining = (runtime_limit - (time.time() - start)) * self.n_cores
            to_fit = to_sample.copy()

        remaining = (runtime_limit - (time.time() - start)) * self.n_cores

        if remaining > 0 or first:
            if len(to_fit) > 0:
                # fit models predicted to have good performance and thus going to be added to the ensemble
                if self.verbose:
                    print("Fitting {} pipelines predicted to be the best ...".
                          format(len(to_fit)))

                candidate_pipelines = [
                    PipelineObject(p_type=self.p_type,
                                   config=self.pipeline_settings_on_dataset[i],
                                   index=i,
                                   verbose=self.verbose) for i in to_fit
                ]

                #                 print(remaining_global)
                p2 = mp.Pool(self.n_cores)
                candidate_pipeline_errors = [
                    p2.apply_async(PipelineObject.kfold_fit_validate,
                                   args=[
                                       p, x_train, y_train, self.n_folds,
                                       remaining_global / 2, self.random_state
                                   ]) for p in candidate_pipelines
                ]  # set a not-quite-small limit for promising models
                p2.close()
                p2.join()

                for i, error in enumerate(candidate_pipeline_errors):
                    cv_error, cv_predictions, t_elapsed = error.get()

                    if not np.isnan(cv_error):
                        candidate_pipelines[i].cv_error, candidate_pipelines[
                            i].cv_predictions = cv_error, cv_predictions
                        candidate_pipelines[i].sampled = True
                        self.new_row[:, to_fit[i]] = cv_error
                        self.sampled_pipelines[
                            to_fit[i]] = candidate_pipelines[i]
                        self.ensemble.candidate_learners.append(
                            candidate_pipelines[i])
                        # update sampled indices
                        self.sampled_indices = self.sampled_indices.union(
                            set([to_fit[i]]))
                        self._t_predicted[to_fit[i]] = t_elapsed
                    else:
                        self._t_predicted[to_fit[i]] = max(
                            t_elapsed, self._t_predicted[to_fit[i]])

#                 self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank)
                self.ever_once_selected_best = True

        if len(self.ensemble.candidate_learners) > 0:
            self.ensemble.fitted = True

            if self.verbose:
                print('\nFitting ensemble of maximum size {}...'.format(
                    len(self.ensemble.candidate_learners)))
            # ensemble selection and fitting in the remaining time budget
            self.ensemble.fit(x_train, y_train, remaining,
                              self.sampled_pipelines)
            for pipeline in self.ensemble.base_learners:
                assert pipeline.index is not None
                self.sampled_indices.add(pipeline.index)
                self.sampled_pipelines[pipeline.index] = pipeline

            if self.verbose:
                print('\nAutoLearner fitting complete.')
        else:
            if self.verbose:
                print("Insufficient time in this round.")