Python Dataset Exemples, tworaven_solver.Dataset Python Exemples

Exemple #1

0

Afficher le fichier

    def produce(self, specification):
        configuration = specification.get('configuration', {})
        predict_type = configuration.get('predict_type', 'RAW')

        # REFIT
        dataframe_train = Dataset(specification['train']).get_dataframe().dropna()

        stimulus = self.make_stimulus(dataframe_train[self.predictors])
        self.fit(stimulus, dataframe_train[self.targets[0]], specification['train'])

        # PRODUCE
        dataframe = Dataset(specification['input']).get_dataframe().dropna()
        dataframe.reset_index(drop=True, inplace=True)

        stimulus = self.make_stimulus(dataframe[self.predictors])
        output_directory_path = specification['output']['resource_uri'].replace('file://', '')
        output_path = '/' + os.path.join(
            *output_directory_path.split('/'),
            str(uuid.uuid4()) + '.csv')

        if self.system == 'mljar-supervised':
            predictions = self.model.predict(stimulus)
            if predict_type == 'RAW':
                predictions = pandas.DataFrame((predictions.idxmax(axis=1) == 'p_1').astype(int))
                predictions.columns = [self.targets[0]]

        else:
            if predict_type == 'RAW':
                predictions = self.model.predict(stimulus)
                if len(predictions.shape) > 1:
                    predictions = np.argmax(predictions, axis=-1)
                predictions = pandas.DataFrame(predictions, columns=[self.targets[0]]).astype(int)
            else:
                predictions = self.model.predict_proba(stimulus)
                # TODO: standardize probability column names
                predictions = pandas.DataFrame(predictions, columns=[f'p_{i}' for i in range(predictions.shape[1])])

        predictions.reset_index(drop=True, inplace=True)
        predictions.insert(0, 'd3mIndex', dataframe['d3mIndex'])

        if not os.path.exists(output_directory_path):
            os.makedirs(output_directory_path)

        cwd = os.getcwd()
        try:
            os.chdir('/')
            predictions.to_csv(output_path, index=False)
        finally:
            os.chdir(cwd)

        return {
            'produce': {
                'input': specification['input'],
                'configuration': configuration,
                'data_pointer': output_path
            },
            'search_id': self.search_id,
            'model_id': self.model_id,
            'system': self.system
        }

Exemple #2

0

Afficher le fichier

Fichier : util_search.py Projet : ginfo-cflex/TwoRavens

    def run(self):
        from supervised.automl import AutoML

        dataset = Dataset(self.specification['input'])

        dataframe = dataset.get_dataframe().dropna()
        X = self.specification['problem']['predictors']
        y = self.specification['problem']['targets'][0]

        stimulus, preprocessor = preprocess(dataframe, self.specification)

        if self.specification.get('timeBoundSearch'):
            self.system_params['total_time_limit'] = self.specification[
                'timeBoundSearch']

        if self.specification.get('timeBoundRun'):
            self.system_params['learner_time_limit'] = self.specification[
                'timeBoundRun']

        automl = AutoML(**self.system_params)

        # mljar seems kind of fragile?
        stimulus = pandas.DataFrame(stimulus)
        stimulus.columns = [str(i).strip() for i in stimulus.columns]

        automl.fit(stimulus, dataframe[y])

        for model_mljar in sorted(automl._models,
                                  key=lambda m: m.get_final_loss())[:4]:
            model = ModelSklearn(
                model_mljar,
                system='mljar-supervised',
                search_id=self.search_id,
                predictors=X,
                targets=[y],
                preprocess=preprocessor,
                task=self.specification['problem']['taskType'])

            model.save()

            from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
            FOUND_MODEL_CALLBACKS[self.callback_found](
                model, **(self.callback_arguments or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'mljar-supervised'
            }
        }

Exemple #3

0

Afficher le fichier

    def score(self, specification):
        # TODO: refitting -> respect configuration
        configuration = specification['configuration']

        dataframe = Dataset(specification['input']).get_dataframe()

        target = self.targets[0]
        if self.task == 'CLASSIFICATION':
            dataframe[target] = dataframe[target].astype(str)

        predicted = self.model.predict(dataframe)

        scores = []
        for metric in specification['performanceMetrics']:
            scores.append({
                'value': get_metric(metric)(dataframe[target], predicted[f'{target}_predictions']),
                'metric': metric,
                'target': target
            })

        return {
            'search_id': self.search_id,
            'model_id': self.model_id,
            'scores': scores,
            'system': self.system
        }

Exemple #4

0

Afficher le fichier

    def produce(self, produce_specification):
        configuration = produce_specification.get('configuration', {})
        predict_type = configuration.get('predict_type', 'RAW')

        dataframe = Dataset(produce_specification['input']).get_dataframe()

        if self.task in ['REGRESSION', 'CLASSIFICATION']:
            dataframe_train = Dataset(produce_specification['train']).get_dataframe().dropna()
            self.fit(dataframe=dataframe_train, data_specification=produce_specification['train'])

        if predict_type == 'RAW':
            predicted = self.model.predict(dataframe)
            # if len(predicted.columns.values) > 1:
            #     predicted = np.argmax(predicted, axis=-1)
        else:
            predicted = self.model.predict_proba(dataframe)
            # TODO: standardize probability column names
            predicted = pandas.DataFrame(predicted, columns=[f'p_{i}' for i in range(predicted.shape[1])])

        output_directory_path = produce_specification['output']['resource_uri'].replace('file://', '')
        output_path = '/' + os.path.join(
            *output_directory_path.split('/'),
            str(uuid.uuid4()) + '.csv')

        if 'd3mIndex' not in predicted.columns.values:
            predicted.insert(0, 'd3mIndex', dataframe['d3mIndex'])

        if not os.path.exists(output_directory_path):
            os.makedirs(output_directory_path)

        cwd = os.getcwd()
        try:
            os.chdir('/')
            predicted.to_csv(output_path, index=False)
        finally:
            os.chdir(cwd)

        return {
            'produce': {
                'input': produce_specification['input'],
                'configuration': configuration,
                'data_pointer': output_path
            },
            'search_id': self.search_id,
            'model_id': self.model_id,
            'system': self.system
        }

Exemple #5

0

Afficher le fichier

    def score(self, specification):
        dataframe = Dataset(specification['input']).get_dataframe()[self.predictors + self.targets].dropna()
        dataframe.reset_index(drop=True, inplace=True)

        configuration = specification['configuration']

        splits = self.make_splits(configuration, dataframe)
        split_scores = defaultdict(list)
        split_weights = defaultdict(list)
        for train_split, test_split in splits:
            self.fit(self.make_stimulus(train_split), train_split[self.targets[0]])

            actual = np.array(test_split[self.targets[0]])
            predicted = self.model.predict(self.make_stimulus(test_split))

            if 'CLASSIFICATION' in self.task:
                actual = actual.astype(int)

            if self.system == 'mljar-supervised':
                predicted = pandas.DataFrame((predicted.idxmax(axis=1) == 'p_1').astype(int))
                predicted.columns = [self.targets[0]]

            for metric in specification['performanceMetrics']:
                try:
                    split_scores[json.dumps(metric)].append(get_metric(metric)(actual, predicted))
                    split_weights[json.dumps(metric)].append(test_split.size)
                except ValueError:
                    pass

        scores = []
        for metric in split_scores:
            if split_scores[metric]:
                scores.append({
                    'value': np.average(split_scores[metric], weights=split_weights[metric]),
                    'metric': json.loads(metric),
                    'target': self.targets[0]
                })

        return {
            'search_id': self.search_id,
            'model_id': self.model_id,
            'scores': scores,
            'system': self.system
        }

Exemple #6

0

Afficher le fichier

    def produce(self, specification):
        configuration = specification.get('configuration', {})
        predict_type = configuration.get('predict_type', 'RAW')

        dataset = Dataset(specification['input'])
        dataframe = dataset.get_dataframe()
        predictions = self.model.predict(dataframe)

        if predict_type == 'RAW':
            predictions = predictions[[f'{self.targets[0]}_predictions']]
            predictions.columns = [self.targets[0]]

        if predict_type == 'PROBABILITIES':
            predictions = predictions[[i for i in predictions.columns.values if i.startswith(f'{self.targets}_probabilities_')]]

        predictions.insert(0, 'd3mIndex', dataframe['d3mIndex'])

        output_directory_path = specification['output']['resource_uri'].replace('file://', '')
        output_path = '/' + os.path.join(
            *output_directory_path.split('/'),
            str(uuid.uuid4()) + '.csv')

        if not os.path.exists(output_directory_path):
            os.makedirs(output_directory_path)

        cwd = os.getcwd()
        try:
            os.chdir('/')
            predictions.to_csv(output_path, index=False)
        finally:
            os.chdir(cwd)

        return {
            'produce': {
                'input': specification['input'],
                'configuration': configuration,
                'data_pointer': output_path
            },
            'search_id': self.search_id,
            'model_id': self.model_id,
            "system": self.system
        }

Exemple #7

0

Afficher le fichier

    def score(self, score_specification):
        # configuration = score_specification['configuration']
        dataframe = Dataset(score_specification['input']).get_dataframe()

        if self.task == "FORECASTING":
            # dataframe_train = Dataset(score_specification['train']).get_dataframe()
            # horizon = configuration.get('forecastingHorizon', {}).get('value', 1)
            # if len(dataframe) < horizon:
            #     raise ValueError(f'No predictions with a horizon of {horizon} are within range of the test data.')

            predicted = self.model.predict(dataframe)

            # predicted = self.forecast(
            #     dataframe=dataframe_train,
            #     dataframe_rolling=dataframe,
            #     horizon=horizon)[:len(dataframe) - horizon + 1]

        elif self.task in ['CLASSIFICATION', 'REGRESSION']:
            # TODO: respect configuration on holdout vs cross-validation, do refitting, etc.
            if self.task == 'CLASSIFICATION':
                for target in self.targets:
                    dataframe[target] = dataframe[target].astype(str)
            predicted = self.model.predict(dataframe)
            if self.task == 'CLASSIFICATION':
                for target in self.targets:
                    predicted[target] = predicted[target].astype(str)

        else:
            raise NotImplementedError

        scores = []
        for target in self.targets:
            for metric in score_specification['performanceMetrics']:
                results = pandas.DataFrame({'actual': dataframe[target], 'predicted': predicted[target]})
                results.dropna(inplace=True)

                scores.append({
                    'value': get_metric(metric)(results['actual'], results['predicted']),
                    'metric': metric,
                    'target': target
                })

        return {
            'search_id': self.search_id,
            'model_id': self.model_id,
            'scores': scores,
            'system': self.system
        }

Exemple #8

0

Afficher le fichier

Fichier : util_search.py Projet : ginfo-cflex/TwoRavens

    def run(self):
        import autosklearn.classification
        import autosklearn.regression

        dataset = Dataset(self.specification['input'])

        dataframe = dataset.get_dataframe().dropna()
        dataframe.reset_index(drop=True, inplace=True)
        stimulus, preprocessor = preprocess(dataframe, self.specification)

        x = self.specification['problem']['predictors']
        y = self.specification['problem']['targets'][0]

        # if os.path.exists(tmp_folder):
        #     shutil.rmtree(tmp_folder)
        # if os.path.exists(output_folder):
        #     shutil.rmtree(output_folder)

        # TODO: auto_sklearn has a bug with weak references when certain non-default options are used.
        #       Just avoiding this bug for now
        # if 'configuration' in self.specification:
        #     config = self.specification['configuration']
        #
        #     self.system_params['resampling_strategy_arguments'] = self.system_params.get('resampling_strategy_arguments', {})
        #     self.system_params['resampling_strategy_arguments']['shuffle'] = config.get('shuffle', False)
        #
        #     if config['method'] == "HOLDOUT":
        #         self.system_params['resampling_strategy'] = 'holdOut'
        #         self.system_params['resampling_strategy_arguments']['train_size'] = max(0, config.get('trainTestRatio')) or .6
        #
        #     if config['method'] == "K_FOLD":
        #         self.system_params['resampling_strategy'] = 'cv'
        #         self.system_params['resampling_strategy_arguments']['folds'] = config.get('folds') or 10

        if self.specification.get('timeBoundSearch'):
            self.system_params[
                'time_left_for_this_task'] = self.specification.get(
                    'timeBoundSearch')

        if self.specification.get('timeBoundRun'):
            self.system_params['per_run_time_limit'] = self.specification.get(
                'timeBoundRun')
        # sklearn_temp_path = '/ravens_volume/solvers/auto_sklearn/temporary/' + str(uuid.uuid4())
        # tmp_folder = os.path.join(*sklearn_temp_path.split('/'), 'temp')
        # output_folder = os.path.join(*sklearn_temp_path.split('/'), 'output')

        # self.system_params['tmp_folder'] = tmp_folder
        # self.system_params['output_folder'] = output_folder
        # self.system_params['delete_tmp_folder_after_terminate'] = False

        # turn off daemon flag from the currently running process, to allow child processes from auto_sklearn fit
        multiprocessing.current_process()._config['daemon'] = False
        self.system_params['n_jobs'] = 1

        # valid system params
        # https://automl.github.io/auto-sklearn/master/api.html#api
        automl = {
            'REGRESSION': autosklearn.regression.AutoSklearnRegressor,
            'CLASSIFICATION': autosklearn.classification.AutoSklearnClassifier
        }[self.specification['problem']['taskType']](**self.system_params)

        automl.fit(stimulus.copy(), dataframe[y].copy())

        # if self.system_params.get('resampling_strategy') == 'cv':
        automl.refit(stimulus, dataframe[y])

        model = ModelSklearn(automl,
                             system='auto_sklearn',
                             search_id=self.search_id,
                             predictors=x,
                             targets=[y],
                             preprocess=preprocessor,
                             task=self.specification['problem']['taskType'])
        model.save()

        from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
        FOUND_MODEL_CALLBACKS[self.callback_found](model,
                                                   **(self.callback_arguments
                                                      or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'auto_sklearn'
            }
        }

Exemple #9

0

Afficher le fichier

Fichier : util_search.py Projet : ginfo-cflex/TwoRavens

    def run(self):
        from ludwig.api import LudwigModel

        dataset = Dataset(self.specification['input'])

        dataframe = dataset.get_dataframe()
        predictors = self.specification['problem']['predictors']
        targets = self.specification['problem']['targets']

        target_type = {
            "REGRESSION": 'numerical',
            "CLASSIFICATION": 'category'
        }[self.specification['problem']['taskType']]

        if self.specification['problem']['taskType'] == 'CLASSIFICATION':
            dataframe[targets[0]] = dataframe[targets[0]].astype(str)

        # https://github.com/uber/ludwig/blob/master/tests/integration_tests/utils.py
        model_definition = {
            "input_features": [{
                "name":
                predictor,
                "type":
                'category'
                if predictor in self.specification['problem']['categorical']
                else 'numerical'
            } for predictor in predictors],
            "output_features": [{
                "name": target,
                "type": target_type
            } for target in targets]
        }

        automl = LudwigModel(model_definition)

        train_statistics = automl.train(dataframe)

        print('train_statistics')
        print(train_statistics)

        model = ModelLudwig(automl,
                            search_id=self.search_id,
                            predictors=predictors,
                            targets=targets,
                            task=self.specification['problem']['taskType'])

        model.save()

        from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
        FOUND_MODEL_CALLBACKS[self.callback_found](model,
                                                   **(self.callback_arguments
                                                      or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'ludwig'
            }
        }

Exemple #10

0

Afficher le fichier

Fichier : util_search.py Projet : ginfo-cflex/TwoRavens

    def run(self):
        import mlbox.model.regression

        dataset = Dataset(self.specification['input'])

        dataframe = dataset.get_dataframe().dropna()
        X = self.specification['problem']['predictors']
        y = self.specification['problem']['targets'][0]

        stimulus, preprocessor = preprocess(dataframe, self.specification)

        strategies = {
            'REGRESSION': [
                "LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging",
                "AdaBoost", "Linear"
            ],
            'CLASSIFICATION': [
                "LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging",
                "AdaBoost", "Linear"
            ],
        }

        if self.FAST_DEBUG:
            strategies = {
                'REGRESSION': ["RandomForest"],
                'CLASSIFICATION': ["RandomForest"],
            }

        solver = {
            'REGRESSION': mlbox.model.regression.Regressor,
            'CLASSIFICATION': mlbox.model.classification.Classifier
        }

        for strategy in strategies[self.specification['problem']['taskType']]:
            automl = solver[self.specification['problem']['taskType']](
                strategy=strategy, **self.system_params)

            if issubclass(type(stimulus), csr_matrix):
                stimulus = stimulus.toarray()

            automl.fit(df_train=pandas.DataFrame(stimulus),
                       y_train=dataframe[y])

            model = ModelSklearn(
                automl,
                system='mlbox',
                search_id=self.search_id,
                predictors=X,
                targets=[y],
                preprocess=preprocessor,
                task=self.specification['problem']['taskType'])
            model.save()

            from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
            FOUND_MODEL_CALLBACKS[self.callback_found](
                model, **(self.callback_arguments or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'mlbox'
            }
        }

Exemple #11

0

Afficher le fichier

Fichier : util_search.py Projet : ginfo-cflex/TwoRavens

    def run(self):
        import tpot
        dataset = Dataset(self.specification['input'])

        dataframe = dataset.get_dataframe().dropna()
        stimulus, preprocessor = preprocess(dataframe, self.specification)

        X = self.specification['problem']['predictors']
        y = self.specification['problem']['targets'][0]

        self.system_params['config_dict'] = 'TPOT sparse'

        # if 'configuration' in self.specification:
        #     config = self.specification['configuration']
        #
        #     if config['method'] == "HOLDOUT":
        #         self.system_params['cv'] =
        #
        #     if config['method'] == "K_FOLD":
        #         self.system_params['cv'] =

        if self.specification.get('timeBoundSearch'):
            self.system_params['max_time_mins'] = self.specification.get(
                'timeBoundSearch') / 60.

        if self.specification.get('timeBoundRun'):
            self.system_params['max_eval_time_mins'] = self.specification.get(
                'timeBoundRun') / 60.

        # custom scorers cause unidentified SIGSEGV upon exit of search
        # scorer = make_scorer(
        #     get_metric(self.specification['performanceMetric']),
        #     greater_is_better=should_maximize(self.specification['performanceMetric']))
        # self.system_params['scoring'] = scorer
        self.system_params['n_jobs'] = 1

        automl = {
            'REGRESSION': tpot.TPOTRegressor,
            'CLASSIFICATION': tpot.TPOTClassifier
        }[self.specification['problem']['taskType']](**self.system_params)

        automl.fit(stimulus, dataframe[y])

        # selected models along the cost-complexity vs accuracy frontier
        for model_str in automl.pareto_front_fitted_pipelines_:
            model = ModelSklearn(
                automl.pareto_front_fitted_pipelines_[model_str],
                system='tpot',
                search_id=self.search_id,
                predictors=X,
                targets=[y],
                preprocess=preprocessor,
                task=self.specification['problem']['taskType'])
            model.save()

            from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
            FOUND_MODEL_CALLBACKS[self.callback_found](
                model, **(self.callback_arguments or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'tpot'
            }
        }

Exemple #12

0

Afficher le fichier

Fichier : util_search.py Projet : ginfo-cflex/TwoRavens

    def run(self):
        import h2o
        from h2o.automl import H2OAutoML

        # ensure backend solver is running
        h2o.init()

        train = h2o.import_file(
            Dataset(self.specification['input']).get_resource_uri())
        test = None

        X = self.specification['problem']['predictors']
        y = self.specification['problem']['targets'][0]

        if self.specification['problem']['taskType'] == 'CLASSIFICATION':
            if train.types[y] == u'real':
                train[y] = train[y].ascharacter()
            # For classification, response should be a factor
            train[y] = train[y].asfactor()

        if 'configuration' in self.specification:
            config = self.specification['configuration']

            if config['method'] == "HOLDOUT":
                train, test = train.split_frame(
                    ratios=[max(0, config.get('trainTestRatio')) or .6],
                    seed=config.get('randomSeed'))

            if config['method'] == "K_FOLD":
                self.system_params['nfolds'] = config.get('folds') or 10

            self.system_params['balance_classes'] = config.get(
                'stratified', False)

        if 'timeBoundSearch' in self.specification:
            self.system_params['max_runtime_secs'] = self.specification[
                'timeBoundSearch']
        if 'timeBoundRun' in self.specification:
            self.system_params[
                'max_runtime_secs_per_model'] = self.specification[
                    'timeBoundRun']
        if 'rankSolutionsLimit' in self.specification:
            self.system_params['max_models'] = self.specification[
                'rankSolutionsLimit']

        # sort_metrics = {
        #     'ACCURACY': "rmse",
        #     'ROC_AUC': "auc",
        #     'MEAN_SQUARED_ERROR': "mse",
        #     'ROOT_MEAN_SQUARED_ERROR': "rmse",
        #     'MEAN_ABSOLUTE_ERROR': "mae",
        #     'LOSS': "logloss",
        # }
        # if 'performanceMetric' in self.specification:
        #     metric_spec = self.specification['performanceMetric']
        #     if metric_spec['metric'] in sort_metrics:
        #         self.system_params['sort_metric'] = sort_metrics[metric_spec['metric']]
        #         self.system_params['stopping_metric'] = sort_metrics[metric_spec['metric']]

        # CV models are useful for model comparisons
        # self.system_params['keep_cross_validation_models'] = True

        if 'CLASSIFICATION' in self.specification['problem']['taskType']:
            train[y] = train[y].asfactor()

        train_params = {"x": X, "y": y, "training_frame": train}
        if test:
            train_params['leaderboard_frame'] = test

        automl = H2OAutoML(**self.system_params)
        automl.train(**train_params)

        if not automl.leader:
            return {
                KEY_SUCCESS: False,
                KEY_MESSAGE: 'no models found',
                KEY_DATA: {
                    'search_id': self.search_id,
                    'system': 'h2o'
                }
            }

        leaderboard = automl.leaderboard

        # take up to 10 models
        for model_id in leaderboard.head(10).as_data_frame()['model_id']:
            model = ModelH2O(h2o.get_model(model_id),
                             search_id=self.search_id,
                             predictors=X,
                             targets=[y],
                             task=self.specification['problem']['taskType'])

            from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS
            FOUND_MODEL_CALLBACKS[self.callback_found](
                model, **(self.callback_arguments or {}))

        return {
            KEY_SUCCESS: True,
            KEY_MESSAGE: 'search complete',
            KEY_DATA: {
                'search_id': self.search_id,
                'system': 'h2o'
            }
        }

Exemple #13

0

Afficher le fichier

    def produce(self, specification):
        import h2o
        configuration = specification.get('configuration', {})
        predict_type = configuration.get('predict_type', 'RAW')

        train = h2o.import_file(Dataset(specification['train']).get_resource_uri())
        y = self.targets[0]
        if 'CLASSIFICATION' in self.task:
            if train.types[y] == u'real':
                train[y] = train[y].ascharacter()
            train[self.targets[0]] = train[self.targets[0]].asfactor()

        self.fit(train, specification['train'])

        test_dataset = Dataset(specification['input'])
        data = h2o.import_file(test_dataset.get_resource_uri())
        if 'CLASSIFICATION' in self.task:
            if data.types[y] == u'real':
                data[y] = data[y].ascharacter()
            data[y] = data[y].asfactor()

        # retry once
        try:
            predictions = self.model.predict(data).as_data_frame()
        except Exception as err:
            predictions = self.model.predict(data).as_data_frame()

        if predict_type == 'RAW':
            if 'CLASSIFICATION' in self.task:
                if data.types[y] == u'real':
                    data[y] = data[y].ascharacter()
                predictions = predictions[['predict']]
            predictions.columns = [y]
        else:
            # TODO: standardize probability column names
            predictions.drop('predict', 1, inplace=True)
        predictions['d3mIndex'] = test_dataset.get_dataframe()['d3mIndex']

        output_directory_path = specification['output']['resource_uri'].replace('file://', '')
        output_path = '/' + os.path.join(
            *output_directory_path.split('/'),
            str(uuid.uuid4()) + '.csv')

        if not os.path.exists(output_directory_path):
            os.makedirs(output_directory_path)

        cwd = os.getcwd()
        try:
            os.chdir('/')
            predictions.to_csv(output_path, index=False)
        finally:
            os.chdir(cwd)

        return {
            'produce': {
                'input': specification['input'],
                'configuration': configuration,
                'data_pointer': output_path
            },
            'search_id': self.search_id,
            'model_id': self.model_id,
            "system": self.system
        }

Exemple #14

0

Afficher le fichier

    def score(self, specification):
        import h2o
        configuration = specification['configuration']
        resource_uri = Dataset(specification['input']).get_resource_uri()
        data = h2o.import_file(resource_uri)
        y = self.targets[0]
        if 'CLASSIFICATION' in self.task:
            if data.types[y] == u'real':
                data[y] = data[y].ascharacter()
            data[y] = data[y].asfactor()

        results = pandas.DataFrame({
            'predict': self.model.predict(data).as_data_frame()['predict'],
            'actual': data[y].as_data_frame()[y]
        }).dropna()

        if 'CLASSIFICATION' in self.task:
            if data.types[y] == u'real':
                data[y] = data[y].ascharacter()
            results['actual'] = results['actual'].astype(int)

        scores = []
        for metric_schema in specification['performanceMetrics']:
            try:
                scores.append({
                    'value': get_metric(metric_schema)(
                        results['actual'],
                        results['predict']),
                    'metric': metric_schema,
                    'target': y
                })
            except ValueError as err:
                print(f'Could not evaluate metric: {str(metric_schema)}')
                print(err)

        # if configuration.get('stratified'):
        #     # how does h2o know which column to stratify for? weirdness here
        #     folds = data.stratified_kfold_column(n_folds=configuration['folds'])
        # else:
        #     folds = data.kfold_column(n_folds=configuration['folds'])
        #
        # split_scores = defaultdict(list)
        # split_weights = defaultdict(list)
        # for split_id in range(configuration['folds']):
        #     train, test = data[folds != split_id], data[folds == split_id]
        #     self.fit(train)
        #     results = pandas.DataFrame({
        #         'predict': self.model.predict(test).as_data_frame()['predict'],
        #         'actual': test[self.targets[0]].as_data_frame()[self.targets[0]]
        #     }).dropna()
        #
        #     if 'CLASSIFICATION' in self.task:
        #         results['actual'] = results['actual'].astype(int)
        #
        #     for metric_schema in specification['performanceMetrics']:
        #         try:
        #             split_scores[json.dumps(metric_schema)].append(get_metric(metric_schema)(
        #                 results['actual'],
        #                 results['predict']))
        #             split_weights[json.dumps(metric_schema)].append(results.size)
        #         except ValueError as err:
        #             print(f'Could not evaluate metric: {str(metric_schema)}')
        #             print(err)
        # for metric in split_scores:
        #     scores.append({
        #         'value': np.average(split_scores[metric], weights=split_weights[metric]),
        #         'metric': json.loads(metric),
        #         'target': self.targets[0]
        #     })

        return {
            'search_id': self.search_id,
            'model_id': self.model_id,
            'scores': scores,
            "system": self.system
        }