Exemple #1
0
    def preprocess_target_ds(model_path, ds):
        options = fsclient.read_json_file(
            os.path.join(model_path, "options.json"))
        target_categoricals = fsclient.read_json_file(
            os.path.join(model_path, "target_categoricals.json"))
        y_true = None

        if not options.get('targetFeature') or not options.get(
                'targetFeature') in ds.columns:
            return y_true, target_categoricals

        if options.get('timeSeriesFeatures'):
            y_true = np.ravel(ds.df[options.get('targetFeature')].astype(
                np.float64, copy=False),
                              order='C')
        else:
            if target_categoricals and options.get(
                    'targetFeature') in target_categoricals:
                ds.convertToCategorical(
                    options.get('targetFeature'),
                    is_target=True,
                    categories=target_categoricals.get(
                        options.get('targetFeature')).get('categories'))

            y_true = np.ravel(ds.df[options.get('targetFeature')], order='C')

        return y_true, target_categoricals
Exemple #2
0
    def _predict_locally(self, predict_data, model_id, threshold):
        is_loaded, model_path = self.verify_local_model(model_id)
        if not is_loaded:
            raise Exception("Model should be deployed before predict.")

        fitted_model = fsclient.load_object_from_file(model_path)
        try:
            options = fsclient.read_json_file(os.path.join(self.ctx.config.get_model_path(model_id), "options.json"))

            model_features = options.get("originalFeatureColumns")
            predict_data = predict_data[model_features]
            predict_data.to_csv("test_options.csv", index=False, compression=None, encoding='utf-8')
        except Exception as e:
            self.ctx.log('Cannot get columns from model.Use original columns from predicted data: %s'%e)

        results_proba = None
        proba_classes = None
        results = None
        if threshold is not None:
            results_proba = fitted_model.predict_proba(predict_data)
            proba_classes = list(fitted_model.classes_)
        else:
            results = fitted_model.predict(predict_data)

        target_categoricals = fsclient.read_json_file(os.path.join(
                self.ctx.config.get_model_path(model_id), "target_categoricals.json"))
        target_categories = target_categoricals.get(self.ctx.config.get('target'), {}).get("categories")

        return results, results_proba, proba_classes, target_categories
Exemple #3
0
    def test_process_prediction(self):
        model_path = 'tests/fixtures/test_predict_by_model/iris'
        options = fsclient.read_json_file(
            os.path.join(model_path, "options.json"))
        target_categories = ["setosa", "versicolor", "virginica"]

        ds = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        ds.drop([options['targetFeature']])
        results = [
            "setosa", "versicolor", "virginica", "setosa", "versicolor",
            "virginica"
        ]
        results_proba = None
        proba_classes = None

        ModelHelper.process_prediction(ds, results, results_proba,
                                       proba_classes, None,
                                       options.get('minority_target_class'),
                                       options['targetFeature'],
                                       target_categories)

        ds_test = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        self.assertEqual(ds.dtypes, ds_test.dtypes)
        self.assertEqual(ds.df.values.tolist(), ds_test.df.values.tolist())
Exemple #4
0
    def test_calculate_scores(self):
        model_path = 'tests/fixtures/test_predict_by_model/iris'
        options = fsclient.read_json_file(
            os.path.join(model_path, "options.json"))

        y_test, _ = ModelHelper.preprocess_target(model_path,
                                                  records=[["setosa"],
                                                           ["versicolor"],
                                                           ["virginica"],
                                                           ["setosa"],
                                                           ["versicolor"],
                                                           ["virginica"]],
                                                  features=["species"])
        y_pred, _ = ModelHelper.preprocess_target(model_path,
                                                  records=[["setosa"],
                                                           ["versicolor"],
                                                           ["versicolor"],
                                                           ["setosa"],
                                                           ["versicolor"],
                                                           ["virginica"]],
                                                  features=["species"])

        scores = ModelHelper.calculate_scores(options,
                                              y_test=y_test,
                                              y_pred=y_pred)
        self.assertEqual(len(scores), len(options['scoreNames']))
        self.assertTrue(scores['accuracy'] > 0.8)
Exemple #5
0
    def test_process_prediction_proba(self):
        model_path = 'tests/fixtures/test_predict_by_model/iris'
        options = fsclient.read_json_file(
            os.path.join(model_path, "options.json"))
        target_categories = ["setosa", "versicolor", "virginica"]

        ds = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        ds.drop([options['targetFeature']])
        results = None  #[0, 1, 2, 0, 1, 2]
        results_proba = [[0.8, 0.1, 0.1], [0.4, 0.6, 0.1], [0.1, 0.2, 0.7],
                         [0.7, 0.2, 0.1], [0.3, 0.7, 0.1], [0.1, 0.3, 0.6]]
        results_proba = np.array(results_proba)
        proba_classes = [0, 1, 2]

        ModelHelper.process_prediction(ds, results, results_proba,
                                       proba_classes, 0.5, None,
                                       options['targetFeature'],
                                       target_categories)

        ds_test = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        self.assertEqual(
            ds.columns, ds_test.columns +
            ["proba_setosa", "proba_versicolor", "proba_virginica"])
        self.assertEqual(ds.df[options['targetFeature']].values.tolist(),
                         ds_test.df[options['targetFeature']].values.tolist())
Exemple #6
0
    def _get_remote_model_features(self, remote_run):
        from  a2ml.api.utils import fsclient, local_fsclient
        import pandas as pd

        model_features = None
        target_categories = None

        temp_dir = local_fsclient.LocalFSClient().get_temp_folder()
        try:
            file_name = 'scoring_file_v_1_0_0.py'
            remote_run.download_file('outputs/%s'%file_name, os.path.join(temp_dir, file_name))
            text = fsclient.read_text_file(os.path.join(temp_dir, file_name))
            to_find = "input_sample ="
            start = text.find(to_find)
            if start > 0:
                end = text.find("\n", start)
                if end > start:
                    code_to_run = text[start+len(to_find):end]

                    input_sample = eval(code_to_run)
                    model_features = input_sample.columns.tolist()
        except Exception as e:
            self.ctx.log('Cannot get columns from remote model.Use original columns from predicted data: %s'%e)

        if self.ctx.config.get("model_type") == "classification":
            try:
                file_name = 'confusion_matrix'
                remote_run.download_file('%s'%file_name, os.path.join(temp_dir, file_name))
                cm_data = fsclient.read_json_file(os.path.join(temp_dir, file_name))
                target_categories = cm_data.get('data', {}).get('class_labels')
            except Exception as e:
                self.ctx.log('Cannot get categorical target class labels from remote model.Use class codes: %s'%e)

        fsclient.remove_folder(temp_dir)
        return model_features, target_categories
Exemple #7
0
    def predict(self, filename, model_id,
        threshold=None, locally=False, data=None, columns=None, output = None,
        json_result=False, count_in_result=False, prediction_date=None, prediction_id=None):
        ds = DataFrame.create_dataframe(filename, data, columns)
        model_path = self.ctx.config.get_model_path(model_id)
        options = fsclient.read_json_file(os.path.join(model_path, "options.json"))

        results, results_proba, proba_classes, target_categories = \
            self._predict_locally(ds.df, model_id, threshold) if locally else self._predict_remotely(ds.df, model_id, threshold)

        if target_categories and len(target_categories) == 2:
            for idx, item in enumerate(target_categories):
                if item == "False":
                    target_categories[idx] = False
                if item == "True":
                    target_categories[idx] = True

        ModelHelper.process_prediction(ds,
            results, results_proba, proba_classes,
            threshold,
            options.get('minority_target_class', self.ctx.config.get('minority_target_class')),
            options.get('targetFeature', self.ctx.config.get('target', None)),
            target_categories)

        predicted = ModelHelper.save_prediction(ds, prediction_id,
            options.get('support_review_model', True), json_result, count_in_result, prediction_date,
            model_path, model_id, output)

        if filename:
            self.ctx.log('Predictions stored in %s' % predicted)

        return {'predicted': predicted}
Exemple #8
0
    def _predict_locally(self, filename_arg, model_id, threshold, data,
                         columns, output):
        model_deploy = ModelDeploy(self.ctx, None)
        is_model_loaded, model_path, model_name = \
            model_deploy.verify_local_model(model_id)

        if not is_model_loaded:
            raise AugerException(
                'Model isn\'t loaded locally. '
                'Please use a2ml deploy command to download model.')

        model_path, model_existed = self._extract_model(model_name)
        model_options = fsclient.read_json_file(
            os.path.join(model_path, "model", "options.json"))

        filename = filename_arg
        if not filename:
            ds = DataFrame.create_dataframe(filename, data, columns)
            filename = os.path.join(self.ctx.config.get_path(), '.augerml',
                                    'predict_data.csv')
            ds.saveToCsvFile(filename, compression=None)

        try:
            predicted = \
                self._docker_run_predict(filename, threshold, model_path)
        finally:
            # clean up unzipped model
            # if it wasn't unzipped before
            if not model_existed:
                shutil.rmtree(model_path, ignore_errors=True)
                model_path = None

        if not filename_arg:
            ds_result = DataFrame.create_dataframe(predicted)

            ds_result.options['data_path'] = None
            ds_result.loaded_columns = columns

            return ModelHelper.save_prediction_result(
                ds_result,
                prediction_id=None,
                support_review_model=model_options.get("support_review_model")
                if model_path else False,
                json_result=False,
                count_in_result=False,
                prediction_date=None,
                model_path=model_path,
                model_id=model_id,
                output=output)
        elif output:
            fsclient.move_file(predicted, output)
            predicted = output

        return predicted
Exemple #9
0
    def _get_feature_importances(self):
        cache_path = ModelHelper.get_metric_path(self.options)

        importance_data = None
        if cache_path:
            importance_data = fsclient.read_json_file(
                os.path.join(cache_path,
                             "metrics.json")).get('feature_importance_data')
            if not importance_data:
                importance_data = fsclient.read_json_file(
                    os.path.join(cache_path,
                                 "metric_names_feature_importance.json")).get(
                                     'feature_importance_data')

        if importance_data:
            return dict(
                zip(importance_data['features'], importance_data['scores']))
        else:
            logging.warn("No feature importance in cache: for model %s" %
                         (cache_path))
            return {}
Exemple #10
0
    def __init__(self, params):
        self.model_path = params.get('model_path')
        if not self.model_path:
            self.model_path = ModelHelper.get_model_path(
                params['augerInfo']['pipeline_id'],
                params['augerInfo'].get('projectPath'))

        self.options = fsclient.read_json_file(
            os.path.join(self.model_path, "options.json"))
        if params.get('augerInfo'):
            self.options['augerInfo'] = params['augerInfo']

        self.target_feature = self.options.get('targetFeature')
Exemple #11
0
    def test_save_prediction(self):
        model_path = 'tests/fixtures/test_predict_by_model/iris'
        options = fsclient.read_json_file(
            os.path.join(model_path, "options.json"))

        prediction_id = "123"
        prediction_date = "today"
        results_file_path = os.path.join(
            model_path, "predictions",
            prediction_date + '_' + prediction_id + "_results.feather.zstd")
        predicted_file_path = os.path.join(
            model_path, "predictions", "iris_test_" + prediction_id + "_" +
            options.get('uid') + "_predicted.csv")

        ds = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
        self.assertFalse(fsclient.is_file_exists(predicted_file_path))

        res = ModelHelper.save_prediction(ds,
                                          prediction_id,
                                          support_review_model=True,
                                          json_result=False,
                                          count_in_result=False,
                                          prediction_date=prediction_date,
                                          model_path=model_path,
                                          model_id=options.get('uid'))
        self.assertEqual(res, predicted_file_path)
        self.assertTrue(fsclient.is_file_exists(predicted_file_path))
        self.assertTrue(fsclient.is_file_exists(results_file_path))

        ds = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
        self.assertFalse(fsclient.is_file_exists(predicted_file_path))

        res = ModelHelper.save_prediction(ds,
                                          prediction_id,
                                          support_review_model=True,
                                          json_result=True,
                                          count_in_result=False,
                                          prediction_date=prediction_date,
                                          model_path=model_path,
                                          model_id=options.get('uid'))
        res = json.loads(res)
        self.assertEqual(res['columns'], ds.columns)
        self.assertEqual(len(res['data']), 6)

        ds = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
        self.assertFalse(fsclient.is_file_exists(predicted_file_path))

        ds.options['data_path'] = None
        res = ModelHelper.save_prediction(ds,
                                          prediction_id,
                                          support_review_model=False,
                                          json_result=False,
                                          count_in_result=False,
                                          prediction_date=prediction_date,
                                          model_path=model_path,
                                          model_id=options.get('uid'))
        self.assertEqual(type(res[0]), dict)
        self.assertEqual(res[0][options['targetFeature']], 'setosa')

        ds = DataFrame.create_dataframe(
            os.path.join(model_path, "iris_test.csv"))
        fsclient.remove_file(results_file_path)
        self.assertFalse(fsclient.is_file_exists(results_file_path))
        fsclient.remove_file(predicted_file_path)
        self.assertFalse(fsclient.is_file_exists(predicted_file_path))

        ds.options['data_path'] = None
        ds.loaded_columns = ds.columns
        res = ModelHelper.save_prediction(ds,
                                          prediction_id,
                                          support_review_model=False,
                                          json_result=False,
                                          count_in_result=False,
                                          prediction_date=prediction_date,
                                          model_path=model_path,
                                          model_id=options.get('uid'))
        self.assertEqual(res['columns'], ds.columns)
        self.assertEqual(len(res['data']), 6)
        self.assertEqual(type(res['data'][0]), list)