def preprocess_target_ds(model_path, ds): options = fsclient.read_json_file( os.path.join(model_path, "options.json")) target_categoricals = fsclient.read_json_file( os.path.join(model_path, "target_categoricals.json")) y_true = None if not options.get('targetFeature') or not options.get( 'targetFeature') in ds.columns: return y_true, target_categoricals if options.get('timeSeriesFeatures'): y_true = np.ravel(ds.df[options.get('targetFeature')].astype( np.float64, copy=False), order='C') else: if target_categoricals and options.get( 'targetFeature') in target_categoricals: ds.convertToCategorical( options.get('targetFeature'), is_target=True, categories=target_categoricals.get( options.get('targetFeature')).get('categories')) y_true = np.ravel(ds.df[options.get('targetFeature')], order='C') return y_true, target_categoricals
def _predict_locally(self, predict_data, model_id, threshold): is_loaded, model_path = self.verify_local_model(model_id) if not is_loaded: raise Exception("Model should be deployed before predict.") fitted_model = fsclient.load_object_from_file(model_path) try: options = fsclient.read_json_file(os.path.join(self.ctx.config.get_model_path(model_id), "options.json")) model_features = options.get("originalFeatureColumns") predict_data = predict_data[model_features] predict_data.to_csv("test_options.csv", index=False, compression=None, encoding='utf-8') except Exception as e: self.ctx.log('Cannot get columns from model.Use original columns from predicted data: %s'%e) results_proba = None proba_classes = None results = None if threshold is not None: results_proba = fitted_model.predict_proba(predict_data) proba_classes = list(fitted_model.classes_) else: results = fitted_model.predict(predict_data) target_categoricals = fsclient.read_json_file(os.path.join( self.ctx.config.get_model_path(model_id), "target_categoricals.json")) target_categories = target_categoricals.get(self.ctx.config.get('target'), {}).get("categories") return results, results_proba, proba_classes, target_categories
def test_process_prediction(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) target_categories = ["setosa", "versicolor", "virginica"] ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) ds.drop([options['targetFeature']]) results = [ "setosa", "versicolor", "virginica", "setosa", "versicolor", "virginica" ] results_proba = None proba_classes = None ModelHelper.process_prediction(ds, results, results_proba, proba_classes, None, options.get('minority_target_class'), options['targetFeature'], target_categories) ds_test = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) self.assertEqual(ds.dtypes, ds_test.dtypes) self.assertEqual(ds.df.values.tolist(), ds_test.df.values.tolist())
def test_calculate_scores(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) y_test, _ = ModelHelper.preprocess_target(model_path, records=[["setosa"], ["versicolor"], ["virginica"], ["setosa"], ["versicolor"], ["virginica"]], features=["species"]) y_pred, _ = ModelHelper.preprocess_target(model_path, records=[["setosa"], ["versicolor"], ["versicolor"], ["setosa"], ["versicolor"], ["virginica"]], features=["species"]) scores = ModelHelper.calculate_scores(options, y_test=y_test, y_pred=y_pred) self.assertEqual(len(scores), len(options['scoreNames'])) self.assertTrue(scores['accuracy'] > 0.8)
def test_process_prediction_proba(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) target_categories = ["setosa", "versicolor", "virginica"] ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) ds.drop([options['targetFeature']]) results = None #[0, 1, 2, 0, 1, 2] results_proba = [[0.8, 0.1, 0.1], [0.4, 0.6, 0.1], [0.1, 0.2, 0.7], [0.7, 0.2, 0.1], [0.3, 0.7, 0.1], [0.1, 0.3, 0.6]] results_proba = np.array(results_proba) proba_classes = [0, 1, 2] ModelHelper.process_prediction(ds, results, results_proba, proba_classes, 0.5, None, options['targetFeature'], target_categories) ds_test = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) self.assertEqual( ds.columns, ds_test.columns + ["proba_setosa", "proba_versicolor", "proba_virginica"]) self.assertEqual(ds.df[options['targetFeature']].values.tolist(), ds_test.df[options['targetFeature']].values.tolist())
def _get_remote_model_features(self, remote_run): from a2ml.api.utils import fsclient, local_fsclient import pandas as pd model_features = None target_categories = None temp_dir = local_fsclient.LocalFSClient().get_temp_folder() try: file_name = 'scoring_file_v_1_0_0.py' remote_run.download_file('outputs/%s'%file_name, os.path.join(temp_dir, file_name)) text = fsclient.read_text_file(os.path.join(temp_dir, file_name)) to_find = "input_sample =" start = text.find(to_find) if start > 0: end = text.find("\n", start) if end > start: code_to_run = text[start+len(to_find):end] input_sample = eval(code_to_run) model_features = input_sample.columns.tolist() except Exception as e: self.ctx.log('Cannot get columns from remote model.Use original columns from predicted data: %s'%e) if self.ctx.config.get("model_type") == "classification": try: file_name = 'confusion_matrix' remote_run.download_file('%s'%file_name, os.path.join(temp_dir, file_name)) cm_data = fsclient.read_json_file(os.path.join(temp_dir, file_name)) target_categories = cm_data.get('data', {}).get('class_labels') except Exception as e: self.ctx.log('Cannot get categorical target class labels from remote model.Use class codes: %s'%e) fsclient.remove_folder(temp_dir) return model_features, target_categories
def predict(self, filename, model_id, threshold=None, locally=False, data=None, columns=None, output = None, json_result=False, count_in_result=False, prediction_date=None, prediction_id=None): ds = DataFrame.create_dataframe(filename, data, columns) model_path = self.ctx.config.get_model_path(model_id) options = fsclient.read_json_file(os.path.join(model_path, "options.json")) results, results_proba, proba_classes, target_categories = \ self._predict_locally(ds.df, model_id, threshold) if locally else self._predict_remotely(ds.df, model_id, threshold) if target_categories and len(target_categories) == 2: for idx, item in enumerate(target_categories): if item == "False": target_categories[idx] = False if item == "True": target_categories[idx] = True ModelHelper.process_prediction(ds, results, results_proba, proba_classes, threshold, options.get('minority_target_class', self.ctx.config.get('minority_target_class')), options.get('targetFeature', self.ctx.config.get('target', None)), target_categories) predicted = ModelHelper.save_prediction(ds, prediction_id, options.get('support_review_model', True), json_result, count_in_result, prediction_date, model_path, model_id, output) if filename: self.ctx.log('Predictions stored in %s' % predicted) return {'predicted': predicted}
def _predict_locally(self, filename_arg, model_id, threshold, data, columns, output): model_deploy = ModelDeploy(self.ctx, None) is_model_loaded, model_path, model_name = \ model_deploy.verify_local_model(model_id) if not is_model_loaded: raise AugerException( 'Model isn\'t loaded locally. ' 'Please use a2ml deploy command to download model.') model_path, model_existed = self._extract_model(model_name) model_options = fsclient.read_json_file( os.path.join(model_path, "model", "options.json")) filename = filename_arg if not filename: ds = DataFrame.create_dataframe(filename, data, columns) filename = os.path.join(self.ctx.config.get_path(), '.augerml', 'predict_data.csv') ds.saveToCsvFile(filename, compression=None) try: predicted = \ self._docker_run_predict(filename, threshold, model_path) finally: # clean up unzipped model # if it wasn't unzipped before if not model_existed: shutil.rmtree(model_path, ignore_errors=True) model_path = None if not filename_arg: ds_result = DataFrame.create_dataframe(predicted) ds_result.options['data_path'] = None ds_result.loaded_columns = columns return ModelHelper.save_prediction_result( ds_result, prediction_id=None, support_review_model=model_options.get("support_review_model") if model_path else False, json_result=False, count_in_result=False, prediction_date=None, model_path=model_path, model_id=model_id, output=output) elif output: fsclient.move_file(predicted, output) predicted = output return predicted
def _get_feature_importances(self): cache_path = ModelHelper.get_metric_path(self.options) importance_data = None if cache_path: importance_data = fsclient.read_json_file( os.path.join(cache_path, "metrics.json")).get('feature_importance_data') if not importance_data: importance_data = fsclient.read_json_file( os.path.join(cache_path, "metric_names_feature_importance.json")).get( 'feature_importance_data') if importance_data: return dict( zip(importance_data['features'], importance_data['scores'])) else: logging.warn("No feature importance in cache: for model %s" % (cache_path)) return {}
def __init__(self, params): self.model_path = params.get('model_path') if not self.model_path: self.model_path = ModelHelper.get_model_path( params['augerInfo']['pipeline_id'], params['augerInfo'].get('projectPath')) self.options = fsclient.read_json_file( os.path.join(self.model_path, "options.json")) if params.get('augerInfo'): self.options['augerInfo'] = params['augerInfo'] self.target_feature = self.options.get('targetFeature')
def test_save_prediction(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) prediction_id = "123" prediction_date = "today" results_file_path = os.path.join( model_path, "predictions", prediction_date + '_' + prediction_id + "_results.feather.zstd") predicted_file_path = os.path.join( model_path, "predictions", "iris_test_" + prediction_id + "_" + options.get('uid') + "_predicted.csv") ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=True, json_result=False, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) self.assertEqual(res, predicted_file_path) self.assertTrue(fsclient.is_file_exists(predicted_file_path)) self.assertTrue(fsclient.is_file_exists(results_file_path)) ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=True, json_result=True, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) res = json.loads(res) self.assertEqual(res['columns'], ds.columns) self.assertEqual(len(res['data']), 6) ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) ds.options['data_path'] = None res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=False, json_result=False, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) self.assertEqual(type(res[0]), dict) self.assertEqual(res[0][options['targetFeature']], 'setosa') ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) ds.options['data_path'] = None ds.loaded_columns = ds.columns res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=False, json_result=False, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) self.assertEqual(res['columns'], ds.columns) self.assertEqual(len(res['data']), 6) self.assertEqual(type(res['data'][0]), list)