def test_process_prediction_proba(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) target_categories = ["setosa", "versicolor", "virginica"] ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) ds.drop([options['targetFeature']]) results = None #[0, 1, 2, 0, 1, 2] results_proba = [[0.8, 0.1, 0.1], [0.4, 0.6, 0.1], [0.1, 0.2, 0.7], [0.7, 0.2, 0.1], [0.3, 0.7, 0.1], [0.1, 0.3, 0.6]] results_proba = np.array(results_proba) proba_classes = [0, 1, 2] ModelHelper.process_prediction(ds, results, results_proba, proba_classes, 0.5, None, options['targetFeature'], target_categories) ds_test = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) self.assertEqual( ds.columns, ds_test.columns + ["proba_setosa", "proba_versicolor", "proba_virginica"]) self.assertEqual(ds.df[options['targetFeature']].values.tolist(), ds_test.df[options['targetFeature']].values.tolist())
def build_review_data(self, data_path=None, output=None): if not data_path: data_path = self.options['data_path'] ds_train = DataFrame.create_dataframe(data_path) all_files = fsclient.list_folder(os.path.join( self.model_path, "predictions/*_actuals.feather.zstd"), wild=True, remove_folder_name=False, meta_info=True) all_files.sort(key=lambda f: f['last_modified'], reverse=True) for (file, ds_actuals) in DataFrame.load_from_files(all_files): if not ds_actuals.df.empty: ds_actuals.drop(['prediction_id', 'prediction_group_id']) ds_train.df = pd.concat( [ds_train.df, ds_actuals.df[ds_train.columns]], ignore_index=True) ds_train.drop_duplicates() if not output: output = os.path.splitext( data_path)[0] + "_review_%s.feather.zstd" % (get_uid()) ds_train.saveToFile(output) return output
def test_process_prediction(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) target_categories = ["setosa", "versicolor", "virginica"] ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) ds.drop([options['targetFeature']]) results = [ "setosa", "versicolor", "virginica", "setosa", "versicolor", "virginica" ] results_proba = None proba_classes = None ModelHelper.process_prediction(ds, results, results_proba, proba_classes, None, options.get('minority_target_class'), options['targetFeature'], target_categories) ds_test = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) self.assertEqual(ds.dtypes, ds_test.dtypes) self.assertEqual(ds.df.values.tolist(), ds_test.df.values.tolist())
def test_score_actuals_for_candidate_prediction(): # Prediction data: # { 'prediction_id':'bef9be07-5534-434e-ab7c-c379d8fcfe77', 'species':'versicolor' }, # { 'prediction_id':'f61b1bbc-6f7b-4e7e-9a3b-6acb6e1462cd', 'species':'virginica' } model_path = 'tests/fixtures/test_score_actuals/pr_can/candidate' prediction_group_id = '272B088D17A7490' # Primary prediction data: # { 'prediction_id':'09aaa96b-5d9c-4c45-ab04-726da868624b', 'species':'virginica' }, # { 'prediction_id':'5e5ad22b-6789-47c6-9a4d-a3a998065127', 'species':'virginica' } primary_model_path = 'tests/fixtures/test_score_actuals/pr_can/primary' primary_prediction_group_id = 'A4FD5B64FEE5434' for actuals_path in glob.glob(model_path + '/predictions/*_actuals.feather.zstd'): os.remove(actuals_path) actuals = [{ 'prediction_id': '09aaa96b-5d9c-4c45-ab04-726da868624b', 'actual': 'versicolor' }, { 'prediction_id': '5e5ad22b-6789-47c6-9a4d-a3a998065127', 'actual': 'virginica' }] res = ModelReview({ 'model_path': model_path }).add_actuals(actual_records=actuals, prediction_group_id=prediction_group_id, primary_prediction_group_id=primary_prediction_group_id, primary_model_path=primary_model_path, calc_score=True) assert type(res) == dict assert res['accuracy'] == 1.0 actual_files = glob.glob(model_path + '/predictions/*_actuals.feather.zstd') assert len(actual_files) == 1 actual_file = actual_files[0] assert str(datetime.date.today()) in actual_file stored_actuals = DataFrame({}) stored_actuals.loadFromFeatherFile(actual_file) assert 'prediction_group_id' in stored_actuals.columns stored_actuals = json.loads( stored_actuals.df.sort_values(by=['prediction_id']).to_json( orient='records')) assert stored_actuals[0][ 'prediction_id'] == 'bef9be07-5534-434e-ab7c-c379d8fcfe77' assert stored_actuals[0]['prediction_group_id'] == prediction_group_id assert stored_actuals[0]['species'] == 'versicolor' assert stored_actuals[1][ 'prediction_id'] == 'f61b1bbc-6f7b-4e7e-9a3b-6acb6e1462cd' assert stored_actuals[1]['prediction_group_id'] == prediction_group_id assert stored_actuals[1]['species'] == 'virginica'
def _predict_locally(self, filename_arg, model_id, threshold, data, columns, output): model_deploy = ModelDeploy(self.ctx, None) is_model_loaded, model_path, model_name = \ model_deploy.verify_local_model(model_id) if not is_model_loaded: raise AugerException( 'Model isn\'t loaded locally. ' 'Please use a2ml deploy command to download model.') model_path, model_existed = self._extract_model(model_name) model_options = fsclient.read_json_file( os.path.join(model_path, "model", "options.json")) filename = filename_arg if not filename: ds = DataFrame.create_dataframe(filename, data, columns) filename = os.path.join(self.ctx.config.get_path(), '.augerml', 'predict_data.csv') ds.saveToCsvFile(filename, compression=None) try: predicted = \ self._docker_run_predict(filename, threshold, model_path) finally: # clean up unzipped model # if it wasn't unzipped before if not model_existed: shutil.rmtree(model_path, ignore_errors=True) model_path = None if not filename_arg: ds_result = DataFrame.create_dataframe(predicted) ds_result.options['data_path'] = None ds_result.loaded_columns = columns return ModelHelper.save_prediction_result( ds_result, prediction_id=None, support_review_model=model_options.get("support_review_model") if model_path else False, json_result=False, count_in_result=False, prediction_date=None, model_path=model_path, model_id=model_id, output=output) elif output: fsclient.move_file(predicted, output) predicted = output return predicted
def _predict_on_cloud(self, filename, model_id, threshold=None): target = self.ctx.config.get('target', None) records, features = DataFrame.load_records(filename, target) pipeline_api = AugerPipelineApi(self.ctx, None, model_id) predictions = pipeline_api.predict( records, features, threshold) predicted = os.path.splitext(filename)[0] + "_predicted.csv" DataFrame.save(predicted, predictions) return predicted
def count_actuals_by_prediction_id(self): res = {} features = [ 'prediction_group_id', 'prediction_id', self.target_feature ] counter = ProbabilisticCounter() all_files = fsclient.list_folder(os.path.join( self.model_path, "predictions/*_actuals.feather.zstd"), wild=True, remove_folder_name=False, meta_info=False) for (file, df) in DataFrame.load_from_files(all_files, features): ModelReview._remove_duplicates_by(df, 'prediction_id', counter) agg = df.df.groupby(['prediction_group_id', 'prediction_id']).count() agg[self. target_feature] = 1 # exclude duplication prediction_id's inside groups agg = agg.groupby('prediction_group_id').count() for prediction_group_id, row, in agg.iterrows(): count = row[0] if prediction_group_id not in res: res[prediction_group_id] = count else: res[prediction_group_id] = res[prediction_group_id] + count return res
def predict(self, filename, model_id, threshold=None, locally=False, data=None, columns=None, output = None, json_result=False, count_in_result=False, prediction_date=None, prediction_id=None): ds = DataFrame.create_dataframe(filename, data, columns) model_path = self.ctx.config.get_model_path(model_id) options = fsclient.read_json_file(os.path.join(model_path, "options.json")) results, results_proba, proba_classes, target_categories = \ self._predict_locally(ds.df, model_id, threshold) if locally else self._predict_remotely(ds.df, model_id, threshold) if target_categories and len(target_categories) == 2: for idx, item in enumerate(target_categories): if item == "False": target_categories[idx] = False if item == "True": target_categories[idx] = True ModelHelper.process_prediction(ds, results, results_proba, proba_classes, threshold, options.get('minority_target_class', self.ctx.config.get('minority_target_class')), options.get('targetFeature', self.ctx.config.get('target', None)), target_categories) predicted = ModelHelper.save_prediction(ds, prediction_id, options.get('support_review_model', True), json_result, count_in_result, prediction_date, model_path, model_id, output) if filename: self.ctx.log('Predictions stored in %s' % predicted) return {'predicted': predicted}
def preprocess_target(model_path, data_path=None, records=None, features=None): ds = DataFrame.create_dataframe(data_path, records, features) return ModelHelper.preprocess_target_ds(model_path, ds)
def predict(self, filename, model_id, threshold, locally): ws = AzureProject(self.ctx)._get_ws() experiment_name = self.ctx.config.get('experiment/name', None) if experiment_name is None: raise AzureException('Please specify Experiment name...') experiment = Experiment(ws, experiment_name) target = self.ctx.config.get('target', None) predict_data = DataFrame.load(filename, target) y_pred = [] if locally: y_pred, y_proba, proba_classes = self._predict_locally( experiment, predict_data, model_id, threshold) else: y_pred, y_proba, proba_classes = self._predict_remotely( ws, experiment, predict_data, model_id, threshold) predict_data[target] = y_pred if y_proba is not None: for idx, name in enumerate(proba_classes): predict_data['proba_' + str(name)] = list(y_proba[:, idx]) predicted = self._save_predictions(predict_data, filename) return {'predicted': predicted}
def score_model_performance_daily(self, date_from, date_to): features = ['prediction_id', self.target_feature] res = {} for (curr_date, files) in ModelReview._prediction_files_by_day( self.model_path, date_from, date_to, "_*_actuals.feather.zstd"): df_actuals = DataFrame({}) for (file, df) in DataFrame.load_from_files(files, features): df_actuals.df = pd.concat([df_actuals.df, df.df]) if df_actuals.count() > 0: df_actuals.df.rename( columns={self.target_feature: 'a2ml_actual'}, inplace=True) scores = self._process_actuals(ds_actuals=df_actuals, calc_score=True) res[str(curr_date)] = scores[self.options.get('score_name')] return res
def _predict_on_cloud(self, filename, model_id, threshold, data, columns, output): ds = DataFrame.create_dataframe(filename, data, columns) pipeline_api = AugerPipelineApi(self.ctx, None, model_id) predictions = pipeline_api.predict(ds.get_records(), ds.columns, threshold) ds_result = DataFrame.create_dataframe(None, records=predictions['data'], features=predictions['columns']) ds_result.options['data_path'] = filename return ModelHelper.save_prediction_result(ds_result, prediction_id=None, support_review_model=False, json_result=False, count_in_result=False, prediction_date=None, model_path=None, model_id=model_id, output=output)
def add_actuals(self, actuals_path=None, actual_records=None, prediction_group_id=None, primary_prediction_group_id=None, primary_model_path=None, actual_date=None, actuals_id=None, calc_score=True): features = None if actuals_path or (actual_records and type(actual_records[0]) == list): features = ['prediction_id', 'actual'] ds_actuals = DataFrame.create_dataframe(actuals_path, actual_records, features=features) result = self._process_actuals(ds_actuals, prediction_group_id, primary_prediction_group_id, primary_model_path, actual_date, actuals_id, calc_score, raise_not_found=True) ds_actuals.drop(self.target_feature) ds_actuals.df = ds_actuals.df.rename( columns={'a2ml_actual': self.target_feature}) if not actuals_id: actuals_id = get_uid() file_name = str( actual_date or datetime.date.today()) + '_' + actuals_id + "_actuals.feather.zstd" ds_actuals.saveToFeatherFile( os.path.join(self.model_path, "predictions", file_name)) return result
def test_save_prediction(self): model_path = 'tests/fixtures/test_predict_by_model/iris' options = fsclient.read_json_file( os.path.join(model_path, "options.json")) prediction_id = "123" prediction_date = "today" results_file_path = os.path.join( model_path, "predictions", prediction_date + '_' + prediction_id + "_results.feather.zstd") predicted_file_path = os.path.join( model_path, "predictions", "iris_test_" + prediction_id + "_" + options.get('uid') + "_predicted.csv") ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=True, json_result=False, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) self.assertEqual(res, predicted_file_path) self.assertTrue(fsclient.is_file_exists(predicted_file_path)) self.assertTrue(fsclient.is_file_exists(results_file_path)) ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=True, json_result=True, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) res = json.loads(res) self.assertEqual(res['columns'], ds.columns) self.assertEqual(len(res['data']), 6) ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) ds.options['data_path'] = None res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=False, json_result=False, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) self.assertEqual(type(res[0]), dict) self.assertEqual(res[0][options['targetFeature']], 'setosa') ds = DataFrame.create_dataframe( os.path.join(model_path, "iris_test.csv")) fsclient.remove_file(results_file_path) self.assertFalse(fsclient.is_file_exists(results_file_path)) fsclient.remove_file(predicted_file_path) self.assertFalse(fsclient.is_file_exists(predicted_file_path)) ds.options['data_path'] = None ds.loaded_columns = ds.columns res = ModelHelper.save_prediction(ds, prediction_id, support_review_model=False, json_result=False, count_in_result=False, prediction_date=prediction_date, model_path=model_path, model_id=options.get('uid')) self.assertEqual(res['columns'], ds.columns) self.assertEqual(len(res['data']), 6) self.assertEqual(type(res['data'][0]), list)
def _distribution_stats(self, date_from, date_to, path_suffix, features, categoricalFeatures=[], feature_mapper={}): res = {} feature_importances = self._get_feature_importances() counter = ProbabilisticCounter() second_pass_counter = ProbabilisticCounter() for (curr_date, files) in ModelReview._prediction_files_by_day( self.model_path, date_from, date_to, path_suffix): stats = {} for feature in features: stats[feature] = { 'count': 0, 'sum': 0, 'sq_sum': 0, 'dist': None, 'imp': feature_importances.get(feature, 0) } df_list = [] for (file, df) in DataFrame.load_from_files(files, features): ModelReview._remove_duplicates_by(df, 'prediction_id', counter) df_list.append(df) # First pass: calc sum and count in each column for average for df in df_list: for feature in features: stats[feature]['count'] += df.df[feature].count() if df.df[feature].dtype.name in [ 'category', 'string', 'object' ] or feature in categoricalFeatures: stats[feature]['dist'] = merge_dicts( stats[feature]['dist'] or {}, dict(df.df[feature].value_counts()), lambda v, ov: v + ov) else: stats[feature]['sum'] += df.df[feature].sum() # Calc average for feature in features: if stats[feature]['count'] > 0 and stats[feature][ 'dist'] == None: stats[feature]['average'] = stats[feature]['sum'] / stats[ feature]['count'] # Second pass: sum of squares of value and average for std dev for df in df_list: ModelReview._remove_duplicates_by(df, 'prediction_id', second_pass_counter) for feature in features: if 'average' in stats[feature]: avg = stats[feature]['average'] stats[feature]['sq_sum'] += ((df.df[feature] - avg)**2).sum() # Calc std dev if len(files) > 0: res[str(curr_date)] = ModelReview._calc_stddev_for_features( stats, features, feature_mapper) return res
def _process_actuals(self, ds_actuals, prediction_group_id=None, primary_prediction_group_id=None, primary_model_path=None, actual_date=None, actuals_id=None, calc_score=False, raise_not_found=False): ds_actuals.df.rename(columns={"actual": 'a2ml_actual'}, inplace=True) actuals_count = ds_actuals.count() primary_ds = None if primary_prediction_group_id: files = ModelReview._get_prediction_files( primary_model_path, primary_prediction_group_id) for (_, df) in DataFrame.load_from_files(files, features=['prediction_id']): primary_ds = df # should be only one file break origin_dtypes = [] origin_columns = [] prediction_files = ModelReview._get_prediction_files( self.model_path, prediction_group_id) actual_index = False for (file, df_prediction_results ) in DataFrame.load_from_files(prediction_files): origin_dtypes = df_prediction_results.df.dtypes origin_columns = df_prediction_results.df.columns if primary_ds is not None: ds_actuals.df[ 'prediction_id'] = ModelReview._map_primary_prediction_id_to_candidate( ds_actuals.df['prediction_id'], primary_ds.df['prediction_id'], df_prediction_results.df['prediction_id']) if not actual_index: ds_actuals.df.set_index('prediction_id', inplace=True) actual_index = True underscore_split = os.path.basename(file['path']).split('_') if len(underscore_split ) == 3: # date_group-id_suffix (new file name with date) prediction_group_id = underscore_split[1] else: # group-id_suffix (old file name without date) prediction_group_id = underscore_split[0] df_prediction_results.df[ 'prediction_group_id'] = prediction_group_id matched_scope = df_prediction_results.df[ df_prediction_results.df['prediction_id'].isin( ds_actuals.df.index)] matched_scope.set_index('prediction_id', inplace=True) ds_actuals.df = ds_actuals.df.combine_first(matched_scope) match_count = ds_actuals.df.count()[self.target_feature] if actuals_count == match_count or primary_ds is not None: break if raise_not_found and match_count == 0 and primary_ds is None: raise Exception( "Actual Prediction IDs not found in model predictions.") ds_actuals.df.reset_index(inplace=True) ds_actuals.dropna(columns=[self.target_feature, 'a2ml_actual']) # combine_first changes orginal non float64 types to float64 when NaN values appear during merging tables # Good explanations https://stackoverflow.com/a/15353297/898680 # Fix: store original datypes and force them after merging for col in origin_columns: if col != 'prediction_id': ds_actuals.df[col] = ds_actuals.df[col].astype( origin_dtypes[col], copy=False) ds_actuals.df['a2ml_actual'] = ds_actuals.df['a2ml_actual'].astype( origin_dtypes[self.target_feature], copy=False) result = True if calc_score: ds_true = DataFrame({}) ds_true.df = ds_actuals.df[[ 'a2ml_actual' ]].rename(columns={'a2ml_actual': self.target_feature}) y_pred, _ = ModelHelper.preprocess_target_ds( self.model_path, ds_actuals) y_true, _ = ModelHelper.preprocess_target_ds( self.model_path, ds_true) result = ModelHelper.calculate_scores(self.options, y_test=y_true, y_pred=y_pred) return result
def test_score_actuals_with_not_full_actuals(): model_path = 'tests/fixtures/test_score_actuals' for actuals_path in glob.glob(model_path + '/predictions/*_actuals.feather.zstd'): os.remove(actuals_path) actuals = [ { 'prediction_id': '5c93079c-00c9-497a-8967-53fa0dd02054', 'actual': False }, { 'prediction_id': 'b1bf9ebf-0277-4771-9bc5-236690a21194', 'actual': False }, { 'prediction_id': 'f61b1bbc-6f7b-4e7e-9a3b-6acb6e1462cd', 'actual': True }, ] actual_date = datetime.date.today() - datetime.timedelta(days=1) res = ModelReview({ 'model_path': model_path }).add_actuals(actuals_path=None, actual_records=actuals, actual_date=actual_date) actual_files = glob.glob(model_path + '/predictions/*_actuals.feather.zstd') assert len(actual_files) > 0 assert str(actual_date) in actual_files[0] stored_actuals = DataFrame({}) stored_actuals.loadFromFeatherFile(actual_files[0]) assert 'prediction_group_id' in stored_actuals.columns stored_actuals = json.loads( stored_actuals.df.sort_values(by=['prediction_id']).to_json( orient='records')) assert len(stored_actuals) == len(actuals) #+ 1 assert stored_actuals[0][ 'prediction_id'] == '5c93079c-00c9-497a-8967-53fa0dd02054' assert stored_actuals[0][ 'prediction_group_id'] == '2ab1e430-6082-4465-b057-3408d36de144' assert stored_actuals[0]['feature1'] == 1 assert stored_actuals[0]['income'] == False assert stored_actuals[1][ 'prediction_id'] == 'b1bf9ebf-0277-4771-9bc5-236690a21194' assert stored_actuals[1][ 'prediction_group_id'] == '2ab1e430-6082-4465-b057-3408d36de144' assert stored_actuals[1]['feature1'] == 1.1 assert stored_actuals[1]['income'] == False assert stored_actuals[2][ 'prediction_id'] == 'f61b1bbc-6f7b-4e7e-9a3b-6acb6e1462cd' assert stored_actuals[2][ 'prediction_group_id'] == '03016c26-f69a-416f-817f-4c58cd69d675' assert stored_actuals[2]['feature1'] == 1.3 assert stored_actuals[2]['income'] == True