def saved_model_identity(pparams): script_path = os.path.dirname(os.path.realpath(__file__)) if not pparams.previously_split: split_uuid = split(pparams) pparams.split_uuid = split_uuid pparams.previously_split = True train_pipe = train(pparams) split_csv = os.path.join(script_path, '../../test_datasets/', train_pipe.data._get_split_key()) test_df = get_test_set(pparams.dataset_key, split_csv, pparams.id_col) # verify with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f: model_metrics = json.load(f) # only compare test results since there's only one fold for test metrics = find_best_test_metric(model_metrics) id_col = metrics['input_dataset']['id_col'] response_col = metrics['input_dataset']['response_cols'][0] smiles_col = metrics['input_dataset']['smiles_col'] test_length = metrics['prediction_results']['num_compounds'] model_tar = train_pipe.params.model_tarball_path pred_df = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, smiles_col=smiles_col, response_col=response_col) pred_df2 = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col, smiles_col=smiles_col, response_col=response_col) X = pred_df[response_col+'_actual'].values y = pred_df[response_col+'_pred'].values X2 = pred_df2[response_col+'_actual'].values y2 = pred_df2[response_col+'_pred'].values r2 = skmetrics.r2_score(X, y) rms = np.sqrt(skmetrics.mean_squared_error(X, y)) mae = skmetrics.mean_absolute_error(X, y) saved_r2 = metrics['prediction_results']['r2_score'] saved_rms = metrics['prediction_results']['rms_score'] saved_mae = metrics['prediction_results']['mae_score'] print(metrics['subset']) print(pred_df.columns) print(abs(r2-saved_r2)) print(abs(rms-saved_rms)) print(abs(mae-saved_mae)) print(np.mean(abs(y2-y))) assert abs(r2-saved_r2)<1e-5 \ and abs(rms-saved_rms)<1e-5 \ and abs(mae-saved_mae)<1e-5 \ and np.mean(abs(y2-y))<1e-5 \ and (test_length == len(test_df))
def test_predict_from_model(): ''' test that predict_from_model makes predictions in the same order as the input ''' model_path = '../../examples/BSEP/models/bsep_classif_scaffold_split.tar.gz' csv_path = '../../examples/BSEP/data/ChEMBL25_BSEP_curated_data.csv' id_col = 'compound_id' smiles_col = 'base_rdkit_smiles' response_col = 'active' df = pd.read_csv(csv_path, dtype={id_col: str}) #df = pd.concat([df.head(25),df.head(25)]) #df = df.head(50) shuffled_df = df.sample(frac=1) pred_df = pfm.predict_from_model_file(model_path, shuffled_df, id_col=id_col, smiles_col=smiles_col, response_col=response_col) old_id_col = shuffled_df[id_col].values new_id_col = pred_df[id_col].values match_rows = all([n == o for n, o in zip(new_id_col, old_id_col)]) print(match_rows) assert all([n == o for n, o in zip(new_id_col, old_id_col)]) score = skm.accuracy_score(shuffled_df[response_col].values, pred_df[response_col + '_pred'].values) print(score) assert score > 0.5
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- clean() # Run HyperOpt # ------------ with open("H1_RF.json", "r") as f: hp_params = json.load(f) script_dir = parse.__file__.strip("parameter_parser.py").replace( "/pipeline/", "") python_path = sys.executable hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path params = parse.wrapper(hp_params) if not os.path.isfile(params.dataset_key): params.dataset_key = os.path.join(params.script_dir, params.dataset_key) train_df = pd.read_csv(params.dataset_key) print(f"Train a RF models with ECFP") pl = mp.ModelPipeline(params) pl.train_model() print("Calculate AD index with the just trained model.") pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score") assert ( "AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp' print("Calculate AD index with the saved model tarball file.") pred_df_file = pfm.predict_from_model_file( model_path=pl.params.model_tarball_path, input_df=train_df[:10], id_col="compound_id", smiles_col="rdkit_smiles", response_col="pKi_mean", dont_standardize=True, AD_method="z_score") assert ("AD_index" in pred_df_file.columns.values ), 'Error: No AD_index column in pred_df_file'
def predict_activity(args): # prepare inputs input_df = pd.read_csv(args.input_file, index_col=False) colnames = set(input_df.columns.values) if args.id_col not in colnames: input_df['compound_id'] = [ 'compound_%.6d' % i for i in range(input_df.shape[0]) ] args.id_col = 'compound_id' if args.smiles_col not in colnames: raise ValueError( 'smiles_col parameter not specified or column not in input file.') model_files = dict(random='bsep_classif_random_split.tar.gz', scaffold='bsep_classif_scaffold_split.tar.gz') if args.model_type not in model_files: raise ValueError("model_type %s is not a recognizied value." % args.model_type) if args.external_training_data is not None: data_file = os.path.join(os.getcwd(), args.external_training_data) else: data_file = None print("Data file:", data_file) # Test loading model from tarball and running predictions models_dir = os.path.join(os.path.dirname(__file__), 'models') model_tarfile = os.path.join(models_dir, model_files[args.model_type]) # predict pred_df = pfm.predict_from_model_file( model_path=model_tarfile, input_df=input_df, id_col=args.id_col, smiles_col=args.smiles_col, response_col=args.response_col, dont_standardize=args.dont_standardize, is_featurized=args.is_featurized, AD_method=args.AD_method, external_training_data=data_file) # delete files created during prediction for root, dirs, files in os.walk(os.getcwd(), topdown=False): for file in files: if 'train_valid_test' in file: os.remove(os.path.join(root, file)) # Write predictions to output file pred_df.to_csv(args.output_file, index=False) print("Wrote predictions to file %s" % args.output_file) # If measured activity values are provided, print some performance metrics if args.response_col is not None: actual_vals = pred_df['%s_actual' % args.response_col].values pred_classes = pred_df['%s_pred' % args.response_col].values pred_probs = pred_df['%s_prob' % args.response_col].values conf_matrix = metrics.confusion_matrix(actual_vals, pred_classes) roc_auc = metrics.roc_auc_score(actual_vals, pred_probs) prc_auc = metrics.average_precision_score(actual_vals, pred_probs) accuracy = metrics.accuracy_score(actual_vals, pred_classes) precision = metrics.precision_score(actual_vals, pred_classes) npv = negative_predictive_value(actual_vals, pred_classes) recall = metrics.recall_score(actual_vals, pred_classes) mcc = metrics.matthews_corrcoef(actual_vals, pred_classes) ncorrect = sum(actual_vals == pred_classes) print("Performance metrics:\n") print("%d out of %d predictions correct." % (ncorrect, pred_df.shape[0])) print("Accuracy: %.3f" % accuracy) print("Precision: %.3f" % precision) print("Recall: %.3f" % recall) print("NPV: %.3f" % npv) print("ROC AUC: %.3f" % roc_auc) print("PRC AUC: %.3f" % prc_auc) print("Matthews correlation coefficient: %.3f" % mcc) print("Confusion matrix:") print("\t\tpredicted activity") print("actual\nactivity\t0\t1\n") print(" 0\t\t%d\t%d" % (conf_matrix[0][0], conf_matrix[0][1])) print(" 1\t\t%d\t%d" % (conf_matrix[1][0], conf_matrix[1][1]))
def train_and_predict(train_json_f, prefix='delaney-processed'): # Train model # ----------- # Read parameter JSON file with open(train_json_f) as f: config = json.loads(f.read()) # Parse parameters params = parse.wrapper(config) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() # Get uuid and reload directory # ----------------------------- model_type = params.model_type prediction_type = params.prediction_type descriptor_type = params.descriptor_type featurizer = params.featurizer splitter = params.splitter model_dir = 'result/%s_curated_fit/%s_%s_%s_%s' % ( prefix, model_type, featurizer, splitter, prediction_type) uuid = model.params.model_uuid tar_f = 'result/%s_curated_fit_model_%s.tar.gz' % (prefix, uuid) reload_dir = model_dir + '/' + uuid # Check training statistics # ------------------------- if prediction_type == 'regression': threshold = 0.6 if 'perf_threshold' in config: threshold = float(config['perf_threshold']) integrative_utilities.training_statistics_file(reload_dir, 'test', threshold, 'r2_score') score = integrative_utilities.read_training_statistics_file( reload_dir, 'test', 'r2_score') else: threshold = 0.7 if 'perf_threshold' in config: threshold = float(config['perf_threshold']) integrative_utilities.training_statistics_file(reload_dir, 'test', threshold, 'accuracy_score') score = integrative_utilities.read_training_statistics_file( reload_dir, 'test', 'accuracy_score') print("Final test score:", score) # Load second test set # -------------------- data = pd.read_csv('%s_curated_external.csv' % prefix) predict = pfm.predict_from_model_file(tar_f, data, id_col=params.id_col, smiles_col=params.smiles_col, response_col=params.response_cols) pred_cols = [f for f in predict.columns if f.endswith('_pred')] pred = predict[pred_cols].to_numpy() # Check predictions # ----------------- assert ( pred.shape[0] == len(data)), 'Error: Incorrect number of predictions' assert (np.all(np.isfinite(pred))), 'Error: Predictions are not numbers' # Save predictions with experimental values # ----------------------------------------- predict.reset_index(level=0, inplace=True) combined = pd.merge(data, predict, on=params.id_col, how='inner') pred_csv_name = '%s_curated_%s_%s_%s_%s_%d_%s_predict.csv' % ( prefix, model_type, prediction_type, descriptor_type, featurizer, len(model.params.response_cols), model.params.splitter) combined.to_csv(pred_csv_name) assert (os.path.isfile(pred_csv_name) and os.path.getsize(pred_csv_name) > 0 ), 'Error: Prediction file not created' return tar_f