def test_command_line_namespace_and_dict_input(): params = parse.wrapper(command_line_namespace_inputs) test = [] test.append( params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv') test.append(params.layer_sizes == [[42, 42]]) test.append(params.batch_size == 63) test.append(params.previously_split) test.append(params.descriptor_type == 'moe') test.append( params.descriptor_key == '/ds/projdata/gsk_data/GSK_Descriptors/all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi.csv' ) test.append(not params.datastore) test.append(params.response_cols == ['task1', 'task2', 'task3']) test.append(not params.transformers) test.append( params.model_filter == { 'model_uuid': 'uuid_1', 'ModelMetadata.TrainingDataset.dataset_key': "['=', 2]", 'ModelMetadata.TrainingDataset.dataset_bucket': "['>', 2]", 'ModelMetadata.TrainingDataset.dataset_oid': "['>=', 2]", 'ModelMetadata.TrainingDataset.class_names': "['in', [1,2,3]]", 'ModelMetadata.TrainingDataset.num_classes': "['<', 2]", 'ModelMetadata.TrainingDataset.feature_transform_type': "['<=', 2]", 'ModelMetadata.TrainingDataset.response_transform_type': "['!=', 3]", 'ModelMetadata.TrainingDataset.id_col': "['nin', [0,1,3,4]]" }) params = parse.wrapper(command_line_dict_inputs) test.append( params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv') test.append(params.layer_sizes == [[42, 42]]) test.append(params.batch_size == 63) test.append(params.previously_split) test.append(params.descriptor_type == 'moe') test.append( params.descriptor_key == '/ds/projdata/gsk_data/GSK_Descriptors/all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi.csv' ) test.append(not params.datastore) test.append(params.response_cols == ['task1', 'task2', 'task3']) test.append(not params.transformers) test.append( params.model_filter == { 'model_uuid': 'uuid_1', 'ModelMetadata.TrainingDataset.dataset_key': "['=', 2]", 'ModelMetadata.TrainingDataset.dataset_bucket': "['>', 2]", 'ModelMetadata.TrainingDataset.dataset_oid': "['>=', 2]", 'ModelMetadata.TrainingDataset.class_names': "['in', [1,2,3]]", 'ModelMetadata.TrainingDataset.num_classes': "['<', 2]", 'ModelMetadata.TrainingDataset.feature_transform_type': "['<=', 2]", 'ModelMetadata.TrainingDataset.response_transform_type': "['!=', 3]", 'ModelMetadata.TrainingDataset.id_col': "['nin', [0,1,3,4]]" }) assert all(test)
def test_synonyms(): answer_a = { "mode": "regression", "num_layers": 3, "learning_rate": 0.0007, "n_tasks": 1, } answer_c = { "mode": "classification", "num_layers": 3, "learning_rate": 0.0007, "n_tasks": 2, } json_a = { "AttentiveFPModel_mode": "regression", "AttentiveFPModel_num_layers": "3", "AttentiveFPModel_learning_rate": "0.0007", "response_cols": "asdf" } json_b = { "prediction_type": "regression", "AttentiveFPModel_num_layers": "3", "learning_rate": "0.0007", "response_cols": "asdf" } json_c = { "prediction_type": "classification", "AttentiveFPModel_num_layers": "3", "learning_rate": "0.0007", "response_cols": ["asdf1", "asdf2"] } params_a = pp.wrapper(json_a) params_b = pp.wrapper(json_b) params_c = pp.wrapper(json_c) aaa = pp.AutoArgumentAdder(pp.model_wl['AttentiveFPModel'], 'AttentiveFPModel') assert aaa.extract_params(params_a) == aaa.extract_params(params_b) assert aaa.extract_params(params_a, strip_prefix=True) == answer_a assert answer_c == aaa.extract_params(params_c, strip_prefix=True) assert not aaa.extract_params(params_a) == aaa.extract_params(params_c) assert not aaa.extract_params(params_b) == aaa.extract_params(params_c)
def confirm_perf_table(json_f, df): ''' df should contain one entry for the model specified by json_f checks to see if the parameters extracted match what's in config ''' # should only have trained one model assert len(df) == 1 # the one row row = df.iloc[0] with open(json_f) as f: config = json.load(f) model_type = config['model_type'] if model_type == 'NN': assert row['best_epoch'] > 0 assert row['max_epochs'] == int(config['max_epochs']) assert row['learning_rate'] == float(config['learning_rate']) assert row['layer_sizes'] == config['layer_sizes'] assert row['dropouts'] == config['dropouts'] elif model_type == 'RF': print(row[[c for c in df.columns if c.startswith('rf_')]]) assert row['rf_estimators'] == int(config['rf_estimators']) assert row['rf_max_features'] == int(config['rf_max_features']) assert row['rf_max_depth'] == int(config['rf_max_depth']) elif model_type == 'xgboost': print(row[[c for c in df.columns if c.startswith('xgb_')]]) assert row['xgb_gamma'] == float(config['xgb_gamma']) assert row['xgb_learning_rate'] == float(config['xgb_learning_rate']) else: assert model_type in pp.model_wl assert row['best_epoch'] > 0 pparams = pp.wrapper(config) assert row['learning_rate'] == float(pparams.learning_rate)
def create_scaffold_split(dset_key, res_dir): params = { "dataset_key": dset_key, "datastore": "False", "uncertainty": "False", "splitter": "scaffold", "split_valid_frac": "0.1", "split_test_frac": "0.1", "split_strategy": "train_valid_test", "previously_split": "False", "prediction_type": "classification", "model_choice_score_type": "roc_auc", "response_cols": "active", "id_col": "compound_id", "smiles_col": "base_rdkit_smiles", "result_dir": res_dir, "system": "LC", "transformers": "True", "model_type": "NN", "featurizer": "computed_descriptors", "descriptor_type": "rdkit_raw", "learning_rate": ".0007", "layer_sizes": "512,128", "dropouts": "0.3,0.3", "save_results": "False", "max_epochs": "500", "early_stopping_patience": "50", "verbose": "False" } pparams = parse.wrapper(params) MP = mp.ModelPipeline(pparams) split_uuid = MP.split_dataset() return split_uuid
def main(): """Entry point when script is run""" print(sys.argv[1:]) params = parse.wrapper(sys.argv[1:]) keep_params = { 'model_type', 'featurizer', 'splitter', 'datastore', 'save_results', 'previously_featurized', 'descriptor_key', 'descriptor_type', 'split_valid_frac', 'split_test_frac', 'bucket', 'lc_account', 'slurm_time_limit', 'slurm_partition' } | excluded_keys params.__dict__ = parse.prune_defaults(params, keep_params=keep_params) if params.search_type == 'grid': hs = GridSearch(params) elif params.search_type == 'random': hs = RandomSearch(params) elif params.search_type == 'geometric': hs = GeometricSearch(params) elif params.search_type == 'user_specified': hs = UserSpecifiedSearch(params) else: print("Incorrect search type specified") sys.exit(1) if params.split_only: hs.generate_split_shortlist() else: hs.run_search()
def uncurated_objects(y=["VALUE_NUM"]): params_from_ds = parse.wrapper(currentdir + '/config_uncurated_bp.json') params_from_ds.response_cols = y featurization = feat.create_featurization(params_from_ds) data = model_dataset.create_model_dataset(params_from_ds, featurization) uncurated_df = data.load_full_dataset() return params_from_ds, data, uncurated_df
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- clean() # Run ECFP NN hyperparam search # ------------ json_file = "nn_ecfp.json" with open(json_file, "r") as f: hp_params = json.load(f) pparams = parse.wrapper(hp_params) print('launch maestro') _ = wait_to_finish(f"maestro run -y -p custom_gen.py run_nn_ecfp.yaml", max_time=2 * 60 * 60) # wait 2 hours. result_df = cm.get_filesystem_perf_results(pparams.result_dir, pparams.prediction_type) assert not result_df is None # Timed out assert max(result_df['test_r2_score'].values ) > 0.6 # should do at least this well. I saw values like 0.687 print('waiting for maestro to finish') time.sleep(60) # Clean # ----- clean()
def featurize_from_shortlist(shortlist_path=None, split_json=None): """ Featurize and split the ChEMBL hERG pIC50 dataset. Then create a config file for running a hyperparameter search to model this dataset. """ sl = pd.read_csv(shortlist_path) with open(split_json, "r") as f: hp_params = json.load(f) print('Featurizing shortlist') hp_params.pop('use_shortlist') hp_params.pop('shortlist_key') for i, row in sl.iterrows(): hp_params['dataset_key'] = row.dataset_key hp_params['response_cols'] = row.response_cols pparams = parse.wrapper(hp_params) print('-----------------------------------------------') print(hp_params['dataset_key']) print(pparams.dataset_key) print('-----------------------------------------------') # Create a ModelPipeline object pipe = mp.ModelPipeline(pparams) # Featurize and split the dataset split_uuid = pipe.split_dataset() # Delete split file to keep it cleaner rdir = hp_params['result_dir'] dkey = row.dataset_key.replace('.csv', '') os.remove(f'{dkey}_train_valid_test_scaffold_{split_uuid}.csv')
def train_model(input, output): """ Retrain a model saved in a model_metadata.json file Args: input (str): path to model_metadata.json file output (str): path to output directory Returns: None """ # Train model # ----------- # Read parameter JSON file with open(input) as f: config = json.loads(f.read()) # Parse parameters params = parse.wrapper(config) params.result_dir = output # otherwise this will have the same uuid as the source model params.model_uuid = None # use the same split params.previously_split = True params.split_uuid = config['splitting_parameters']['split_uuid'] logger.debug("model params %s" % str(params)) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() return model
def test_default_params_json(): params = parse.wrapper(currentdir + '/config_required_inputs.json') defaults = default_parameters() defaults.config_file = currentdir + '/config_required_inputs.json' test = [] test.append(params == defaults) test.append(params.transformers) assert all(test)
def train_model_from_tracker(model_uuid, output_dir): """ Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker Args: model_uuid (str): model tracker model_uuid file output_dir (str): path to output directory Returns: the model pipeline object with trained model """ if not mlmt_supported: logger.debug( "Model tracker not supported in your environment; can load models from filesystem only." ) return None mlmt_client = dsf.initialize_model_tracker() collection_name = mt.get_model_collection_by_uuid(model_uuid, mlmt_client=mlmt_client) # get metadata from tracker config = mt.get_metadata_by_uuid(model_uuid) # check if datastore dataset try: result = dsf.retrieve_dataset_by_datasetkey( config['training_dataset']['dataset_key'], bucket=config['training_dataset']['bucket']) if result is not None: config['datastore'] = True except: pass # fix weird old parameters #if config[] # Parse parameters params = parse.wrapper(config) params.result_dir = output_dir # otherwise this will have the same uuid as the source model params.model_uuid = None # use the same split params.previously_split = True params.split_uuid = config['splitting_parameters']['split_uuid'] # specify collection params.collection_name = collection_name logger.debug("model params %s" % str(params)) # Create model pipeline model = mp.ModelPipeline(params) # Train model model.train_model() return model
def test_train_valid_test(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'nn_ecfp_random.json') pparams = parse.wrapper(['--config_file', json_file]) pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') pparams.result_dir = script_path saved_model_identity(pparams)
def datastore_objects(y=["PIC50"]): params_from_ds = parse.wrapper(currentdir + '/config_datastore_dset_cav12.json') params_from_ds.response_cols = y featurization = feat.create_featurization(params_from_ds) data = model_dataset.create_model_dataset(params_from_ds, featurization) dset_df = data.load_full_dataset() data.get_featurized_data() data.split_dataset() return params_from_ds, data, dset_df
def test_defaults(): json_d = { "prediction_type": "classification", "AttentiveFPModel_num_layers": "3", "response_cols": ["asdf1", "asdf2"] } params_d = pp.wrapper(json_d) # make sure that the default value of synonyms are still set correctly expected_lr = 0.0005 assert params_d.learning_rate == expected_lr, f'{params_d.learning_rate} should be {expected_lr}'
def test_ecfp_nn(): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, 'nn_ecfp.json') pparams = parse.wrapper(['--config_file', json_file]) pparams.dataset_key = os.path.join(script_path, '../../test_datasets/aurka_chembl_base_smiles_union.csv') pparams.result_dir = script_path pparams.split_uuid = 'test-split' saved_model_identity(pparams)
def delaney_pipeline(y=["measured log solubility in mols per litre"], featurizer="ecfp", split_strategy="train_valid_test", splitter="random"): delaney_inp_file = currentdir + '/config_delaney.json' inp_params = parse.wrapper(delaney_inp_file) inp_params.response_cols = y inp_params.featurizer = featurizer inp_params.split_strategy = split_strategy inp_params.splitter = splitter mp = MP.ModelPipeline(inp_params) return mp
def test_create_model_wrapper(): """ Args: params (Namespace) : Parameters passed to the model pipeline featurizer (Featurization): Object managing the featurization of compounds ds_client (DatastoreClient): Interface to the file datastore Returns: model (pipeline.Model): Wrapper for DeepChem, sklearn or other model. Raises: ValueError: Only params.model_type = 'NN' or 'RF' is supported. Dependencies: None Calls: DCNNModelWrapper, DCRFModelWrapper """ inp_params = parse.wrapper(general_params) featurization = feat.create_featurization(inp_params) mdl = model_wrapper.create_model_wrapper(inp_params, featurization) mdl.setup_model_dirs() # testing for correct attribute initialization with model_type == "NN" test = [] test.append(mdl.params.model_type == 'NN') test.append(isinstance(mdl.featurization, feat.DynamicFeaturization)) test.append(mdl.output_dir == inp_params.output_dir) test.append(mdl.model_dir == inp_params.output_dir + '/' + 'model') test.append(mdl.best_model_dir == inp_params.output_dir + '/' + 'best_model') test.append(mdl.baseline_model_dir == inp_params.output_dir + '/' + 'baseline_epoch_model') test.append(mdl.transformers == []) test.append(mdl.transformers_x == []) test.append(isinstance(mdl, model_wrapper.DCNNModelWrapper)) # testing for correct attribute initialization with model_type == "RF" temp_params = copy.deepcopy(inp_params) temp_params.model_type = 'RF' featurization = feat.create_featurization(temp_params) mdl_RF = model_wrapper.create_model_wrapper(temp_params, featurization) test.append(isinstance(mdl_RF, MP.model_wrapper.DCRFModelWrapper)) test.append(mdl_RF.params.model_type == 'RF') # assertion for all tests assert all(test) #testing for Exception with model_type not in ['NN','RF'] with pytest.raises(ValueError): temp_params.model_type = 'wrong' mdl_wrong = model_wrapper.create_model_wrapper(temp_params, featurization)
def wait_to_finish(json_file, max_time=600): """ Run hyperparam search and return pref_df Given parased parameter namespace build the hyperparam search command and wait for training to complete. Once training is complete, retrun the perf_df. This function repeatedly calls get_filesystem_perf_results until it sees at least the number of jobs generated by pparams. Args: json_file (str): Path to json_file to run. max_type (int): Max wait time in seconds. Default 600. -1 is unlimited wait time. Returns: DataFrame or None: returns perf_df if training completes in time. """ with open(json_file, "r") as f: hp_params = json.load(f) pparams = parse.wrapper(hp_params) script_dir = pparams.script_dir python_path = pparams.python_path result_dir = pparams.result_dir pred_type = pparams.prediction_type run_cmd = f"{python_path} {script_dir}/utils/hyperparam_search_wrapper.py --config_file {json_file}" # os.system(run_cmd) p = subprocess.Popen(run_cmd.split(' '), stdout=subprocess.PIPE) out = p.stdout.read().decode("utf-8") num_jobs = out.count('Submitted batch job') num_found = 0 time_waited = 0 wait_interval = 30 print("Waiting %d jobs to finish. Checks every 30 seconds" % num_jobs) while (num_found < num_jobs) and ((max_time == -1) or (time_waited < max_time)): # wait until the training jobs have finished time.sleep(wait_interval) # check for results every 30 seconds time_waited += wait_interval try: result_df = cm.get_filesystem_perf_results(result_dir, pred_type=pred_type) num_found = result_df.shape[0] except: num_found = 0 result_df = None return result_df
def split_and_save_dataset(self, assay_params): self.get_dataset_metadata(assay_params) #TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) #TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) data.get_featurized_data() data.split_dataset() data.save_split_dataset() assay_params['previously_split'] = True assay_params['split_uuid'] = data.split_uuid
def moe_descriptors(datastore=False): if datastore == True: params_ds = parse.wrapper(currentdir + "/config_MAOA_moe_descriptors_ds.json") else: params_file = parse.wrapper(currentdir + "/config_MAOA_moe_descriptors.json") # if not os.path.isfile(params_file.dataset_key): # os.makedirs('pytest/config_MAOA_moe_descriptors/moe_descriptors', exist_ok=True) # copyfile(params_ds.dataset_key, params_file.dataset_key) if datastore == True: params_desc = params_ds else: params_desc = params_file featurization = feat.create_featurization(params_desc) dataset_obj_for_desc = model_dataset.create_model_dataset(params_desc, featurization, ds_client=None) df = dataset_obj_for_desc.load_full_dataset() return params_desc, dataset_obj_for_desc, df
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- clean() # Run HyperOpt # ------------ with open("H1_RF.json", "r") as f: hp_params = json.load(f) script_dir = parse.__file__.strip("parameter_parser.py").replace( "/pipeline/", "") python_path = sys.executable hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path params = parse.wrapper(hp_params) if not os.path.isfile(params.dataset_key): params.dataset_key = os.path.join(params.script_dir, params.dataset_key) train_df = pd.read_csv(params.dataset_key) print(f"Train a RF models with ECFP") pl = mp.ModelPipeline(params) pl.train_model() print("Calculate AD index with the just trained model.") pred_df_mp = pl.predict_on_dataframe(train_df[:10], contains_responses=True, AD_method="z_score") assert ( "AD_index" in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp' print("Calculate AD index with the saved model tarball file.") pred_df_file = pfm.predict_from_model_file( model_path=pl.params.model_tarball_path, input_df=train_df[:10], id_col="compound_id", smiles_col="rdkit_smiles", response_col="pKi_mean", dont_standardize=True, AD_method="z_score") assert ("AD_index" in pred_df_file.columns.values ), 'Error: No AD_index column in pred_df_file'
def train_model_w_balan(dset_key, split_uuid, res_dir): # Now train models on the same dataset with balancing weights params = { "dataset_key": dset_key, "datastore": "False", "uncertainty": "False", "splitter": "scaffold", "split_valid_frac": "0.1", "split_test_frac": "0.1", "split_strategy": "train_valid_test", "previously_split": "True", "split_uuid": split_uuid, "prediction_type": "classification", "model_choice_score_type": "roc_auc", "response_cols": "active", "id_col": "compound_id", "smiles_col": "base_rdkit_smiles", "result_dir": res_dir, "system": "LC", "transformers": "True", "model_type": "NN", "featurizer": "computed_descriptors", "descriptor_type": "rdkit_raw", "weight_transform_type": "balancing", "learning_rate": ".0007", "layer_sizes": "512,128", "dropouts": "0.3,0.3", "save_results": "False", "max_epochs": "500", "early_stopping_patience": "50", "verbose": "False" } for i in range(nreps): pparams = parse.wrapper(params) MP = mp.ModelPipeline(pparams) MP.train_model() wrapper = MP.model_wrapper for ss in ['valid', 'test']: metvals = wrapper.get_pred_results(ss, 'best') for metric in [ 'roc_auc_score', 'prc_auc_score', 'cross_entropy', 'precision', 'recall_score', 'npv', 'accuracy_score', 'bal_accuracy', 'kappa', 'matthews_cc' ]: subset.append(ss) balanced.append('yes') metrics.append(metric) vals.append(metvals[metric])
def test_predict_on_dataframe(): ''' test that predict_from_model makes predictions in the same order as the input ''' model_path = '../../examples/BSEP/models/bsep_classif_scaffold_split.tar.gz' csv_path = '../../examples/BSEP/data/ChEMBL25_BSEP_curated_data.csv' id_col = 'compound_id' smiles_col = 'base_rdkit_smiles' response_col = 'active' conc_col = None is_featurized = False dont_standardize = False AD_method = None k = 5 dist_metric = "euclidean" df = pd.read_csv(csv_path, dtype={id_col: str}) shuffled_df = df.sample(frac=1) input_df, pred_params = pfm._prepare_input_data(shuffled_df, id_col, smiles_col, response_col, conc_col, dont_standardize) has_responses = ('response_cols' in pred_params) pred_params = parse.wrapper(pred_params) pipe = mp.create_prediction_pipeline_from_file(pred_params, reload_dir=None, model_path=model_path) pred_df = pipe.predict_on_dataframe(input_df, contains_responses=has_responses, is_featurized=is_featurized, AD_method=AD_method, k=k, dist_metric=dist_metric) old_id_col = shuffled_df[id_col].values new_id_col = pred_df[id_col].values match_rows = all([n == o for n, o in zip(new_id_col, old_id_col)]) print(match_rows) assert all([n == o for n, o in zip(new_id_col, old_id_col)]) score = skm.accuracy_score(shuffled_df[response_col].values, pred_df['pred'].values) print(score) assert score > 0.5
def verify_saved_params(original_json_f, tar_f): ''' compares saved params in a tar file with original json ''' reload_dir = tempfile.mkdtemp() model_fp = tarfile.open(tar_f, mode='r:gz') model_fp.extractall(path=reload_dir) model_fp.close() # read config from tar file config_file_path = os.path.join(reload_dir, 'model_metadata.json') with open(config_file_path) as f: tar_config = json.loads(f.read()) # read original config with open(original_json_f) as f: original_config = json.loads(f.read()) original_pp = parse.wrapper(original_config) original_model_params = parse.extract_model_params(original_pp) original_feat_params = parse.extract_featurizer_params(original_pp) tar_pp = parse.wrapper(tar_config) tar_model_params = parse.extract_model_params(tar_pp) tar_feat_params = parse.extract_featurizer_params(tar_pp) print('-----------------------------------') print('model params') print(original_model_params) print(tar_model_params) assert original_model_params == tar_model_params print('-----------------------------------') print('feat params') print(original_feat_params) print(tar_feat_params) assert original_feat_params == tar_feat_params
def test_correct_input_type_json(): params = parse.wrapper(currentdir + '/config_list_inputs.json') test = [] test.append(params.system == 'twintron-blue') test.append( params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv') test.append(params.layer_sizes == [42, 42]) test.append(params.batch_size == 63) test.append(params.previously_split) test.append(params.descriptor_type == 'moe') test.append(not params.datastore) test.append(params.model_type == ['NN', 'RF']) test.append(params.response_cols == ['task1', 'task2', 'task3']) test.append(not params.transformers) assert all(test)
def test_hierarchical_dict(): params = parse.wrapper(hierarchical_input_dict) test = [] test.append(params.system == 'twintron-blue') test.append( params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv') test.append(params.layer_sizes == [[42, 42]]) test.append(params.batch_size == 63) test.append(params.previously_split) test.append(params.descriptor_type == 'moe') test.append(not params.datastore) test.append(params.model_type == ['NN', 'RF']) test.append(params.response_cols == ['task1', 'task2', 'task3']) test.append(not params.transformers) assert all(test)
def test(): """ Test full model pipeline: Curate data, fit model, and predict property for new compounds """ # Clean # ----- clean() # Run HyperOpt # ------------ with open("H1_hybrid.json", "r") as f: hp_params = json.load(f) script_dir = parse.__file__.strip("parameter_parser.py").replace( "/pipeline/", "") python_path = sys.executable hp_params["script_dir"] = script_dir hp_params["python_path"] = python_path params = parse.wrapper(hp_params) if not os.path.isfile(params.dataset_key): params.dataset_key = os.path.join(params.script_dir, params.dataset_key) train_df = pd.read_csv(params.dataset_key) print(f"Train a hybrid models with MOE descriptors") pl = mp.ModelPipeline(params) pl.train_model() print("Check the model performance on validation data") pred_data = pl.model_wrapper.get_perf_data(subset="valid", epoch_label="best") pred_results = pred_data.get_prediction_results() print(pred_results) pred_score = pred_results['r2_score'] score_threshold = 0.4 assert pred_score > score_threshold, \ f'Error: Score is too low {pred_score}. Must be higher than {score_threshold}' print("Make predictions with the hyrid model") predict = pl.predict_on_dataframe(train_df[:10], contains_responses=False) assert (predict['pred'].shape[0] == 10 ), 'Error: Incorrect number of predictions' assert (np.all(np.isfinite( predict['pred'].values))), 'Error: Predictions are not numbers'
def train_and_get_tar(input_json, ds_key_file): script_path = os.path.dirname(os.path.realpath(__file__)) json_file = os.path.join(script_path, input_json) pparams = parse.wrapper(['--config_file', json_file]) pparams.dataset_key = os.path.join(script_path, ds_key_file) pparams.result_dir = os.path.join(script_path, 'result') train_pipe = mp.ModelPipeline(pparams) train_pipe.train_model() list_of_files = glob.glob('./result/*.gz') # check all *.gz latest_file = max(list_of_files, key=os.path.getctime) # get the latest gz return latest_file
def test_correct_input_mixed_command_line_types(): params = parse.wrapper(config_inputs) test = [] test.append(params.system == 'twintron-blue') test.append( params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv') test.append(params.layer_sizes == [[42, 42]]) test.append(params.batch_size == 63) test.append(params.previously_split) test.append(params.descriptor_type == 'moe') test.append(not params.datastore) test.append(params.response_cols == ['one1', 'two2', 'three3']) test.append(params.baseline_epoch == 78) test.append(params.splitter == 'random') test.append(not params.transformers) assert all(test)
def predict_from_model_file(model_path, input_df, id_col='compound_id', smiles_col='rdkit_smiles', response_col=None, is_featurized=False, dont_standardize=False, AD_method=None, k=5, dist_metric="euclidean"): """ Loads a pretrained model from a model tarball file and runs predictions on compounds in an input data frame. Args: model_path (str): File path of the model tarball file. input_df (DataFrame): Input data to run predictions on; must at minimum contain SMILES strings. id_col (str): Name of the column containing compound IDs. If none is provided, sequential IDs will be generated. smiles_col (str): Name of the column containing SMILES strings; required. response_col (str): Name of an optional column containing actual response values; if it is provided, the actual values will be included in the returned data frame to make it easier for you to assess performance. dont_standardize (bool): By default, SMILES strings are salt-stripped and standardized using RDKit; if you have already done this, or don't want them to be standardized, set dont_standardize to True. AD_method (str): with default, Applicable domain (AD) index will not be calcualted, use z_score or local_density to choose the method to calculate AD index. k (int): number of the neareast neighbors to evaluate the AD index, default is 5. dist_metric (str): distance metrics, valid values are 'cityblock', 'cosine', 'euclidean', 'jaccard', 'manhattan' Return: A data frame with compound IDs, SMILES strings and predicted response values. Actual response values will be included if response_col is provided. Standard prediction error estimates will be included if the model was trained with uncertainty=True. Note that the predicted and actual response columns will be labeled according to the response_col setting in the original training data, not the response_col passed to this function; e.g. if the original model response_col was 'pIC50', the returned data frame will contain columns 'pIC50_actual', 'pIC50_pred' and 'pIC50_std'. """ input_df, pred_params = _prepare_input_data(input_df, id_col, smiles_col, response_col, dont_standardize) has_responses = ('response_cols' in pred_params) pred_params = parse.wrapper(pred_params) pipe = mp.create_prediction_pipeline_from_file(pred_params, reload_dir=None, model_path=model_path) pred_df = pipe.predict_full_dataset(input_df, contains_responses=has_responses, is_featurized=is_featurized, dset_params=pred_params, AD_method=AD_method, k=k, dist_metric=dist_metric) pred_df = pred_df.sort_values(by=id_col) return pred_df