Exemple #1
0
def test_command_line_namespace_and_dict_input():

    params = parse.wrapper(command_line_namespace_inputs)
    test = []
    test.append(
        params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv')
    test.append(params.layer_sizes == [[42, 42]])
    test.append(params.batch_size == 63)
    test.append(params.previously_split)
    test.append(params.descriptor_type == 'moe')
    test.append(
        params.descriptor_key ==
        '/ds/projdata/gsk_data/GSK_Descriptors/all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi.csv'
    )
    test.append(not params.datastore)
    test.append(params.response_cols == ['task1', 'task2', 'task3'])
    test.append(not params.transformers)
    test.append(
        params.model_filter == {
            'model_uuid': 'uuid_1',
            'ModelMetadata.TrainingDataset.dataset_key': "['=', 2]",
            'ModelMetadata.TrainingDataset.dataset_bucket': "['>', 2]",
            'ModelMetadata.TrainingDataset.dataset_oid': "['>=', 2]",
            'ModelMetadata.TrainingDataset.class_names': "['in', [1,2,3]]",
            'ModelMetadata.TrainingDataset.num_classes': "['<', 2]",
            'ModelMetadata.TrainingDataset.feature_transform_type':
            "['<=', 2]",
            'ModelMetadata.TrainingDataset.response_transform_type':
            "['!=', 3]",
            'ModelMetadata.TrainingDataset.id_col': "['nin', [0,1,3,4]]"
        })
    params = parse.wrapper(command_line_dict_inputs)
    test.append(
        params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv')
    test.append(params.layer_sizes == [[42, 42]])
    test.append(params.batch_size == 63)
    test.append(params.previously_split)
    test.append(params.descriptor_type == 'moe')
    test.append(
        params.descriptor_key ==
        '/ds/projdata/gsk_data/GSK_Descriptors/all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi.csv'
    )
    test.append(not params.datastore)
    test.append(params.response_cols == ['task1', 'task2', 'task3'])
    test.append(not params.transformers)
    test.append(
        params.model_filter == {
            'model_uuid': 'uuid_1',
            'ModelMetadata.TrainingDataset.dataset_key': "['=', 2]",
            'ModelMetadata.TrainingDataset.dataset_bucket': "['>', 2]",
            'ModelMetadata.TrainingDataset.dataset_oid': "['>=', 2]",
            'ModelMetadata.TrainingDataset.class_names': "['in', [1,2,3]]",
            'ModelMetadata.TrainingDataset.num_classes': "['<', 2]",
            'ModelMetadata.TrainingDataset.feature_transform_type':
            "['<=', 2]",
            'ModelMetadata.TrainingDataset.response_transform_type':
            "['!=', 3]",
            'ModelMetadata.TrainingDataset.id_col': "['nin', [0,1,3,4]]"
        })
    assert all(test)
def test_synonyms():
    answer_a = {
        "mode": "regression",
        "num_layers": 3,
        "learning_rate": 0.0007,
        "n_tasks": 1,
    }

    answer_c = {
        "mode": "classification",
        "num_layers": 3,
        "learning_rate": 0.0007,
        "n_tasks": 2,
    }

    json_a = {
        "AttentiveFPModel_mode": "regression",
        "AttentiveFPModel_num_layers": "3",
        "AttentiveFPModel_learning_rate": "0.0007",
        "response_cols": "asdf"
    }

    json_b = {
        "prediction_type": "regression",
        "AttentiveFPModel_num_layers": "3",
        "learning_rate": "0.0007",
        "response_cols": "asdf"
    }

    json_c = {
        "prediction_type": "classification",
        "AttentiveFPModel_num_layers": "3",
        "learning_rate": "0.0007",
        "response_cols": ["asdf1", "asdf2"]
    }

    params_a = pp.wrapper(json_a)
    params_b = pp.wrapper(json_b)
    params_c = pp.wrapper(json_c)

    aaa = pp.AutoArgumentAdder(pp.model_wl['AttentiveFPModel'],
                               'AttentiveFPModel')

    assert aaa.extract_params(params_a) == aaa.extract_params(params_b)
    assert aaa.extract_params(params_a, strip_prefix=True) == answer_a
    assert answer_c == aaa.extract_params(params_c, strip_prefix=True)
    assert not aaa.extract_params(params_a) == aaa.extract_params(params_c)
    assert not aaa.extract_params(params_b) == aaa.extract_params(params_c)
def confirm_perf_table(json_f, df):
    '''
    df should contain one entry for the model specified by json_f

    checks to see if the parameters extracted match what's in config
    '''
    # should only have trained one model
    assert len(df) == 1
    # the one row
    row = df.iloc[0]

    with open(json_f) as f:
        config = json.load(f)

    model_type = config['model_type']
    if model_type == 'NN':
        assert row['best_epoch'] > 0
        assert row['max_epochs'] == int(config['max_epochs'])
        assert row['learning_rate'] == float(config['learning_rate'])
        assert row['layer_sizes'] == config['layer_sizes']
        assert row['dropouts'] == config['dropouts']
    elif model_type == 'RF':
        print(row[[c for c in df.columns if c.startswith('rf_')]])
        assert row['rf_estimators'] == int(config['rf_estimators'])
        assert row['rf_max_features'] == int(config['rf_max_features'])
        assert row['rf_max_depth'] == int(config['rf_max_depth'])
    elif model_type == 'xgboost':
        print(row[[c for c in df.columns if c.startswith('xgb_')]])
        assert row['xgb_gamma'] == float(config['xgb_gamma'])
        assert row['xgb_learning_rate'] == float(config['xgb_learning_rate'])
    else:
        assert model_type in pp.model_wl
        assert row['best_epoch'] > 0
        pparams = pp.wrapper(config)
        assert row['learning_rate'] == float(pparams.learning_rate)
def create_scaffold_split(dset_key, res_dir):
    params = {
        "dataset_key": dset_key,
        "datastore": "False",
        "uncertainty": "False",
        "splitter": "scaffold",
        "split_valid_frac": "0.1",
        "split_test_frac": "0.1",
        "split_strategy": "train_valid_test",
        "previously_split": "False",
        "prediction_type": "classification",
        "model_choice_score_type": "roc_auc",
        "response_cols": "active",
        "id_col": "compound_id",
        "smiles_col": "base_rdkit_smiles",
        "result_dir": res_dir,
        "system": "LC",
        "transformers": "True",
        "model_type": "NN",
        "featurizer": "computed_descriptors",
        "descriptor_type": "rdkit_raw",
        "learning_rate": ".0007",
        "layer_sizes": "512,128",
        "dropouts": "0.3,0.3",
        "save_results": "False",
        "max_epochs": "500",
        "early_stopping_patience": "50",
        "verbose": "False"
    }

    pparams = parse.wrapper(params)
    MP = mp.ModelPipeline(pparams)

    split_uuid = MP.split_dataset()
    return split_uuid
def main():
    """Entry point when script is run"""
    print(sys.argv[1:])
    params = parse.wrapper(sys.argv[1:])
    keep_params = {
        'model_type', 'featurizer', 'splitter', 'datastore', 'save_results',
        'previously_featurized', 'descriptor_key', 'descriptor_type',
        'split_valid_frac', 'split_test_frac', 'bucket', 'lc_account',
        'slurm_time_limit', 'slurm_partition'
    } | excluded_keys
    params.__dict__ = parse.prune_defaults(params, keep_params=keep_params)
    if params.search_type == 'grid':
        hs = GridSearch(params)
    elif params.search_type == 'random':
        hs = RandomSearch(params)
    elif params.search_type == 'geometric':
        hs = GeometricSearch(params)
    elif params.search_type == 'user_specified':
        hs = UserSpecifiedSearch(params)
    else:
        print("Incorrect search type specified")
        sys.exit(1)
    if params.split_only:
        hs.generate_split_shortlist()
    else:
        hs.run_search()
Exemple #6
0
def uncurated_objects(y=["VALUE_NUM"]):
    params_from_ds = parse.wrapper(currentdir + '/config_uncurated_bp.json')
    params_from_ds.response_cols = y
    featurization = feat.create_featurization(params_from_ds)
    data = model_dataset.create_model_dataset(params_from_ds, featurization)
    uncurated_df = data.load_full_dataset()
    return params_from_ds, data, uncurated_df
Exemple #7
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    clean()

    # Run ECFP NN hyperparam search
    # ------------
    json_file = "nn_ecfp.json"
    with open(json_file, "r") as f:
        hp_params = json.load(f)
    pparams = parse.wrapper(hp_params)

    print('launch maestro')
    _ = wait_to_finish(f"maestro run -y -p custom_gen.py run_nn_ecfp.yaml",
                       max_time=2 * 60 * 60)  # wait 2 hours.

    result_df = cm.get_filesystem_perf_results(pparams.result_dir,
                                               pparams.prediction_type)
    assert not result_df is None  # Timed out
    assert max(result_df['test_r2_score'].values
               ) > 0.6  # should do at least this well. I saw values like 0.687

    print('waiting for maestro to finish')
    time.sleep(60)

    # Clean
    # -----
    clean()
def featurize_from_shortlist(shortlist_path=None, split_json=None):
    """
    Featurize and split the ChEMBL hERG pIC50 dataset. Then create a config
    file for running a hyperparameter search to model this dataset.
    """
    sl = pd.read_csv(shortlist_path)
    with open(split_json, "r") as f:
        hp_params = json.load(f)

    print('Featurizing shortlist')
    hp_params.pop('use_shortlist')
    hp_params.pop('shortlist_key')

    for i, row in sl.iterrows():
        hp_params['dataset_key'] = row.dataset_key
        hp_params['response_cols'] = row.response_cols
        pparams = parse.wrapper(hp_params)

        print('-----------------------------------------------')
        print(hp_params['dataset_key'])
        print(pparams.dataset_key)
        print('-----------------------------------------------')

        # Create a ModelPipeline object
        pipe = mp.ModelPipeline(pparams)

        # Featurize and split the dataset
        split_uuid = pipe.split_dataset()

        # Delete split file to keep it cleaner
        rdir = hp_params['result_dir']
        dkey = row.dataset_key.replace('.csv', '')
        os.remove(f'{dkey}_train_valid_test_scaffold_{split_uuid}.csv')
Exemple #9
0
def train_model(input, output):
    """ Retrain a model saved in a model_metadata.json file

    Args:
        input (str): path to model_metadata.json file

        output (str): path to output directory

    Returns:
        None
    """
    # Train model
    # -----------
    # Read parameter JSON file
    with open(input) as f:
        config = json.loads(f.read())

    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']

    logger.debug("model params %s" % str(params))

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
Exemple #10
0
def test_default_params_json():
    params = parse.wrapper(currentdir + '/config_required_inputs.json')
    defaults = default_parameters()
    defaults.config_file = currentdir + '/config_required_inputs.json'
    test = []
    test.append(params == defaults)
    test.append(params.transformers)
    assert all(test)
Exemple #11
0
def train_model_from_tracker(model_uuid, output_dir):
    """ Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker

    Args:
        model_uuid (str): model tracker model_uuid file

        output_dir (str): path to output directory

    Returns:
        the model pipeline object with trained model
    """

    if not mlmt_supported:
        logger.debug(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    collection_name = mt.get_model_collection_by_uuid(model_uuid,
                                                      mlmt_client=mlmt_client)

    # get metadata from tracker
    config = mt.get_metadata_by_uuid(model_uuid)

    # check if datastore dataset
    try:
        result = dsf.retrieve_dataset_by_datasetkey(
            config['training_dataset']['dataset_key'],
            bucket=config['training_dataset']['bucket'])
        if result is not None:
            config['datastore'] = True
    except:
        pass
    # fix weird old parameters
    #if config[]
    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output_dir
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']
    # specify collection
    params.collection_name = collection_name

    logger.debug("model params %s" % str(params))

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
Exemple #12
0
def test_train_valid_test():
    script_path = os.path.dirname(os.path.realpath(__file__))
    json_file = os.path.join(script_path, 'nn_ecfp_random.json')

    pparams = parse.wrapper(['--config_file', json_file])
    pparams.dataset_key = os.path.join(script_path, 
        '../../test_datasets/aurka_chembl_base_smiles_union.csv')
    pparams.result_dir = script_path

    saved_model_identity(pparams)
Exemple #13
0
def datastore_objects(y=["PIC50"]):
    params_from_ds = parse.wrapper(currentdir +
                                   '/config_datastore_dset_cav12.json')
    params_from_ds.response_cols = y
    featurization = feat.create_featurization(params_from_ds)
    data = model_dataset.create_model_dataset(params_from_ds, featurization)
    dset_df = data.load_full_dataset()
    data.get_featurized_data()
    data.split_dataset()
    return params_from_ds, data, dset_df
def test_defaults():
    json_d = {
        "prediction_type": "classification",
        "AttentiveFPModel_num_layers": "3",
        "response_cols": ["asdf1", "asdf2"]
    }

    params_d = pp.wrapper(json_d)
    # make sure that the default value of synonyms are still set correctly
    expected_lr = 0.0005
    assert params_d.learning_rate == expected_lr, f'{params_d.learning_rate} should be {expected_lr}'
Exemple #15
0
def test_ecfp_nn():
    script_path = os.path.dirname(os.path.realpath(__file__))
    json_file = os.path.join(script_path, 'nn_ecfp.json')

    pparams = parse.wrapper(['--config_file', json_file])
    pparams.dataset_key = os.path.join(script_path, 
        '../../test_datasets/aurka_chembl_base_smiles_union.csv')
    pparams.result_dir = script_path
    pparams.split_uuid = 'test-split'

    saved_model_identity(pparams)
Exemple #16
0
def delaney_pipeline(y=["measured log solubility in mols per litre"],
                     featurizer="ecfp",
                     split_strategy="train_valid_test",
                     splitter="random"):
    delaney_inp_file = currentdir + '/config_delaney.json'
    inp_params = parse.wrapper(delaney_inp_file)
    inp_params.response_cols = y
    inp_params.featurizer = featurizer
    inp_params.split_strategy = split_strategy
    inp_params.splitter = splitter
    mp = MP.ModelPipeline(inp_params)
    return mp
def test_create_model_wrapper():
    """
        Args:
        params (Namespace) : Parameters passed to the model pipeline
                     featurizer (Featurization): Object managing the featurization of compounds
                                 ds_client (DatastoreClient): Interface to the file datastore

                                                  Returns:
                                                  model (pipeline.Model): Wrapper for DeepChem, sklearn or other model.

                                                              Raises:
ValueError: Only params.model_type = 'NN' or 'RF' is supported. 

Dependencies:
None

Calls:
DCNNModelWrapper, DCRFModelWrapper
    """
    inp_params = parse.wrapper(general_params)
    featurization = feat.create_featurization(inp_params)
    mdl = model_wrapper.create_model_wrapper(inp_params, featurization)
    mdl.setup_model_dirs()
    # testing for correct attribute initialization with model_type == "NN"
    test = []
    test.append(mdl.params.model_type == 'NN')
    test.append(isinstance(mdl.featurization, feat.DynamicFeaturization))
    test.append(mdl.output_dir == inp_params.output_dir)
    test.append(mdl.model_dir == inp_params.output_dir + '/' + 'model')
    test.append(mdl.best_model_dir == inp_params.output_dir + '/' +
                'best_model')
    test.append(mdl.baseline_model_dir == inp_params.output_dir + '/' +
                'baseline_epoch_model')
    test.append(mdl.transformers == [])
    test.append(mdl.transformers_x == [])
    test.append(isinstance(mdl, model_wrapper.DCNNModelWrapper))

    # testing for correct attribute initialization with model_type == "RF"
    temp_params = copy.deepcopy(inp_params)
    temp_params.model_type = 'RF'
    featurization = feat.create_featurization(temp_params)
    mdl_RF = model_wrapper.create_model_wrapper(temp_params, featurization)
    test.append(isinstance(mdl_RF, MP.model_wrapper.DCRFModelWrapper))
    test.append(mdl_RF.params.model_type == 'RF')

    # assertion for all tests
    assert all(test)

    #testing for Exception with model_type not in ['NN','RF']
    with pytest.raises(ValueError):
        temp_params.model_type = 'wrong'
        mdl_wrong = model_wrapper.create_model_wrapper(temp_params,
                                                       featurization)
def wait_to_finish(json_file, max_time=600):
    """ Run hyperparam search and return pref_df

    Given parased parameter namespace build the hyperparam search command and
    wait for training to complete. Once training is complete, retrun the perf_df.
    This function repeatedly calls get_filesystem_perf_results until it sees
    at least the number of jobs generated by pparams.

    Args:
        json_file (str): Path to json_file to run.

        max_type (int): Max wait time in seconds. Default 600. -1 is unlimited
            wait time.

    Returns:
        DataFrame or None: returns perf_df if training completes in time. 

    """
    with open(json_file, "r") as f:
        hp_params = json.load(f)

    pparams = parse.wrapper(hp_params)

    script_dir = pparams.script_dir
    python_path = pparams.python_path
    result_dir = pparams.result_dir
    pred_type = pparams.prediction_type

    run_cmd = f"{python_path} {script_dir}/utils/hyperparam_search_wrapper.py --config_file {json_file}"
    #    os.system(run_cmd)
    p = subprocess.Popen(run_cmd.split(' '), stdout=subprocess.PIPE)
    out = p.stdout.read().decode("utf-8")

    num_jobs = out.count('Submitted batch job')
    num_found = 0
    time_waited = 0
    wait_interval = 30

    print("Waiting %d jobs to finish. Checks every 30 seconds" % num_jobs)
    while (num_found < num_jobs) and ((max_time == -1) or
                                      (time_waited < max_time)):
        # wait until the training jobs have finished
        time.sleep(wait_interval)  # check for results every 30 seconds
        time_waited += wait_interval
        try:
            result_df = cm.get_filesystem_perf_results(result_dir,
                                                       pred_type=pred_type)
            num_found = result_df.shape[0]
        except:
            num_found = 0
            result_df = None

    return result_df
Exemple #19
0
 def split_and_save_dataset(self, assay_params):
     self.get_dataset_metadata(assay_params)
     #TODO: check usage with defaults
     namespace_params = parse.wrapper(assay_params)
     #TODO: Don't want to recreate each time
     featurization = feat.create_featurization(namespace_params)
     data = model_datasets.create_model_dataset(namespace_params,
                                                featurization)
     data.get_featurized_data()
     data.split_dataset()
     data.save_split_dataset()
     assay_params['previously_split'] = True
     assay_params['split_uuid'] = data.split_uuid
Exemple #20
0
def moe_descriptors(datastore=False):
    if datastore == True:
        params_ds = parse.wrapper(currentdir +
                                  "/config_MAOA_moe_descriptors_ds.json")
    else:
        params_file = parse.wrapper(currentdir +
                                    "/config_MAOA_moe_descriptors.json")


#         if not os.path.isfile(params_file.dataset_key):
#             os.makedirs('pytest/config_MAOA_moe_descriptors/moe_descriptors', exist_ok=True)
#             copyfile(params_ds.dataset_key, params_file.dataset_key)
    if datastore == True:
        params_desc = params_ds
    else:
        params_desc = params_file
    featurization = feat.create_featurization(params_desc)
    dataset_obj_for_desc = model_dataset.create_model_dataset(params_desc,
                                                              featurization,
                                                              ds_client=None)
    df = dataset_obj_for_desc.load_full_dataset()
    return params_desc, dataset_obj_for_desc, df
Exemple #21
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    clean()

    # Run HyperOpt
    # ------------
    with open("H1_RF.json", "r") as f:
        hp_params = json.load(f)

    script_dir = parse.__file__.strip("parameter_parser.py").replace(
        "/pipeline/", "")
    python_path = sys.executable
    hp_params["script_dir"] = script_dir
    hp_params["python_path"] = python_path

    params = parse.wrapper(hp_params)
    if not os.path.isfile(params.dataset_key):
        params.dataset_key = os.path.join(params.script_dir,
                                          params.dataset_key)

    train_df = pd.read_csv(params.dataset_key)

    print(f"Train a RF models with ECFP")
    pl = mp.ModelPipeline(params)
    pl.train_model()

    print("Calculate AD index with the just trained model.")
    pred_df_mp = pl.predict_on_dataframe(train_df[:10],
                                         contains_responses=True,
                                         AD_method="z_score")

    assert (
        "AD_index"
        in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'

    print("Calculate AD index with the saved model tarball file.")
    pred_df_file = pfm.predict_from_model_file(
        model_path=pl.params.model_tarball_path,
        input_df=train_df[:10],
        id_col="compound_id",
        smiles_col="rdkit_smiles",
        response_col="pKi_mean",
        dont_standardize=True,
        AD_method="z_score")
    assert ("AD_index" in pred_df_file.columns.values
            ), 'Error: No AD_index column in pred_df_file'
def train_model_w_balan(dset_key, split_uuid, res_dir):
    # Now train models on the same dataset with balancing weights
    params = {
        "dataset_key": dset_key,
        "datastore": "False",
        "uncertainty": "False",
        "splitter": "scaffold",
        "split_valid_frac": "0.1",
        "split_test_frac": "0.1",
        "split_strategy": "train_valid_test",
        "previously_split": "True",
        "split_uuid": split_uuid,
        "prediction_type": "classification",
        "model_choice_score_type": "roc_auc",
        "response_cols": "active",
        "id_col": "compound_id",
        "smiles_col": "base_rdkit_smiles",
        "result_dir": res_dir,
        "system": "LC",
        "transformers": "True",
        "model_type": "NN",
        "featurizer": "computed_descriptors",
        "descriptor_type": "rdkit_raw",
        "weight_transform_type": "balancing",
        "learning_rate": ".0007",
        "layer_sizes": "512,128",
        "dropouts": "0.3,0.3",
        "save_results": "False",
        "max_epochs": "500",
        "early_stopping_patience": "50",
        "verbose": "False"
    }

    for i in range(nreps):
        pparams = parse.wrapper(params)
        MP = mp.ModelPipeline(pparams)
        MP.train_model()
        wrapper = MP.model_wrapper

        for ss in ['valid', 'test']:
            metvals = wrapper.get_pred_results(ss, 'best')
            for metric in [
                    'roc_auc_score', 'prc_auc_score', 'cross_entropy',
                    'precision', 'recall_score', 'npv', 'accuracy_score',
                    'bal_accuracy', 'kappa', 'matthews_cc'
            ]:
                subset.append(ss)
                balanced.append('yes')
                metrics.append(metric)
                vals.append(metvals[metric])
def test_predict_on_dataframe():
    '''
    test that predict_from_model makes predictions in the same
    order as the input
    '''

    model_path = '../../examples/BSEP/models/bsep_classif_scaffold_split.tar.gz'
    csv_path = '../../examples/BSEP/data/ChEMBL25_BSEP_curated_data.csv'

    id_col = 'compound_id'
    smiles_col = 'base_rdkit_smiles'
    response_col = 'active'
    conc_col = None
    is_featurized = False
    dont_standardize = False
    AD_method = None
    k = 5
    dist_metric = "euclidean"

    df = pd.read_csv(csv_path, dtype={id_col: str})
    shuffled_df = df.sample(frac=1)

    input_df, pred_params = pfm._prepare_input_data(shuffled_df, id_col,
                                                    smiles_col, response_col,
                                                    conc_col, dont_standardize)

    has_responses = ('response_cols' in pred_params)
    pred_params = parse.wrapper(pred_params)

    pipe = mp.create_prediction_pipeline_from_file(pred_params,
                                                   reload_dir=None,
                                                   model_path=model_path)
    pred_df = pipe.predict_on_dataframe(input_df,
                                        contains_responses=has_responses,
                                        is_featurized=is_featurized,
                                        AD_method=AD_method,
                                        k=k,
                                        dist_metric=dist_metric)

    old_id_col = shuffled_df[id_col].values
    new_id_col = pred_df[id_col].values

    match_rows = all([n == o for n, o in zip(new_id_col, old_id_col)])
    print(match_rows)
    assert all([n == o for n, o in zip(new_id_col, old_id_col)])

    score = skm.accuracy_score(shuffled_df[response_col].values,
                               pred_df['pred'].values)
    print(score)
    assert score > 0.5
def verify_saved_params(original_json_f, tar_f):
    '''
    compares saved params in a tar file with original json
    '''
    reload_dir = tempfile.mkdtemp()
    model_fp = tarfile.open(tar_f, mode='r:gz')
    model_fp.extractall(path=reload_dir)
    model_fp.close()

    # read config from tar file
    config_file_path = os.path.join(reload_dir, 'model_metadata.json')
    with open(config_file_path) as f:
        tar_config = json.loads(f.read())

    # read original config
    with open(original_json_f) as f:
        original_config = json.loads(f.read())

    original_pp = parse.wrapper(original_config)
    original_model_params = parse.extract_model_params(original_pp)
    original_feat_params = parse.extract_featurizer_params(original_pp)

    tar_pp = parse.wrapper(tar_config)
    tar_model_params = parse.extract_model_params(tar_pp)
    tar_feat_params = parse.extract_featurizer_params(tar_pp)

    print('-----------------------------------')
    print('model params')
    print(original_model_params)
    print(tar_model_params)
    assert original_model_params == tar_model_params
    print('-----------------------------------')
    print('feat params')
    print(original_feat_params)
    print(tar_feat_params)
    assert original_feat_params == tar_feat_params
Exemple #25
0
def test_correct_input_type_json():
    params = parse.wrapper(currentdir + '/config_list_inputs.json')
    test = []
    test.append(params.system == 'twintron-blue')
    test.append(
        params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv')
    test.append(params.layer_sizes == [42, 42])
    test.append(params.batch_size == 63)
    test.append(params.previously_split)
    test.append(params.descriptor_type == 'moe')
    test.append(not params.datastore)
    test.append(params.model_type == ['NN', 'RF'])
    test.append(params.response_cols == ['task1', 'task2', 'task3'])
    test.append(not params.transformers)
    assert all(test)
Exemple #26
0
def test_hierarchical_dict():
    params = parse.wrapper(hierarchical_input_dict)
    test = []
    test.append(params.system == 'twintron-blue')
    test.append(
        params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv')
    test.append(params.layer_sizes == [[42, 42]])
    test.append(params.batch_size == 63)
    test.append(params.previously_split)
    test.append(params.descriptor_type == 'moe')
    test.append(not params.datastore)
    test.append(params.model_type == ['NN', 'RF'])
    test.append(params.response_cols == ['task1', 'task2', 'task3'])
    test.append(not params.transformers)
    assert all(test)
Exemple #27
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    clean()

    # Run HyperOpt
    # ------------
    with open("H1_hybrid.json", "r") as f:
        hp_params = json.load(f)

    script_dir = parse.__file__.strip("parameter_parser.py").replace(
        "/pipeline/", "")
    python_path = sys.executable
    hp_params["script_dir"] = script_dir
    hp_params["python_path"] = python_path

    params = parse.wrapper(hp_params)
    if not os.path.isfile(params.dataset_key):
        params.dataset_key = os.path.join(params.script_dir,
                                          params.dataset_key)

    train_df = pd.read_csv(params.dataset_key)

    print(f"Train a hybrid models with MOE descriptors")
    pl = mp.ModelPipeline(params)
    pl.train_model()

    print("Check the model performance on validation data")
    pred_data = pl.model_wrapper.get_perf_data(subset="valid",
                                               epoch_label="best")
    pred_results = pred_data.get_prediction_results()
    print(pred_results)

    pred_score = pred_results['r2_score']
    score_threshold = 0.4
    assert pred_score > score_threshold, \
        f'Error: Score is too low {pred_score}. Must be higher than {score_threshold}'

    print("Make predictions with the hyrid model")
    predict = pl.predict_on_dataframe(train_df[:10], contains_responses=False)
    assert (predict['pred'].shape[0] == 10
            ), 'Error: Incorrect number of predictions'
    assert (np.all(np.isfinite(
        predict['pred'].values))), 'Error: Predictions are not numbers'
def train_and_get_tar(input_json, ds_key_file):

    script_path = os.path.dirname(os.path.realpath(__file__))
    json_file = os.path.join(script_path, input_json)

    pparams = parse.wrapper(['--config_file', json_file])
    pparams.dataset_key = os.path.join(script_path, ds_key_file)
    pparams.result_dir = os.path.join(script_path, 'result')

    train_pipe = mp.ModelPipeline(pparams)
    train_pipe.train_model()

    list_of_files = glob.glob('./result/*.gz')  # check all *.gz
    latest_file = max(list_of_files, key=os.path.getctime)  # get the latest gz

    return latest_file
Exemple #29
0
def test_correct_input_mixed_command_line_types():
    params = parse.wrapper(config_inputs)
    test = []
    test.append(params.system == 'twintron-blue')
    test.append(
        params.dataset_key == '/ds/data/public/delaney/delaney-processed.csv')
    test.append(params.layer_sizes == [[42, 42]])
    test.append(params.batch_size == 63)
    test.append(params.previously_split)
    test.append(params.descriptor_type == 'moe')
    test.append(not params.datastore)
    test.append(params.response_cols == ['one1', 'two2', 'three3'])
    test.append(params.baseline_epoch == 78)
    test.append(params.splitter == 'random')
    test.append(not params.transformers)
    assert all(test)
Exemple #30
0
def predict_from_model_file(model_path, input_df, id_col='compound_id', smiles_col='rdkit_smiles',
                     response_col=None, is_featurized=False, dont_standardize=False, AD_method=None, k=5, dist_metric="euclidean"):
    """
    Loads a pretrained model from a model tarball file and runs predictions on compounds in an input
    data frame.

    Args:
        model_path (str): File path of the model tarball file.

        input_df (DataFrame): Input data to run predictions on; must at minimum contain SMILES strings.

        id_col (str): Name of the column containing compound IDs. If none is provided, sequential IDs will be
        generated.

        smiles_col (str): Name of the column containing SMILES strings; required.

        response_col (str): Name of an optional column containing actual response values; if it is provided, 
        the actual values will be included in the returned data frame to make it easier for you to assess performance.

        dont_standardize (bool): By default, SMILES strings are salt-stripped and standardized using RDKit; 
        if you have already done this, or don't want them to be standardized, set dont_standardize to True.

        AD_method (str): with default, Applicable domain (AD) index will not be calcualted, use 
        z_score or local_density to choose the method to calculate AD index.
        
        k (int): number of the neareast neighbors to evaluate the AD index, default is 5.

        dist_metric (str): distance metrics, valid values are 'cityblock', 'cosine', 'euclidean', 'jaccard', 'manhattan'
    Return: 
        A data frame with compound IDs, SMILES strings and predicted response values. Actual response values
        will be included if response_col is provided. Standard prediction error estimates will be included
        if the model was trained with uncertainty=True. Note that the predicted and actual response
        columns will be labeled according to the response_col setting in the original training data,
        not the response_col passed to this function; e.g. if the original model response_col was 'pIC50',
        the returned data frame will contain columns 'pIC50_actual', 'pIC50_pred' and 'pIC50_std'.
    """

    input_df, pred_params = _prepare_input_data(input_df, id_col, smiles_col, response_col, dont_standardize)

    has_responses = ('response_cols' in pred_params)
    pred_params = parse.wrapper(pred_params)

    pipe = mp.create_prediction_pipeline_from_file(pred_params, reload_dir=None, model_path=model_path)
    pred_df = pipe.predict_full_dataset(input_df, contains_responses=has_responses, is_featurized=is_featurized,
                                        dset_params=pred_params, AD_method=AD_method, k=k, dist_metric=dist_metric)
    pred_df = pred_df.sort_values(by=id_col)
    return pred_df