Example #1
0
def saved_model_identity(pparams):
    script_path = os.path.dirname(os.path.realpath(__file__))
    if not pparams.previously_split:
        split_uuid = split(pparams)

        pparams.split_uuid = split_uuid
        pparams.previously_split = True

    train_pipe = train(pparams)
    split_csv = os.path.join(script_path, '../../test_datasets/', 
                    train_pipe.data._get_split_key())
    test_df = get_test_set(pparams.dataset_key, split_csv, pparams.id_col)

    # verify
    with open(os.path.join(pparams.output_dir, 'model_metrics.json'), 'r') as f:
        model_metrics = json.load(f)

    # only compare test results since there's only one fold for test
    metrics = find_best_test_metric(model_metrics)
    id_col = metrics['input_dataset']['id_col']
    response_col = metrics['input_dataset']['response_cols'][0]
    smiles_col = metrics['input_dataset']['smiles_col']
    test_length = metrics['prediction_results']['num_compounds']

    model_tar = train_pipe.params.model_tarball_path
    pred_df = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col,
                smiles_col=smiles_col, response_col=response_col)
    pred_df2 = pfm.predict_from_model_file(model_tar, test_df, id_col=id_col,
                smiles_col=smiles_col, response_col=response_col)

    X = pred_df[response_col+'_actual'].values
    y = pred_df[response_col+'_pred'].values
    X2 = pred_df2[response_col+'_actual'].values
    y2 = pred_df2[response_col+'_pred'].values

    r2 = skmetrics.r2_score(X, y)
    rms = np.sqrt(skmetrics.mean_squared_error(X, y))
    mae = skmetrics.mean_absolute_error(X, y)

    saved_r2 = metrics['prediction_results']['r2_score']
    saved_rms = metrics['prediction_results']['rms_score']
    saved_mae = metrics['prediction_results']['mae_score']

    print(metrics['subset'])
    print(pred_df.columns)
    print(abs(r2-saved_r2))
    print(abs(rms-saved_rms))
    print(abs(mae-saved_mae))
    print(np.mean(abs(y2-y)))

    assert abs(r2-saved_r2)<1e-5 \
            and abs(rms-saved_rms)<1e-5 \
            and abs(mae-saved_mae)<1e-5 \
            and np.mean(abs(y2-y))<1e-5 \
            and (test_length == len(test_df))
def test_predict_from_model():
    '''
    test that predict_from_model makes predictions in the same
    order as the input
    '''

    model_path = '../../examples/BSEP/models/bsep_classif_scaffold_split.tar.gz'
    csv_path = '../../examples/BSEP/data/ChEMBL25_BSEP_curated_data.csv'

    id_col = 'compound_id'
    smiles_col = 'base_rdkit_smiles'
    response_col = 'active'

    df = pd.read_csv(csv_path, dtype={id_col: str})
    #df = pd.concat([df.head(25),df.head(25)])
    #df = df.head(50)
    shuffled_df = df.sample(frac=1)

    pred_df = pfm.predict_from_model_file(model_path,
                                          shuffled_df,
                                          id_col=id_col,
                                          smiles_col=smiles_col,
                                          response_col=response_col)

    old_id_col = shuffled_df[id_col].values
    new_id_col = pred_df[id_col].values

    match_rows = all([n == o for n, o in zip(new_id_col, old_id_col)])
    print(match_rows)
    assert all([n == o for n, o in zip(new_id_col, old_id_col)])

    score = skm.accuracy_score(shuffled_df[response_col].values,
                               pred_df[response_col + '_pred'].values)
    print(score)
    assert score > 0.5
Example #3
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    clean()

    # Run HyperOpt
    # ------------
    with open("H1_RF.json", "r") as f:
        hp_params = json.load(f)

    script_dir = parse.__file__.strip("parameter_parser.py").replace(
        "/pipeline/", "")
    python_path = sys.executable
    hp_params["script_dir"] = script_dir
    hp_params["python_path"] = python_path

    params = parse.wrapper(hp_params)
    if not os.path.isfile(params.dataset_key):
        params.dataset_key = os.path.join(params.script_dir,
                                          params.dataset_key)

    train_df = pd.read_csv(params.dataset_key)

    print(f"Train a RF models with ECFP")
    pl = mp.ModelPipeline(params)
    pl.train_model()

    print("Calculate AD index with the just trained model.")
    pred_df_mp = pl.predict_on_dataframe(train_df[:10],
                                         contains_responses=True,
                                         AD_method="z_score")

    assert (
        "AD_index"
        in pred_df_mp.columns.values), 'Error: No AD_index column pred_df_mp'

    print("Calculate AD index with the saved model tarball file.")
    pred_df_file = pfm.predict_from_model_file(
        model_path=pl.params.model_tarball_path,
        input_df=train_df[:10],
        id_col="compound_id",
        smiles_col="rdkit_smiles",
        response_col="pKi_mean",
        dont_standardize=True,
        AD_method="z_score")
    assert ("AD_index" in pred_df_file.columns.values
            ), 'Error: No AD_index column in pred_df_file'
def predict_activity(args):
    # prepare inputs
    input_df = pd.read_csv(args.input_file, index_col=False)
    colnames = set(input_df.columns.values)
    if args.id_col not in colnames:
        input_df['compound_id'] = [
            'compound_%.6d' % i for i in range(input_df.shape[0])
        ]
        args.id_col = 'compound_id'
    if args.smiles_col not in colnames:
        raise ValueError(
            'smiles_col parameter not specified or column not in input file.')
    model_files = dict(random='bsep_classif_random_split.tar.gz',
                       scaffold='bsep_classif_scaffold_split.tar.gz')
    if args.model_type not in model_files:
        raise ValueError("model_type %s is not a recognizied value." %
                         args.model_type)
    if args.external_training_data is not None:
        data_file = os.path.join(os.getcwd(), args.external_training_data)
    else:
        data_file = None
    print("Data file:", data_file)
    # Test loading model from tarball and running predictions
    models_dir = os.path.join(os.path.dirname(__file__), 'models')
    model_tarfile = os.path.join(models_dir, model_files[args.model_type])

    # predict
    pred_df = pfm.predict_from_model_file(
        model_path=model_tarfile,
        input_df=input_df,
        id_col=args.id_col,
        smiles_col=args.smiles_col,
        response_col=args.response_col,
        dont_standardize=args.dont_standardize,
        is_featurized=args.is_featurized,
        AD_method=args.AD_method,
        external_training_data=data_file)

    # delete files created during prediction
    for root, dirs, files in os.walk(os.getcwd(), topdown=False):
        for file in files:
            if 'train_valid_test' in file:
                os.remove(os.path.join(root, file))

    # Write predictions to output file
    pred_df.to_csv(args.output_file, index=False)
    print("Wrote predictions to file %s" % args.output_file)

    # If measured activity values are provided, print some performance metrics
    if args.response_col is not None:
        actual_vals = pred_df['%s_actual' % args.response_col].values
        pred_classes = pred_df['%s_pred' % args.response_col].values
        pred_probs = pred_df['%s_prob' % args.response_col].values
        conf_matrix = metrics.confusion_matrix(actual_vals, pred_classes)
        roc_auc = metrics.roc_auc_score(actual_vals, pred_probs)
        prc_auc = metrics.average_precision_score(actual_vals, pred_probs)
        accuracy = metrics.accuracy_score(actual_vals, pred_classes)
        precision = metrics.precision_score(actual_vals, pred_classes)
        npv = negative_predictive_value(actual_vals, pred_classes)
        recall = metrics.recall_score(actual_vals, pred_classes)
        mcc = metrics.matthews_corrcoef(actual_vals, pred_classes)
        ncorrect = sum(actual_vals == pred_classes)
        print("Performance metrics:\n")
        print("%d out of %d predictions correct." %
              (ncorrect, pred_df.shape[0]))
        print("Accuracy: %.3f" % accuracy)
        print("Precision: %.3f" % precision)
        print("Recall: %.3f" % recall)
        print("NPV: %.3f" % npv)
        print("ROC AUC: %.3f" % roc_auc)
        print("PRC AUC: %.3f" % prc_auc)
        print("Matthews correlation coefficient: %.3f" % mcc)
        print("Confusion matrix:")
        print("\t\tpredicted activity")
        print("actual\nactivity\t0\t1\n")
        print("   0\t\t%d\t%d" % (conf_matrix[0][0], conf_matrix[0][1]))
        print("   1\t\t%d\t%d" % (conf_matrix[1][0], conf_matrix[1][1]))
def train_and_predict(train_json_f, prefix='delaney-processed'):
    # Train model
    # -----------
    # Read parameter JSON file
    with open(train_json_f) as f:
        config = json.loads(f.read())

    # Parse parameters
    params = parse.wrapper(config)

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    # Get uuid and reload directory
    # -----------------------------
    model_type = params.model_type
    prediction_type = params.prediction_type
    descriptor_type = params.descriptor_type
    featurizer = params.featurizer
    splitter = params.splitter
    model_dir = 'result/%s_curated_fit/%s_%s_%s_%s' % (
        prefix, model_type, featurizer, splitter, prediction_type)
    uuid = model.params.model_uuid
    tar_f = 'result/%s_curated_fit_model_%s.tar.gz' % (prefix, uuid)
    reload_dir = model_dir + '/' + uuid

    # Check training statistics
    # -------------------------
    if prediction_type == 'regression':
        threshold = 0.6
        if 'perf_threshold' in config:
            threshold = float(config['perf_threshold'])

        integrative_utilities.training_statistics_file(reload_dir, 'test',
                                                       threshold, 'r2_score')
        score = integrative_utilities.read_training_statistics_file(
            reload_dir, 'test', 'r2_score')
    else:
        threshold = 0.7
        if 'perf_threshold' in config:
            threshold = float(config['perf_threshold'])
        integrative_utilities.training_statistics_file(reload_dir, 'test',
                                                       threshold,
                                                       'accuracy_score')
        score = integrative_utilities.read_training_statistics_file(
            reload_dir, 'test', 'accuracy_score')

    print("Final test score:", score)

    # Load second test set
    # --------------------
    data = pd.read_csv('%s_curated_external.csv' % prefix)

    predict = pfm.predict_from_model_file(tar_f,
                                          data,
                                          id_col=params.id_col,
                                          smiles_col=params.smiles_col,
                                          response_col=params.response_cols)
    pred_cols = [f for f in predict.columns if f.endswith('_pred')]

    pred = predict[pred_cols].to_numpy()

    # Check predictions
    # -----------------
    assert (
        pred.shape[0] == len(data)), 'Error: Incorrect number of predictions'
    assert (np.all(np.isfinite(pred))), 'Error: Predictions are not numbers'

    # Save predictions with experimental values
    # -----------------------------------------
    predict.reset_index(level=0, inplace=True)
    combined = pd.merge(data, predict, on=params.id_col, how='inner')
    pred_csv_name = '%s_curated_%s_%s_%s_%s_%d_%s_predict.csv' % (
        prefix, model_type, prediction_type, descriptor_type, featurizer,
        len(model.params.response_cols), model.params.splitter)
    combined.to_csv(pred_csv_name)
    assert (os.path.isfile(pred_csv_name)
            and os.path.getsize(pred_csv_name) > 0
            ), 'Error: Prediction file not created'

    return tar_f