def H1_init():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    integrative_utilities.clean_fit_predict()
    clean('H1')

    # Curate
    # ------
    H1_curate()
def init():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    integrative_utilities.clean_fit_predict()
    clean()

    # Download
    # --------
    download()

    # Curate
    # ------
    curate()
Exemple #3
0
def test():
    """
    Test full model pipeline: Curate data, fit model, and predict property for new compounds
    """

    # Clean
    # -----
    integrative_utilities.clean_fit_predict()
    clean()

    # Download
    # --------
    download()

    # Curate
    # ------
    curate()

    # Train model
    # -----------
    # Read parameter JSON file
    with open('config_delaney_train_NN.json') as f:
        config = json.loads(f.read())

    # Parse parameters
    params = parse.wrapper(config)

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    # Get uuid and reload directory
    # -----------------------------
    uuid = integrative_utilities.get_subdirectory(
        'result/delaney-processed_curated_fit/NN_graphconv_scaffold_regression'
    )
    reload_dir = 'result/delaney-processed_curated_fit/NN_graphconv_scaffold_regression/' + uuid

    # Check training statistics
    # -------------------------
    integrative_utilities.training_statistics_file(reload_dir, 'test', 0.6)

    # Make prediction parameters
    # --------------------------
    # Read prediction parameter JSON file
    with open('config_delaney_predict_NN.json', 'r') as f:
        predict_parameters_dict = json.loads(f.read())

    # Set transformer key here because model uuid is not known before fit
    predict_parameters_dict[
        'transformer_key'] = reload_dir + 'transformers.pkl'

    predict_parameters = parse.wrapper(predict_parameters_dict)

    # Load second test set
    # --------------------
    data = pd.read_csv('delaney-processed_curated_external.csv')

    # Select columns and rename response column
    data = data[[
        predict_parameters.id_col, predict_parameters.smiles_col,
        predict_parameters.response_cols[0]
    ]]
    data = data.rename(
        columns={predict_parameters.response_cols[0]: 'experimental_values'})

    # Make prediction pipeline
    # ------------------------
    pp = mp.create_prediction_pipeline_from_file(predict_parameters,
                                                 reload_dir)

    # Predict
    # -------
    predict = pp.predict_on_dataframe(data)

    # Check predictions
    # -----------------
    assert (predict['pred'].shape[0] == 117
            ), 'Error: Incorrect number of predictions'
    assert (np.all(np.isfinite(
        predict['pred'].values))), 'Error: Predictions are not numbers'

    # Save predictions with experimental values
    # -----------------------------------------
    predict.reset_index(level=0, inplace=True)
    combined = pd.merge(data,
                        predict,
                        on=predict_parameters.id_col,
                        how='inner')
    combined.to_csv('delaney-processed_curated_predict.csv')
    assert (os.path.isfile('delaney-processed_curated_predict.csv')
            and os.path.getsize('delaney-processed_curated_predict.csv') > 0
            ), 'Error: Prediction file not created'