Beispiel #1
0
def main(path, task, representation, use_pca):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()

    X = featurise_mols(smiles_list, representation)

    if use_pca:
        n_components = 50
    else:
        n_components = None

    # Set random state to be different to the splits used for evaluation

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=30)

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    X_train, y_train, _, _, y_scaler = transform_data(X_train, y_train, X_test,
                                                      y_test, n_components,
                                                      use_pca)

    estim = HyperoptEstimator(regressor=random_forest_regression('my_RF'))
    estim.fit(X_train, y_train, valid_size=0.1, n_folds=5, cv_shuffle=True)
    print(estim.best_model())
    with open(f'saved_hypers/RF/tuning_for_{task}', 'w') as f:
        print(estim.best_model(), file=f)
Beispiel #2
0
def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(X, y, test_size=test_set_size)  # To get test set size
    n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        k = Tanimoto()
        m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)

        # e_iso_pi best params:
        # {'learner': RandomForestRegressor(max_features=0.9348473830061558, n_estimators=381,
        #                       n_jobs=1, random_state=2, verbose=False)}
        # e_iso_n best params:
        # {'learner': RandomForestRegressor(bootstrap=False, max_features=0.09944870853556087,
        #                                   min_samples_leaf=3, n_estimators=1295, n_jobs=1,
        #                                   random_state=0, verbose=False)}
        # z_iso_pi best params:
        # {'learner': RandomForestRegressor(max_depth=4, max_features=0.33072121415416944,
        #                                   n_estimators=2755, n_jobs=1, random_state=2,
        #                                   verbose=False)}
        # z_iso_n best params:
        # {'learner': RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1,
        #                                   random_state=3, verbose=False)}

        regr_rf = RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1,
                                           random_state=3, verbose=False)
        regr_rf.fit(X_train, y_train)

        # Optimise the kernel variance and noise level by the marginal likelihood

        opt = gpflow.optimizers.Scipy()
        opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))
        print_summary(m)

        # mean and variance GP prediction and RF prediction

        y_pred, y_var = m.predict_f(X_test)
        y_pred_rf = regr_rf.predict(X_test)
        y_pred_av = (y_pred + y_pred_rf.reshape(-1, 1)) / 2.0
        y_pred = y_scaler.inverse_transform(y_pred_av)
        y_test = y_scaler.inverse_transform(y_test)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        y_pred_train_rf = regr_rf.predict(X_train)
        y_pred_train = (y_pred_train + y_pred_train_rf.reshape(-1, 1)) / 2.0
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
Beispiel #3
0
def main(path, task, representation, use_pca, n_trials, test_set_size,
         use_rmse_conf):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(
        X, y, test_size=test_set_size)  # To get test set size
    n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(
            X_train,
            y_train,
            X_test,
            y_test,
            n_components=n_components,
            use_pca=use_pca)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        k = Tanimoto()
        m = gpflow.models.GPR(data=(X_train, y_train),
                              mean_function=Constant(np.mean(y_train)),
                              kernel=k,
                              noise_variance=1)

        # Optimise the kernel variance and noise level by the marginal likelihood

        opt = gpflow.optimizers.Scipy()
        opt.minimize(objective_closure,
                     m.trainable_variables,
                     options=dict(maxiter=10000))
        print_summary(m)

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)
        y_pred = y_scaler.inverse_transform(y_pred)
        y_test = y_scaler.inverse_transform(y_test)

        # Compute scores for confidence curve plotting.

        ranked_confidence_list = np.argsort(y_var, axis=0).flatten()

        for k in range(len(y_test)):

            # Construct the RMSE error for each level of confidence

            conf = ranked_confidence_list[0:k + 1]
            rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
            rmse_confidence_list[i, k] = rmse

            # Construct the MAE error for each level of confidence

            mae = mean_absolute_error(y_test[conf], y_pred[conf])
            mae_confidence_list[i, k] = mae

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(
            mean_squared_error(y_scaler.inverse_transform(y_train),
                               y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))

    # Plot confidence-error curves

    confidence_percentiles = np.arange(
        1e-14, 100, 100 / len(y_test)
    )  # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29

    if use_rmse_conf:

        rmse_mean = np.mean(rmse_confidence_list, axis=0)
        rmse_std = np.std(rmse_confidence_list, axis=0)

        # We flip because we want the most confident predictions on the right-hand side of the plot

        rmse_mean = np.flip(rmse_mean)
        rmse_std = np.flip(rmse_std)

        # One-sigma error bars

        lower = rmse_mean - rmse_std
        upper = rmse_mean + rmse_std

        plt.plot(confidence_percentiles, rmse_mean, label='mean')
        plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
        plt.xlabel('Confidence Percentile')
        plt.ylabel('RMSE (nm)')
        plt.ylim([0, np.max(upper) + 1])
        plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
        plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
        plt.savefig(
            task +
            '/results/gpr/{}_confidence_curve_rmse.png'.format(representation))
        plt.show()

    else:

        # We plot the Mean-absolute error confidence-error curves

        mae_mean = np.mean(mae_confidence_list, axis=0)
        mae_std = np.std(mae_confidence_list, axis=0)

        mae_mean = np.flip(mae_mean)
        mae_std = np.flip(mae_std)

        lower = mae_mean - mae_std
        upper = mae_mean + mae_std

        plt.plot(confidence_percentiles, mae_mean, label='mean')
        plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
        plt.xlabel('Confidence Percentile')
        plt.ylabel('MAE (nm)')
        plt.ylim([0, np.max(upper) + 1])
        plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
        plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
        plt.savefig(
            task +
            '/results/gpr/{}_confidence_curve_mae.png'.format(representation))
        plt.show()
def main(path, path_to_dft_dataset, task, representation, theory_level):
    """
    :param path: str specifying path to photoswitches.csv file.
    :param path_to_dft_dataset: str specifying path to dft_comparison.csv file.
    :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0']
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset)

    X = featurise_mols(smiles_list, representation)

    # Keep only non-duplicate entries because we're not considering effects of solvent

    non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    X = X[non_duplicate_indices, :]
    experimental_vals = experimental_vals[non_duplicate_indices]
    non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    pbe0_vals = pbe0_vals[non_dup_pbe0]
    cam_vals = cam_vals[non_dup_cam]

    # molecules with dft values to be split into train/test
    if theory_level == 'CAM-B3LYP':
        X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals)))
        # DFT values for the CAM-B3LYP level of theory
        dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals)))
    else:
        X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals)))
        # DFT values for the PBE0 level of theory
        dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals)))

    mae_list = []
    dft_mae_list = []

    # We define the Gaussian Process optimisation objective

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    print('\nBeginning training loop...')

    for i in range(len(y_with_dft)):

        X_train = np.delete(X_with_dft, i, axis=0)
        y_train = np.delete(y_with_dft, i)
        X_test = X_with_dft[i].reshape(1, -1)
        y_test = y_with_dft[i]
        dft_test = dft_vals[i]

        X_train = np.concatenate((X_train, X_no_dft))
        y_train = np.concatenate((y_train, y_no_dft))
        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        k = Tanimoto()
        m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)

        # Optimise the kernel variance and noise level by the marginal likelihood

        opt = gpflow.optimizers.Scipy()
        opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))
        print_summary(m)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)
        y_pred = y_scaler.inverse_transform(y_pred)
        y_test = y_scaler.inverse_transform(y_test)

        # Output MAE for this trial

        mae = abs(y_test - y_pred)

        print("MAE: {}".format(mae))

        # Store values in order to compute the mean and standard error of the statistics across trials

        mae_list.append(mae)

        # DFT prediction scores on the same trial

        dft_mae = abs(y_test - dft_test)

        dft_mae_list.append(dft_mae)

    mae_list = np.array(mae_list)
    dft_mae_list = np.array(dft_mae_list)

    print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))

    print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
task = 'e_iso_pi'
path = '../dataset/photoswitches.csv'

# New candidates to predict wavelength values for

# candidate_list = ['O=C(OC)C(C=C1)=CC=C1C2=C[N-][N+]3=C(C=CN32)/N=N/C4=CC=CC=C4',
#                   'O=C(OC)C(C=C1)=CC=C1C2=CN3[N+]([N-]2)=CC=C3/N=N/C4=CC=CC=C4']

df = pd.read_csv('../dataset/purchasable_switch.csv')
candidate_list = df['SMILES'].to_list()

if __name__ == '__main__':

    data_loader = TaskDataLoader(task, path)
    smiles_list, y_train = data_loader.load_property_data()
    X_train = featurise_mols(smiles_list, representation)
    X_test = featurise_mols(candidate_list, representation)

    num_features = np.shape(X_train)[1]

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    #  We standardise the outputs but leave the inputs unchanged. Equivalent to transform data used in other scripts.

    y_train = y_train.reshape(-1, 1)
    y_scaler = StandardScaler()
Beispiel #6
0
def main(task, path, representation, use_pca, test_set_size, r_size,
         det_encoder_n_hidden, lat_encoder_n_hidden, decoder_n_hidden):
    """
    :param task: str specifying the task name. One of [Photoswitch, ESOL, FreeSolv, Lipophilicity].
    :param path: str specifying the path to the photoswitches.csv file
    :param representation: str specifying the representation. One of [fingerprints, fragments, fragprints]
    :param use_pca: bool specifying whether or not to use PCA to perform Principal Components Regression
    :param test_set_size: float specifying the train/test split ratio. e.g. 0.2 is 80/20 train/test split
    :param r_size: Dimensionality of context encoding r.
    :param det_encoder_n_hidden: Number of deterministic encoder hidden layers.
    :param lat_encoder_n_hidden: Number of latent encoder hidden layers.
    :param decoder_n_hidden: Number of decoder hidden layers.
    :return:
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)
    y_size = 1

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 50
    else:
        n_components = None

    print('\nBeginning training loop...')
    j = 0  # index for saving results

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_set_size, random_state=42)

    if task != 'Photoswitch':
        # Artificially create a 80/10/10 train/validation/test split discarding the validation set.
        split_in_two = int(len(y_test) / 2)
        X_test = X_test[0:split_in_two]
        y_test = y_test[0:split_in_two]

    else:
        # We subdivide the train set in order to run cross-validation.
        X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.1,
                                                            random_state=42)

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    #  We standardise the outputs but leave the inputs unchanged

    X_train, y_train, X_test, _, y_scaler = transform_data(
        X_train,
        y_train,
        X_test,
        y_test,
        n_components=n_components,
        use_pca=use_pca)

    X_train = torch.from_numpy(X_train).float().unsqueeze(dim=0)
    X_test = torch.from_numpy(X_test).float().unsqueeze(dim=0)
    y_train = torch.from_numpy(y_train).float().unsqueeze(dim=0)

    det_encoder_hidden_sizes = [8, 16]
    lat_encoder_hidden_sizes = [8, 16]
    decoder_hidden_sizes = [8, 16]
    learning_rates = [0.01, 0.001]
    batch_sizes = [16, 32]
    iteration_numbers = [250, 500]

    best_rmse = 10000000  # a big number
    best_params = {
        'det_encs': 0,
        'lat_encs': 0,
        'dec_hid': 0,
        'lr': 0,
        'batch_size': 0,
        'iterations': 0
    }

    for det_encs in det_encoder_hidden_sizes:
        for lat_encs in lat_encoder_hidden_sizes:
            for dec_hid in decoder_hidden_sizes:
                for l_rate in learning_rates:
                    for batch_s in batch_sizes:
                        for iter_num in iteration_numbers:

                            m = AttentiveNP(
                                x_size=X_train.shape[2],
                                y_size=y_size,
                                r_size=r_size,
                                det_encoder_hidden_size=det_encs,
                                det_encoder_n_hidden=det_encoder_n_hidden,
                                lat_encoder_hidden_size=lat_encs,
                                lat_encoder_n_hidden=lat_encoder_n_hidden,
                                decoder_hidden_size=dec_hid,
                                decoder_n_hidden=decoder_n_hidden,
                                lr=l_rate,
                                attention_type="multihead")

                            print('...training.')

                            m.train(X_train,
                                    y_train,
                                    batch_size=batch_s,
                                    iterations=iter_num,
                                    print_freq=None)

                            # Now, the context set comprises the training x / y values, the target set comprises the test x values.

                            y_pred, y_var = m.predict(X_train,
                                                      y_train,
                                                      X_test,
                                                      n_samples=100)

                            # Output Standardised RMSE and RMSE on Train Set

                            score = r2_score(
                                y_test, y_scaler.inverse_transform(y_pred))
                            rmse = np.sqrt(
                                mean_squared_error(
                                    y_test,
                                    y_scaler.inverse_transform(y_pred)))
                            mae = mean_absolute_error(
                                y_test, y_scaler.inverse_transform(y_pred))

                            print("\nR^2: {:.3f}".format(score))
                            print("RMSE: {:.3f}".format(rmse))
                            print("MAE: {:.3f}".format(mae))

                            if rmse < best_rmse:
                                best_rmse = rmse
                                best_params['det_encs'] = det_encs
                                best_params['lat_encs'] = lat_encs
                                best_params['dec_hid'] = dec_hid
                                best_params['lr'] = l_rate
                                best_params['batch_size'] = batch_s
                                best_params['iterations'] = iter_num
                            print('Best parameters are \n')
                            print(best_params)

    print('Final best parameters are \n')
    print(best_params)

    with open(f'cross_val_hypers/{task}/ANP/hypers_{representation}.txt',
              'w') as f:
        f.write(str(best_params))
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

from data_utils import TaskDataLoader, featurise_mols
from kernels import Tanimoto

representation = 'fragprints'
task = 'e_iso_pi'
path = '../dataset/photoswitches.csv'
df = pd.read_csv('../dataset/purchasable_switch.csv')
candidate_list = df['SMILES'].to_list()

if __name__ == '__main__':

    X_test = featurise_mols(candidate_list, representation)

    data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path)
    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data()
    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data()
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_e_iso_pi = y_e_iso_pi.reshape(-1, 1)
    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)
Beispiel #8
0
def main(path, path_to_dft_dataset, representation, theory_level):
    """
    :param path: str specifying path to photoswitches.csv file.
    :param path_to_dft_dataset: str specifying path to dft_comparison.csv file.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0']
    """

    task = 'e_iso_pi'  # e_iso_pi only task supported for TD-DFT comparison
    data_loader = TaskDataLoader(task, path)
    smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset)

    X = featurise_mols(smiles_list, representation)

    # Keep only non-duplicate entries because we're not considering effects of solvent

    non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    X = X[non_duplicate_indices, :]
    experimental_vals = experimental_vals[non_duplicate_indices]
    non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
    pbe0_vals = pbe0_vals[non_dup_pbe0]
    cam_vals = cam_vals[non_dup_cam]

    # molecules with dft values to be split into train/test
    if theory_level == 'CAM-B3LYP':
        X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals)))
        # DFT values for the CAM-B3LYP level of theory
        dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals)))
    else:
        X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0)
        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals)))
        # DFT values for the PBE0 level of theory
        dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals)))
        # molecules with no dft vals must go into the training set.
        X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0)
        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals)))

    # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance.

    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data()
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)

    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)

    output_dim = 4  # Number of outputs
    rank = 1  # Rank of W
    feature_dim = len(X_no_dft[0, :])

    tanimoto_active_dims = [i for i in range(feature_dim)]  # active dims for Tanimoto base kernel.

    mae_list = []
    dft_mae_list = []

    # We define the Gaussian Process optimisation objective

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    print('\nBeginning training loop...')

    for i in range(len(y_with_dft)):

        X_train = np.delete(X_with_dft, i, axis=0)
        y_train = np.delete(y_with_dft, i)
        X_test = X_with_dft[i].reshape(1, -1)
        y_test = y_with_dft[i]
        dft_test = dft_vals[i]

        X_train = np.concatenate((X_train, X_no_dft))
        y_train = np.concatenate((y_train, y_no_dft))
        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
        X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1),
                                 np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1),
                                 np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1),
                                 np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1)))

        X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
        X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)

        # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
        Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))),
                                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

        y_test = np.hstack((y_test, np.zeros_like(y_test)))

        # Base kernel
        k = Tanimoto(active_dims=tanimoto_active_dims)
        #set_trainable(k.variance, False)

        # Coregion kernel
        coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim])

        # Create product kernel
        kern = k * coreg

        # This likelihood switches between Gaussian noise with different variances for each f_i:
        lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(),
                                                     gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()])

        # now build the GP model as normal
        m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik)

        # fit the covariance function parameters
        maxiter = ci_niter(1000)
        gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",)
        print_summary(m)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)

        # Output MAE for this trial

        mae = abs(y_test[:, 0] - y_pred)

        print("MAE: {}".format(mae))

        # Store values in order to compute the mean and standard error of the statistics across trials

        mae_list.append(mae)

        # DFT prediction scores on the same trial

        dft_mae = abs(y_test[:, 0] - dft_test)

        dft_mae_list.append(dft_mae)

    mae_list = np.array(mae_list)
    dft_mae_list = np.array(dft_mae_list)

    print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
    print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()

    X = featurise_mols(smiles_list, representation)

    if use_pca:
        n_components = 50
    else:
        n_components = None

    r2_list = []
    rmse_list = []
    mae_list = []

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)
        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)
        X_train, y_train, X_test, y_test, y_scaler = transform_data(
            X_train, y_train, X_test, y_test, n_components, use_pca)

        regr_rf = RandomForestRegressor(n_estimators=1519,
                                        random_state=4,
                                        max_features=0.086,
                                        bootstrap=False,
                                        min_samples_leaf=2)
        regr_rf.fit(X_train, y_train)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train = regr_rf.predict(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(
            mean_squared_error(y_scaler.inverse_transform(y_train),
                               y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        # Predict on new data
        y_rf = regr_rf.predict(X_test)
        y_rf = y_scaler.inverse_transform(y_rf)
        y_test = y_scaler.inverse_transform(y_test)
        score = r2_score(y_test, y_rf)
        rmse = np.sqrt(mean_squared_error(y_test, y_rf))
        mae = mean_absolute_error(y_test, y_rf)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)
    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
Beispiel #10
0
def main(path, task, representation, use_pca, n_trials, test_set_size):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    r2_list = []
    rmse_list = []
    mae_list = []

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        _, y_train, _, y_test, y_scaler = transform_data(
            X_train,
            y_train,
            X_test,
            y_test,
            n_components=n_components,
            use_pca=use_pca)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        gp_kernel = TanimotoKernel()
        gpr = GaussianProcessRegressor(kernel=gp_kernel)
        gpr.fit(X_train, y_train)

        # mean GP prediction

        X_test = np.tile(X_test, (10000, 1))

        import time
        start = time.time()

        y_pred = gpr.predict(X_test, return_std=False)

        end = time.time()
        print(f'time elapsed is {end - start}')
        y_pred = y_scaler.inverse_transform(y_pred)
        y_test = y_scaler.inverse_transform(y_test)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train = gpr.predict(X_train, return_std=False)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(
            mean_squared_error(y_scaler.inverse_transform(y_train),
                               y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, task, representation, use_pca, n_trials, test_set_size):
    """
    Train a multioutput GP simultaneously on all tasks of the photoswitch dataset.

    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n']
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path)
    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data(
    )
    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data(
    )
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_e_iso_pi = y_e_iso_pi.reshape(-1, 1)
    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)

    X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation)
    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)

    output_dim = 4  # Number of outputs
    rank = 1  # Rank of W
    feature_dim = len(X_e_iso_pi[0, :])

    tanimoto_active_dims = [i for i in range(feature_dim)
                            ]  # active dims for Tanimoto base kernel.

    r2_list = []
    rmse_list = []
    mae_list = []

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        if task == 'e_iso_pi':
            X_task = X_e_iso_pi
            y_task = y_e_iso_pi
        elif task == 'z_iso_pi':
            X_task = X_z_iso_pi
            y_task = y_z_iso_pi
        elif task == 'e_iso_n':
            X_task = X_e_iso_n
            y_task = y_e_iso_n
        else:
            X_task = X_z_iso_n
            y_task = y_z_iso_n

        X_train, X_test, y_train, y_test = train_test_split(
            X_task, y_task, test_size=test_set_size, random_state=i)

        X_train = X_train.astype(np.float64)
        X_test = X_test.astype(np.float64)

        if task == 'e_iso_pi':
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_train,
                                               np.zeros((len(X_train), 1)),
                                               axis=1),
                                     np.append(X_z_iso_pi,
                                               np.ones((len(X_z_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_e_iso_n,
                                               np.ones(
                                                   (len(X_e_iso_n), 1)) * 2,
                                               axis=1),
                                     np.append(X_z_iso_n,
                                               np.ones(
                                                   (len(X_z_iso_n), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
            X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_train, np.zeros_like(y_train))),
                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

            y_test = np.hstack((y_test, np.zeros_like(y_test)))

        elif task == 'z_iso_pi':
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_e_iso_pi,
                                               np.zeros((len(X_e_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_train,
                                               np.ones((len(X_train), 1)),
                                               axis=1),
                                     np.append(X_e_iso_n,
                                               np.ones(
                                                   (len(X_e_iso_n), 1)) * 2,
                                               axis=1),
                                     np.append(X_z_iso_n,
                                               np.ones(
                                                   (len(X_z_iso_n), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1)
            X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))),
                 np.hstack((y_train, np.ones_like(y_train))),
                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

            y_test = np.hstack((y_test, np.ones_like(y_test)))

        elif task == 'e_iso_n':
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_e_iso_pi,
                                               np.zeros((len(X_e_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_z_iso_pi,
                                               np.ones((len(X_z_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_train,
                                               np.ones((len(X_train), 1)) * 2,
                                               axis=1),
                                     np.append(X_z_iso_n,
                                               np.ones(
                                                   (len(X_z_iso_n), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1)
            X_train = np.append(X_train,
                                np.ones((len(X_train), 1)) * 2,
                                axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))),
                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                 np.hstack((y_train, np.ones_like(y_train) * 2)),
                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

            y_test = np.hstack((y_test, np.ones_like(y_test) * 2))

        else:
            # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
            X_augmented = np.vstack((np.append(X_e_iso_pi,
                                               np.zeros((len(X_e_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_z_iso_pi,
                                               np.ones((len(X_z_iso_pi), 1)),
                                               axis=1),
                                     np.append(X_e_iso_n,
                                               np.ones(
                                                   (len(X_e_iso_n), 1)) * 2,
                                               axis=1),
                                     np.append(X_train,
                                               np.ones((len(X_train), 1)) * 3,
                                               axis=1)))

            X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1)
            X_train = np.append(X_train,
                                np.ones((len(X_train), 1)) * 3,
                                axis=1)

            # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
            Y_augmented = np.vstack(
                (np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))),
                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
                 np.hstack((y_train, np.ones_like(y_train) * 3))))

            y_test = np.hstack((y_test, np.ones_like(y_test) * 3))

        # Base kernel
        k = Tanimoto(active_dims=tanimoto_active_dims)
        #set_trainable(k.variance, False)

        # Coregion kernel
        coreg = gpflow.kernels.Coregion(output_dim=output_dim,
                                        rank=rank,
                                        active_dims=[feature_dim])

        # Create product kernel
        kern = k * coreg

        # This likelihood switches between Gaussian noise with different variances for each f_i:
        lik = gpflow.likelihoods.SwitchedLikelihood([
            gpflow.likelihoods.Gaussian(),
            gpflow.likelihoods.Gaussian(),
            gpflow.likelihoods.Gaussian(),
            gpflow.likelihoods.Gaussian()
        ])

        # now build the GP model as normal
        m = gpflow.models.VGP((X_augmented, Y_augmented),
                              mean_function=Constant(np.mean(y_train[:, 0])),
                              kernel=kern,
                              likelihood=lik)

        # fit the covariance function parameters
        maxiter = ci_niter(1000)
        gpflow.optimizers.Scipy().minimize(
            m.training_loss,
            m.trainable_variables,
            options=dict(maxiter=maxiter),
            method="L-BFGS-B",
        )
        print_summary(m)

        # mean and variance GP prediction

        y_pred, y_var = m.predict_f(X_test)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train, _ = m.predict_f(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        score = r2_score(y_test[:, 0], y_pred)
        rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred))
        mae = mean_absolute_error(y_test[:, 0], y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

        B = coreg.output_covariance().numpy()
        print("B =", B)
        _ = plt.imshow(B)

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
Beispiel #12
0
def main(path, task, representation, use_pca, n_trials, test_set_size,
         use_rmse_conf, precompute_repr):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity']
    :param representation: str specifying the molecular representation. One of ['SMILES, fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param n_trials: int specifying number of random train/test splits to use
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence-
    error curves. True is the option for rmse.
    :param precompute_repr: bool indicating whether to precompute representations or not.
    """

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    if precompute_repr:
        if representation == 'SMILES':
            with open(
                    f'precomputed_representations/{task}_{representation}.txt',
                    'w') as f:
                for smiles in X:
                    f.write(smiles + '\n')
        else:
            np.savetxt(
                f'precomputed_representations/{task}_{representation}.txt', X)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(X,
                                       y,
                                       test_size=test_set_size,
                                       random_state=42)  # To get test set size

    # Photoswitch dataset requires 80/20 splitting. Other datasets are 80/10/10.

    if task != 'Photoswitch':
        split_in_two = int(len(y_test) / 2)
        n_test = split_in_two
    else:
        n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    # For Calibration curve

    prediction_prop = [[] for _ in range(n_trials)]

    print('\nBeginning training loop...')

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        if representation == 'SMILES':

            np.savetxt(f'fixed_train_test_splits/{task}/X_train_split_{i}.txt',
                       X_train,
                       fmt="%s")
            np.savetxt(f'fixed_train_test_splits/{task}/X_test_split_{i}.txt',
                       X_test,
                       fmt="%s")
            np.savetxt(f'fixed_train_test_splits/{task}/y_train_split_{i}.txt',
                       y_train)
            np.savetxt(f'fixed_train_test_splits/{task}/y_test_split_{i}.txt',
                       y_test)

        else:

            if task != 'Photoswitch':

                # Artificially create a 80/10/10 train/validation/test split discarding the validation set.
                split_in_two = int(len(y_test) / 2)
                X_test = X_test[0:split_in_two]
                y_test = y_test[0:split_in_two]

            y_train = y_train.reshape(-1, 1)
            y_test = y_test.reshape(-1, 1)

            #  We standardise the outputs but leave the inputs unchanged

            _, y_train, _, y_test, y_scaler = transform_data(
                X_train,
                y_train,
                X_test,
                y_test,
                n_components=n_components,
                use_pca=use_pca)

            X_train = X_train.astype(np.float64)
            X_test = X_test.astype(np.float64)

            np.random.seed(42)

            datasets, n, d, mean_y_train, std_y_train = load_reg_data(
                X_train, y_train, X_test, y_test)

            train_set_x, train_set_y = datasets[0]
            test_set_x, test_set_y = datasets[1]

            N_train = train_set_x.get_value(borrow=True).shape[0]
            N_test = test_set_x.get_value(borrow=True).shape[0]
            layer_sizes = [d, 20, 20, len(mean_y_train)]
            n_samples = 100
            alpha = 0.5
            learning_rate = 0.01
            v_prior = 1.0
            batch_size = 32
            print('... building model')
            sys.stdout.flush()
            bb_alpha = BB_alpha(layer_sizes, n_samples, alpha, learning_rate,
                                v_prior, batch_size, train_set_x, train_set_y,
                                N_train, test_set_x, test_set_y, N_test,
                                mean_y_train, std_y_train)
            print('... training')
            sys.stdout.flush()

            test_error, test_ll = bb_alpha.train_ADAM(100)

            print('Test RMSE: ', test_error)
            print('Test ll: ', test_ll)

            samples = bb_alpha.sample_predictive_distribution(X_test)
            y_pred = np.mean(samples, axis=0)
            var = np.var(samples, axis=0)

            # For producing the calibration curve

            for k in [
                    0.13, 0.26, 0.39, 0.53, 0.68, 0.85, 1.04, 1.15, 1.28, 1.44,
                    1.645, 1.96
            ]:
                a = (y_scaler.inverse_transform(y_test) <
                     y_scaler.inverse_transform(y_pred + k * np.sqrt(var)))
                b = (y_scaler.inverse_transform(y_test) >
                     y_scaler.inverse_transform(y_pred - k * np.sqrt(var)))
                prediction_prop[i].append(
                    np.argwhere((a == True) & (b == True)).shape[0] /
                    len(y_test))

            # We transform the standardised predictions back to the original data space

            y_pred = y_scaler.inverse_transform(y_pred)
            y_test = y_scaler.inverse_transform(y_test)

            # Compute scores for confidence curve plotting.

            ranked_confidence_list = np.argsort(var, axis=0).flatten()

            for k in range(len(y_test)):

                # Construct the RMSE error for each level of confidence

                conf = ranked_confidence_list[0:k + 1]
                rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
                rmse_confidence_list[i, k] = rmse

                # Construct the MAE error for each level of confidence

                mae = mean_absolute_error(y_test[conf], y_pred[conf])
                mae_confidence_list[i, k] = mae

            # Output Standardised RMSE and RMSE on Train Set

            train_samples = bb_alpha.sample_predictive_distribution(X_train)
            y_pred_train = np.mean(train_samples, axis=0)

            train_rmse_stan = np.sqrt(mean_squared_error(
                y_train, y_pred_train))
            train_rmse = np.sqrt(
                mean_squared_error(y_scaler.inverse_transform(y_train),
                                   y_scaler.inverse_transform(y_pred_train)))
            print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
            print("Train RMSE: {:.3f}".format(train_rmse))

            score = r2_score(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mae = mean_absolute_error(y_test, y_pred)

            print("\nR^2: {:.3f}".format(score))
            print("RMSE: {:.3f}".format(rmse))
            print("MAE: {:.3f}".format(mae))

            r2_list.append(score)
            rmse_list.append(rmse)
            mae_list.append(mae)

    if representation != 'SMILES':

        r2_list = np.array(r2_list)
        rmse_list = np.array(rmse_list)
        mae_list = np.array(mae_list)

        print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list),
                                                    np.std(r2_list)))
        print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list),
                                                   np.std(rmse_list)))
        print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list),
                                                    np.std(mae_list)))

        # Plot confidence-error curves

        confidence_percentiles = np.arange(
            1e-14, 100, 100 / len(y_test)
        )  # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29

        if use_rmse_conf:

            rmse_mean = np.mean(rmse_confidence_list, axis=0)
            rmse_std = np.std(rmse_confidence_list, axis=0)

            # We flip because we want the most confident predictions on the right-hand side of the plot

            rmse_mean = np.flip(rmse_mean)
            rmse_std = np.flip(rmse_std)

            # One-sigma error bars

            lower = rmse_mean - rmse_std
            upper = rmse_mean + rmse_std

            plt.plot(confidence_percentiles, rmse_mean, label='mean')
            plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
            plt.xlabel('Confidence Percentile')
            plt.ylabel('RMSE')
            plt.ylim([0, np.max(upper) + 1])
            plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
            plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
            plt.savefig(task +
                        '/results/BNN/{}_{}_confidence_curve_rmse.png'.format(
                            representation, task))
            plt.show()

        else:

            # We plot the Mean-absolute error confidence-error curves

            mae_mean = np.mean(mae_confidence_list, axis=0)
            mae_std = np.std(mae_confidence_list, axis=0)

            mae_mean = np.flip(mae_mean)
            mae_std = np.flip(mae_std)

            lower = mae_mean - mae_std
            upper = mae_mean + mae_std

            plt.plot(confidence_percentiles, mae_mean, label='mean')
            plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
            plt.xlabel('Confidence Percentile')
            plt.ylabel('MAE')
            plt.ylim([0, np.max(upper) + 1])
            plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
            plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
            plt.savefig(task +
                        '/results/BNN/{}_{}_confidence_curve_mae.png'.format(
                            representation, task))
            plt.show()

        # Plot the calibration curve

        mean_props = np.mean(prediction_prop, axis=0)
        sd_props = np.std(prediction_prop, axis=0)
        lower = mean_props - sd_props
        upper = mean_props + sd_props
        qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
        plt.plot(qs, mean_props, label='mean')
        plt.fill_between(qs, lower, upper, alpha=0.2)
        plt.plot(qs, qs, color="red")
        plt.xlabel('q')
        plt.ylabel('C(q)')
        plt.savefig(task + '/results/BNN/{}_{}_calibration_curve.png'.format(
            representation, task))
        plt.show()

        np.savetxt(
            task +
            '/results/BNN/{}_{}_mean_props'.format(representation, task),
            mean_props)
        np.savetxt(
            task + '/results/BNN/{}_{}_sd_props'.format(representation, task),
            sd_props)
Beispiel #13
0
def main(path, path_to_large_dataset, task, representation, test_set_size,
         augment_photo_dataset, n_trials):
    """
    :param path: str giving path to the photoswitches.csv file.
    :param path_to_large_dataset: str giving path to paper_allDB.csv file
    :param task: str specifying the task. Always e_iso_pi for the generalization experiment
    :param representation: str specifying the molecular representation. One of [fingerprints, fragments, fragprints].'
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    :param augment_photo_dataset: If True augment the photoswitch dataset with the Beard et al. 2019 dataset
    :param n_trials: int specifying the number of random train/test splits.
    """

    data_loader = TaskDataLoader(task, path)

    photo_smiles_list, y_vals_photo = data_loader.load_property_data()
    beard_smiles_list, y_vals_beard = data_loader.load_large_comparison_data(
        path_to_large_dataset)

    r2_list = []
    rmse_list = []
    mae_list = []

    if not augment_photo_dataset:
        # test set is now fixed
        n_trials = 1
        # We train on the Beard dataset and test on the photoswitch dataset
        X_train = featurise_mols(beard_smiles_list, representation)
        X_test = featurise_mols(photo_smiles_list, representation)
        y_train = y_vals_beard
        y_test = y_vals_photo

    for i in range(0, n_trials):

        if augment_photo_dataset:
            # We add the Beard dataset as additional training data
            X_train, X_test, y_train, y_test = train_test_split(
                photo_smiles_list,
                y_vals_photo,
                test_size=test_set_size,
                random_state=i)
            X_train = X_train + beard_smiles_list
            y_train = np.concatenate((y_train, y_vals_beard))
            X_train = featurise_mols(X_train, representation)
            X_test = featurise_mols(X_test, representation)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)
        X_train, y_train, X_test, y_test, y_scaler = transform_data(
            X_train, y_train, X_test, y_test)

        regr_rf = RandomForestRegressor(n_estimators=1000,
                                        max_depth=300,
                                        random_state=2)
        regr_rf.fit(X_train, y_train)

        # Output Standardised RMSE and RMSE on Train Set

        y_pred_train = regr_rf.predict(X_train)
        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
        train_rmse = np.sqrt(
            mean_squared_error(y_scaler.inverse_transform(y_train),
                               y_scaler.inverse_transform(y_pred_train)))
        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
        print("Train RMSE: {:.3f}".format(train_rmse))

        # Predict on new data
        y_rf = regr_rf.predict(X_test)
        y_rf = y_scaler.inverse_transform(y_rf)
        y_test = y_scaler.inverse_transform(y_test)
        score = r2_score(y_test, y_rf)
        rmse = np.sqrt(mean_squared_error(y_test, y_rf))
        mae = mean_absolute_error(y_test, y_rf)

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))
def main(path, representation):
    """
    :param path: str specifying path to dataset.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    """

    task = 'e_iso_pi'  # task always e_iso_pi with human performance comparison
    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # 5 test molecules

    test_smiles = [
        'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2',
        'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC',
        'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC'
    ]

    # and their indices in the loaded data
    test_smiles_indices = [116, 131, 168, 221, 229]

    X_train = np.delete(X, np.array(test_smiles_indices), axis=0)
    y_train = np.delete(y, np.array(test_smiles_indices))
    X_test = X[[116, 131, 168, 221, 229]]

    # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was
    # under a different solvent
    y_test = y[[116, 131, 168, 221, 229]]
    y_test[2] = 407.

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    # #  We standardise the outputs but leave the inputs unchanged
    #
    # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)

    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)

    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data(
    )
    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()

    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
    y_z_iso_n = y_z_iso_n.reshape(-1, 1)

    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)

    output_dim = 4  # Number of outputs
    rank = 1  # Rank of W
    feature_dim = len(X_train[0, :])

    tanimoto_active_dims = [i for i in range(feature_dim)
                            ]  # active dims for Tanimoto base kernel.

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
    X_augmented = np.vstack((np.append(X_train,
                                       np.zeros((len(X_train), 1)),
                                       axis=1),
                             np.append(X_z_iso_pi,
                                       np.ones((len(X_z_iso_pi), 1)),
                                       axis=1),
                             np.append(X_e_iso_n,
                                       np.ones((len(X_e_iso_n), 1)) * 2,
                                       axis=1),
                             np.append(X_z_iso_n,
                                       np.ones((len(X_z_iso_n), 1)) * 3,
                                       axis=1)))

    X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
    X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)

    # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
    Y_augmented = np.vstack(
        (np.hstack((y_train, np.zeros_like(y_train))),
         np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
         np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
         np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))

    y_test = np.hstack((y_test, np.zeros_like(y_test)))

    # Base kernel
    k = Tanimoto(active_dims=tanimoto_active_dims)
    # set_trainable(k.variance, False)

    # Coregion kernel
    coreg = gpflow.kernels.Coregion(output_dim=output_dim,
                                    rank=rank,
                                    active_dims=[feature_dim])

    # Create product kernel
    kern = k * coreg

    # This likelihood switches between Gaussian noise with different variances for each f_i:
    lik = gpflow.likelihoods.SwitchedLikelihood([
        gpflow.likelihoods.Gaussian(),
        gpflow.likelihoods.Gaussian(),
        gpflow.likelihoods.Gaussian(),
        gpflow.likelihoods.Gaussian()
    ])

    # now build the GP model as normal
    m = gpflow.models.VGP((X_augmented, Y_augmented),
                          mean_function=Constant(np.mean(y_train[:, 0])),
                          kernel=kern,
                          likelihood=lik)

    # fit the covariance function parameters
    maxiter = ci_niter(1000)
    gpflow.optimizers.Scipy().minimize(
        m.training_loss,
        m.trainable_variables,
        options=dict(maxiter=maxiter),
        method="L-BFGS-B",
    )
    print_summary(m)

    # mean and variance GP prediction

    y_pred, y_var = m.predict_f(X_test)

    # Output Standardised RMSE and RMSE on Train Set

    y_pred_train, _ = m.predict_f(X_train)
    train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
    print("Train RMSE: {:.3f}".format(train_rmse))

    r2 = r2_score(y_test[:, 0], y_pred)
    rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred))
    mae = mean_absolute_error(y_test[:, 0], y_pred)
    per_molecule = np.diag(abs(y_pred - y_test[:, 0]))

    print("\n Averaged test statistics are")
    print("\nR^2: {:.3f}".format(r2))
    print("RMSE: {:.3f}".format(rmse))
    print("MAE: {:.3f}".format(mae))
    print("\nAbsolute error per molecule is {} ".format(per_molecule))
Beispiel #15
0
def main(path, representation):
    """
    :param path: str specifying path to dataset.
    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
    """

    task = 'e_iso_pi'  # Always e_iso_pi for human performance comparison
    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # 5 test molecules

    test_smiles = [
        'BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1',
        'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2',
        'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC',
        'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC'
    ]

    # and their indices in the loaded data
    test_smiles_indices = [116, 131, 168, 221, 229]

    X_train = np.delete(X, np.array(test_smiles_indices), axis=0)
    y_train = np.delete(y, np.array(test_smiles_indices))
    X_test = X[[116, 131, 168, 221, 229]]

    # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was
    # under a different solvent
    y_test = y[[116, 131, 168, 221, 229]]
    y_test[2] = 407.

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    #  We standardise the outputs but leave the inputs unchanged

    _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test,
                                                     y_test)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)

    num_features = np.shape(X)[1]

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None

    def objective_closure():
        return -m.log_marginal_likelihood()

    # for plotting confidence-error curves

    rmse_confidence_list = []
    mae_confidence_list = []

    k = Tanimoto()
    m = gpflow.models.GPR(data=(X_train, y_train),
                          mean_function=Constant(np.mean(y_train)),
                          kernel=k,
                          noise_variance=1)

    # Optimise the kernel variance and noise level by the marginal likelihood

    opt = gpflow.optimizers.Scipy()
    opt.minimize(objective_closure,
                 m.trainable_variables,
                 options=dict(maxiter=100))
    print_summary(m)

    # mean and variance GP prediction

    y_pred, y_var = m.predict_f(X_test)
    y_pred = y_scaler.inverse_transform(y_pred)
    y_test = y_scaler.inverse_transform(y_test)

    # Output Standardised RMSE and RMSE on Train Set

    y_pred_train, _ = m.predict_f(X_train)
    train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_rmse = np.sqrt(
        mean_squared_error(y_scaler.inverse_transform(y_train),
                           y_scaler.inverse_transform(y_pred_train)))
    print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
    print("Train RMSE: {:.3f}".format(train_rmse))

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    per_molecule = abs(y_pred - y_test)

    print("\n Averaged test statistics are")
    print("\nR^2: {:.3f}".format(r2))
    print("RMSE: {:.3f}".format(rmse))
    print("MAE: {:.3f}".format(mae))
    print("\nAbsolute error per molecule is {} ".format(per_molecule))
Beispiel #16
0
def main(path, task, representation, use_pca, test_set_size):
    """
    :param path: str specifying path to dataset.
    :param task: str specifying the task. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity']
    :param representation: str specifying the molecular representation. One of [fingerprints, 'fragments', 'fragprints']
    :param use_pca: bool. If True apply PCA to perform Principal Components Regression.
    :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set
    """

    if representation == 'SMILES':
        raise Exception('SMILES is not a valid representation for the BNN')

    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 100
    else:
        n_components = None

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_set_size, random_state=42)

    if task != 'Photoswitch':
        # Artificially create a 80/10/10 train/validation/test split discarding the validation set.
        split_in_two = int(len(y_test) / 2)
        X_test = X_test[0:split_in_two]
        y_test = y_test[0:split_in_two]

    else:
        # We subdivide the train set in order to run cross-validation.
        X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.1,
                                                            random_state=42)

    y_train = y_train.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    #  We standardise the outputs but leave the inputs unchanged

    _, y_train, _, y_test, y_scaler = transform_data(X_train,
                                                     y_train,
                                                     X_test,
                                                     y_test,
                                                     n_components=n_components,
                                                     use_pca=use_pca)

    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)

    datasets, n, d, mean_y_train, std_y_train = load_reg_data(
        X_train, y_train, X_test, y_test)

    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[1]

    layer_sizes = [10, 20]
    learning_rates = [0.01, 0.001]
    batch_sizes = [16, 32, 64]
    iters = [20, 50, 100]

    best_rmse = 10000000  # a large number
    best_params = {'layer_size': 0, 'lr': 0, 'batch_size': 0, 'iterations': 0}

    for layer_size in layer_sizes:
        for lr in learning_rates:
            for batch_size in batch_sizes:
                for iteration in iters:

                    N_train = train_set_x.get_value(borrow=True).shape[0]
                    N_test = test_set_x.get_value(borrow=True).shape[0]
                    layer_sizes = [
                        d, layer_size, layer_size,
                        len(mean_y_train)
                    ]
                    n_samples = 100
                    alpha = 0.5
                    learning_rate = lr
                    v_prior = 1.0
                    batch_size = batch_size
                    print('... building model')
                    sys.stdout.flush()
                    bb_alpha = BB_alpha(layer_sizes, n_samples, alpha,
                                        learning_rate, v_prior, batch_size,
                                        train_set_x, train_set_y, N_train,
                                        test_set_x, test_set_y, N_test,
                                        mean_y_train, std_y_train)
                    print('... training')
                    sys.stdout.flush()

                    test_error, test_ll = bb_alpha.train_ADAM(iteration)

                    print('Test RMSE: ', test_error)
                    print('Test ll: ', test_ll)

                    samples = bb_alpha.sample_predictive_distribution(X_test)
                    y_pred = np.mean(samples, axis=0)

                    # Output Standardised RMSE and RMSE on Train Set

                    train_samples = bb_alpha.sample_predictive_distribution(
                        X_train)
                    y_pred_train = np.mean(train_samples, axis=0)

                    train_rmse_stan = np.sqrt(
                        mean_squared_error(y_train, y_pred_train))
                    train_rmse = np.sqrt(
                        mean_squared_error(
                            y_scaler.inverse_transform(y_train),
                            y_scaler.inverse_transform(y_pred_train)))
                    print("\nStandardised Train RMSE: {:.3f}".format(
                        train_rmse_stan))
                    print("Train RMSE: {:.3f}".format(train_rmse))

                    score = r2_score(y_scaler.inverse_transform(y_test),
                                     y_scaler.inverse_transform(y_pred))
                    rmse = np.sqrt(
                        mean_squared_error(y_scaler.inverse_transform(y_test),
                                           y_scaler.inverse_transform(y_pred)))
                    mae = mean_absolute_error(
                        y_scaler.inverse_transform(y_test),
                        y_scaler.inverse_transform(y_pred))

                    print("\nR^2: {:.3f}".format(score))
                    print("RMSE: {:.3f}".format(rmse))
                    print("MAE: {:.3f}".format(mae))

                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_params['lr'] = lr
                        best_params['batch_size'] = batch_size
                        best_params['iterations'] = iteration
                        best_params['layer_size'] = layer_size
                    print('Best parameters are \n')
                    print(best_params)

    print('Final best parameters are \n')
    print(best_params)

    with open(f'cross_val_hypers/{task}/BNN/hypers_{representation}.txt',
              'w') as f:
        f.write(str(best_params))
def main(task, path, representation, use_pca, n_trials, test_set_size,
         batch_size, lr, iterations, r_size, det_encoder_hidden_size,
         det_encoder_n_hidden, lat_encoder_hidden_size, lat_encoder_n_hidden,
         decoder_hidden_size, decoder_n_hidden):
    """
    :param task: str specifying the task name. One of [e_iso_pi, e_iso_n, z_iso_pi, z_iso_n]
    :param path: str specifying the path to the photoswitches.csv file
    :param representation: str specifying the representation. One of [fingerprints, fragments, fragprints]
    :param use_pca: bool specifying whether or not to use PCA to perform Principal Components Regression
    :param n_trials: int specifying the number of random train/test splits.
    :param test_set_size: float specifying the train/test split ratio. e.g. 0.2 is 80/20 train/test split
    :param batch_size: int specifying the number of samples to take of the context set, given the number of
    context points that should be selected.
    :param lr: float specifying the learning rate.
    :param iterations: int specifying the number of training iterations
    :param r_size: Dimensionality of context encoding r.
    :param det_encoder_hidden_size: Dimensionality of deterministic encoder hidden layers.
    :param det_encoder_n_hidden: Number of deterministic encoder hidden layers.
    :param lat_encoder_hidden_size: Dimensionality of latent encoder hidden layers.
    :param lat_encoder_n_hidden: Number of latent encoder hidden layers.
    :param decoder_hidden_size: Dimensionality of decoder hidden layers.
    :param decoder_n_hidden: Number of decoder hidden layers.
    :return:
    """

    path_to_save = task + '/results/anp/'
    data_loader = TaskDataLoader(task, path)
    smiles_list, y = data_loader.load_property_data()
    y_size = 1

    if args.representation == 'fingerprints':
        X = featurise_mols(smiles_list, representation)
    elif args.representation == 'fragments':
        X = featurise_mols(smiles_list, representation)
    else:
        X = featurise_mols(smiles_list, representation)

    # If True we perform Principal Components Regression

    if use_pca:
        n_components = 50
    else:
        n_components = None

    r2_list = []
    rmse_list = []
    mae_list = []

    # We pre-allocate arrays for plotting confidence-error curves

    _, _, _, y_test = train_test_split(
        X, y, test_size=test_set_size)  # To get test set size
    n_test = len(y_test)

    rmse_confidence_list = np.zeros((n_trials, n_test))
    mae_confidence_list = np.zeros((n_trials, n_test))

    print('\nBeginning training loop...')
    j = 0  # index for saving results

    for i in range(0, n_trials):

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_set_size, random_state=i)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        #  We standardise the outputs but leave the inputs unchanged

        X_train, y_train, X_test, _, y_scaler = transform_data(
            X_train,
            y_train,
            X_test,
            y_test,
            n_components=n_components,
            use_pca=use_pca)

        X_train = torch.from_numpy(X_train).float().unsqueeze(dim=0)
        X_test = torch.from_numpy(X_test).float().unsqueeze(dim=0)
        y_train = torch.from_numpy(y_train).float().unsqueeze(dim=0)

        m = AttentiveNP(x_size=X_train.shape[2],
                        y_size=y_size,
                        r_size=r_size,
                        det_encoder_hidden_size=det_encoder_hidden_size,
                        det_encoder_n_hidden=det_encoder_n_hidden,
                        lat_encoder_hidden_size=lat_encoder_hidden_size,
                        lat_encoder_n_hidden=lat_encoder_n_hidden,
                        decoder_hidden_size=decoder_hidden_size,
                        decoder_n_hidden=decoder_n_hidden,
                        lr=lr,
                        attention_type="multihead")

        print('...training.')

        m.train(X_train,
                y_train,
                batch_size=batch_size,
                iterations=iterations,
                print_freq=None)

        # Now, the context set comprises the training x / y values, the target set comprises the test x values.

        y_pred, y_var = m.predict(X_train, y_train, X_test, n_samples=100)

        y_pred = y_scaler.inverse_transform(y_pred)

        # Compute scores for confidence curve plotting.

        ranked_confidence_list = np.argsort(y_var.numpy(), axis=0).flatten()

        for k in range(len(y_test)):
            # Construct the RMSE error for each level of confidence

            conf = ranked_confidence_list[0:k + 1]
            rmse = np.sqrt(mean_squared_error(y_test[conf], y_pred[conf]))
            rmse_confidence_list[i, k] = rmse

            # Construct the MAE error for each level of confidence

            mae = mean_absolute_error(y_test[conf], y_pred[conf])
            mae_confidence_list[i, k] = mae

        # Output Standardised RMSE and RMSE on Train Set

        score = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)

        print("\nR^2: {:.3f}".format(score))
        print("RMSE: {:.3f}".format(rmse))
        print("MAE: {:.3f}".format(mae))

        r2_list.append(score)
        rmse_list.append(rmse)
        mae_list.append(mae)

        np.savetxt(
            path_to_save + '_seed_' + str(j) + '_ypred_' + representation +
            '.txt', y_pred)
        np.savetxt(path_to_save + '_seed_' + str(j) + '_ytest.txt', y_test)
        np.savetxt(
            path_to_save + '_seed_' + str(j) + '_ystd_' + representation +
            '.txt', np.sqrt(y_var))

        j += 1

    r2_list = np.array(r2_list)
    rmse_list = np.array(rmse_list)
    mae_list = np.array(mae_list)

    print("\nmean R^2: {:.4f} +- {:.4f}".format(
        np.mean(r2_list),
        np.std(r2_list) / np.sqrt(len(r2_list))))
    print("mean RMSE: {:.4f} +- {:.4f}".format(
        np.mean(rmse_list),
        np.std(rmse_list) / np.sqrt(len(rmse_list))))
    print("mean MAE: {:.4f} +- {:.4f}\n".format(
        np.mean(mae_list),
        np.std(mae_list) / np.sqrt(len(mae_list))))

    with open(path_to_save + representation + '.txt', 'w+') as f:
        f.write('\n Representation = ' + str(representation))
        f.write('\n Task = ' + str(task))
        f.write('\n Use PCA? = ' + str(use_pca))
        f.write('\n Number of trials = {} \n'.format(n_trials))
        f.write('\n Deterministic encoder hidden size = ' +
                str(det_encoder_hidden_size))
        f.write('\n Deterministic encoder number of layers = ' +
                str(det_encoder_n_hidden))
        f.write('\n Latent encoder hidden size = ' +
                str(lat_encoder_hidden_size))
        f.write('\n Latent encoder number of layers = ' +
                str(lat_encoder_n_hidden))
        f.write('\n Decoder hidden size = ' + str(decoder_hidden_size))
        f.write('\n Decoder number of layers = ' + str(decoder_n_hidden))
        f.write('\n Latent variable size = ' + str(r_size))
        f.write('\n Batch size = {}'.format(batch_size))
        f.write('\n Learning rate = {}'.format(lr))
        f.write('\n Number of iterations = {} \n'.format(iterations))
        f.write("\nmean R^2: {:.4f} +- {:.4f}".format(
            np.mean(r2_list),
            np.std(r2_list) / np.sqrt(len(r2_list))))
        f.write("\nmean RMSE: {:.4f} +- {:.4f}".format(
            np.mean(rmse_list),
            np.std(rmse_list) / np.sqrt(len(rmse_list))))
        f.write("\nmean MAE: {:.4f} +- {:.4f}\n".format(
            np.mean(mae_list),
            np.std(mae_list) / np.sqrt(len(mae_list))))

        f.flush()

    # Plot confidence-error curves

    # 1e-14 instead of 0 to stop weirdness with len(y_test) = 29
    confidence_percentiles = np.arange(1e-14, 100, 100 / len(y_test))

    rmse_mean = np.mean(rmse_confidence_list, axis=0)
    rmse_std = np.std(rmse_confidence_list, axis=0)

    # We flip because we want the most confident predictions on the right-hand side of the plot

    rmse_mean = np.flip(rmse_mean)
    rmse_std = np.flip(rmse_std)

    # One-sigma error bars

    lower = rmse_mean - rmse_std
    upper = rmse_mean + rmse_std

    plt.plot(confidence_percentiles, rmse_mean, label='mean')
    plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
    plt.xlabel('Confidence Percentile')
    plt.ylabel('RMSE (nm)')
    plt.ylim([0, np.max(upper) + 1])
    plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
    plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
    plt.savefig(path_to_save + 'confidence_curve_rmse.png')

    # We plot the Mean-absolute error confidence-error curves

    mae_mean = np.mean(mae_confidence_list, axis=0)
    mae_std = np.std(mae_confidence_list, axis=0)

    mae_mean = np.flip(mae_mean)
    mae_std = np.flip(mae_std)

    lower = mae_mean - mae_std
    upper = mae_mean + mae_std

    plt.plot(confidence_percentiles, mae_mean, label='mean')
    plt.fill_between(confidence_percentiles, lower, upper, alpha=0.2)
    plt.xlabel('Confidence Percentile')
    plt.ylabel('MAE (nm)')
    plt.ylim([0, np.max(upper) + 1])
    plt.xlim([0, 100 * ((len(y_test) - 1) / len(y_test))])
    plt.yticks(np.arange(0, np.max(upper) + 1, 5.0))
    plt.savefig(path_to_save + 'confidence_curve_mae.png')