def build_predictor(net_type, fp_length, fp_depth, conv_width, h1_size, L2_reg,
                    nll_func):
    if net_type == 'mean':
        return build_mean_predictor(nll_func)
    elif net_type == 'conv_plus_linear':
        vanilla_net_params = dict(layer_sizes=[fp_length],
                                  normalize=True,
                                  L2_reg=L2_reg,
                                  nll_func=nll_func)
        conv_params = dict(num_hidden_features=[conv_width] * fp_depth,
                           fp_length=fp_length)
        return build_conv_deep_net(conv_params, vanilla_net_params)
    elif net_type == 'morgan_plus_linear':
        vanilla_net_params = dict(layer_sizes=[fp_length],
                                  normalize=True,
                                  L2_reg=L2_reg,
                                  nll_func=nll_func)
        return build_morgan_deep_net(fp_length, fp_depth, vanilla_net_params)
    elif net_type == 'conv_plus_net':
        vanilla_net_params = dict(layer_sizes=[fp_length, h1_size],
                                  normalize=True,
                                  L2_reg=L2_reg,
                                  nll_func=nll_func)
        conv_params = dict(num_hidden_features=[conv_width] * fp_depth,
                           fp_length=fp_length)
        return build_conv_deep_net(conv_params, vanilla_net_params)
    elif net_type == 'morgan_plus_net':
        vanilla_net_params = dict(layer_sizes=[fp_length, h1_size],
                                  normalize=True,
                                  L2_reg=L2_reg,
                                  nll_func=nll_func)
        return build_morgan_deep_net(fp_length, fp_depth, vanilla_net_params)
    else:
        raise Exception("Unknown network type.")
Ejemplo n.º 2
0
    def run_conv_experiment(model_params):

        # Define the architecture of the network that sits on top of the fingerprints.
        vanilla_net_params = dict(
            layer_sizes=[model_params['fp_length'],
                         model_params['h1_size']],  # One hidden layer.
            normalize=True,
            L2_reg=model_params['L2_reg'],
            nll_func=rmse)

        conv_layer_sizes = [model_params['conv_width']
                            ] * model_params['fp_depth']
        conv_arch_params = {
            'num_hidden_features': conv_layer_sizes,
            'fp_length': model_params['fp_length'],
            'normalize': 1
        }
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        num_weights = len(conv_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return r2(test_predictions, test_targets)
def compute_fingerprints(dataset, train_file, test_file, learning_rate):
    train, val, test = dataset
    X_train, y_train = train
    X_val, y_val = val
    X_test, y_test = test

    X_train_val = np.concatenate((X_train, X_val))
    y_train_val = np.concatenate((y_train, y_val))

    global train_params
    # train_params["num_iters"] = int(len(X_train)/train_params["batch_size"])
    train_params["step_size"] = learning_rate

    smiles_to_fps = {}
    conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
    conv_arch_params = {
        'num_hidden_features': conv_layer_sizes,
        'fp_length': model_params['fp_length'],
        'normalize': 1,
        'smiles_to_fps': smiles_to_fps
    }

    loss_fun, pred_fun, conv_parser = build_conv_deep_net(
        conv_arch_params, vanilla_net_params, model_params['L2_reg'])
    num_weights = len(conv_parser)

    predict_func, trained_weights, conv_training_curve = train_nn(
        pred_fun,
        loss_fun,
        num_weights,
        X_train,
        y_train,
        train_params,
        validation_smiles=X_val,
        validation_raw_targets=y_val)

    pred_fun(trained_weights, X_train_val)

    with open(train_file, "w+") as smiles_fps_file:
        header = ["smiles", "fingerprints", "target"]
        file_info = [[smile, smiles_to_fps[smile], target]
                     for smile, target in zip(X_train_val, y_train_val)]

        writer = csv.writer(smiles_fps_file)
        writer.writerow(header)
        for line in file_info:
            writer.writerow(line)

    predict_func(X_test)
    with open(test_file, "w+") as smiles_fps_file:
        header = ["smiles", "fingerprints", "target"]
        file_info = [[smile, smiles_to_fps[smile], target]
                     for smile, target in zip(X_test, y_test)]

        writer = csv.writer(smiles_fps_file)
        writer.writerow(header)
        for line in file_info:
            writer.writerow(line)
Ejemplo n.º 4
0
def train_neural_fingerprint(train_directory, labels_mapping, tmp_dir, n_epochs=15):
    global task_params
    task_params['N_train'] = int(len(os.listdir(train_directory)) * 0.7)
    task_params['N_valid'] = int(len(os.listdir(train_directory)) * 0.01)
    task_params['N_test']  = int(len(os.listdir(train_directory)) * 0.01)
    task_params['data_file'] = tmp_dir

    global num_epochs 
    num_epochs = n_epochs

    directory = train_directory
    output = open(tmp_dir, 'wb+')

    files = os.listdir(directory)
    output.write('graph,label\n')
    for f in files:
        output.write(directory + '/' +  f + ',' + str(labels_mapping[f]) + '\n')
    output.close()
    
    print "Loading data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='graph', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata

    print "Regression on", task_params['N_train'], "training points."
    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds,  val_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x : y_train_mean)

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)

    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)

    print_performance(predict_func)
    return trained_weights
Ejemplo n.º 5
0
 def run_conv_experiment():
     conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
     conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                         'fp_length' : model_params['fp_length'], 'normalize' : 1}
     loss_fun, pred_fun, conv_parser = \
         build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
     num_weights = len(conv_parser)
     predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                  train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
     test_predictions = predict_func(test_inputs)
     return rmse(test_predictions, test_targets)
Ejemplo n.º 6
0
 def run_conv_experiment():
     conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
     conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                         'fp_length' : model_params['fp_length'], 'normalize' : 1}
     loss_fun, pred_fun, conv_parser = \
         build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
     num_weights = len(conv_parser)
     predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                  train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
     test_predictions = predict_func(test_inputs)
     return rmse(test_predictions, test_targets)
Ejemplo n.º 7
0
    def fit(self, smiles_list, logS_list, seed=0):
        train_smiles = [smiles for smiles in smiles_list]
        train_logS = [logS for logS in logS_list]

        conv_layer_sizes = [self.model_params['conv_width']
                            ] * self.model_params['fp_depth']
        conv_arch_params = {
            'num_hidden_features': conv_layer_sizes,
            'fp_length': self.model_params['fp_length'],
            'normalize': 1
        }

        # Neural net architecture
        net_arch_params = dict(layer_sizes=[
            self.model_params['fp_length'], self.model_params['h1_size']
        ],
                               normalize=True,
                               L2_reg=self.model_params['L2_reg'],
                               nll_func=rmse)

        loss_fun, pred_fun, conv_parser = build_conv_deep_net(
            conv_arch_params, net_arch_params, self.model_params['L2_reg'])

        num_weights = len(conv_parser)
        init_weights = npr.RandomState(seed).randn(
            num_weights) * self.train_params['init_scale']

        train_logS_norm, undo_norm = normalize_array(train_logS)

        # Build gradient using autograd.
        grad_fun = grad(loss_fun)
        grad_fun_with_data = build_batched_grad(
            grad_fun, self.train_params['batch_size'], train_smiles,
            train_logS_norm)

        # Optimize weights.
        trained_weights = adam(grad_fun_with_data,
                               init_weights,
                               num_iters=self.train_params['num_iters'],
                               step_size=self.train_params['step_size'])

        self.model = (undo_norm, trained_weights, pred_fun)
Ejemplo n.º 8
0
def train_neural_fingerprint():
    print "Loading data..."
    traindata, valdata, testdata = load_data(
        task_params['data_file'],
        (task_params['N_train'], task_params['N_valid'],
         task_params['N_test']),
        input_name='smiles',
        target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata

    print "Regression on", task_params['N_train'], "training points."

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds, val_targets)
        print "-" * 80
        return rmse(val_preds, val_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x: y_train_mean)

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    print "Convnet fingerprints with neural net"
    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)
    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
    print_performance(predict_func)
    return trained_weights
Ejemplo n.º 9
0
def train_neural_fingerprint():
    print "Loading data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='smiles', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata

    print "Regression on", task_params['N_train'], "training points."
    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds,  val_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x : y_train_mean)

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    print "Convnet fingerprints with neural net"
    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)
    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
    print_performance(predict_func)
    return trained_weights
def plot(trained_weights):
    print "Loading training data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='smiles', target_name=task_params['target_name'])
    train_smiles, train_targets = traindata

    print "Convnet fingerprints with neural net"
    conv_arch_params['return_atom_activations'] = True
    output_layer_fun, parser, compute_atom_activations = \
       build_convnet_fingerprint_fun(**conv_arch_params)
    atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles)

    if not os.path.exists('figures'): os.makedirs('figures')

    parent_molecule_dict = {}
    for mol_ix, atom_ixs in enumerate(array_rep['atom_list']):
        for atom_ix in atom_ixs:
            parent_molecule_dict[atom_ix] = mol_ix

    atom_neighbor_list = construct_atom_neighbor_list(array_rep)

    def get_neighborhood_ixs(array_rep, cur_atom_ix, radius):
        # Recursive function to get indices of all atoms in a certain radius.
        if radius == 0:
            return set([cur_atom_ix])
        else:
            cur_set = set([cur_atom_ix])
            for n_ix in atom_neighbor_list[cur_atom_ix]:
                cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1))
            return cur_set

    # Recreate trained network.
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False
    _, _, combined_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])

    net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params)
    net_weights = combined_parser.get(trained_weights, 'net weights')
    last_layer_weights = net_parser.get(net_weights, ('weights', 0))

    for fp_ix in range(params['fp_length']):
        print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0])
        combined_list = []
        for radius in all_radii:
            fp_activations = atom_activations[radius][:, fp_ix]
            combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)]

        unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0])
        combined_list = sorted(unique_list, key=lambda x: -x[0])

        for fig_ix in range(num_figs_per_fp):
            # Find the most-activating atoms for this fingerprint index, across all molecules and depths.
            activation, most_active_atom_ix, cur_radius = combined_list[fig_ix]
            most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix]
            highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius)
            highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs]

            print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation
            draw_molecule_with_highlights(
                "figures/fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix),
                train_smiles[most_activating_mol_ix],
                highlight_atoms=highlight_list_rdkit)
Ejemplo n.º 11
0
def conv_fp_func(conv_params):
    loss, _, parser = build_conv_deep_net(conv_params, vanilla_net_params, fp_l2_penalty=0.0)
    return lambda weights: loss(weights, smiles, targets), parser
def conv_fp_func(conv_params):
    loss, _, parser = build_conv_deep_net(conv_params,
                                          vanilla_net_params,
                                          fp_l2_penalty=0.0)
    return lambda weights: loss(weights, smiles, targets), parser