Esempio n. 1
0
def train_neural_fingerprint(train_directory, labels_mapping, tmp_dir, n_epochs=15):
    global task_params
    task_params['N_train'] = int(len(os.listdir(train_directory)) * 0.7)
    task_params['N_valid'] = int(len(os.listdir(train_directory)) * 0.01)
    task_params['N_test']  = int(len(os.listdir(train_directory)) * 0.01)
    task_params['data_file'] = tmp_dir

    global num_epochs 
    num_epochs = n_epochs

    directory = train_directory
    output = open(tmp_dir, 'wb+')

    files = os.listdir(directory)
    output.write('graph,label\n')
    for f in files:
        output.write(directory + '/' +  f + ',' + str(labels_mapping[f]) + '\n')
    output.close()
    
    print "Loading data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='graph', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata

    print "Regression on", task_params['N_train'], "training points."
    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds,  val_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x : y_train_mean)

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)

    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)

    print_performance(predict_func)
    return trained_weights
Esempio n. 2
0
def main(_):
    print("Loading data...")
    traindata, valdata, testdata = load_data(
        task_params['data_file'],
        input_name='smile',
        target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata
    test_inputs, test_targets = testdata

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print("\nPerformance (RMSE) on " + task_params['target_name'] + ":")
        print("Train:", rmse(train_preds, train_targets))
        print("Test: ", rmse(val_preds, val_targets))
        print("-" * 80)
        return rmse(val_preds, val_targets)

    def run_morgan_experiment():
        loss_fun, pred_fun, net_parser = \
            build_morgan_deep_net(model_params['fp_length'],
                                  model_params['fp_depth'], vanilla_net_params)
        num_weights = len(net_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        return print_performance(predict_func)

    def run_conv_experiment():
        conv_layer_sizes = [model_params['conv_width']
                            ] * model_params['fp_depth']
        conv_arch_params = {
            'num_hidden_features': conv_layer_sizes,
            'fp_length': model_params['fp_length'],
            'normalize': 1
        }
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        num_weights = len(conv_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return rmse(test_predictions, test_targets)

    print("Task params", task_params)
    print()
    print("Starting Morgan fingerprint experiment...")
    #test_loss_morgan = run_morgan_experiment()
    print("Starting neural fingerprint experiment...")
    test_loss_neural = run_conv_experiment()
    print()
    #print("Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural)
    print("{} Neural test RMSE:".format(p_i), test_loss_neural)
def main():
    # pdb.set_trace()
    print "Loading data..."
    traindata, valdata, testdata = load_data(
        task_params['data_file'], (N_train, N_val, N_test),
        input_name='smiles', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs,   val_targets   = valdata
    test_inputs,  test_targets  = testdata

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds, val_targets)

    def run_morgan_experiment():
        loss_fun, pred_fun, net_parser = \
            build_morgan_deep_net(model_params['fp_length'],
                                  model_params['fp_depth'], vanilla_net_params)
        num_weights = len(net_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        return print_performance(predict_func)

    def run_conv_experiment():
        conv_layer_sizes = [model_params['conv_width']] * model_params['fp_depth']
        print("conv_layer_sizes ",conv_layer_sizes)
        conv_arch_params = {'num_hidden_features' : conv_layer_sizes,
                            'fp_length' : model_params['fp_length'], 'normalize' : 1}
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        # import pdb; pdb.set_trace()
        num_weights = len(conv_parser) 
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return rmse(test_predictions, test_targets)

    print "Task params", task_params
    print
    print "Starting Morgan fingerprint experiment..."
    test_loss_morgan = run_morgan_experiment()
    # test_loss_morgan = 0.0
    print "Starting neural fingerprint experiment..."
    test_loss_neural = run_conv_experiment()
    print
    print "Morgan test RMSE:", test_loss_morgan, "Neural test RMSE:", test_loss_neural
Esempio n. 4
0
def train_neural_fingerprint():
    print "Loading data..."
    traindata, valdata, testdata = load_data(
        task_params['data_file'],
        (task_params['N_train'], task_params['N_valid'],
         task_params['N_test']),
        input_name='smiles',
        target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata

    print "Regression on", task_params['N_train'], "training points."

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds, val_targets)
        print "-" * 80
        return rmse(val_preds, val_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x: y_train_mean)

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    print "Convnet fingerprints with neural net"
    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)
    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
    print_performance(predict_func)
    return trained_weights
def train_neural_fingerprint():
    print "Loading data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='smiles', target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata

    print "Regression on", task_params['N_train'], "training points."
    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)
        print "\nPerformance (RMSE) on " + task_params['target_name'] + ":"
        print "Train:", rmse(train_preds, train_targets)
        print "Test: ", rmse(val_preds,  val_targets)
        print "-" * 80
        return rmse(val_preds,  val_targets)

    print "-" * 80
    print "Mean predictor"
    y_train_mean = np.mean(train_targets)
    print_performance(lambda x : y_train_mean)

    print "Task params", params
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False

    print "Convnet fingerprints with neural net"
    loss_fun, pred_fun, conv_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])
    num_weights = len(conv_parser)
    predict_func, trained_weights, conv_training_curve = \
         train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                 nn_train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
    print_performance(predict_func)
    return trained_weights
def neural_graph_fps(target_name, input_path, len_smi):

    task_params = {
        'target_name': target_name,
        'data_file': input_path,
    }
    N_train = int(len_smi * 0.7) - int(
        len_smi * 0.7) % 100 + 100  # must be in hundreds, haven't found why
    N_val = int(len_smi * 0.1)
    N_test = len_smi - N_train - N_val

    train_params = dict(num_iters=100,
                        batch_size=100,
                        init_scale=np.exp(-4),
                        step_size=np.exp(-6))

    traindata, valdata, testdata = load_data(
        task_params['data_file'], (N_train, N_val, N_test),
        input_name='smiles',
        target_name=task_params['target_name'])
    train_inputs, train_targets = traindata
    val_inputs, val_targets = valdata
    test_inputs, test_targets = testdata

    def print_performance(pred_func):
        train_preds = pred_func(train_inputs)
        val_preds = pred_func(val_inputs)

        return r2(val_preds, val_targets)

    def run_morgan_experiment():
        loss_fun, pred_fun, net_parser = \
            build_morgan_deep_net(model_params['fp_length'],
                                  model_params['fp_depth'], vanilla_net_params)
        num_weights = len(net_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        return print_performance(predict_func)

    def run_conv_experiment(model_params):

        # Define the architecture of the network that sits on top of the fingerprints.
        vanilla_net_params = dict(
            layer_sizes=[model_params['fp_length'],
                         model_params['h1_size']],  # One hidden layer.
            normalize=True,
            L2_reg=model_params['L2_reg'],
            nll_func=rmse)

        conv_layer_sizes = [model_params['conv_width']
                            ] * model_params['fp_depth']
        conv_arch_params = {
            'num_hidden_features': conv_layer_sizes,
            'fp_length': model_params['fp_length'],
            'normalize': 1
        }
        loss_fun, pred_fun, conv_parser = \
            build_conv_deep_net(conv_arch_params, vanilla_net_params, model_params['L2_reg'])
        num_weights = len(conv_parser)
        predict_func, trained_weights, conv_training_curve = \
            train_nn(pred_fun, loss_fun, num_weights, train_inputs, train_targets,
                     train_params, validation_smiles=val_inputs, validation_raw_targets=val_targets)
        test_predictions = predict_func(test_inputs)
        return r2(test_predictions, test_targets)

    fp_lengths = [7, 8, 9]
    fp_depths = [6, 7, 8]
    conv_widths = [30, 40, 50, 60]
    h1_sizes = [100]

    #fp_length: 7 fp_depth: 8 conv_width: 60 h1_size: 100
    #Neural test R2: 0.9314066399774756
    max_r2 = 0
    fp_length_opt, fp_length_opt, conv_width_opt = fp_lengths[0], fp_depths[
        0], conv_widths[0]
    for fp_length in fp_lengths:
        for fp_depth in fp_depths:
            for conv_width in conv_widths:
                for h1_size in h1_sizes:
                    model_params = dict(
                        fp_length=
                        fp_length,  # Usually neural fps need far fewer dimensions than morgan.
                        fp_depth=
                        fp_depth,  # The depth of the network equals the fingerprint radius.
                        conv_width=
                        conv_width,  # Only the neural fps need this parameter.
                        h1_size=
                        h1_size,  # Size of hidden layer of network on top of fps.
                        L2_reg=np.exp(-2))
                    test_r2_neural = run_conv_experiment(model_params)
                    if max_r2 < test_r2_neural:
                        fp_length_opt = fp_length
                        fp_depth_opt = fp_depth
                        conv_width_opt = conv_width
                    max_r2 = max(test_r2_neural, max_r2)

    print("fp_length:", fp_length_opt, "fp_depth:", fp_depth_opt,
          "conv_width:", conv_width_opt, "h1_size:", h1_size)
    print("Neural test R2:", max_r2)
def plot(trained_weights):
    print "Loading training data..."
    traindata, valdata, testdata = load_data(task_params['data_file'],
                        (task_params['N_train'], task_params['N_valid'], task_params['N_test']),
                        input_name='smiles', target_name=task_params['target_name'])
    train_smiles, train_targets = traindata

    print "Convnet fingerprints with neural net"
    conv_arch_params['return_atom_activations'] = True
    output_layer_fun, parser, compute_atom_activations = \
       build_convnet_fingerprint_fun(**conv_arch_params)
    atom_activations, array_rep = compute_atom_activations(trained_weights, train_smiles)

    if not os.path.exists('figures'): os.makedirs('figures')

    parent_molecule_dict = {}
    for mol_ix, atom_ixs in enumerate(array_rep['atom_list']):
        for atom_ix in atom_ixs:
            parent_molecule_dict[atom_ix] = mol_ix

    atom_neighbor_list = construct_atom_neighbor_list(array_rep)

    def get_neighborhood_ixs(array_rep, cur_atom_ix, radius):
        # Recursive function to get indices of all atoms in a certain radius.
        if radius == 0:
            return set([cur_atom_ix])
        else:
            cur_set = set([cur_atom_ix])
            for n_ix in atom_neighbor_list[cur_atom_ix]:
                cur_set.update(get_neighborhood_ixs(array_rep, n_ix, radius-1))
            return cur_set

    # Recreate trained network.
    nn_train_params, vanilla_net_params = parse_training_params(params)
    conv_arch_params['return_atom_activations'] = False
    _, _, combined_parser = \
        build_conv_deep_net(conv_arch_params, vanilla_net_params, params['l2_penalty'])

    net_loss_fun, net_pred_fun, net_parser = build_standard_net(**vanilla_net_params)
    net_weights = combined_parser.get(trained_weights, 'net weights')
    last_layer_weights = net_parser.get(net_weights, ('weights', 0))

    for fp_ix in range(params['fp_length']):
        print "FP {0} has linear regression coefficient {1}".format(fp_ix, last_layer_weights[fp_ix][0])
        combined_list = []
        for radius in all_radii:
            fp_activations = atom_activations[radius][:, fp_ix]
            combined_list += [(fp_activation, atom_ix, radius) for atom_ix, fp_activation in enumerate(fp_activations)]

        unique_list = remove_duplicates(combined_list, key_lambda=lambda x: x[0])
        combined_list = sorted(unique_list, key=lambda x: -x[0])

        for fig_ix in range(num_figs_per_fp):
            # Find the most-activating atoms for this fingerprint index, across all molecules and depths.
            activation, most_active_atom_ix, cur_radius = combined_list[fig_ix]
            most_activating_mol_ix = parent_molecule_dict[most_active_atom_ix]
            highlight_list_our_ixs = get_neighborhood_ixs(array_rep, most_active_atom_ix, cur_radius)
            highlight_list_rdkit = [array_rep['rdkit_ix'][our_ix] for our_ix in highlight_list_our_ixs]

            print "radius:", cur_radius, "atom list:", highlight_list_rdkit, "activation", activation
            draw_molecule_with_highlights(
                "figures/fp_{0}_highlight_{1}.pdf".format(fp_ix, fig_ix),
                train_smiles[most_activating_mol_ix],
                highlight_atoms=highlight_list_rdkit)
def fit_fingerprints(task_params, model_params, train_params, verbose):
    if verbose:
        print(
            "Loading data from '{data_fname}' with\n\tsmiles column: '{smiles_column}'\n\ttarget column: '{target_column}'\n\tN_train: {N_train}\n\tN_validate: {N_validate}\n\tN_test: {N_test}\n"
            .format(**task_params))

    data = load_data(filename=task_params['data_fname'],
                     sizes=(task_params['N_train'], task_params['N_validate'],
                            task_params['N_test']),
                     input_name=task_params['smiles_column'],
                     target_name=task_params['target_column'])

    if verbose:
        print(
            "Building fingerprint function of length {fp_length} as a convolutional network with width {fp_width} and depth {fp_depth} ..."
            .format(**model_params))

    # Build deep convolutional neural network that when instantiated
    # with weights, take a list of smiles produces fingerprint vectors
    # for each.
    #   weights type: WeightsParser
    #   smiles type: Iterable[str]
    #   output type: ndarray[??]    # the see output_layer_fun_and_atom_activations function
    #   fp_func type: Callable[[weights, smiles], output]
    #   fp_parser type: WeightsParser
    fp_func, fp_parser = \
        build_convnet_fingerprint_fun(
            num_hidden_features = [model_params['fp_width']] * model_params['fp_depth'],
            fp_length = model_params['fp_length'],
            normalize = True)

    if verbose:
        print("Building regression network ... ")

    # Builds a deep convolutional neural netowrk that stacks neural
    # fingerprint network on top of a vanilla convolutional network
    # with regularlized L2 loss function underneath
    #   loss_fun type: Callable[[weights, smiles, targets], numeric]
    #   pred_fun type: Callable[[weights, smiles], np.array]
    #   combined_parser: WeightsParser
    net_params = dict(
        layer_sizes=[model_params['fp_length'], model_params['h1_size']],
        normalize=True,
        L2_reg=np.exp(model_params['log_l2_penalty']),
        nll_func=model_params['nll_func'])
    loss_fun, pred_fun, combined_parser = \
        build_fingerprint_deep_net(
            net_params=net_params,
            fingerprint_func=fp_func,
            fp_parser=fp_parser,
            fp_l2_penalty=np.exp(model_params['log_l2_penalty']))

    if verbose:
        print("Training model ...")
    # Train the full network for the activity using the training data
    # optimizing the loss over the validation data
    #   predict_func type: Callable[[smiles], np.array]
    #   trained_weights type: np.ndarray
    #   training_curve type: Iterable[numeric]
    predict_func, trained_weights, training_curve = \
        train_nn(
            pred_fun=pred_fun,
            loss_fun=loss_fun,
            nll_func_name=model_params['nll_func_name'],
            nll_func=model_params['nll_func'],
            num_weights=len(combined_parser),
            train_smiles=data[0][0],
            train_raw_targets=data[0][1],
            train_params=train_params,
            seed=task_params['seed'],
            validation_smiles=data[1][0],
            validation_raw_targets=data[1][1])

    if verbose:
        print_performance(target_name=task_params['target_column'],
                          predict_func=predict_func,
                          nll_func_name=model_params['nll_func_name'],
                          data=data)

    trained_fp_weights = combined_parser.get(trained_weights,
                                             'fingerprint weights')
    return trained_fp_weights, training_curve