Beispiel #1
0
def f(radius,
      T,
      fingerprint_dim,
      weight_decay,
      learning_rate,
      p_dropout,
      direction=False):

    loss_function = nn.MSELoss()
    loss_function.cuda()
    model = Fingerprint(int(round(radius)), int(round(T)),
                        num_atom_features, num_bond_features,
                        int(round(fingerprint_dim)), output_units_num,
                        p_dropout)
    model.cuda()
    optimizer = optim.Adam(model.parameters(),
                           10**-learning_rate,
                           weight_decay=10**-weight_decay)

    best_param = {}
    best_param["train_epoch"] = 0
    best_param["test_epoch"] = 0
    best_param["train_MSE"] = 9e8
    best_param["test_MSE"] = 9e8
    for epoch in range(800):
        train(model, train_df, optimizer, loss_function, epoch + 1)
        train_MAE, train_MSE = eval(model, train_df)
        test_MAE, test_MSE = eval(model, test_df)
        if train_MSE < best_param["train_MSE"]:
            best_param["train_epoch"] = epoch
            best_param["train_MSE"] = train_MSE
        if test_MSE < best_param["test_MSE"]:
            best_param["test_epoch"] = epoch
            best_param["test_MSE"] = test_MSE
        if (epoch - best_param["train_epoch"] >
                6) and (epoch - best_param["test_epoch"] > 8):
            break
    # print(best_param["test_epoch"], best_param["test_MSE"])
    with open(log_file, 'a') as f:
        f.write(','.join([
            str(int(round(radius))),
            str(int(round(T))),
            str(int(round(fingerprint_dim))),
            str(p_dropout),
            str(weight_decay),
            str(learning_rate)
        ]))
        f.write(',' + str(best_param["test_epoch"]) + ',' +
                str(best_param["test_MSE"]) + '\n')

    # GPGO maximize performance by default, set performance to its negative value for minimization
    if direction:
        return best_param["test_MSE"]
    else:
        return -best_param["test_MSE"]
Beispiel #2
0
remained_df = smiles_tasks_df[smiles_tasks_df["cano_smiles"].isin(
    feature_dicts['smiles_to_atom_mask'].keys())]
uncovered_df = smiles_tasks_df.drop(remained_df.index)
uncovered_df

test_df = remained_df.sample(frac=0.2, random_state=random_seed)
train_df = remained_df.drop(test_df.index)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(
    [canonical_smiles_list[0]], feature_dicts)
num_atom_features = x_atom.shape[-1]
num_bond_features = x_bonds.shape[-1]
loss_function = nn.MSELoss()
model = Fingerprint(radius, T, num_atom_features, num_bond_features,
                    fingerprint_dim, output_units_num, p_dropout)
model.cuda()

optimizer = optim.Adam(model.parameters(),
                       10**-learning_rate,
                       weight_decay=10**-weight_decay)


def train(model, dataset, optimizer, loss_function, epoch):
    model.train()
    np.random.seed(epoch)
    valList = np.arange(0, dataset.shape[0])
    #shuffle them
    np.random.shuffle(valList)
    batch_list = []
    for i in range(0, dataset.shape[0], batch_size):
Beispiel #3
0
        p_dropout = 0.2
        fingerprint_dim = 200

        weight_decay = 5  # also known as l2_regularization_lambda
        learning_rate = 2.5
        radius = 2
        T = 2
        per_task_output_units_num = 1  # for regression model
        output_units_num = len(tasks) * per_task_output_units_num

        x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(
            [canonical_smiles_list[0]], feature_dicts)
        num_atom_features = x_atom.shape[-1]
        num_bond_features = x_bonds.shape[-1]
        loss_function = nn.MSELoss()
        model = Fingerprint(radius, T, num_atom_features, num_bond_features,
                            fingerprint_dim, output_units_num, p_dropout)
        model.cuda()

        # optimizer = optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
        optimizer = optim.Adam(model.parameters(),
                               10**-learning_rate,
                               weight_decay=10**-weight_decay)
        # optimizer = optim.SGD(model.parameters(), 10**-learning_rate, weight_decay=10**-weight_decay)

        tensorboard = SummaryWriter(
            log_dir="runs/" + start_time + "_" + prefix_filename + "_" +
            str(fingerprint_dim) + "_" + str(p_dropout))

        model_parameters = filter(lambda p: p.requires_grad,
                                  model.parameters())
        params = sum([np.prod(p.size()) for p in model_parameters])
Beispiel #4
0
            canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True))
        except:
            print(smiles,"######3")
            pass
    feature_filename = 'lipop/Lipophilicity'
    # if os.path.isfile(feature_filename):
    #     print("NO lipop/delaney-processed.pickle")
    #     feature_dicts = pickle.load(open(feature_filename, "rb"))
    # else:
    feature_dicts = save_smiles_dicts(smilesList, feature_filename)

    x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(
        [canonical_smiles_list[0]], feature_dicts)
    num_atom_features = x_atom.shape[-1]
    num_bond_features = x_bonds.shape[-1]
    model = Fingerprint(radius, T, num_atom_features, num_bond_features,
                        fingerprint_dim, output_units_num, p_dropout)
    model.to(device)
    rnn = LSTM(model).to(device)


    #使用adam优化器进行优化,输入待优化参数rnn.parameters,优化学习率为learning_rate
    # optimizer = torch.optim.Adam(list(rnn.parameters()), lr=learning_rate)
    optimizer = torch.optim.SGD(list(rnn.parameters()),
                                lr=learning_rate, weight_decay = weight_decay,
                                momentum = momentum)
    loss_function = nn.MSELoss().to(device)

    # 按照以下的过程进行参数的训练
    for epoch in range(epoch_num):
        avg_loss = 0
        sum_loss = 0
def AttentiveFP_regressor_training(df_filename, feature_filename, tasks,
                                fingerprint_dim, radius, T, output_dir,
                                smiles_field = 'cano_smiles', normalizeFlag = '_normalized',
                                test_fraction = 10, random_seed = 8,
                                batch_size = 128, epochs = 300, p_dropout = 0.5,
                                weight_decay = 4.9, learning_rate = 3.4,
                                batch_normalization = False):
    '''
    INPUT:
    df - a dataframe recording values for tasks;
    feature_filename - .p file name of the stored chemical feature dictionary;
    tasks - a list, must be a subset of df.columns;
    fingerprint_dim - the number of nodes in hidden layer;
    radius - the number of recurrent layers on molecular graph;
    T - the number of recurrent layers on virtual graph;
    '''

    #1 prepare dataset (just extract needed subset, id + targets + smiles), split for train/test
    print('============ Training data loading =================')
    df = pd.read_csv(df_filename)
    feature_dicts = pickle.load(open(feature_filename, 'rb'))
    remained_df = df[df[smiles_field].isin(feature_dicts['smiles_to_atom_mask'].keys())]
    uncovered_df = df.drop(remained_df.index)
    if len(uncovered_df) > 0:
        print('The following data is missing:')
        print(uncovered_df)

    test_df = remained_df.sample(frac = 1/test_fraction, random_state = random_seed)
    training_data = remained_df.drop(test_df.index)
      # get the stats of the training data, which will be used to normalize the loss
    columns = ['Task', 'Mean', 'Standard deviation', 'Mean absolute deviation', 'ratio']
    mean_list = []
    std_list = []
    mad_list = []
    ratio_list = []
    
    for task in tasks:
        mean = training_data[task].mean()
        mean_list.append(mean)
        std = training_data[task].std()
        std_list.append(std)
        mad = training_data[task].mad()
        mad_list.append(mad)
        ratio_list.append(std/mad)
        training_data[task+normalizeFlag] = (training_data[task] - mean) / std
        test_df[task+normalizeFlag] = (test_df[task] - mean) / std

    list_of_tuples = list(zip(tasks, mean_list, std_list, mad_list, ratio_list))
    stats = pd.DataFrame(list_of_tuples, columns = columns)
    stats.to_csv(output_dir + 'trainset_stats.csv', index = None)

    train_df = training_data.reset_index(drop = True)
    test_df = test_df.reset_index(drop=True)
    print('Data loading finished:')
    print('Train set size: %i' % len(train_df))
    print('Test set size: %i' % len(test_df))

    #2 model initialization
    print('============ Model initialization =================')
    per_task_output_units_num = 1
    output_units_num = len(tasks) * per_task_output_units_num

    x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = \
                    get_smiles_array([remained_df[smiles_field].iloc[0]], feature_dicts)
    num_atom_features = x_atom.shape[-1]
    num_bond_features = x_bonds.shape[-1]
    loss_function = nn.MSELoss()

    model = Fingerprint(radius, T, num_atom_features, num_bond_features,
                        fingerprint_dim, output_units_num, p_dropout, batch_normalization)
    model.cuda()
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print('Total number of parameters: %i' % params)
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name, param.data.shape)

    optimizer = optim.Adam(model.parameters(), 10**-learning_rate, 
                           weight_decay=10**-weight_decay)

    print('============ Saving params =================')
    #5 write all params to json file
    model_params = {}
    model_params['radius'] = radius
    model_params['fingerprint_dim'] = fingerprint_dim
    model_params['T'] = T
    model_params['output_units_num'] = output_units_num
    model_params['num_atom_features'] = num_atom_features
    model_params['num_bond_features'] = num_bond_features
    model_params['p_dropout'] = p_dropout
    model_params['batch_normalization'] = batch_normalization
    model_params['mol_length'] = x_atom.shape[1]

    data_stats = {}
    data_stats['tasks'] = tasks
    data_stats['smiles_field'] = smiles_field
    data_stats['test_fraction'] = test_fraction
    data_stats['random_seed'] = random_seed
    data_stats['mean'] = mean_list
    data_stats['std'] = std_list
    data_stats['mad'] = mad_list
    data_stats['ratio'] = ratio_list

    training_params = {}
    training_params['batch_size'] = batch_size
    training_params['epochs'] = epochs
    training_params['weight_decay'] = weight_decay
    training_params['learning_rate'] = learning_rate
    training_params['normalizeFlag'] = normalizeFlag

    json_output = {}
    json_output['model_params'] = model_params
    json_output['data_stats'] = data_stats
    json_output['training_params'] = training_params

    with open(output_dir + 'params.json', 'w') as outfile:
        json.dump(json_output, outfile)


    #3 model training
    print('============ Start model training =================')
      # parameter initialization
    for m in model.modules():
        if isinstance(m, (nn.Linear)):
            nn.init.xavier_uniform_(m.weight)
        if isinstance(m, (nn.GRUCell)):
            nn.init.orthogonal_(m.weight_ih)
            nn.init.orthogonal_(m.weight_hh)

    best_param ={}
    best_param["train_epoch"] = 0
    best_param["valid_epoch"] = 0
    best_param["train_MSE_normalized"] = 9e8
    best_param["valid_MSE_normalized"] = 9e8

    for epoch in range(epochs):
        print(train_regressor(model, train_df, tasks, optimizer, loss_function, 
                              batch_size, smiles_field, normalizeFlag, feature_dicts, stats))
        train_r2, train_MSE_normalized, train_MSE, train_MAE_normalized, \
                    train_MAE = eval_regressor(model, train_df, smiles_field, tasks, normalizeFlag, \
                                               batch_size, feature_dicts, stats)
        valid_r2, valid_MSE_normalized, valid_MSE, valid_MAE_normalized, \
                    valid_MAE = eval_regressor(model, test_df, smiles_field, tasks, normalizeFlag, \
                                               batch_size, feature_dicts, stats)

    #4 evluation and log tracking
        print("EPOCH:\t" + str(epoch) + '\n' \
            +"train_MAE: \n" + str(train_MAE) + '\n' \
            +"valid_MAE: \n" + str(valid_MAE) + '\n' \
            +"train_r2: \n" + str(train_r2) + '\n' \
            +"valid_r2: \n" + str(valid_r2) + '\n' \
            +"train_MSE_normalized_mean: " + str(train_MSE_normalized.mean()) + '\n' \
            +"valid_MSE_normalized_mean: " + str(valid_MSE_normalized.mean()) + '\n' \
            +"train_r2_mean: " + str(train_r2.mean()) + '\n' \
            +"valid_r2_mean: " + str(valid_r2.mean()) + '\n')
        if train_MSE_normalized.mean() < best_param["train_MSE_normalized"]:
            best_param["train_epoch"] = epoch
            best_param["train_MSE_normalized"] = train_MSE_normalized.mean()
        if valid_MSE_normalized.mean() < best_param["valid_MSE_normalized"]:
            best_param["valid_epoch"] = epoch
            best_param["valid_MSE_normalized"] = valid_MSE_normalized.mean()
            if valid_r2.mean() > 0.6:
                torch.save(model, output_dir + 'model-' + str(epoch) + '.pt')
        if (epoch - best_param["train_epoch"] > 3) and (epoch - best_param["valid_epoch"] > 5): # early stopping
            torch.save(model, output_dir + 'model-' + str(epoch) + '.pt')
            break
    print("Training finished.")

    return 
Beispiel #6
0
    df["cano_smiles"] = canonical_smiles_list
    feature_dicts = save_smiles_dicts(smilesList, 'tmp')
    remained_df = df[df["cano_smiles"].isin(feature_dicts['smiles_to_atom_mask'].keys())]
    uncovered_idx = set(df.index) - set(remained_df.index)
    train_idx = set(train_idx) - set(uncovered_idx)
    valid_idx = set(valid_idx) - set(uncovered_idx)
    print(len(train_idx), len(valid_idx))
    
    train_df = remained_df.loc[train_idx].reset_index(drop=True)
    valid_df = remained_df.loc[valid_idx].reset_index(drop=True)

    x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array([canonical_smiles_list[0]],feature_dicts)
    num_atom_features = x_atom.shape[-1]
    num_bond_features = x_bonds.shape[-1]
    loss_function = nn.MSELoss()
    model = Fingerprint(radius, T, num_atom_features, num_bond_features,
                fingerprint_dim, output_units_num, p_dropout)
    model.cuda()
    optimizer = optim.Adam(model.parameters(), 10**-learning_rate, weight_decay=10**-weight_decay)

    for epoch in range(epochs):
        train_MAE, train_MSE = eval(model, train_df)
        valid_MAE, valid_MSE = eval(model, valid_df)
        print(epoch, np.sqrt(train_MSE), np.sqrt(valid_MSE))
        train(model, train_df, optimizer, loss_function)

    end = time.time()
    total = end - start
    print('total epoch: %s, total time: %s' % (epochs, total))
    res.append([epochs, total])

x = pd.DataFrame(res, columns = ['epochs', 'total_time(s)'])