def eval(model, dataset): model.eval() eval_MAE_list = [] eval_MSE_list = [] valList = np.arange(0, dataset.shape[0]) batch_list = [] for i in range(0, dataset.shape[0], batch_size): batch = valList[i:i + batch_size] batch_list.append(batch) for counter, batch in enumerate(batch_list): batch_df = dataset.loc[batch, :] smiles_list = batch_df.cano_smiles.values # print(batch_df) y_val = batch_df[tasks[0]].values x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array( smiles_list, feature_dicts) atoms_prediction, mol_prediction = model( torch.Tensor(x_atom), torch.Tensor(x_bonds), torch.cuda.LongTensor(x_atom_index), torch.cuda.LongTensor(x_bond_index), torch.Tensor(x_mask)) MAE = F.l1_loss(mol_prediction, torch.Tensor(y_val).view(-1, 1), reduction='none') MSE = F.mse_loss(mol_prediction, torch.Tensor(y_val).view(-1, 1), reduction='none') # print(x_mask[:2],atoms_prediction.shape, mol_prediction,MSE) eval_MAE_list.extend(MAE.data.squeeze().cpu().numpy()) eval_MSE_list.extend(MSE.data.squeeze().cpu().numpy()) return np.array(eval_MAE_list).mean(), np.array(eval_MSE_list).mean()
def train(model, dataset, optimizer, loss_function, epoch): model.train() np.random.seed(epoch) valList = np.arange(0, dataset.shape[0]) #shuffle them np.random.shuffle(valList) batch_list = [] for i in range(0, dataset.shape[0], batch_size): batch = valList[i:i + batch_size] batch_list.append(batch) for counter, batch in enumerate(batch_list): batch_df = dataset.loc[batch, :] smiles_list = batch_df.cano_smiles.values y_val = batch_df[tasks[0]].values x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array( smiles_list, feature_dicts) atoms_prediction, mol_prediction = model( torch.Tensor(x_atom), torch.Tensor(x_bonds), torch.cuda.LongTensor(x_atom_index), torch.cuda.LongTensor(x_bond_index), torch.Tensor(x_mask)) optimizer.zero_grad() loss = loss_function(mol_prediction, torch.Tensor(y_val).view(-1, 1)) loss.backward() optimizer.step()
def forward(self, x, x_lens, tmp_smi): # print(self.matrix1, self.matrix2, self.matrix3) # bs = len(x) # length = np.array([t.shape[0] for t in x]) x = x.to(device) x = self.matrix[0] * x[:, 0, :, :] + self.matrix[ 1] * x[:, 1, :, :] + self.matrix[2] * x[:, 2, :, :] # x = self.fc(x.to(device)).to(device) # packing # embed_packed = pack_padded_sequence(x, x_lens, # batch_first=True, # enforce_sorted=False) out, (hidden, cell) = self.lstm(x) #h_state是之前的隐层状态 x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array( tmp_smi, feature_dicts) atoms_prediction, mol_prediction, mol_feature = self.model( torch.Tensor(x_atom).to(device), torch.Tensor(x_bonds).to(device), torch.cuda.LongTensor(x_atom_index), torch.cuda.LongTensor(x_bond_index), torch.Tensor(x_mask).to(device)) # unpacking # out, lens = pad_packed_sequence(out, batch_first=True) alpha_n = 0 att = 0 # out,hidden = self.lstm(x.to(device)) #h_state是之前的隐层状态 # out = torch.cat((h_n[-1, :, :], h_n[-2, :, :]), dim=-1) # out1 = unpack_sequences(rnn_out, orderD) # for i in range(bs): # out1[i,length[i]:-1,:] = 0 out = torch.mean(out, dim=1).squeeze() # out = out[:,-1,:] #进行全连接 out_tmp = self.fc3(out) out_tmp = F.leaky_relu(out_tmp) out_tmp = self.dropout(out_tmp) out_tmp = torch.cat((out_tmp.view(-1, 512), mol_feature.view(-1, 200)), dim=1) out_tmp = self.fc4(out_tmp) # out_tmp = self.fc5(mol_feature) # outputs = [] # for i, out_tmp in enumerate(out): # # out_tmp = torch.mean(out_tmp[:lens[i],:], dim=0).squeeze() # out_tmp = out_tmp[lens[i]-1,:] # out_tmp = self.fc3(out_tmp) # out_tmp = F.leaky_relu(out_tmp) # out_tmp = self.dropout(out_tmp) # out_tmp = self.fc4(out_tmp) # outputs.append(out_tmp) # out = torch.stack(outputs, dim=0) return out_tmp, alpha_n, att
def train_regressor(model, dataset, tasks, optimizer, loss_function, batch_size, smiles_field, normalizeFlag, feature_dicts, stats): ratio_list = stats['ratio'].values model.train() #np.random.seed(epoch) valList = np.arange(0,dataset.shape[0]) # shuffle dataset np.random.shuffle(valList) batch_list = [] for i in range(0, dataset.shape[0], batch_size): batch = valList[i:i+batch_size] batch_list.append(batch) for counter, train_batch in enumerate(batch_list): batch_df = dataset.loc[train_batch,:] smiles_list = batch_df[smiles_field].values x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = \ get_smiles_array(smiles_list, feature_dicts) atoms_prediction, mol_prediction, _, _ = model(torch.Tensor(x_atom), torch.Tensor(x_bonds), torch.cuda.LongTensor(x_atom_index), torch.cuda.LongTensor(x_bond_index), torch.Tensor(x_mask)) optimizer.zero_grad() loss = 0.0 # compute the loss function for i, task in enumerate(tasks): y_pred = mol_prediction[:, i] y_val = batch_df[task+normalizeFlag].values # need to check how normalization deal with NAs # filter out NAs validInds = np.where(~np.isnan(y_val))[0] if len(validInds) == 0: continue y_val_adjust = np.array([y_val[v] for v in validInds]).astype(float) validInds = torch.cuda.LongTensor(validInds).squeeze() y_pred_adjust = torch.index_select(y_pred, 0, validInds) loss += loss_function( y_pred_adjust, torch.Tensor(y_val_adjust).squeeze())*ratio_list[i]**2 loss.backward() optimizer.step() return loss
if os.path.isfile(feature_filename): feature_dicts = pickle.load(open(feature_filename, "rb")) else: feature_dicts = save_smiles_dicts(smilesList, filename) # feature_dicts = get_smiles_dicts(smilesList) remained_df = smiles_tasks_df[smiles_tasks_df["cano_smiles"].isin( feature_dicts['smiles_to_atom_mask'].keys())] uncovered_df = smiles_tasks_df.drop(remained_df.index) uncovered_df test_df = remained_df.sample(frac=0.2, random_state=random_seed) train_df = remained_df.drop(test_df.index) train_df = train_df.reset_index(drop=True) test_df = test_df.reset_index(drop=True) x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array( [canonical_smiles_list[0]], feature_dicts) num_atom_features = x_atom.shape[-1] num_bond_features = x_bonds.shape[-1] loss_function = nn.MSELoss() model = Fingerprint(radius, T, num_atom_features, num_bond_features, fingerprint_dim, output_units_num, p_dropout) model.cuda() optimizer = optim.Adam(model.parameters(), 10**-learning_rate, weight_decay=10**-weight_decay) def train(model, dataset, optimizer, loss_function, epoch): model.train() np.random.seed(epoch)
def generate_RUNKEY_dataframe_AttentiveFP(df_filename, feature_filename, param_file_name, prev_model, id_field, layer = 0, batch_size = 128): # load parameters print('-------------- Load parameters --------------') with open(param_file_name, 'r') as myfile: data = myfile.read() obj = json.loads(data) normalizeFlag = obj['training_params']['normalizeFlag'] smiles_field = obj['data_stats']['smiles_field'] tasks = obj['data_stats']['tasks'] # load total datasets print('-------------- Load dataset --------------') df = pd.read_csv(df_filename) feature_dicts = pickle.load(open(feature_filename, 'rb')) remained_df = df[df[smiles_field].isin(feature_dicts['smiles_to_atom_mask'].keys())] uncovered_df = df.drop(remained_df.index) if len(uncovered_df) > 0: print('The following data is missing:') print(uncovered_df) df = remained_df[tasks + [obj['data_stats']['smiles_field']] + [id_field]] if not type(df[id_field].iloc[0]) == str: df['chembl_id'] = df[id_field].astype(int) else: df['chembl_id'] = df[id_field] switch_field = lambda item:'canonical_smiles' if item == smiles_field else item df.columns = [switch_field(item) for item in df.columns.tolist()] # load previously trained model print('-------------- Load prev model --------------') best_model = torch.load(prev_model) best_model_dict = best_model.state_dict() best_model_wts = copy.deepcopy(best_model_dict) model_for_viz = Fingerprint_viz(obj['model_params']['radius'], obj['model_params']['T'], obj['model_params']['num_atom_features'], obj['model_params']['num_bond_features'], obj['model_params']['fingerprint_dim'], obj['model_params']['output_units_num'], obj['model_params']['p_dropout'], obj['model_params']['batch_normalization']) model_for_viz.load_state_dict(best_model_wts) # calculate prediction and coords (maybe need batch process) print('-------------- prepare compound df --------------') valList = np.arange(0, df.shape[0]) np.random.shuffle(valList) df = df.loc[valList,:] df = df.reset_index(drop = True) N_training = min([df.shape[0], 5000]) valList = np.arange(0, N_training) compound_df = df.loc[valList,:] batch_list = [] for i in range(0, N_training, batch_size): batch = valList[i:(i+batch_size)] batch_list.append(batch) pred_mat = np.zeros((N_training, len(tasks))) atom_weight_mat = np.zeros((N_training, obj['model_params']['mol_length'])) mol_feature_mat = np.zeros((N_training, obj['model_params']['fingerprint_dim'])) for counter, train_batch in enumerate(batch_list): temp_df = compound_df.loc[train_batch,:] smiles_list = temp_df['canonical_smiles'].tolist() x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(smiles_list,feature_dicts) atom_feature_viz, atom_attention_weight_viz, mol_feature_viz, mol_feature_unbounded_viz, \ mol_attention_weight_viz, mol_prediction = model_for_viz( torch.Tensor(x_atom), torch.Tensor(x_bonds), torch.cuda.LongTensor(x_atom_index), torch.cuda.LongTensor(x_bond_index), torch.Tensor(x_mask)) mol_pred = np.array(mol_prediction.data.squeeze().cpu().numpy()) mol_pred_translate = np.zeros(mol_pred.shape) for ii, task in enumerate(tasks): pos = int(np.where(np.array(obj['data_stats']['tasks'])==task)[0]) mol_pred_translate[:,ii] = (mol_pred[:,pos] * obj['data_stats']['std'][pos]) + obj['data_stats']['mean'][pos] pred_mat[train_batch,:] = mol_pred_translate atom_weight = np.stack([mol_attention_weight_viz[t].cpu().detach().numpy() for t in range(obj['model_params']['T'])])[layer,:,:,0] # arrange the atoms according to its index for m in range(len(temp_df)): smiles = smiles_list[m] ind_mask = x_mask[m] ind_atom = smiles_to_rdkit_list[smiles] ind_weight = atom_weight[m,:] out_weight = [] for j, one_or_zero in enumerate(list(ind_mask)): if one_or_zero == 1.0: out_weight.append(ind_weight[j]) out_weight_sorted = np.array([out_weight[m] for m in np.argsort(ind_atom)]).flatten() atom_weight_mat[train_batch[m], 0:len(out_weight_sorted)] = out_weight_sorted mol_feature_mat[train_batch,:] = np.stack([mol_feature_viz[t].cpu().detach().numpy() for t in range(obj['model_params']['T'])])[layer,:,:] # prepare compound_df (dim reduce + minibatch clustering) coord_values = dim_reduce_op(transfer_values, type = 'seq') compound_df['x'] = coord_values[:,0] compound_df['y'] = coord_values[:,1] for ii, task in enumerate(tasks): compound_df['pred_' + task] = pred_mat[:, ii] mbk = cluster_MiniBatch(coord_values) mbk.means_labels_unique = np.unique(mbk.labels_) compound_df['label'] = mbk.labels_ # prepare batch_df print('-------------- prepare batch df --------------') n_row = len(mbk.means_labels_unique) n_col = len(tasks) cluster_info_mat = np.zeros((n_row, (n_col + 3))) for k in range(n_row): mask = mbk.labels_ == mbk.means_labels_unique[k] cluster_info_mat[k, 0:n_col] = np.nanmean(pred_mat[mask], axis = 0) cluster_info_mat[k, n_col] = sum(mask) cluster_info_mat[k, (n_col + 1) : (n_col + 3)] = np.nanmean(coord_values[mask, :], axis = 0) batch_df = pd.DataFrame(cluster_info_mat) batch_df.columns = ['avg_' + task for task in tasks] + ['size', 'coordx', 'coordy'] batch_df['Label_id'] = mbk.means_labels_unique # prepare task_df print('-------------- prepare task df --------------') task_df = pd.DataFrame(atom_weight_mat) ### generate color labels by default print('------- Generate color labels with default K of 5 --------') batch_df, task_df, compound_df = update_bicluster(batch_df, task_df, compound_df, mode = 'ST') ### wrapping up print('-------------- Saving datasets ----------------') compound_df.to_csv(output_prefix + 'compound_df.csv', index = False) batch_df.to_csv(output_prefix + 'batch_df.csv', index = False) task_df.to_csv(output_prefix + 'task_df.csv', index = False) return
def AttentiveFP_regressor_training(df_filename, feature_filename, tasks, fingerprint_dim, radius, T, output_dir, smiles_field = 'cano_smiles', normalizeFlag = '_normalized', test_fraction = 10, random_seed = 8, batch_size = 128, epochs = 300, p_dropout = 0.5, weight_decay = 4.9, learning_rate = 3.4, batch_normalization = False): ''' INPUT: df - a dataframe recording values for tasks; feature_filename - .p file name of the stored chemical feature dictionary; tasks - a list, must be a subset of df.columns; fingerprint_dim - the number of nodes in hidden layer; radius - the number of recurrent layers on molecular graph; T - the number of recurrent layers on virtual graph; ''' #1 prepare dataset (just extract needed subset, id + targets + smiles), split for train/test print('============ Training data loading =================') df = pd.read_csv(df_filename) feature_dicts = pickle.load(open(feature_filename, 'rb')) remained_df = df[df[smiles_field].isin(feature_dicts['smiles_to_atom_mask'].keys())] uncovered_df = df.drop(remained_df.index) if len(uncovered_df) > 0: print('The following data is missing:') print(uncovered_df) test_df = remained_df.sample(frac = 1/test_fraction, random_state = random_seed) training_data = remained_df.drop(test_df.index) # get the stats of the training data, which will be used to normalize the loss columns = ['Task', 'Mean', 'Standard deviation', 'Mean absolute deviation', 'ratio'] mean_list = [] std_list = [] mad_list = [] ratio_list = [] for task in tasks: mean = training_data[task].mean() mean_list.append(mean) std = training_data[task].std() std_list.append(std) mad = training_data[task].mad() mad_list.append(mad) ratio_list.append(std/mad) training_data[task+normalizeFlag] = (training_data[task] - mean) / std test_df[task+normalizeFlag] = (test_df[task] - mean) / std list_of_tuples = list(zip(tasks, mean_list, std_list, mad_list, ratio_list)) stats = pd.DataFrame(list_of_tuples, columns = columns) stats.to_csv(output_dir + 'trainset_stats.csv', index = None) train_df = training_data.reset_index(drop = True) test_df = test_df.reset_index(drop=True) print('Data loading finished:') print('Train set size: %i' % len(train_df)) print('Test set size: %i' % len(test_df)) #2 model initialization print('============ Model initialization =================') per_task_output_units_num = 1 output_units_num = len(tasks) * per_task_output_units_num x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = \ get_smiles_array([remained_df[smiles_field].iloc[0]], feature_dicts) num_atom_features = x_atom.shape[-1] num_bond_features = x_bonds.shape[-1] loss_function = nn.MSELoss() model = Fingerprint(radius, T, num_atom_features, num_bond_features, fingerprint_dim, output_units_num, p_dropout, batch_normalization) model.cuda() model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print('Total number of parameters: %i' % params) for name, param in model.named_parameters(): if param.requires_grad: print(name, param.data.shape) optimizer = optim.Adam(model.parameters(), 10**-learning_rate, weight_decay=10**-weight_decay) print('============ Saving params =================') #5 write all params to json file model_params = {} model_params['radius'] = radius model_params['fingerprint_dim'] = fingerprint_dim model_params['T'] = T model_params['output_units_num'] = output_units_num model_params['num_atom_features'] = num_atom_features model_params['num_bond_features'] = num_bond_features model_params['p_dropout'] = p_dropout model_params['batch_normalization'] = batch_normalization model_params['mol_length'] = x_atom.shape[1] data_stats = {} data_stats['tasks'] = tasks data_stats['smiles_field'] = smiles_field data_stats['test_fraction'] = test_fraction data_stats['random_seed'] = random_seed data_stats['mean'] = mean_list data_stats['std'] = std_list data_stats['mad'] = mad_list data_stats['ratio'] = ratio_list training_params = {} training_params['batch_size'] = batch_size training_params['epochs'] = epochs training_params['weight_decay'] = weight_decay training_params['learning_rate'] = learning_rate training_params['normalizeFlag'] = normalizeFlag json_output = {} json_output['model_params'] = model_params json_output['data_stats'] = data_stats json_output['training_params'] = training_params with open(output_dir + 'params.json', 'w') as outfile: json.dump(json_output, outfile) #3 model training print('============ Start model training =================') # parameter initialization for m in model.modules(): if isinstance(m, (nn.Linear)): nn.init.xavier_uniform_(m.weight) if isinstance(m, (nn.GRUCell)): nn.init.orthogonal_(m.weight_ih) nn.init.orthogonal_(m.weight_hh) best_param ={} best_param["train_epoch"] = 0 best_param["valid_epoch"] = 0 best_param["train_MSE_normalized"] = 9e8 best_param["valid_MSE_normalized"] = 9e8 for epoch in range(epochs): print(train_regressor(model, train_df, tasks, optimizer, loss_function, batch_size, smiles_field, normalizeFlag, feature_dicts, stats)) train_r2, train_MSE_normalized, train_MSE, train_MAE_normalized, \ train_MAE = eval_regressor(model, train_df, smiles_field, tasks, normalizeFlag, \ batch_size, feature_dicts, stats) valid_r2, valid_MSE_normalized, valid_MSE, valid_MAE_normalized, \ valid_MAE = eval_regressor(model, test_df, smiles_field, tasks, normalizeFlag, \ batch_size, feature_dicts, stats) #4 evluation and log tracking print("EPOCH:\t" + str(epoch) + '\n' \ +"train_MAE: \n" + str(train_MAE) + '\n' \ +"valid_MAE: \n" + str(valid_MAE) + '\n' \ +"train_r2: \n" + str(train_r2) + '\n' \ +"valid_r2: \n" + str(valid_r2) + '\n' \ +"train_MSE_normalized_mean: " + str(train_MSE_normalized.mean()) + '\n' \ +"valid_MSE_normalized_mean: " + str(valid_MSE_normalized.mean()) + '\n' \ +"train_r2_mean: " + str(train_r2.mean()) + '\n' \ +"valid_r2_mean: " + str(valid_r2.mean()) + '\n') if train_MSE_normalized.mean() < best_param["train_MSE_normalized"]: best_param["train_epoch"] = epoch best_param["train_MSE_normalized"] = train_MSE_normalized.mean() if valid_MSE_normalized.mean() < best_param["valid_MSE_normalized"]: best_param["valid_epoch"] = epoch best_param["valid_MSE_normalized"] = valid_MSE_normalized.mean() if valid_r2.mean() > 0.6: torch.save(model, output_dir + 'model-' + str(epoch) + '.pt') if (epoch - best_param["train_epoch"] > 3) and (epoch - best_param["valid_epoch"] > 5): # early stopping torch.save(model, output_dir + 'model-' + str(epoch) + '.pt') break print("Training finished.") return
def eval_regressor(model, dataset, smiles_field, tasks, normalizeFlag, batch_size, feature_dicts, stats, plot_flag = False): std_list = stats['Standard deviation'].values ratio_list = stats['ratio'].values model.eval() y_val_list = {} y_pred_list = {} eval_MAE_list = {} eval_MSE_list = {} for i, task in enumerate(tasks): y_pred_list[task] = np.array([]) y_val_list[task] = np.array([]) eval_MAE_list[task] = np.array([]) eval_MSE_list[task] = np.array([]) valList = np.arange(0, dataset.shape[0]) batch_list = [] for i in range(0, dataset.shape[0], batch_size): batch = valList[i:(i+batch_size)] batch_list.append(batch) for counter, eval_batch in enumerate(batch_list): batch_df = dataset.loc[eval_batch, :] smiles_list = batch_df[smiles_field].values x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(smiles_list, feature_dicts) atoms_prediction, mol_prediction, _, _ = model(torch.Tensor(x_atom), torch.Tensor(x_bonds), torch.cuda.LongTensor(x_atom_index), torch.cuda.LongTensor(x_bond_index), torch.Tensor(x_mask)) for i, task in enumerate(tasks): y_pred = mol_prediction[:, i] y_val = batch_df[task+normalizeFlag].values # filter out NAs validInds = np.where(~np.isnan(y_val))[0] valid_len = len(validInds) if len(validInds) == 0: continue y_val_adjust = np.array([y_val[v] for v in validInds]).astype(float) validInds = torch.cuda.LongTensor(validInds).squeeze() y_pred_adjust = torch.index_select(y_pred, 0, validInds) MAE = F.l1_loss(y_pred_adjust, torch.Tensor(y_val_adjust).squeeze(), reduction = 'none') MSE = F.mse_loss(y_pred_adjust, torch.Tensor(y_val_adjust).squeeze(), reduction = 'none') y_pred_list[task] = np.concatenate([y_pred_list[task], y_pred_adjust.cpu().detach().numpy()]) y_val_list[task] = np.concatenate([y_val_list[task], y_val_adjust]) if valid_len == 1: eval_MAE_list[task] = np.concatenate([eval_MAE_list[task], MAE.data.squeeze().cpu().numpy().reshape((1,))]) eval_MSE_list[task] = np.concatenate([eval_MSE_list[task], MSE.data.squeeze().cpu().numpy().reshape((1,))]) else: eval_MAE_list[task] = np.concatenate([eval_MAE_list[task], MAE.data.squeeze().cpu().numpy()]) eval_MSE_list[task] = np.concatenate([eval_MSE_list[task], MSE.data.squeeze().cpu().numpy()]) eval_r2_score = np.array([r2_score(y_val_list[task], y_pred_list[task]) for task in tasks]) eval_MSE_normalized = np.array([eval_MSE_list[task].mean() for i, task in enumerate(tasks)]) eval_MAE_normalized = np.array([eval_MAE_list[task].mean() for i, task in enumerate(tasks)]) eval_MAE = np.multiply(eval_MAE_normalized, np.array(std_list)) eval_MSE = np.multiply(eval_MSE_normalized, np.array(std_list)) if plot_flag: return eval_r2_score, eval_MAE_normalized, eval_MAE, eval_MSE_normalized, eval_MSE, y_pred_list, y_val_list return eval_r2_score, eval_MAE_normalized, eval_MAE, eval_MSE_normalized, eval_MSE