def Inference(data_name, smiles_list = ['CC','CCC','C=O'], data_units = '', k_fold_number = 8, augmentation = False, outdir = "../data/"): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp) save_dir = outdir+'Inference/'+'{}/{}/'.format(data_name,p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X for inference starts...***\n\n") print("***Checking the SMILES list for inference***\n") smiles_checked = list() smiles_rejected = list() for ismiles in smiles_list: mol_tmp = Chem.MolFromSmiles(ismiles) if mol_tmp != None: smiles_can = Chem.MolToSmiles(mol_tmp) smiles_checked.append(smiles_can) else: smiles_rejected.append(ismiles) if len(smiles_rejected) > 0: with open(save_dir+'rejected_smiles.txt','w') as f: for ismiles in smiles_rejected: f.write("%s\n" % ismiles) if len(smiles_checked) == 0: print("***Process of inference automatically aborted!***") print("The provided SMILES are all incorrect and could not be verified via RDKit.") return smiles_x = np.array(smiles_checked) smiles_y = np.array([[np.nan]*len(smiles_checked)]).flatten() # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False smiles_x_enum, smiles_x_enum_card, smiles_y_enum = \ augm.Augmentation(smiles_x, smiles_y, canon=canonical, rotate=rotation) print("Enumerated SMILES: {}\n".format(smiles_x_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES smiles_x_enum_tokens = token.get_tokens(smiles_x_enum) # models ensembling smiles_y_pred_mean_array = np.empty(shape=(0,len(smiles_checked)), dtype='float') for ifold in range(k_fold_number): # Tokens as a list tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt') # Add 'pad', 'unk' tokens to the existing list vocab_size = len(tokens) tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) # Transformation of tokenized SMILES to vector of intergers and vice-versa token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) if ifold == 0: # Maximum of length of SMILES to process max_length = model_train.layers[0].output_shape[-1] print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length)) model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) # predict and compare for the training, validation and test sets smiles_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = smiles_x_enum_tokens, max_length = max_length, vocab = tokens) smiles_y_pred = model_train.predict(smiles_x_enum_tokens_tointvec) # compute a mean per set of augmented SMILES smiles_y_pred_mean, _ = utils.mean_median_result(smiles_x_enum_card, smiles_y_pred) smiles_y_pred_mean_array = np.append(smiles_y_pred_mean_array, smiles_y_pred_mean.reshape(1,-1), axis = 0) if ifold == (k_fold_number-1): smiles_y_pred_mean_ensemble = np.mean(smiles_y_pred_mean_array, axis = 0) smiles_y_pred_sd_ensemble = np.std(smiles_y_pred_mean_array, axis = 0) pred_from_ens = pd.DataFrame(data=[smiles_x, smiles_y_pred_mean_ensemble, smiles_y_pred_sd_ensemble]).T pred_from_ens.columns = ['SMILES', 'ens_pred_mean', 'ens_pred_sd'] print("***Inference of SMILES property done.***") return pred_from_ens
def Embedding_Vis(data, data_name, data_units='', k_fold_number=8, k_fold_index=0, augmentation=False, outdir="../data/", affinity_propn=True, verbose=0): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' input_dir = outdir + 'Main/' + '{}/{}/'.format(data_name, p_dir_temp) save_dir = outdir + 'Embedding_Vis/' + '{}/{}/'.format( data_name, p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X for embedding visualization starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist() print("******") print("***Fold #{} initiated...***".format(k_fold_index)) print("******") print("***Sampling and splitting of the dataset.***\n") # Reproducing the data split of the requested fold (k_fold_index) x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), random_state=seed_list[k_fold_index], scaling = True) # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False x_train_enum, x_train_enum_card, y_train_enum = \ augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation) x_valid_enum, x_valid_enum_card, y_valid_enum = \ augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation) x_test_enum, x_test_enum_card, y_test_enum = \ augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation) print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\ format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES per dataset x_train_enum_tokens = token.get_tokens(x_train_enum) x_valid_enum_tokens = token.get_tokens(x_valid_enum) x_test_enum_tokens = token.get_tokens(x_test_enum) print("Examples of tokenized SMILES from a training set:\n{}\n".\ format(x_train_enum_tokens[:5])) # Vocabulary size computation all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens tokens = token.extract_vocab(all_smiles_tokens) vocab_size = len(tokens) train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens)) print(train_unique_tokens) print("Number of tokens only present in a training set: {}\n".format( len(train_unique_tokens))) train_unique_tokens.insert(0, 'pad') # Tokens as a list tokens = token.get_vocab(input_dir + data_name + '_tokens_set_fold_' + str(k_fold_index) + '.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Maximum of length of SMILES to process max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens]) print( "Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n" .format(max_length)) # Transformation of tokenized SMILES to vector of intergers and vice-versa token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) model_train = load_model(input_dir + 'LSTMAtt_' + data_name + '_model.best_fold_' + str(k_fold_index) + '.hdf5', custom_objects={'AttentionM': model.AttentionM()}) print("Chosen model summary:\n") print(model_train.summary()) print("\n") print("***Embedding of the individual tokens from the chosen model.***\n") model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae, metrics.mse]) model_embed_weights = model_train.layers[1].get_weights()[0] #print(model_embed_weights.shape) #tsne = TSNE(perplexity=30, early_exaggeration=120 , n_components=2, random_state=123, verbose=0) pca = PCA(n_components=2, random_state=123) transformed_weights = pca.fit_transform(model_embed_weights) #transformed_weights = tsne.fit_transform(model_embed_weights) f = plt.figure(figsize=(9, 9)) ax = plt.subplot(aspect='equal') if affinity_propn: # Compute Affinity Propagation af = AffinityPropagation().fit(model_embed_weights) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) # Plot it colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): class_members = np.where(np.array(labels == k) == True)[0].tolist() for ilabpt in class_members: alpha_tmp = 0.5 if tokens[ ilabpt] in train_unique_tokens else 0.5 line_tmp = 1 if tokens[ilabpt] in train_unique_tokens else 5 marker_tmp = 'o' if tokens[ ilabpt] in train_unique_tokens else 'x' edge_color_tmp = 'black' if tokens[ ilabpt] in train_unique_tokens else col ax.plot(transformed_weights[ilabpt, 0], transformed_weights[ilabpt, 1], col, marker=marker_tmp, markeredgecolor=edge_color_tmp, markeredgewidth=line_tmp, alpha=alpha_tmp, markersize=10) else: # Black and white plot for ilabpt in range(vocab_size): alpha_tmp = 0.5 if tokens[ilabpt] in train_unique_tokens else 0.2 size_tmp = 40 if tokens[ilabpt] in train_unique_tokens else 20 ax.scatter(transformed_weights[ilabpt, 0], transformed_weights[ilabpt, 1], lw=1, s=size_tmp, facecolor='black', marker='o', alpha=alpha_tmp) annotations = [] weight_tmp = 'bold' ilabpt = 0 for ilabpt, (x_i, y_i) in enumerate( zip(transformed_weights[:, 0].tolist(), transformed_weights[:, 1].tolist())): weight_tmp = 'black' if tokens[ ilabpt] in train_unique_tokens else 'normal' tokens_tmp = tokens[ilabpt] if tokens_tmp == ' ': tokens_tmp = 'space' elif tokens_tmp == '.': tokens_tmp = 'dot' annotations.append( plt.text(x_i, y_i, tokens_tmp, fontsize=12, weight=weight_tmp)) adjust_text(annotations, x=transformed_weights[:, 0].tolist(), y=transformed_weights[:, 1].tolist(), arrowprops=dict(arrowstyle="-", color='k', lw=0.5)) plt.xticks([]) plt.yticks([]) ax.axis('tight') plt.savefig(save_dir + 'Visualization_' + data_name + '_Embedding_fold_' + str(k_fold_index) + '.png', bbox_inches='tight') plt.show()
def Interpretation(data, data_name, data_units = '', k_fold_number = 8, k_fold_index=0, augmentation = False, outdir = "../data/", smiles_toviz = 'CCC', font_size = 15, font_rotation = 'horizontal'): if augmentation: p_dir_temp = 'Augm' else: p_dir_temp = 'Can' input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp) save_dir = outdir+'Interpretation/'+'{}/{}/'.format(data_name,p_dir_temp) os.makedirs(save_dir, exist_ok=True) print("***SMILES_X Interpreter starts...***\n\n") np.random.seed(seed=123) seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist() # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times selection_seed = seed_list[k_fold_index] print("******") print("***Fold #{} initiated...***".format(selection_seed)) print("******") print("***Sampling and splitting of the dataset.***\n") x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \ utils.random_split(smiles_input=data.smiles, prop_input=np.array(data.iloc[:,1]), random_state=selection_seed, scaling = True) np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s') np.savetxt(save_dir+'smiles_valid.txt', np.asarray(x_valid), newline="\n", fmt='%s') np.savetxt(save_dir+'smiles_test.txt', np.asarray(x_test), newline="\n", fmt='%s') mol_toviz = Chem.MolFromSmiles(smiles_toviz) if mol_toviz != None: smiles_toviz_can = Chem.MolToSmiles(mol_toviz) else: print("***Process of visualization automatically aborted!***") print("The smiles_toviz is incorrect and cannot be canonicalized by RDKit.") return smiles_toviz_x = np.array([smiles_toviz_can]) if smiles_toviz_can in np.array(data.smiles): smiles_toviz_y = np.array([[data.iloc[np.where(data.smiles == smiles_toviz_x[0])[0][0],1]]]) else: smiles_toviz_y = np.array([[np.nan]]) # data augmentation or not if augmentation == True: print("***Data augmentation.***\n") canonical = False rotation = True else: print("***No data augmentation has been required.***\n") canonical = True rotation = False x_train_enum, x_train_enum_card, y_train_enum = \ augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation) x_valid_enum, x_valid_enum_card, y_valid_enum = \ augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation) x_test_enum, x_test_enum_card, y_test_enum = \ augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation) smiles_toviz_x_enum, smiles_toviz_x_enum_card, smiles_toviz_y_enum = \ augm.Augmentation(smiles_toviz_x, smiles_toviz_y, canon=canonical, rotate=rotation) print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\ format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0])) print("***Tokenization of SMILES.***\n") # Tokenize SMILES per dataset x_train_enum_tokens = token.get_tokens(x_train_enum) x_valid_enum_tokens = token.get_tokens(x_valid_enum) x_test_enum_tokens = token.get_tokens(x_test_enum) smiles_toviz_x_enum_tokens = token.get_tokens(smiles_toviz_x_enum) print("Examples of tokenized SMILES from a training set:\n{}\n".\ format(x_train_enum_tokens[:5])) # Vocabulary size computation all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens tokens = token.extract_vocab(all_smiles_tokens) vocab_size = len(tokens) train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens)) print(train_unique_tokens) print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens))) train_unique_tokens.insert(0,'pad') # Tokens as a list tokens = token.get_vocab(input_dir+data_name+'_Vocabulary.txt') # Add 'pad', 'unk' tokens to the existing list tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size) print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size)) # Maximum of length of SMILES to process max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens]) print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length)) # Transformation of tokenized SMILES to vector of intergers and vice-versa token_to_int = token.get_tokentoint(tokens) int_to_token = token.get_inttotoken(tokens) # Best architecture to visualize from model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', custom_objects={'AttentionM': model.AttentionM()}) best_arch = [model_topredict.layers[2].output_shape[-1]/2, model_topredict.layers[3].output_shape[-1], model_topredict.layers[1].output_shape[-1]] # Architecture to return attention weights model_att = model.LSTMAttModel.create(inputtokens = max_length+1, vocabsize = vocab_size, lstmunits= int(best_arch[0]), denseunits = int(best_arch[1]), embedding = int(best_arch[2]), return_proba = True) print("Best model summary:\n") print(model_att.summary()) print("\n") print("***Interpretation from the best model.***\n") model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5') model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, max_length = max_length+1, vocab = tokens) intermediate_layer_model = Model(inputs=model_att.input, outputs=model_att.layers[-2].output) intermediate_output = intermediate_layer_model.predict(smiles_toviz_x_enum_tokens_tointvec) smiles_toviz_x_card_cumsum_viz = np.cumsum(smiles_toviz_x_enum_card) smiles_toviz_x_card_cumsum_shift_viz = shift(smiles_toviz_x_card_cumsum_viz, 1, cval=0) mols_id = 0 ienumcard = smiles_toviz_x_card_cumsum_shift_viz[mols_id] smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard]) intermediate_output_tmp = intermediate_output[ienumcard,-smiles_len_tmp+1:-1].flatten().reshape(1,-1) max_intermediate_output_tmp = np.max(intermediate_output_tmp) plt.matshow(intermediate_output_tmp, cmap='Reds') plt.tick_params(axis='x', bottom = False) plt.xticks([ix for ix in range(smiles_len_tmp-2)]) plt.xticks(range(smiles_len_tmp-2), [int_to_token[iint].replace('pad','') \ for iint in smiles_toviz_x_enum_tokens_tointvec[ienumcard,-smiles_len_tmp+1:-1]], fontsize = font_size, rotation = font_rotation) plt.yticks([]) plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') #plt.show() smiles_tmp = smiles_toviz_x_enum[ienumcard] mol_tmp = Chem.MolFromSmiles(smiles_tmp) smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard]) mol_df_tmp = pd.DataFrame([smiles_toviz_x_enum_tokens[ienumcard][1:-1],intermediate_output[ienumcard].\ flatten().\ tolist()[-smiles_len_tmp+1:-1]]).transpose() bond = ['-','=','#','$','/','\\','.','(',')'] mol_df_tmp = mol_df_tmp[~mol_df_tmp.iloc[:,0].isin(bond)] mol_df_tmp = mol_df_tmp[[not itoken.isdigit() for itoken in mol_df_tmp.iloc[:,0].values.tolist()]] minmaxscaler = MinMaxScaler(feature_range=(0,1)) norm_weights = minmaxscaler.fit_transform(mol_df_tmp.iloc[:,1].values.reshape(-1,1)).flatten().tolist() fig = GetSimilarityMapFromWeights(mol=mol_tmp, size = (250,250), scale=-1, sigma=0.05, weights=norm_weights, colorMap='Reds', contourLines = 10, alpha = 0.25) fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight') #fig.show() model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse]) y_pred_test_tmp = model_topredict.predict(smiles_toviz_x_enum_tokens_tointvec[ienumcard].reshape(1,-1))[0,0] y_test_tmp = smiles_toviz_y_enum[ienumcard,0] if not np.isnan(y_test_tmp): print("True value: {0:.2f} Predicted: {1:.2f}".format(y_test_tmp, scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0])) else: print("Predicted: {0:.2f}".format(scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0])) smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard]) diff_topred_list = list() diff_totrue_list = list() for csubsmiles in range(1,smiles_len_tmp): isubsmiles = smiles_toviz_x_enum_tokens[ienumcard][:csubsmiles]+[' '] isubsmiles_tointvec= token.int_vec_encode(tokenized_smiles_list = [isubsmiles], max_length = max_length+1, vocab = tokens) predict_prop_tmp = model_topredict.predict(isubsmiles_tointvec)[0,0] diff_topred_tmp = (predict_prop_tmp-y_pred_test_tmp)/np.abs(y_pred_test_tmp) diff_topred_list.append(diff_topred_tmp) diff_totrue_tmp = (predict_prop_tmp-y_test_tmp)/np.abs(y_test_tmp) diff_totrue_list.append(diff_totrue_tmp) max_diff_topred_tmp = np.max(diff_topred_list) max_diff_totrue_tmp = np.max(diff_totrue_list) plt.figure(figsize=(15,7)) markers, stemlines, baseline = plt.stem([ix for ix in range(smiles_len_tmp-1)], diff_topred_list, 'k.-', use_line_collection=True) plt.setp(baseline, color='k', linewidth=2, linestyle='--') plt.setp(markers, linewidth=1, marker='o', markersize=10, markeredgecolor = 'black') plt.setp(stemlines, color = 'k', linewidth=0.5, linestyle='-') plt.xticks(range(smiles_len_tmp-1), smiles_toviz_x_enum_tokens[ienumcard][:-1], fontsize = font_size, rotation = font_rotation) plt.yticks(fontsize = 20) plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15) plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')