from sklearn.decomposition import PCA import rdkit.Chem as Chem from rdkit.Chem import Draw if __name__ == '__main__': root_dir = '../../../big_data' # result_dir = os.path.join(root_dir, 'figures', 'chapter3_figures') # sub_dir = '06_model_Parallel2vec' # 05_model_Tandem2vec or 06_model_Parallel2vec # input_dir = os.path.join(root_dir, sub_dir) need_plot_md = ['nN', 'nS', 'nBondsD', 'naRing'] frag2info = pd.read_csv(os.path.join(root_dir, '03_fragment', 'frag_smiles2md.csv'), index_col=0) frag2info = frag2info.loc[:, need_plot_md] print_df(frag2info) # if not os.path.exists(result_dir): # os.makedirs(result_dir) for frag_sentence_type in ['parallel']: # only plot parallel print( 'Deal with fragment sentence type: {}'.format(frag_sentence_type)) # frag_sentence_type = 'tandem' # parallel or tandem if frag_sentence_type == 'parallel': sub_dir2 = '06_model_Parallel2vec' else: sub_dir2 = '05_model_Tandem2vec' # sub_dir_tandem = '05_model_Tandem2vec' # sub_dir_parallel = '06_model_Parallel2vec' # minn = 1 # maxn = 2
# find aromatic ring and corresponding non-aromatic ring pair bond_pair = find_aromatic_non_aroma_ring_pair(frag_df=frag_info) with open(bond_pair_file_path, 'a') as f_handle: if bond_pair: for bp in bond_pair: # print(bp) f_handle.write('\t'.join(list(bp) + ['aromatic_ring']) + '\n') frag_pairs = pd.read_csv(bond_pair_file_path, sep='\t') frag2vec = pd.read_csv(frag_smiles2vec_file_path, index_col='fragment') pca = PCA(n_components=2) x_reduced_pca = pd.DataFrame(data=pca.fit_transform(frag2vec), index=frag2vec.index) print('>>> x_reduced_pca') print_df(x_reduced_pca) # frag_pairs = frag_pairs.loc[frag_pairs['keep'] == 1] print('>>> frag_pairs') print_df(frag_pairs) for bond_type in frag_pairs['bond_type'].unique(): print('>>> Deal with {}...'.format(bond_type)) plt.figure(figsize=(8, 6)) current_bond_pairs = frag_pairs.loc[frag_pairs['bond_type'] == bond_type].copy() if current_bond_pairs.shape[0] > 100: current_bond_pairs = current_bond_pairs.sample(n=100, random_state=42) current_frag1 = current_bond_pairs.loc[:, 'frag1'] current_frag2 = current_bond_pairs.loc[:, 'frag2']
print(frag_vec_file_path) get_mol_vec(frag2vec_file_path=frag_vec_file_path, data_set=cid2frag_smiles_file_path, result_path=mol_vector_file_path) train_set_file_path = os.path.join(result_dir, 'train_set.csv') test_set_file_path = os.path.join(result_dir, 'test_set.csv') if not (os.path.exists(train_set_file_path) and os.path.exists(test_set_file_path)): print('Split down-sampled dataset...') split_data = split_data_set( os.path.join(root_dir, subdir1, down_sampled_mol_file)) train_set = split_data['train_set'] test_set = split_data['test_set'] print('>>> Training set') print_df(train_set) print('>>> Test set') print_df(test_set) train_set.to_csv(train_set_file_path) test_set.to_csv(test_set_file_path) # -------------------------------------------------------------------------------------- # train model print('Start to train MLP model...') train_set = pd.read_csv(train_set_file_path, index_col='cid') selected_mol2md = pd.read_csv(selected_mol2md_file_path, index_col='cid') md = get_ordered_md() selected_mol2md = selected_mol2md.loc[:, md].copy() y = selected_mol2md.loc[selected_mol2md.index.isin(train_set.index)] for frag_type in ['tandem', 'parallel', 'random']: # ['tandem', 'parallel']
def archive(): # deal with y print(' > Start to deal with y...') frag2md_info_file = '' frag2vec_file = '' frag2md_info = pd.read_csv(os.path.join(root_dir, frag2md_info_file), index_col='fragment') if model_type == 'classification': frag2md_info[frag2md_info >= 1] = 1 all_x = pd.read_csv(os.path.join(root_dir, frag2vec_file), index_col=0) x = all_x.loc[all_x.index != 'UNK'].copy() y = frag2md_info.loc[x.index, SELECTED_MD].copy() # train model print(' > Start to train model...') for md in SELECTED_MD: if md == 'naRing': result_dir_new = os.path.join(result_dir, md + '_aromaticity') m_part1 = nn_model_regression(x=x, y=y.loc[:, [md]], epochs=100, result_dir=result_dir_new, callback=True) else: result_dir_new = os.path.join(result_dir, md) m_part1 = nn_model_regression(x=x, y=y.loc[:, [md]], epochs=100, result_dir=result_dir_new, callback=True) # get new frag_id2vec in 30D frag2vec_30d = pd.DataFrame(data=m_part1.predict(all_x), index=all_x.index) print_df(frag2vec_30d) frag2vec_new_fp = os.path.join(result_dir_new, 'frag2vec_new_30d.csv') frag2vec_30d.to_csv(frag2vec_new_fp) # plot by t-SNE x_reduced = reduce_by_tsne(frag2vec_30d) x_reduced = pd.DataFrame(data=x_reduced, index=frag2vec_30d.index) print(' >Start to plot t-SNE vis of fragment vector...') # save_fig_path = os.path.join('./chapter4_figure/', 't-SNE_vis_new_30d.png') need_plot_md = [md] + list( np.random.choice(SELECTED_MD, 3, replace=False)) fig = show_each_md(x_reduced=x_reduced, frag_info=frag2md_info.loc[:, need_plot_md]) fig.savefig(os.path.join(result_dir_new, 't-SNE_vis_sorted_by_{}.png'.format(md)), dpi=200) # plot top n fragment topn = 4 print( ' >Start to plot top {} nearest neighbor of selected fragment vector...' .format(topn)) q_frags = [ "C1=COCO1", "C1=CCNN=C1", "C1=CCC1", "OBr", "S=S", "C1#CNCC1" ] q_mol2vec = frag2vec_30d.loc[q_frags, :] nn = find_nearest_neighbor(training_mol_vec_fp=frag2vec_new_fp, query_mol_vec_df=q_mol2vec, top_n=topn, mol2md_fp='') # plot smiles_list = [] dis = [] legends = [] for inx in range(len(q_frags)): smiles_list += [ i.split(": ")[0] for i in nn[inx][q_frags[inx]].split('; ') ] dis += [ str('{:.8f}').format(float(i.split(": ")[1])) for i in nn[inx][q_frags[inx]].split('; ') ] # print(dis) # print(inx, smiles_list) legends += [ '{}({})'.format(smiles_list[i], dis[i]) for i in range(len(smiles_list)) ] fig = draw_multiple_mol(smiles_list=smiles_list, mols_per_row=topn, legends=legends) # print(type(fig)) # print(fig) save_fig(fig, file_path=os.path.join( result_dir_new, 'top{}_nearest_neighbor_sorted_by_{}.svg'.format( topn, md)))
cid_list_new = {} with open(cid2smiles_file_name, 'r') as f: for i in tqdm(f): cid = i.split('\t')[0] if cid != 'cid': cid_list_new[int(cid)] = 1 cid2md.loc[list(cid_list_new.keys()), :].to_csv(cid2md_file_path_new, index_label='cid') print('Start to class molecules by combination by MD...') class_by_md_combination_file_path = os.path.join( current_dir, 'class_by_md_combination.csv') if not os.path.exists(class_by_md_combination_file_path): cid2md = pd.read_csv(cid2md_file_path_new, index_col='cid') # assert cid2md.shape[1] == 9 print_df(cid2md) class_by_md_comb = get_class_md_combination(cid2md) print(' >check again') print(sum(class_by_md_comb.index.isin(cid2md.index))) print_df(class_by_md_comb) class_by_md_comb.to_csv(class_by_md_combination_file_path, index_label='cid') else: print('>>> Using previous result...') # class_by_md_comb = pd.read_csv(class_by_md_combination_file_path, index_col=0) print('Start to down-sampling...') selected_cid2md_class_file_path = os.path.join( current_dir, 'selected_cid2md_class.csv') if not os.path.exists(selected_cid2md_class_file_path): down_sampling_mol(class_by_md_combination_file_path, result_dir=current_dir,