Example #1
0
from sklearn.decomposition import PCA

import rdkit.Chem as Chem
from rdkit.Chem import Draw

if __name__ == '__main__':
    root_dir = '../../../big_data'
    # result_dir = os.path.join(root_dir, 'figures', 'chapter3_figures')
    # sub_dir = '06_model_Parallel2vec'  # 05_model_Tandem2vec or 06_model_Parallel2vec
    # input_dir = os.path.join(root_dir, sub_dir)
    need_plot_md = ['nN', 'nS', 'nBondsD', 'naRing']
    frag2info = pd.read_csv(os.path.join(root_dir, '03_fragment',
                                         'frag_smiles2md.csv'),
                            index_col=0)
    frag2info = frag2info.loc[:, need_plot_md]
    print_df(frag2info)
    # if not os.path.exists(result_dir):
    #     os.makedirs(result_dir)

    for frag_sentence_type in ['parallel']:  # only plot parallel
        print(
            'Deal with fragment sentence type: {}'.format(frag_sentence_type))
        # frag_sentence_type = 'tandem'  # parallel or tandem
        if frag_sentence_type == 'parallel':
            sub_dir2 = '06_model_Parallel2vec'
        else:
            sub_dir2 = '05_model_Tandem2vec'
        # sub_dir_tandem = '05_model_Tandem2vec'
        # sub_dir_parallel = '06_model_Parallel2vec'
        # minn = 1
        # maxn = 2
Example #2
0
        # find aromatic ring and corresponding non-aromatic ring pair
        bond_pair = find_aromatic_non_aroma_ring_pair(frag_df=frag_info)
        with open(bond_pair_file_path, 'a') as f_handle:
            if bond_pair:
                for bp in bond_pair:
                    # print(bp)
                    f_handle.write('\t'.join(list(bp) + ['aromatic_ring']) +
                                   '\n')

    frag_pairs = pd.read_csv(bond_pair_file_path, sep='\t')
    frag2vec = pd.read_csv(frag_smiles2vec_file_path, index_col='fragment')
    pca = PCA(n_components=2)
    x_reduced_pca = pd.DataFrame(data=pca.fit_transform(frag2vec),
                                 index=frag2vec.index)
    print('>>> x_reduced_pca')
    print_df(x_reduced_pca)

    # frag_pairs = frag_pairs.loc[frag_pairs['keep'] == 1]
    print('>>> frag_pairs')
    print_df(frag_pairs)

    for bond_type in frag_pairs['bond_type'].unique():
        print('>>> Deal with {}...'.format(bond_type))
        plt.figure(figsize=(8, 6))
        current_bond_pairs = frag_pairs.loc[frag_pairs['bond_type'] ==
                                            bond_type].copy()
        if current_bond_pairs.shape[0] > 100:
            current_bond_pairs = current_bond_pairs.sample(n=100,
                                                           random_state=42)
        current_frag1 = current_bond_pairs.loc[:, 'frag1']
        current_frag2 = current_bond_pairs.loc[:, 'frag2']
                print(frag_vec_file_path)
                get_mol_vec(frag2vec_file_path=frag_vec_file_path,
                            data_set=cid2frag_smiles_file_path,
                            result_path=mol_vector_file_path)

    train_set_file_path = os.path.join(result_dir, 'train_set.csv')
    test_set_file_path = os.path.join(result_dir, 'test_set.csv')
    if not (os.path.exists(train_set_file_path)
            and os.path.exists(test_set_file_path)):
        print('Split down-sampled dataset...')
        split_data = split_data_set(
            os.path.join(root_dir, subdir1, down_sampled_mol_file))
        train_set = split_data['train_set']
        test_set = split_data['test_set']
        print('>>> Training set')
        print_df(train_set)
        print('>>> Test set')
        print_df(test_set)
        train_set.to_csv(train_set_file_path)
        test_set.to_csv(test_set_file_path)

    # --------------------------------------------------------------------------------------
    # train model
    print('Start to train MLP model...')
    train_set = pd.read_csv(train_set_file_path, index_col='cid')
    selected_mol2md = pd.read_csv(selected_mol2md_file_path, index_col='cid')
    md = get_ordered_md()
    selected_mol2md = selected_mol2md.loc[:, md].copy()
    y = selected_mol2md.loc[selected_mol2md.index.isin(train_set.index)]
    for frag_type in ['tandem', 'parallel',
                      'random']:  # ['tandem', 'parallel']
def archive():
    # deal with y
    print('  > Start to deal with y...')
    frag2md_info_file = ''
    frag2vec_file = ''
    frag2md_info = pd.read_csv(os.path.join(root_dir, frag2md_info_file),
                               index_col='fragment')
    if model_type == 'classification':
        frag2md_info[frag2md_info >= 1] = 1
    all_x = pd.read_csv(os.path.join(root_dir, frag2vec_file), index_col=0)
    x = all_x.loc[all_x.index != 'UNK'].copy()
    y = frag2md_info.loc[x.index, SELECTED_MD].copy()

    # train model
    print('  > Start to train model...')
    for md in SELECTED_MD:
        if md == 'naRing':
            result_dir_new = os.path.join(result_dir, md + '_aromaticity')
            m_part1 = nn_model_regression(x=x,
                                          y=y.loc[:, [md]],
                                          epochs=100,
                                          result_dir=result_dir_new,
                                          callback=True)
        else:
            result_dir_new = os.path.join(result_dir, md)
            m_part1 = nn_model_regression(x=x,
                                          y=y.loc[:, [md]],
                                          epochs=100,
                                          result_dir=result_dir_new,
                                          callback=True)

        # get new frag_id2vec in 30D
        frag2vec_30d = pd.DataFrame(data=m_part1.predict(all_x),
                                    index=all_x.index)
        print_df(frag2vec_30d)
        frag2vec_new_fp = os.path.join(result_dir_new, 'frag2vec_new_30d.csv')
        frag2vec_30d.to_csv(frag2vec_new_fp)

        # plot by t-SNE
        x_reduced = reduce_by_tsne(frag2vec_30d)
        x_reduced = pd.DataFrame(data=x_reduced, index=frag2vec_30d.index)
        print('  >Start to plot t-SNE vis of fragment vector...')
        # save_fig_path = os.path.join('./chapter4_figure/', 't-SNE_vis_new_30d.png')
        need_plot_md = [md] + list(
            np.random.choice(SELECTED_MD, 3, replace=False))
        fig = show_each_md(x_reduced=x_reduced,
                           frag_info=frag2md_info.loc[:, need_plot_md])
        fig.savefig(os.path.join(result_dir_new,
                                 't-SNE_vis_sorted_by_{}.png'.format(md)),
                    dpi=200)

        # plot top n fragment
        topn = 4
        print(
            '  >Start to plot top {} nearest neighbor of selected fragment vector...'
            .format(topn))
        q_frags = [
            "C1=COCO1", "C1=CCNN=C1", "C1=CCC1", "OBr", "S=S", "C1#CNCC1"
        ]
        q_mol2vec = frag2vec_30d.loc[q_frags, :]
        nn = find_nearest_neighbor(training_mol_vec_fp=frag2vec_new_fp,
                                   query_mol_vec_df=q_mol2vec,
                                   top_n=topn,
                                   mol2md_fp='')
        # plot
        smiles_list = []
        dis = []
        legends = []
        for inx in range(len(q_frags)):
            smiles_list += [
                i.split(": ")[0] for i in nn[inx][q_frags[inx]].split('; ')
            ]
            dis += [
                str('{:.8f}').format(float(i.split(": ")[1]))
                for i in nn[inx][q_frags[inx]].split('; ')
            ]
            # print(dis)
            # print(inx, smiles_list)
        legends += [
            '{}({})'.format(smiles_list[i], dis[i])
            for i in range(len(smiles_list))
        ]
        fig = draw_multiple_mol(smiles_list=smiles_list,
                                mols_per_row=topn,
                                legends=legends)
        # print(type(fig))
        # print(fig)
        save_fig(fig,
                 file_path=os.path.join(
                     result_dir_new,
                     'top{}_nearest_neighbor_sorted_by_{}.svg'.format(
                         topn, md)))
        cid_list_new = {}
        with open(cid2smiles_file_name, 'r') as f:
            for i in tqdm(f):
                cid = i.split('\t')[0]
                if cid != 'cid':
                    cid_list_new[int(cid)] = 1
        cid2md.loc[list(cid_list_new.keys()), :].to_csv(cid2md_file_path_new,
                                                        index_label='cid')

    print('Start to class molecules by combination by MD...')
    class_by_md_combination_file_path = os.path.join(
        current_dir, 'class_by_md_combination.csv')
    if not os.path.exists(class_by_md_combination_file_path):
        cid2md = pd.read_csv(cid2md_file_path_new, index_col='cid')
        # assert cid2md.shape[1] == 9
        print_df(cid2md)
        class_by_md_comb = get_class_md_combination(cid2md)
        print('   >check again')
        print(sum(class_by_md_comb.index.isin(cid2md.index)))
        print_df(class_by_md_comb)
        class_by_md_comb.to_csv(class_by_md_combination_file_path,
                                index_label='cid')
    else:
        print('>>> Using previous result...')
        # class_by_md_comb = pd.read_csv(class_by_md_combination_file_path, index_col=0)
    print('Start to down-sampling...')
    selected_cid2md_class_file_path = os.path.join(
        current_dir, 'selected_cid2md_class.csv')
    if not os.path.exists(selected_cid2md_class_file_path):
        down_sampling_mol(class_by_md_combination_file_path,
                          result_dir=current_dir,