Beispiel #1
0
def Inference(data_name, 
              smiles_list = ['CC','CCC','C=O'], 
              data_units = '',
              k_fold_number = 8,
              augmentation = False, 
              outdir = "../data/"):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    save_dir = outdir+'Inference/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
    
    print("***SMILES_X for inference starts...***\n\n")
    print("***Checking the SMILES list for inference***\n")
    smiles_checked = list()
    smiles_rejected = list()
    for ismiles in smiles_list:
        mol_tmp = Chem.MolFromSmiles(ismiles)
        if mol_tmp != None:
            smiles_can = Chem.MolToSmiles(mol_tmp)
            smiles_checked.append(smiles_can)
        else:
            smiles_rejected.append(ismiles)
            
    if len(smiles_rejected) > 0:
        with open(save_dir+'rejected_smiles.txt','w') as f:
            for ismiles in smiles_rejected:
                f.write("%s\n" % ismiles)
                
    if len(smiles_checked) == 0:
        print("***Process of inference automatically aborted!***")
        print("The provided SMILES are all incorrect and could not be verified via RDKit.")
        return
    
    smiles_x = np.array(smiles_checked)
    smiles_y = np.array([[np.nan]*len(smiles_checked)]).flatten()
     
    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    smiles_x_enum, smiles_x_enum_card, smiles_y_enum = \
    augm.Augmentation(smiles_x, smiles_y, canon=canonical, rotate=rotation)

    print("Enumerated SMILES: {}\n".format(smiles_x_enum.shape[0]))
    
    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES 
    smiles_x_enum_tokens = token.get_tokens(smiles_x_enum)

    # models ensembling
    smiles_y_pred_mean_array = np.empty(shape=(0,len(smiles_checked)), dtype='float')
    for ifold in range(k_fold_number):
        
        # Tokens as a list
        tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
        # Add 'pad', 'unk' tokens to the existing list
        vocab_size = len(tokens)
        tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

        # Transformation of tokenized SMILES to vector of intergers and vice-versa
        token_to_int = token.get_tokentoint(tokens)
        int_to_token = token.get_inttotoken(tokens)
        
        # Best architecture to visualize from
        model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', 
                                 custom_objects={'AttentionM': model.AttentionM()})

        if ifold == 0:
            # Maximum of length of SMILES to process
            max_length = model_train.layers[0].output_shape[-1]
            print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))
            print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length))

        model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

        # predict and compare for the training, validation and test sets
        smiles_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = smiles_x_enum_tokens, 
                                                             max_length = max_length, 
                                                             vocab = tokens)

        smiles_y_pred = model_train.predict(smiles_x_enum_tokens_tointvec)

        # compute a mean per set of augmented SMILES
        smiles_y_pred_mean, _ = utils.mean_median_result(smiles_x_enum_card, smiles_y_pred)
        
        smiles_y_pred_mean_array = np.append(smiles_y_pred_mean_array, smiles_y_pred_mean.reshape(1,-1), axis = 0)
        
        if ifold == (k_fold_number-1):
            smiles_y_pred_mean_ensemble = np.mean(smiles_y_pred_mean_array, axis = 0)
            smiles_y_pred_sd_ensemble = np.std(smiles_y_pred_mean_array, axis = 0)

            pred_from_ens = pd.DataFrame(data=[smiles_x,
                                               smiles_y_pred_mean_ensemble,
                                               smiles_y_pred_sd_ensemble]).T
            pred_from_ens.columns = ['SMILES', 'ens_pred_mean', 'ens_pred_sd']
            
            print("***Inference of SMILES property done.***")
            
            return pred_from_ens
Beispiel #2
0
def Embedding_Vis(data,
                  data_name,
                  data_units='',
                  k_fold_number=8,
                  k_fold_index=0,
                  augmentation=False,
                  outdir="../data/",
                  affinity_propn=True,
                  verbose=0):

    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'

    input_dir = outdir + 'Main/' + '{}/{}/'.format(data_name, p_dir_temp)
    save_dir = outdir + 'Embedding_Vis/' + '{}/{}/'.format(
        data_name, p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)

    print("***SMILES_X for embedding visualization starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist()

    print("******")
    print("***Fold #{} initiated...***".format(k_fold_index))
    print("******")

    print("***Sampling and splitting of the dataset.***\n")
    # Reproducing the data split of the requested fold (k_fold_index)
    x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
    utils.random_split(smiles_input=data.smiles,
                       prop_input=np.array(data.iloc[:,1]),
                       random_state=seed_list[k_fold_index],
                       scaling = True)

    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    x_train_enum, x_train_enum_card, y_train_enum = \
    augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

    x_valid_enum, x_valid_enum_card, y_valid_enum = \
    augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

    x_test_enum, x_test_enum_card, y_test_enum = \
    augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)

    print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
    format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES per dataset
    x_train_enum_tokens = token.get_tokens(x_train_enum)
    x_valid_enum_tokens = token.get_tokens(x_valid_enum)
    x_test_enum_tokens = token.get_tokens(x_test_enum)

    print("Examples of tokenized SMILES from a training set:\n{}\n".\
    format(x_train_enum_tokens[:5]))

    # Vocabulary size computation
    all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens
    tokens = token.extract_vocab(all_smiles_tokens)
    vocab_size = len(tokens)

    train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens))
    print(train_unique_tokens)
    print("Number of tokens only present in a training set: {}\n".format(
        len(train_unique_tokens)))
    train_unique_tokens.insert(0, 'pad')

    # Tokens as a list
    tokens = token.get_vocab(input_dir + data_name + '_tokens_set_fold_' +
                             str(k_fold_index) + '.txt')
    # Add 'pad', 'unk' tokens to the existing list
    tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

    print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

    # Maximum of length of SMILES to process
    max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
    print(
        "Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n"
        .format(max_length))

    # Transformation of tokenized SMILES to vector of intergers and vice-versa
    token_to_int = token.get_tokentoint(tokens)
    int_to_token = token.get_inttotoken(tokens)

    model_train = load_model(input_dir + 'LSTMAtt_' + data_name +
                             '_model.best_fold_' + str(k_fold_index) + '.hdf5',
                             custom_objects={'AttentionM': model.AttentionM()})

    print("Chosen model summary:\n")
    print(model_train.summary())
    print("\n")

    print("***Embedding of the individual tokens from the chosen model.***\n")
    model_train.compile(loss="mse",
                        optimizer='adam',
                        metrics=[metrics.mae, metrics.mse])

    model_embed_weights = model_train.layers[1].get_weights()[0]
    #print(model_embed_weights.shape)
    #tsne = TSNE(perplexity=30, early_exaggeration=120 , n_components=2, random_state=123, verbose=0)
    pca = PCA(n_components=2, random_state=123)
    transformed_weights = pca.fit_transform(model_embed_weights)
    #transformed_weights = tsne.fit_transform(model_embed_weights)

    f = plt.figure(figsize=(9, 9))
    ax = plt.subplot(aspect='equal')

    if affinity_propn:
        # Compute Affinity Propagation
        af = AffinityPropagation().fit(model_embed_weights)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)
        # Plot it
        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        for k, col in zip(range(n_clusters_), colors):
            class_members = np.where(np.array(labels == k) == True)[0].tolist()
            for ilabpt in class_members:
                alpha_tmp = 0.5 if tokens[
                    ilabpt] in train_unique_tokens else 0.5
                line_tmp = 1 if tokens[ilabpt] in train_unique_tokens else 5
                marker_tmp = 'o' if tokens[
                    ilabpt] in train_unique_tokens else 'x'
                edge_color_tmp = 'black' if tokens[
                    ilabpt] in train_unique_tokens else col
                ax.plot(transformed_weights[ilabpt, 0],
                        transformed_weights[ilabpt, 1],
                        col,
                        marker=marker_tmp,
                        markeredgecolor=edge_color_tmp,
                        markeredgewidth=line_tmp,
                        alpha=alpha_tmp,
                        markersize=10)
    else:
        # Black and white plot
        for ilabpt in range(vocab_size):
            alpha_tmp = 0.5 if tokens[ilabpt] in train_unique_tokens else 0.2
            size_tmp = 40 if tokens[ilabpt] in train_unique_tokens else 20
            ax.scatter(transformed_weights[ilabpt, 0],
                       transformed_weights[ilabpt, 1],
                       lw=1,
                       s=size_tmp,
                       facecolor='black',
                       marker='o',
                       alpha=alpha_tmp)

    annotations = []
    weight_tmp = 'bold'
    ilabpt = 0
    for ilabpt, (x_i, y_i) in enumerate(
            zip(transformed_weights[:, 0].tolist(),
                transformed_weights[:, 1].tolist())):
        weight_tmp = 'black' if tokens[
            ilabpt] in train_unique_tokens else 'normal'
        tokens_tmp = tokens[ilabpt]
        if tokens_tmp == ' ':
            tokens_tmp = 'space'
        elif tokens_tmp == '.':
            tokens_tmp = 'dot'
        annotations.append(
            plt.text(x_i, y_i, tokens_tmp, fontsize=12, weight=weight_tmp))
    adjust_text(annotations,
                x=transformed_weights[:, 0].tolist(),
                y=transformed_weights[:, 1].tolist(),
                arrowprops=dict(arrowstyle="-", color='k', lw=0.5))

    plt.xticks([])
    plt.yticks([])
    ax.axis('tight')

    plt.savefig(save_dir + 'Visualization_' + data_name + '_Embedding_fold_' +
                str(k_fold_index) + '.png',
                bbox_inches='tight')
    plt.show()
Beispiel #3
0
def Interpretation(data, 
                   data_name, 
                   data_units = '',
                   k_fold_number = 8,
                   k_fold_index=0,
                   augmentation = False, 
                   outdir = "../data/", 
                   smiles_toviz = 'CCC', 
                   font_size = 15, 
                   font_rotation = 'horizontal'):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    save_dir = outdir+'Interpretation/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
    
    print("***SMILES_X Interpreter starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
    selection_seed = seed_list[k_fold_index]
        
    print("******")
    print("***Fold #{} initiated...***".format(selection_seed))
    print("******")

    print("***Sampling and splitting of the dataset.***\n")
    x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
    utils.random_split(smiles_input=data.smiles, 
                       prop_input=np.array(data.iloc[:,1]), 
                       random_state=selection_seed, 
                       scaling = True)

    np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s')
    np.savetxt(save_dir+'smiles_valid.txt', np.asarray(x_valid), newline="\n", fmt='%s')
    np.savetxt(save_dir+'smiles_test.txt', np.asarray(x_test), newline="\n", fmt='%s')
    
    mol_toviz = Chem.MolFromSmiles(smiles_toviz)
    if mol_toviz != None:
        smiles_toviz_can = Chem.MolToSmiles(mol_toviz)
    else:
        print("***Process of visualization automatically aborted!***")
        print("The smiles_toviz is incorrect and cannot be canonicalized by RDKit.")
        return
    smiles_toviz_x = np.array([smiles_toviz_can])
    if smiles_toviz_can in np.array(data.smiles):
        smiles_toviz_y = np.array([[data.iloc[np.where(data.smiles == smiles_toviz_x[0])[0][0],1]]])
    else:
        smiles_toviz_y = np.array([[np.nan]])

    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    x_train_enum, x_train_enum_card, y_train_enum = \
    augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

    x_valid_enum, x_valid_enum_card, y_valid_enum = \
    augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

    x_test_enum, x_test_enum_card, y_test_enum = \
    augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)
    
    smiles_toviz_x_enum, smiles_toviz_x_enum_card, smiles_toviz_y_enum = \
    augm.Augmentation(smiles_toviz_x, smiles_toviz_y, canon=canonical, rotate=rotation)

    print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
    format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES per dataset
    x_train_enum_tokens = token.get_tokens(x_train_enum)
    x_valid_enum_tokens = token.get_tokens(x_valid_enum)
    x_test_enum_tokens = token.get_tokens(x_test_enum)
    
    smiles_toviz_x_enum_tokens = token.get_tokens(smiles_toviz_x_enum)

    print("Examples of tokenized SMILES from a training set:\n{}\n".\
    format(x_train_enum_tokens[:5]))

    # Vocabulary size computation
    all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens
    tokens = token.extract_vocab(all_smiles_tokens)
    vocab_size = len(tokens)

    train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens))
    print(train_unique_tokens)
    print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens)))
    train_unique_tokens.insert(0,'pad')
    
    # Tokens as a list
    tokens = token.get_vocab(input_dir+data_name+'_Vocabulary.txt')
    # Add 'pad', 'unk' tokens to the existing list
    tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
    
    print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

    # Maximum of length of SMILES to process
    max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
    print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length))

    # Transformation of tokenized SMILES to vector of intergers and vice-versa
    token_to_int = token.get_tokentoint(tokens)
    int_to_token = token.get_inttotoken(tokens)

    # Best architecture to visualize from
    model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', 
                                          custom_objects={'AttentionM': model.AttentionM()})
    best_arch = [model_topredict.layers[2].output_shape[-1]/2, 
                 model_topredict.layers[3].output_shape[-1], 
                 model_topredict.layers[1].output_shape[-1]]

    # Architecture to return attention weights
    model_att = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                          vocabsize = vocab_size, 
                                          lstmunits= int(best_arch[0]), 
                                          denseunits = int(best_arch[1]), 
                                          embedding = int(best_arch[2]), 
                                          return_proba = True)

    print("Best model summary:\n")
    print(model_att.summary())
    print("\n")

    print("***Interpretation from the best model.***\n")
    model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
    model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

    smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, 
                                                               max_length = max_length+1,
                                                               vocab = tokens)
    
    intermediate_layer_model = Model(inputs=model_att.input,
                                     outputs=model_att.layers[-2].output)
    intermediate_output = intermediate_layer_model.predict(smiles_toviz_x_enum_tokens_tointvec)
    
    smiles_toviz_x_card_cumsum_viz = np.cumsum(smiles_toviz_x_enum_card)
    smiles_toviz_x_card_cumsum_shift_viz = shift(smiles_toviz_x_card_cumsum_viz, 1, cval=0)

    mols_id = 0
    ienumcard = smiles_toviz_x_card_cumsum_shift_viz[mols_id]
    
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    intermediate_output_tmp = intermediate_output[ienumcard,-smiles_len_tmp+1:-1].flatten().reshape(1,-1)
    max_intermediate_output_tmp = np.max(intermediate_output_tmp)

    plt.matshow(intermediate_output_tmp, 
                cmap='Reds')
    plt.tick_params(axis='x', bottom = False)
    plt.xticks([ix for ix in range(smiles_len_tmp-2)])
    plt.xticks(range(smiles_len_tmp-2), 
               [int_to_token[iint].replace('pad','') \
                for iint in smiles_toviz_x_enum_tokens_tointvec[ienumcard,-smiles_len_tmp+1:-1]], 
               fontsize = font_size, 
               rotation = font_rotation)
    plt.yticks([])
    plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
    #plt.show()
    
    smiles_tmp = smiles_toviz_x_enum[ienumcard]
    mol_tmp = Chem.MolFromSmiles(smiles_tmp)
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    mol_df_tmp = pd.DataFrame([smiles_toviz_x_enum_tokens[ienumcard][1:-1],intermediate_output[ienumcard].\
                               flatten().\
                               tolist()[-smiles_len_tmp+1:-1]]).transpose()
    bond = ['-','=','#','$','/','\\','.','(',')']
    mol_df_tmp = mol_df_tmp[~mol_df_tmp.iloc[:,0].isin(bond)]
    mol_df_tmp = mol_df_tmp[[not itoken.isdigit() for itoken in mol_df_tmp.iloc[:,0].values.tolist()]]

    minmaxscaler = MinMaxScaler(feature_range=(0,1))
    norm_weights = minmaxscaler.fit_transform(mol_df_tmp.iloc[:,1].values.reshape(-1,1)).flatten().tolist()
    fig = GetSimilarityMapFromWeights(mol=mol_tmp, 
                                      size = (250,250), 
                                      scale=-1,  
                                      sigma=0.05,
                                      weights=norm_weights, 
                                      colorMap='Reds', 
                                      contourLines = 10,
                                      alpha = 0.25)
    fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
    #fig.show()
    
    model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
    
    y_pred_test_tmp = model_topredict.predict(smiles_toviz_x_enum_tokens_tointvec[ienumcard].reshape(1,-1))[0,0]
    y_test_tmp = smiles_toviz_y_enum[ienumcard,0]
    if not np.isnan(y_test_tmp):
        print("True value: {0:.2f} Predicted: {1:.2f}".format(y_test_tmp,
                                                    scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0]))
    else:
        print("Predicted: {0:.2f}".format(scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0]))
    
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    diff_topred_list = list()
    diff_totrue_list = list()
    for csubsmiles in range(1,smiles_len_tmp):
        isubsmiles = smiles_toviz_x_enum_tokens[ienumcard][:csubsmiles]+[' ']
        isubsmiles_tointvec= token.int_vec_encode(tokenized_smiles_list = [isubsmiles], 
                                                  max_length = max_length+1, 
                                                  vocab = tokens)
        predict_prop_tmp = model_topredict.predict(isubsmiles_tointvec)[0,0]
        diff_topred_tmp = (predict_prop_tmp-y_pred_test_tmp)/np.abs(y_pred_test_tmp)
        diff_topred_list.append(diff_topred_tmp)
        diff_totrue_tmp = (predict_prop_tmp-y_test_tmp)/np.abs(y_test_tmp)
        diff_totrue_list.append(diff_totrue_tmp)
    max_diff_topred_tmp = np.max(diff_topred_list)
    max_diff_totrue_tmp = np.max(diff_totrue_list)

    plt.figure(figsize=(15,7))
    markers, stemlines, baseline = plt.stem([ix for ix in range(smiles_len_tmp-1)], 
                                            diff_topred_list, 
                                            'k.-', 
                                             use_line_collection=True)
    plt.setp(baseline, color='k', linewidth=2, linestyle='--')
    plt.setp(markers, linewidth=1, marker='o', markersize=10, markeredgecolor = 'black')
    plt.setp(stemlines, color = 'k', linewidth=0.5, linestyle='-')
    plt.xticks(range(smiles_len_tmp-1), 
               smiles_toviz_x_enum_tokens[ienumcard][:-1],
               fontsize = font_size, 
               rotation = font_rotation)
    plt.yticks(fontsize = 20)
    plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15)
    plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')