コード例 #1
0
def Embedding_Vis(data,
                  data_name,
                  data_units='',
                  k_fold_number=8,
                  k_fold_index=0,
                  augmentation=False,
                  outdir="../data/",
                  affinity_propn=True,
                  verbose=0):

    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'

    input_dir = outdir + 'Main/' + '{}/{}/'.format(data_name, p_dir_temp)
    save_dir = outdir + 'Embedding_Vis/' + '{}/{}/'.format(
        data_name, p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)

    print("***SMILES_X for embedding visualization starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist()

    print("******")
    print("***Fold #{} initiated...***".format(k_fold_index))
    print("******")

    print("***Sampling and splitting of the dataset.***\n")
    # Reproducing the data split of the requested fold (k_fold_index)
    x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
    utils.random_split(smiles_input=data.smiles,
                       prop_input=np.array(data.iloc[:,1]),
                       random_state=seed_list[k_fold_index],
                       scaling = True)

    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    x_train_enum, x_train_enum_card, y_train_enum = \
    augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

    x_valid_enum, x_valid_enum_card, y_valid_enum = \
    augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

    x_test_enum, x_test_enum_card, y_test_enum = \
    augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)

    print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
    format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES per dataset
    x_train_enum_tokens = token.get_tokens(x_train_enum)
    x_valid_enum_tokens = token.get_tokens(x_valid_enum)
    x_test_enum_tokens = token.get_tokens(x_test_enum)

    print("Examples of tokenized SMILES from a training set:\n{}\n".\
    format(x_train_enum_tokens[:5]))

    # Vocabulary size computation
    all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens
    tokens = token.extract_vocab(all_smiles_tokens)
    vocab_size = len(tokens)

    train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens))
    print(train_unique_tokens)
    print("Number of tokens only present in a training set: {}\n".format(
        len(train_unique_tokens)))
    train_unique_tokens.insert(0, 'pad')

    # Tokens as a list
    tokens = token.get_vocab(input_dir + data_name + '_tokens_set_fold_' +
                             str(k_fold_index) + '.txt')
    # Add 'pad', 'unk' tokens to the existing list
    tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

    print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

    # Maximum of length of SMILES to process
    max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
    print(
        "Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n"
        .format(max_length))

    # Transformation of tokenized SMILES to vector of intergers and vice-versa
    token_to_int = token.get_tokentoint(tokens)
    int_to_token = token.get_inttotoken(tokens)

    model_train = load_model(input_dir + 'LSTMAtt_' + data_name +
                             '_model.best_fold_' + str(k_fold_index) + '.hdf5',
                             custom_objects={'AttentionM': model.AttentionM()})

    print("Chosen model summary:\n")
    print(model_train.summary())
    print("\n")

    print("***Embedding of the individual tokens from the chosen model.***\n")
    model_train.compile(loss="mse",
                        optimizer='adam',
                        metrics=[metrics.mae, metrics.mse])

    model_embed_weights = model_train.layers[1].get_weights()[0]
    #print(model_embed_weights.shape)
    #tsne = TSNE(perplexity=30, early_exaggeration=120 , n_components=2, random_state=123, verbose=0)
    pca = PCA(n_components=2, random_state=123)
    transformed_weights = pca.fit_transform(model_embed_weights)
    #transformed_weights = tsne.fit_transform(model_embed_weights)

    f = plt.figure(figsize=(9, 9))
    ax = plt.subplot(aspect='equal')

    if affinity_propn:
        # Compute Affinity Propagation
        af = AffinityPropagation().fit(model_embed_weights)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)
        # Plot it
        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        for k, col in zip(range(n_clusters_), colors):
            class_members = np.where(np.array(labels == k) == True)[0].tolist()
            for ilabpt in class_members:
                alpha_tmp = 0.5 if tokens[
                    ilabpt] in train_unique_tokens else 0.5
                line_tmp = 1 if tokens[ilabpt] in train_unique_tokens else 5
                marker_tmp = 'o' if tokens[
                    ilabpt] in train_unique_tokens else 'x'
                edge_color_tmp = 'black' if tokens[
                    ilabpt] in train_unique_tokens else col
                ax.plot(transformed_weights[ilabpt, 0],
                        transformed_weights[ilabpt, 1],
                        col,
                        marker=marker_tmp,
                        markeredgecolor=edge_color_tmp,
                        markeredgewidth=line_tmp,
                        alpha=alpha_tmp,
                        markersize=10)
    else:
        # Black and white plot
        for ilabpt in range(vocab_size):
            alpha_tmp = 0.5 if tokens[ilabpt] in train_unique_tokens else 0.2
            size_tmp = 40 if tokens[ilabpt] in train_unique_tokens else 20
            ax.scatter(transformed_weights[ilabpt, 0],
                       transformed_weights[ilabpt, 1],
                       lw=1,
                       s=size_tmp,
                       facecolor='black',
                       marker='o',
                       alpha=alpha_tmp)

    annotations = []
    weight_tmp = 'bold'
    ilabpt = 0
    for ilabpt, (x_i, y_i) in enumerate(
            zip(transformed_weights[:, 0].tolist(),
                transformed_weights[:, 1].tolist())):
        weight_tmp = 'black' if tokens[
            ilabpt] in train_unique_tokens else 'normal'
        tokens_tmp = tokens[ilabpt]
        if tokens_tmp == ' ':
            tokens_tmp = 'space'
        elif tokens_tmp == '.':
            tokens_tmp = 'dot'
        annotations.append(
            plt.text(x_i, y_i, tokens_tmp, fontsize=12, weight=weight_tmp))
    adjust_text(annotations,
                x=transformed_weights[:, 0].tolist(),
                y=transformed_weights[:, 1].tolist(),
                arrowprops=dict(arrowstyle="-", color='k', lw=0.5))

    plt.xticks([])
    plt.yticks([])
    ax.axis('tight')

    plt.savefig(save_dir + 'Visualization_' + data_name + '_Embedding_fold_' +
                str(k_fold_index) + '.png',
                bbox_inches='tight')
    plt.show()
コード例 #2
0
ファイル: interpret.py プロジェクト: egracheva/SMILES-X
def Interpretation(data, 
                   data_name, 
                   data_units = '',
                   k_fold_number = 8,
                   k_fold_index=0,
                   augmentation = False, 
                   outdir = "../data/", 
                   smiles_toviz = 'CCC', 
                   font_size = 15, 
                   font_rotation = 'horizontal'):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    save_dir = outdir+'Interpretation/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
    
    print("***SMILES_X Interpreter starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
    selection_seed = seed_list[k_fold_index]
        
    print("******")
    print("***Fold #{} initiated...***".format(selection_seed))
    print("******")

    print("***Sampling and splitting of the dataset.***\n")
    x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
    utils.random_split(smiles_input=data.smiles, 
                       prop_input=np.array(data.iloc[:,1]), 
                       random_state=selection_seed, 
                       scaling = True)

    np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s')
    np.savetxt(save_dir+'smiles_valid.txt', np.asarray(x_valid), newline="\n", fmt='%s')
    np.savetxt(save_dir+'smiles_test.txt', np.asarray(x_test), newline="\n", fmt='%s')
    
    mol_toviz = Chem.MolFromSmiles(smiles_toviz)
    if mol_toviz != None:
        smiles_toviz_can = Chem.MolToSmiles(mol_toviz)
    else:
        print("***Process of visualization automatically aborted!***")
        print("The smiles_toviz is incorrect and cannot be canonicalized by RDKit.")
        return
    smiles_toviz_x = np.array([smiles_toviz_can])
    if smiles_toviz_can in np.array(data.smiles):
        smiles_toviz_y = np.array([[data.iloc[np.where(data.smiles == smiles_toviz_x[0])[0][0],1]]])
    else:
        smiles_toviz_y = np.array([[np.nan]])

    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    x_train_enum, x_train_enum_card, y_train_enum = \
    augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

    x_valid_enum, x_valid_enum_card, y_valid_enum = \
    augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

    x_test_enum, x_test_enum_card, y_test_enum = \
    augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)
    
    smiles_toviz_x_enum, smiles_toviz_x_enum_card, smiles_toviz_y_enum = \
    augm.Augmentation(smiles_toviz_x, smiles_toviz_y, canon=canonical, rotate=rotation)

    print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
    format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES per dataset
    x_train_enum_tokens = token.get_tokens(x_train_enum)
    x_valid_enum_tokens = token.get_tokens(x_valid_enum)
    x_test_enum_tokens = token.get_tokens(x_test_enum)
    
    smiles_toviz_x_enum_tokens = token.get_tokens(smiles_toviz_x_enum)

    print("Examples of tokenized SMILES from a training set:\n{}\n".\
    format(x_train_enum_tokens[:5]))

    # Vocabulary size computation
    all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens
    tokens = token.extract_vocab(all_smiles_tokens)
    vocab_size = len(tokens)

    train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens))
    print(train_unique_tokens)
    print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens)))
    train_unique_tokens.insert(0,'pad')
    
    # Tokens as a list
    tokens = token.get_vocab(input_dir+data_name+'_Vocabulary.txt')
    # Add 'pad', 'unk' tokens to the existing list
    tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
    
    print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

    # Maximum of length of SMILES to process
    max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
    print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length))

    # Transformation of tokenized SMILES to vector of intergers and vice-versa
    token_to_int = token.get_tokentoint(tokens)
    int_to_token = token.get_inttotoken(tokens)

    # Best architecture to visualize from
    model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', 
                                          custom_objects={'AttentionM': model.AttentionM()})
    best_arch = [model_topredict.layers[2].output_shape[-1]/2, 
                 model_topredict.layers[3].output_shape[-1], 
                 model_topredict.layers[1].output_shape[-1]]

    # Architecture to return attention weights
    model_att = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                          vocabsize = vocab_size, 
                                          lstmunits= int(best_arch[0]), 
                                          denseunits = int(best_arch[1]), 
                                          embedding = int(best_arch[2]), 
                                          return_proba = True)

    print("Best model summary:\n")
    print(model_att.summary())
    print("\n")

    print("***Interpretation from the best model.***\n")
    model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
    model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

    smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, 
                                                               max_length = max_length+1,
                                                               vocab = tokens)
    
    intermediate_layer_model = Model(inputs=model_att.input,
                                     outputs=model_att.layers[-2].output)
    intermediate_output = intermediate_layer_model.predict(smiles_toviz_x_enum_tokens_tointvec)
    
    smiles_toviz_x_card_cumsum_viz = np.cumsum(smiles_toviz_x_enum_card)
    smiles_toviz_x_card_cumsum_shift_viz = shift(smiles_toviz_x_card_cumsum_viz, 1, cval=0)

    mols_id = 0
    ienumcard = smiles_toviz_x_card_cumsum_shift_viz[mols_id]
    
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    intermediate_output_tmp = intermediate_output[ienumcard,-smiles_len_tmp+1:-1].flatten().reshape(1,-1)
    max_intermediate_output_tmp = np.max(intermediate_output_tmp)

    plt.matshow(intermediate_output_tmp, 
                cmap='Reds')
    plt.tick_params(axis='x', bottom = False)
    plt.xticks([ix for ix in range(smiles_len_tmp-2)])
    plt.xticks(range(smiles_len_tmp-2), 
               [int_to_token[iint].replace('pad','') \
                for iint in smiles_toviz_x_enum_tokens_tointvec[ienumcard,-smiles_len_tmp+1:-1]], 
               fontsize = font_size, 
               rotation = font_rotation)
    plt.yticks([])
    plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
    #plt.show()
    
    smiles_tmp = smiles_toviz_x_enum[ienumcard]
    mol_tmp = Chem.MolFromSmiles(smiles_tmp)
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    mol_df_tmp = pd.DataFrame([smiles_toviz_x_enum_tokens[ienumcard][1:-1],intermediate_output[ienumcard].\
                               flatten().\
                               tolist()[-smiles_len_tmp+1:-1]]).transpose()
    bond = ['-','=','#','$','/','\\','.','(',')']
    mol_df_tmp = mol_df_tmp[~mol_df_tmp.iloc[:,0].isin(bond)]
    mol_df_tmp = mol_df_tmp[[not itoken.isdigit() for itoken in mol_df_tmp.iloc[:,0].values.tolist()]]

    minmaxscaler = MinMaxScaler(feature_range=(0,1))
    norm_weights = minmaxscaler.fit_transform(mol_df_tmp.iloc[:,1].values.reshape(-1,1)).flatten().tolist()
    fig = GetSimilarityMapFromWeights(mol=mol_tmp, 
                                      size = (250,250), 
                                      scale=-1,  
                                      sigma=0.05,
                                      weights=norm_weights, 
                                      colorMap='Reds', 
                                      contourLines = 10,
                                      alpha = 0.25)
    fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
    #fig.show()
    
    model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
    
    y_pred_test_tmp = model_topredict.predict(smiles_toviz_x_enum_tokens_tointvec[ienumcard].reshape(1,-1))[0,0]
    y_test_tmp = smiles_toviz_y_enum[ienumcard,0]
    if not np.isnan(y_test_tmp):
        print("True value: {0:.2f} Predicted: {1:.2f}".format(y_test_tmp,
                                                    scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0]))
    else:
        print("Predicted: {0:.2f}".format(scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0]))
    
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    diff_topred_list = list()
    diff_totrue_list = list()
    for csubsmiles in range(1,smiles_len_tmp):
        isubsmiles = smiles_toviz_x_enum_tokens[ienumcard][:csubsmiles]+[' ']
        isubsmiles_tointvec= token.int_vec_encode(tokenized_smiles_list = [isubsmiles], 
                                                  max_length = max_length+1, 
                                                  vocab = tokens)
        predict_prop_tmp = model_topredict.predict(isubsmiles_tointvec)[0,0]
        diff_topred_tmp = (predict_prop_tmp-y_pred_test_tmp)/np.abs(y_pred_test_tmp)
        diff_topred_list.append(diff_topred_tmp)
        diff_totrue_tmp = (predict_prop_tmp-y_test_tmp)/np.abs(y_test_tmp)
        diff_totrue_list.append(diff_totrue_tmp)
    max_diff_topred_tmp = np.max(diff_topred_list)
    max_diff_totrue_tmp = np.max(diff_totrue_list)

    plt.figure(figsize=(15,7))
    markers, stemlines, baseline = plt.stem([ix for ix in range(smiles_len_tmp-1)], 
                                            diff_topred_list, 
                                            'k.-', 
                                             use_line_collection=True)
    plt.setp(baseline, color='k', linewidth=2, linestyle='--')
    plt.setp(markers, linewidth=1, marker='o', markersize=10, markeredgecolor = 'black')
    plt.setp(stemlines, color = 'k', linewidth=0.5, linestyle='-')
    plt.xticks(range(smiles_len_tmp-1), 
               smiles_toviz_x_enum_tokens[ienumcard][:-1],
               fontsize = font_size, 
               rotation = font_rotation)
    plt.yticks(fontsize = 20)
    plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15)
    plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
コード例 #3
0
def Inference(data_name, 
              smiles_list = ['CC','CCC','C=O'], 
              data_units = '',
              k_fold_number = 8,
              augmentation = False, 
              outdir = "../data/"):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    save_dir = outdir+'Inference/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
    
    print("***SMILES_X for inference starts...***\n\n")
    print("***Checking the SMILES list for inference***\n")
    smiles_checked = list()
    smiles_rejected = list()
    for ismiles in smiles_list:
        mol_tmp = Chem.MolFromSmiles(ismiles)
        if mol_tmp != None:
            smiles_can = Chem.MolToSmiles(mol_tmp)
            smiles_checked.append(smiles_can)
        else:
            smiles_rejected.append(ismiles)
            
    if len(smiles_rejected) > 0:
        with open(save_dir+'rejected_smiles.txt','w') as f:
            for ismiles in smiles_rejected:
                f.write("%s\n" % ismiles)
                
    if len(smiles_checked) == 0:
        print("***Process of inference automatically aborted!***")
        print("The provided SMILES are all incorrect and could not be verified via RDKit.")
        return
    
    smiles_x = np.array(smiles_checked)
    smiles_y = np.array([[np.nan]*len(smiles_checked)]).flatten()
     
    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    smiles_x_enum, smiles_x_enum_card, smiles_y_enum = \
    augm.Augmentation(smiles_x, smiles_y, canon=canonical, rotate=rotation)

    print("Enumerated SMILES: {}\n".format(smiles_x_enum.shape[0]))
    
    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES 
    smiles_x_enum_tokens = token.get_tokens(smiles_x_enum)

    # models ensembling
    smiles_y_pred_mean_array = np.empty(shape=(0,len(smiles_checked)), dtype='float')
    for ifold in range(k_fold_number):
        
        # Tokens as a list
        tokens = token.get_vocab(input_dir+data_name+'_tokens_set_fold_'+str(ifold)+'.txt')
        # Add 'pad', 'unk' tokens to the existing list
        vocab_size = len(tokens)
        tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

        # Transformation of tokenized SMILES to vector of intergers and vice-versa
        token_to_int = token.get_tokentoint(tokens)
        int_to_token = token.get_inttotoken(tokens)
        
        # Best architecture to visualize from
        model_train = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5', 
                                 custom_objects={'AttentionM': model.AttentionM()})

        if ifold == 0:
            # Maximum of length of SMILES to process
            max_length = model_train.layers[0].output_shape[-1]
            print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))
            print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length))

        model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

        # predict and compare for the training, validation and test sets
        smiles_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = smiles_x_enum_tokens, 
                                                             max_length = max_length, 
                                                             vocab = tokens)

        smiles_y_pred = model_train.predict(smiles_x_enum_tokens_tointvec)

        # compute a mean per set of augmented SMILES
        smiles_y_pred_mean, _ = utils.mean_median_result(smiles_x_enum_card, smiles_y_pred)
        
        smiles_y_pred_mean_array = np.append(smiles_y_pred_mean_array, smiles_y_pred_mean.reshape(1,-1), axis = 0)
        
        if ifold == (k_fold_number-1):
            smiles_y_pred_mean_ensemble = np.mean(smiles_y_pred_mean_array, axis = 0)
            smiles_y_pred_sd_ensemble = np.std(smiles_y_pred_mean_array, axis = 0)

            pred_from_ens = pd.DataFrame(data=[smiles_x,
                                               smiles_y_pred_mean_ensemble,
                                               smiles_y_pred_sd_ensemble]).T
            pred_from_ens.columns = ['SMILES', 'ens_pred_mean', 'ens_pred_sd']
            
            print("***Inference of SMILES property done.***")
            
            return pred_from_ens
コード例 #4
0
ファイル: main.py プロジェクト: qoffee/SMILES-X
def Main(data,
         data_name,
         bayopt_bounds,
         data_units='',
         k_fold_number=8,
         augmentation=False,
         outdir="../data/",
         bayopt_n_epochs=10,
         bayopt_n_rounds=25,
         bayopt_it_factor=1,
         bayopt_on=True,
         lstmunits_ref=512,
         denseunits_ref=512,
         embedding_ref=512,
         batch_size_ref=64,
         alpha_ref=3,
         n_gpus=1,
         bridge_type='None',
         patience=25,
         n_epochs=1000):

    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'

    save_dir = outdir + 'Main/' + '{}/{}/'.format(data_name, p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)

    print("***SMILES_X starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist()
    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
    for ifold in range(k_fold_number):

        print("******")
        print("***Fold #{} initiated...***".format(ifold))
        print("******")

        print("***Sampling and splitting of the dataset.***\n")
        x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
        utils.random_split(smiles_input=data.smiles,
                           prop_input=np.array(data.iloc[:,1]),
                           random_state=seed_list[ifold],
                           scaling = True)

        # data augmentation or not
        if augmentation == True:
            print("***Data augmentation to {}***\n".format(augmentation))
            canonical = False
            rotation = True
        else:
            print("***No data augmentation has been required.***\n")
            canonical = True
            rotation = False

        x_train_enum, x_train_enum_card, y_train_enum = \
        augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

        x_valid_enum, x_valid_enum_card, y_valid_enum = \
        augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

        x_test_enum, x_test_enum_card, y_test_enum = \
        augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)

        print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
        format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

        print("***Tokenization of SMILES.***\n")
        # Tokenize SMILES per dataset
        x_train_enum_tokens = token.get_tokens(x_train_enum)
        x_valid_enum_tokens = token.get_tokens(x_valid_enum)
        x_test_enum_tokens = token.get_tokens(x_test_enum)

        print("Examples of tokenized SMILES from a training set:\n{}\n".\
        format(x_train_enum_tokens[:5]))

        # Vocabulary size computation
        all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens
        tokens = token.extract_vocab(all_smiles_tokens)
        vocab_size = len(tokens)

        train_unique_tokens = token.extract_vocab(x_train_enum_tokens)
        print("Number of tokens only present in a training set: {}\n".format(
            len(train_unique_tokens)))
        valid_unique_tokens = token.extract_vocab(x_valid_enum_tokens)
        print("Number of tokens only present in a validation set: {}".format(
            len(valid_unique_tokens)))
        print("Is the validation set a subset of the training set: {}".\
              format(valid_unique_tokens.issubset(train_unique_tokens)))
        print("What are the tokens by which they differ: {}\n".\
              format(valid_unique_tokens.difference(train_unique_tokens)))
        test_unique_tokens = token.extract_vocab(x_test_enum_tokens)
        print("Number of tokens only present in a test set: {}".format(
            len(test_unique_tokens)))
        print("Is the test set a subset of the training set: {}".\
              format(test_unique_tokens.issubset(train_unique_tokens)))
        print("What are the tokens by which they differ: {}".\
              format(test_unique_tokens.difference(train_unique_tokens)))
        print("Is the test set a subset of the validation set: {}".\
              format(test_unique_tokens.issubset(valid_unique_tokens)))
        print("What are the tokens by which they differ: {}\n".\
              format(test_unique_tokens.difference(valid_unique_tokens)))

        print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

        # Save the vocabulary for re-use
        token.save_vocab(
            tokens,
            save_dir + data_name + '_tokens_set_fold_' + str(ifold) + '.txt')
        # Tokens as a list
        tokens = token.get_vocab(save_dir + data_name + '_tokens_set_fold_' +
                                 str(ifold) + '.txt')
        # Add 'pad', 'unk' tokens to the existing list
        tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

        # Maximum of length of SMILES to process
        max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
        print(
            "Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n"
            .format(max_length))

        print("***Bayesian Optimization of the SMILESX's architecture.***\n")
        # Transformation of tokenized SMILES to vector of intergers and vice-versa
        token_to_int = token.get_tokentoint(tokens)
        int_to_token = token.get_inttotoken(tokens)

        if bayopt_on:
            # Operate the bayesian optimization of the neural architecture
            def create_mod(params):
                print('Model: {}'.format(params))

                model_tag = data_name

                K.clear_session()

                if n_gpus > 1:
                    if bridge_type == 'NVLink':
                        model_opt = model.LSTMAttModel.create(
                            inputtokens=max_length + 1,
                            vocabsize=vocab_size,
                            lstmunits=int(params[:, 0][0]),
                            denseunits=int(params[:, 1]),
                            embedding=int(params[:, 2][0]))
                    else:
                        with tf.device(
                                '/cpu'):  # necessary to multi-GPU scaling
                            model_opt = model.LSTMAttModel.create(
                                inputtokens=max_length + 1,
                                vocabsize=vocab_size,
                                lstmunits=int(params[:, 0][0]),
                                denseunits=int(params[:, 1]),
                                embedding=int(params[:, 2][0]))

                    multi_model = model.ModelMGPU(model_opt,
                                                  gpus=n_gpus,
                                                  bridge_type=bridge_type)
                else:  # single GPU
                    model_opt = model.LSTMAttModel.create(
                        inputtokens=max_length + 1,
                        vocabsize=vocab_size,
                        lstmunits=int(params[:, 0][0]),
                        denseunits=int(params[:, 1]),
                        embedding=int(params[:, 2][0]))

                    multi_model = model_opt

                batch_size = int(params[:, 3][0])
                custom_adam = Adam(lr=math.pow(10, -float(params[:, 4][0])))
                multi_model.compile(loss='mse',
                                    optimizer=custom_adam,
                                    metrics=[metrics.mae, metrics.mse])

                history = multi_model.fit_generator(
                    generator=DataSequence(x_train_enum_tokens,
                                           vocab=tokens,
                                           max_length=max_length,
                                           props_set=y_train_enum,
                                           batch_size=batch_size),
                    steps_per_epoch=math.ceil(
                        len(x_train_enum_tokens) / batch_size) //
                    bayopt_it_factor,
                    validation_data=DataSequence(x_valid_enum_tokens,
                                                 vocab=tokens,
                                                 max_length=max_length,
                                                 props_set=y_valid_enum,
                                                 batch_size=min(
                                                     len(x_valid_enum_tokens),
                                                     batch_size)),
                    validation_steps=math.ceil(
                        len(x_valid_enum_tokens) /
                        min(len(x_valid_enum_tokens), batch_size)) //
                    bayopt_it_factor,
                    epochs=bayopt_n_epochs,
                    shuffle=True,
                    initial_epoch=0,
                    verbose=0)

                best_epoch = np.argmin(history.history['val_loss'])
                mae_valid = history.history['val_mean_absolute_error'][
                    best_epoch]
                mse_valid = history.history['val_mean_squared_error'][
                    best_epoch]
                if math.isnan(
                        mse_valid
                ):  # discard diverging architectures (rare event)
                    mae_valid = math.inf
                    mse_valid = math.inf
                print('Valid MAE: {0:0.4f}, RMSE: {1:0.4f}'.format(
                    mae_valid, mse_valid))

                return mse_valid

            print("Random initialization:\n")
            Bayes_opt = GPyOpt.methods.BayesianOptimization(
                f=create_mod,
                domain=bayopt_bounds,
                acquisition_type='EI',
                initial_design_numdata=bayopt_n_rounds,
                exact_feval=False,
                normalize_Y=True,
                num_cores=multiprocessing.cpu_count() - 1)
            print("Optimization:\n")
            Bayes_opt.run_optimization(max_iter=bayopt_n_rounds)
            best_arch = Bayes_opt.x_opt
        else:
            best_arch = [
                lstmunits_ref, denseunits_ref, embedding_ref, batch_size_ref,
                alpha_ref
            ]

        print("\nThe architecture for this datatset is:\n\tLSTM units: {}\n\tDense units: {}\n\tEmbedding dimensions {}".\
             format(int(best_arch[0]), int(best_arch[1]), int(best_arch[2])))
        print("\tBatch size: {0:}\n\tLearning rate: 10^-({1:.1f})\n".format(
            int(best_arch[3]), float(best_arch[4])))

        print("***Training of the best model.***\n")
        # Train the model and predict
        K.clear_session()
        # Define the multi-gpus model if necessary
        if n_gpus > 1:
            if bridge_type == 'NVLink':
                model_train = model.LSTMAttModel.create(
                    inputtokens=max_length + 1,
                    vocabsize=vocab_size,
                    lstmunits=int(best_arch[0]),
                    denseunits=int(best_arch[1]),
                    embedding=int(best_arch[2]))
            else:
                with tf.device('/cpu'):
                    model_train = model.LSTMAttModel.create(
                        inputtokens=max_length + 1,
                        vocabsize=vocab_size,
                        lstmunits=int(best_arch[0]),
                        denseunits=int(best_arch[1]),
                        embedding=int(best_arch[2]))
            print("Best model summary:\n")
            print(model_train.summary())
            print("\n")
            multi_model = model.ModelMGPU(model_train,
                                          gpus=n_gpus,
                                          bridge_type=bridge_type)
        else:
            model_train = model.LSTMAttModel.create(
                inputtokens=max_length + 1,
                vocabsize=vocab_size,
                lstmunits=int(best_arch[0]),
                denseunits=int(best_arch[1]),
                embedding=int(best_arch[2]))

            print("Best model summary:\n")
            print(model_train.summary())
            print("\n")
            multi_model = model_train

        batch_size = int(best_arch[3])
        custom_adam = Adam(lr=math.pow(10, -float(best_arch[4])))
        # Compile the model
        multi_model.compile(loss="mse",
                            optimizer=custom_adam,
                            metrics=[metrics.mae, metrics.mse])

        # Checkpoint, Early stopping and callbacks definition
        filepath = save_dir + 'LSTMAtt_' + data_name + '_model.best_fold_' + str(
            ifold) + '.hdf5'

        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_loss',
                                     verbose=0,
                                     save_best_only=True,
                                     mode='min')

        earlystopping = EarlyStopping(monitor='val_loss',
                                      min_delta=0,
                                      patience=patience,
                                      verbose=0,
                                      mode='min')

        callbacks_list = [checkpoint, earlystopping]

        # Fit the model
        history = multi_model.fit_generator(
            generator=DataSequence(x_train_enum_tokens,
                                   vocab=tokens,
                                   max_length=max_length,
                                   props_set=y_train_enum,
                                   batch_size=batch_size),
            validation_data=DataSequence(x_valid_enum_tokens,
                                         vocab=tokens,
                                         max_length=max_length,
                                         props_set=y_valid_enum,
                                         batch_size=min(
                                             len(x_valid_enum_tokens),
                                             batch_size)),
            epochs=n_epochs,
            shuffle=True,
            initial_epoch=0,
            callbacks=callbacks_list)

        # Summarize history for losses per epoch
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper right')
        plt.savefig(save_dir + 'History_fit_LSTMAtt_' + data_name +
                    '_model_weights.best_fold_' + str(ifold) + '.png',
                    bbox_inches='tight')
        plt.close()

        print("Best val_loss @ Epoch #{}\n".format(
            np.argmin(history.history['val_loss']) + 1))

        print("***Predictions from the best model.***\n")
        model_train.load_weights(save_dir + 'LSTMAtt_' + data_name +
                                 '_model.best_fold_' + str(ifold) + '.hdf5')
        model_train.compile(loss="mse",
                            optimizer='adam',
                            metrics=[metrics.mae, metrics.mse])

        # predict and compare for the training, validation and test sets
        x_train_enum_tokens_tointvec = token.int_vec_encode(
            tokenized_smiles_list=x_train_enum_tokens,
            max_length=max_length + 1,
            vocab=tokens)
        x_valid_enum_tokens_tointvec = token.int_vec_encode(
            tokenized_smiles_list=x_valid_enum_tokens,
            max_length=max_length + 1,
            vocab=tokens)
        x_test_enum_tokens_tointvec = token.int_vec_encode(
            tokenized_smiles_list=x_test_enum_tokens,
            max_length=max_length + 1,
            vocab=tokens)

        y_pred_train = model_train.predict(x_train_enum_tokens_tointvec)
        y_pred_valid = model_train.predict(x_valid_enum_tokens_tointvec)
        y_pred_test = model_train.predict(x_test_enum_tokens_tointvec)

        # compute a mean per set of augmented SMILES
        y_pred_train_mean, _ = utils.mean_median_result(
            x_train_enum_card, y_pred_train)
        y_pred_valid_mean, _ = utils.mean_median_result(
            x_valid_enum_card, y_pred_valid)
        y_pred_test_mean, _ = utils.mean_median_result(x_test_enum_card,
                                                       y_pred_test)

        # inverse transform the scaling of the property and plot 'predictions VS observations'
        y_pred_VS_true_train = scaler.inverse_transform(y_train) - \
                               scaler.inverse_transform(y_pred_train_mean.reshape(-1,1))
        mae_train = np.mean(np.absolute(y_pred_VS_true_train))
        mse_train = np.mean(np.square(y_pred_VS_true_train))
        corrcoef_train = r2_score(scaler.inverse_transform(y_train), \
                                 scaler.inverse_transform(y_pred_train_mean.reshape(-1,1)))
        print("For the training set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_train, np.sqrt(mse_train), corrcoef_train))

        y_pred_VS_true_valid = scaler.inverse_transform(y_valid) - \
                               scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1))
        mae_valid = np.mean(np.absolute(y_pred_VS_true_valid))
        mse_valid = np.mean(np.square(y_pred_VS_true_valid))
        corrcoef_valid = r2_score(scaler.inverse_transform(y_valid), \
                                  scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1)))
        print("For the validation set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_valid, np.sqrt(mse_valid), corrcoef_valid))

        y_pred_VS_true_test = scaler.inverse_transform(y_test) - \
                              scaler.inverse_transform(y_pred_test_mean.reshape(-1,1))
        mae_test = np.mean(np.absolute(y_pred_VS_true_test))
        mse_test = np.mean(np.square(y_pred_VS_true_test))
        corrcoef_test = r2_score(scaler.inverse_transform(y_test), \
                                 scaler.inverse_transform(y_pred_test_mean.reshape(-1,1)))
        print("For the test set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_test, np.sqrt(mse_test), corrcoef_test))

        # Plot the final result
        # Unscaling the data
        y_train = scaler.inverse_transform(y_train)
        y_pred_train_mean = scaler.inverse_transform(
            y_pred_train_mean.reshape(-1, 1))
        y_valid = scaler.inverse_transform(y_valid)
        y_pred_valid_mean = scaler.inverse_transform(
            y_pred_valid_mean.reshape(-1, 1))
        y_test = scaler.inverse_transform(y_test)
        y_pred_test_mean = scaler.inverse_transform(
            y_pred_test_mean.reshape(-1, 1))

        # Changed colors, scaling and sizes
        plt.figure(figsize=(12, 8))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Setting plot limits
        y_true_min = min(np.min(y_train), np.min(y_valid), np.min(y_test))
        y_true_max = max(np.max(y_train), np.max(y_valid), np.max(y_test))
        y_pred_min = min(np.min(y_pred_train_mean), np.min(y_pred_valid_mean),
                         np.min(y_pred_test_mean))
        y_pred_max = max(np.max(y_pred_train_mean), np.max(y_pred_valid_mean),
                         np.max(y_pred_test_mean))
        # Expanding slightly the canvas around the data points (by 10%)
        axmin = y_true_min - 0.1 * (y_true_max - y_true_min)
        axmax = y_true_max + 0.1 * (y_true_max - y_true_min)
        aymin = y_pred_min - 0.1 * (y_pred_max - y_pred_min)
        aymax = y_pred_max + 0.1 * (y_pred_max - y_pred_min)

        plt.xlim(min(axmin, aymin), max(axmax, aymax))
        plt.ylim(min(axmin, aymin), max(axmax, aymax))

        plt.errorbar(y_train,
                     y_pred_train_mean,
                     fmt='o',
                     label="Train",
                     elinewidth=0,
                     ms=5,
                     mfc='#519fc4',
                     markeredgewidth=0,
                     alpha=0.7)
        plt.errorbar(y_valid,
                     y_pred_valid_mean,
                     elinewidth=0,
                     fmt='o',
                     label="Validation",
                     ms=5,
                     mfc='#db702e',
                     markeredgewidth=0,
                     alpha=0.7)
        plt.errorbar(y_test,
                     y_pred_test_mean,
                     elinewidth=0,
                     fmt='o',
                     label="Test",
                     ms=5,
                     mfc='#cc1b00',
                     markeredgewidth=0,
                     alpha=0.7)

        # Plot X=Y line
        plt.plot([
            max(plt.xlim()[0],
                plt.ylim()[0]),
            min(plt.xlim()[1],
                plt.ylim()[1])
        ], [
            max(plt.xlim()[0],
                plt.ylim()[0]),
            min(plt.xlim()[1],
                plt.ylim()[1])
        ],
                 ':',
                 color='#595f69')

        plt.xlabel('Observations ' + data_units, fontsize=12)
        plt.ylabel('Predictions ' + data_units, fontsize=12)
        plt.legend()

        # Added fold number
        plt.savefig(save_dir + 'TrainValid_Plot_LSTMAtt_' + data_name +
                    '_model_weights.best_fold_' + str(ifold) + '.png',
                    bbox_inches='tight',
                    dpi=80)
        plt.close()