예제 #1
0
def Main(data, 
         data_name, 
         bayopt_bounds, 
         data_units = '',
         k_fold_number = 8, 
         augmentation = False, 
         outdir = "../data/", 
         bayopt_n_epochs = 10,
         bayopt_n_rounds = 25, 
         bayopt_it_factor = 1, 
         bayopt_on = True, 
         lstmunits_ref = 512, 
         denseunits_ref = 512, 
         embedding_ref = 512, 
         batch_size_ref = 64, 
         alpha_ref = 3, 
         n_gpus = 1, 
         bridge_type = 'None', 
         patience = 25, 
         n_epochs = 1000):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    save_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
        
    print("***SMILES_X starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
    for ifold in range(k_fold_number):
        
        print("******")
        print("***Fold #{} initiated...***".format(ifold))
        print("******")
        
        print("***Sampling and splitting of the dataset.***\n")
        x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
        utils.random_split(smiles_input=data.smiles, 
                           prop_input=np.array(data.iloc[:,1]), 
                           random_state=seed_list[ifold], 
                           scaling = True)
              
        # data augmentation or not
        if augmentation == True:
            print("***Data augmentation to {}***\n".format(augmentation))
            canonical = False
            rotation = True
        else:
            print("***No data augmentation has been required.***\n")
            canonical = True
            rotation = False
            
        x_train_enum, x_train_enum_card, y_train_enum = \
        augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

        x_valid_enum, x_valid_enum_card, y_valid_enum = \
        augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

        x_test_enum, x_test_enum_card, y_test_enum = \
        augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)
        
        print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
        format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))
        
        print("***Tokenization of SMILES.***\n")
        # Tokenize SMILES per dataset
        x_train_enum_tokens = token.get_tokens(x_train_enum)
        x_valid_enum_tokens = token.get_tokens(x_valid_enum)
        x_test_enum_tokens = token.get_tokens(x_test_enum)
        
        print("Examples of tokenized SMILES from a training set:\n{}\n".\
        format(x_train_enum_tokens[:5]))
        
        # Vocabulary size computation
        all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens

        # Check if the vocabulary for current dataset exists already
        if os.path.exists(save_dir+data_name+'_Vocabulary.txt'):
            tokens = token.get_vocab(save_dir+data_name+'_Vocabulary.txt')
        else:
            tokens = token.extract_vocab(all_smiles_tokens)
            token.save_vocab(tokens, save_dir+data_name+'_Vocabulary.txt')
            tokens = token.get_vocab(save_dir+data_name+'_Vocabulary.txt')

        vocab_size = len(tokens)
        
        train_unique_tokens = token.extract_vocab(x_train_enum_tokens)
        print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens)))
        valid_unique_tokens = token.extract_vocab(x_valid_enum_tokens)
        print("Number of tokens only present in a validation set: {}".format(len(valid_unique_tokens)))
        print("Is the validation set a subset of the training set: {}".\
              format(valid_unique_tokens.issubset(train_unique_tokens)))
        print("What are the tokens by which they differ: {}\n".\
              format(valid_unique_tokens.difference(train_unique_tokens)))
        test_unique_tokens = token.extract_vocab(x_test_enum_tokens)
        print("Number of tokens only present in a test set: {}".format(len(test_unique_tokens)))
        print("Is the test set a subset of the training set: {}".\
              format(test_unique_tokens.issubset(train_unique_tokens)))
        print("What are the tokens by which they differ: {}".\
              format(test_unique_tokens.difference(train_unique_tokens)))
        print("Is the test set a subset of the validation set: {}".\
              format(test_unique_tokens.issubset(valid_unique_tokens)))
        print("What are the tokens by which they differ: {}\n".\
              format(test_unique_tokens.difference(valid_unique_tokens)))
        
        print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))
        
        # Add 'pad', 'unk' tokens to the existing list
        tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
        
        # Maximum of length of SMILES to process
        max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
        print("Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n".format(max_length))
        
        print("***Bayesian Optimization of the SMILESX's architecture.***\n")        
        if bayopt_on:
            # Operate the bayesian optimization of the neural architecture
            def create_mod(params):
                print('Model: {}'.format(params))

                model_tag = data_name

                K.clear_session()

                if n_gpus > 1:
                    if bridge_type == 'NVLink':
                        model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                              vocabsize = vocab_size, 
                                                              lstmunits=int(params[:,0][0]), 
                                                              denseunits = int(params[:,1]), 
                                                              embedding = int(params[:,2][0]))
                    else:
                        with tf.device('/cpu'): # necessary to multi-GPU scaling
                            model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                                  vocabsize = vocab_size, 
                                                                  lstmunits=int(params[:,0][0]), 
                                                                  denseunits = int(params[:,1]), 
                                                                  embedding = int(params[:,2][0]))
                            
                    multi_model = model.ModelMGPU(model_opt, gpus=n_gpus, bridge_type=bridge_type)
                else: # single GPU
                    model_opt = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                          vocabsize = vocab_size, 
                                                          lstmunits=int(params[:,0][0]), 
                                                          denseunits = int(params[:,1]), 
                                                          embedding = int(params[:,2][0]))
                    
                    multi_model = model_opt

                batch_size = int(params[:,3][0])
                custom_adam = Adam(lr=math.pow(10,-float(params[:,4][0])))
                multi_model.compile(loss='mse', optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])

                history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens,
                                                                             vocab = tokens, 
                                                                             max_length = max_length, 
                                                                             props_set = y_train_enum, 
                                                                             batch_size = batch_size), 
                                                                             steps_per_epoch = math.ceil(len(x_train_enum_tokens)/batch_size)//bayopt_it_factor, 
                                                    validation_data = DataSequence(x_valid_enum_tokens,
                                                                                   vocab = tokens, 
                                                                                   max_length = max_length, 
                                                                                   props_set = y_valid_enum, 
                                                                                   batch_size = min(len(x_valid_enum_tokens), batch_size)),
                                                    validation_steps = math.ceil(len(x_valid_enum_tokens)/min(len(x_valid_enum_tokens), batch_size))//bayopt_it_factor, 
                                                    epochs = bayopt_n_epochs, 
                                                    shuffle = True,
                                                    initial_epoch = 0, 
                                                    verbose = 0)

                best_epoch = np.argmin(history.history['val_loss'])
                mae_valid = history.history['val_mean_absolute_error'][best_epoch]
                mse_valid = history.history['val_mean_squared_error'][best_epoch]
                if math.isnan(mse_valid): # discard diverging architectures (rare event)
                    mae_valid = math.inf
                    mse_valid = math.inf
                print('Valid MAE: {0:0.4f}, RMSE: {1:0.4f}'.format(mae_valid, mse_valid))

                return mse_valid

            print("Random initialization:\n")
            Bayes_opt = GPyOpt.methods.BayesianOptimization(f=create_mod, 
                                                            domain=bayopt_bounds, 
                                                            acquisition_type = 'EI',
                                                            initial_design_numdata = bayopt_n_rounds,
                                                            exact_feval = False,
                                                            normalize_Y = True,
                                                            num_cores = multiprocessing.cpu_count()-1)
            print("Optimization:\n")
            Bayes_opt.run_optimization(max_iter=bayopt_n_rounds)
            best_arch = Bayes_opt.x_opt
        else:
            best_arch = [lstmunits_ref, denseunits_ref, embedding_ref, batch_size_ref, alpha_ref]
            
        print("\nThe architecture for this datatset is:\n\tLSTM units: {}\n\tDense units: {}\n\tEmbedding dimensions {}".\
             format(int(best_arch[0]), int(best_arch[1]), int(best_arch[2])))
        print("\tBatch size: {0:}\n\tLearning rate: 10^-({1:.1f})\n".format(int(best_arch[3]), float(best_arch[4])))
        
        print("***Training of the best model.***\n")
        # Train the model and predict
        K.clear_session()   
        # Define the multi-gpus model if necessary
        if n_gpus > 1:
            if bridge_type == 'NVLink':
                model_train = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                        vocabsize = vocab_size, 
                                                        lstmunits= int(best_arch[0]), 
                                                        denseunits = int(best_arch[1]), 
                                                        embedding = int(best_arch[2]))
            else:
                with tf.device('/cpu'):
                    model_train = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                            vocabsize = vocab_size, 
                                                            lstmunits= int(best_arch[0]), 
                                                            denseunits = int(best_arch[1]), 
                                                            embedding = int(best_arch[2]))
            print("Best model summary:\n")
            print(model_train.summary())
            print("\n")
            multi_model = model.ModelMGPU(model_train, gpus=n_gpus, bridge_type=bridge_type)
        else:
            model_train = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                                    vocabsize = vocab_size, 
                                                    lstmunits= int(best_arch[0]), 
                                                    denseunits = int(best_arch[1]), 
                                                    embedding = int(best_arch[2]))

            print("Best model summary:\n")
            print(model_train.summary())
            print("\n")
            multi_model = model_train

        batch_size = int(best_arch[3])
        custom_adam = Adam(lr=math.pow(10,-float(best_arch[4])))
        # Compile the model
        multi_model.compile(loss="mse", optimizer=custom_adam, metrics=[metrics.mae,metrics.mse])
        
        # Checkpoint, Early stopping and callbacks definition
        filepath=save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5'
        
        checkpoint = ModelCheckpoint(filepath, 
                                     monitor='val_loss', 
                                     verbose=0, 
                                     save_best_only=True, 
                                     mode='min')

        earlystopping = EarlyStopping(monitor='val_loss', 
                                      min_delta=0, 
                                      patience=patience, 
                                      verbose=0, 
                                      mode='min')
                
        callbacks_list = [checkpoint, earlystopping]

        # Fit the model
        history = multi_model.fit_generator(generator = DataSequence(x_train_enum_tokens,
                                                                     vocab = tokens, 
                                                                     max_length = max_length, 
                                                                     props_set = y_train_enum, 
                                                                     batch_size = batch_size), 
                                            validation_data = DataSequence(x_valid_enum_tokens,
                                                                           vocab = tokens, 
                                                                           max_length = max_length, 
                                                                           props_set = y_valid_enum, 
                                                                           batch_size = min(len(x_valid_enum_tokens), batch_size)),
                                            epochs = n_epochs, 
                                            shuffle = True,
                                            initial_epoch = 0, 
                                            callbacks = callbacks_list)

        # Summarize history for losses per epoch
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper right')
        plt.savefig(save_dir+'History_fit_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight')
        plt.close()
        
        print("Best val_loss @ Epoch #{}\n".format(np.argmin(history.history['val_loss'])+1))

        print("***Predictions from the best model.***\n")
        model_train.load_weights(save_dir+'LSTMAtt_'+data_name+'_model.best_fold_'+str(ifold)+'.hdf5')
        model_train.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

        # predict and compare for the training, validation and test sets
        x_train_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_train_enum_tokens, 
                                                            max_length = max_length+1, 
                                                            vocab = tokens)
        x_valid_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_valid_enum_tokens, 
                                                            max_length = max_length+1, 
                                                            vocab = tokens)
        x_test_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list = x_test_enum_tokens, 
                                                           max_length = max_length+1, 
                                                           vocab = tokens)

        y_pred_train = model_train.predict(x_train_enum_tokens_tointvec)
        y_pred_valid = model_train.predict(x_valid_enum_tokens_tointvec)
        y_pred_test = model_train.predict(x_test_enum_tokens_tointvec)

        # compute a mean per set of augmented SMILES
        y_pred_train_mean, _ = utils.mean_median_result(x_train_enum_card, y_pred_train)
        y_pred_valid_mean, _ = utils.mean_median_result(x_valid_enum_card, y_pred_valid)
        y_pred_test_mean, _ = utils.mean_median_result(x_test_enum_card, y_pred_test)

        # inverse transform the scaling of the property and plot 'predictions VS observations'
        y_pred_VS_true_train = scaler.inverse_transform(y_train) - \
                               scaler.inverse_transform(y_pred_train_mean.reshape(-1,1))
        mae_train = np.mean(np.absolute(y_pred_VS_true_train))
        mse_train = np.mean(np.square(y_pred_VS_true_train))
        corrcoef_train = r2_score(scaler.inverse_transform(y_train), \
                                 scaler.inverse_transform(y_pred_train_mean.reshape(-1,1)))
        print("For the training set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_train, np.sqrt(mse_train), corrcoef_train))

        y_pred_VS_true_valid = scaler.inverse_transform(y_valid) - \
                               scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1))
        mae_valid = np.mean(np.absolute(y_pred_VS_true_valid))
        mse_valid = np.mean(np.square(y_pred_VS_true_valid))
        corrcoef_valid = r2_score(scaler.inverse_transform(y_valid), \
                                  scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1)))
        print("For the validation set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_valid, np.sqrt(mse_valid), corrcoef_valid))

        y_pred_VS_true_test = scaler.inverse_transform(y_test) - \
                              scaler.inverse_transform(y_pred_test_mean.reshape(-1,1))
        mae_test = np.mean(np.absolute(y_pred_VS_true_test))
        mse_test = np.mean(np.square(y_pred_VS_true_test))
        corrcoef_test = r2_score(scaler.inverse_transform(y_test), \
                                 scaler.inverse_transform(y_pred_test_mean.reshape(-1,1)))
        print("For the test set:\nMAE: {0:0.4f} RMSE: {1:0.4f} R^2: {2:0.4f}\n".\
              format(mae_test, np.sqrt(mse_test), corrcoef_test))

        # Plot the final result
        # Unscaling the data
        y_train = scaler.inverse_transform(y_train)
        y_pred_train_mean = scaler.inverse_transform(y_pred_train_mean.reshape(-1,1))
        y_valid = scaler.inverse_transform(y_valid)
        y_pred_valid_mean = scaler.inverse_transform(y_pred_valid_mean.reshape(-1,1))
        y_test = scaler.inverse_transform(y_test)
        y_pred_test_mean = scaler.inverse_transform(y_pred_test_mean.reshape(-1,1))

        # Changed colors, scaling and sizes
        plt.figure(figsize=(12, 8))

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Setting plot limits
        y_true_min = min(np.min(y_train), np.min(y_valid), np.min(y_test))
        y_true_max = max(np.max(y_train), np.max(y_valid), np.max(y_test))
        y_pred_min = min(np.min(y_pred_train_mean), np.min(y_pred_valid_mean), np.min(y_pred_test_mean))
        y_pred_max = max(np.max(y_pred_train_mean), np.max(y_pred_valid_mean), np.max(y_pred_test_mean))
        # Expanding slightly the canvas around the data points (by 10%)
        axmin = y_true_min-0.1*(y_true_max-y_true_min)
        axmax = y_true_max+0.1*(y_true_max-y_true_min)
        aymin = y_pred_min-0.1*(y_pred_max-y_pred_min)
        aymax = y_pred_max+0.1*(y_pred_max-y_pred_min)

        plt.xlim(min(axmin, aymin), max(axmax, aymax))
        plt.ylim(min(axmin, aymin), max(axmax, aymax))
                        
        plt.errorbar(y_train, 
                    y_pred_train_mean,
                    fmt='o',
                    label="Train",
                    elinewidth = 0, 
                    ms=5,
                    mfc='#519fc4',
                    markeredgewidth = 0,
                    alpha=0.7)
        plt.errorbar(y_valid,
                    y_pred_valid_mean,
                    elinewidth = 0,
                    fmt='o',
                    label="Validation", 
                    ms=5, 
                    mfc='#db702e',
                    markeredgewidth = 0,
                    alpha=0.7)
        plt.errorbar(y_test,
                    y_pred_test_mean,
                    elinewidth = 0,
                    fmt='o',
                    label="Test", 
                    ms=5, 
                    mfc='#cc1b00',
                    markeredgewidth = 0,
                    alpha=0.7)


        # Plot X=Y line
        plt.plot([max(plt.xlim()[0], plt.ylim()[0]), 
                  min(plt.xlim()[1], plt.ylim()[1])],
                 [max(plt.xlim()[0], plt.ylim()[0]), 
                  min(plt.xlim()[1], plt.ylim()[1])],
                 ':', color = '#595f69')
        
        plt.xlabel('Observations ' + data_units, fontsize = 12)
        plt.ylabel('Predictions ' + data_units, fontsize = 12)
        plt.legend()

        # Added fold number
        plt.savefig(save_dir+'TrainValid_Plot_LSTMAtt_'+data_name+'_model_weights.best_fold_'+str(ifold)+'.png', bbox_inches='tight', dpi=80)
        plt.close()
예제 #2
0
def Embedding_Vis(data,
                  data_name,
                  data_units='',
                  k_fold_number=8,
                  k_fold_index=0,
                  augmentation=False,
                  outdir="../data/",
                  affinity_propn=True,
                  verbose=0):

    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'

    input_dir = outdir + 'Main/' + '{}/{}/'.format(data_name, p_dir_temp)
    save_dir = outdir + 'Embedding_Vis/' + '{}/{}/'.format(
        data_name, p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)

    print("***SMILES_X for embedding visualization starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size=k_fold_number).tolist()

    print("******")
    print("***Fold #{} initiated...***".format(k_fold_index))
    print("******")

    print("***Sampling and splitting of the dataset.***\n")
    # Reproducing the data split of the requested fold (k_fold_index)
    x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
    utils.random_split(smiles_input=data.smiles,
                       prop_input=np.array(data.iloc[:,1]),
                       random_state=seed_list[k_fold_index],
                       scaling = True)

    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    x_train_enum, x_train_enum_card, y_train_enum = \
    augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

    x_valid_enum, x_valid_enum_card, y_valid_enum = \
    augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

    x_test_enum, x_test_enum_card, y_test_enum = \
    augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)

    print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
    format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES per dataset
    x_train_enum_tokens = token.get_tokens(x_train_enum)
    x_valid_enum_tokens = token.get_tokens(x_valid_enum)
    x_test_enum_tokens = token.get_tokens(x_test_enum)

    print("Examples of tokenized SMILES from a training set:\n{}\n".\
    format(x_train_enum_tokens[:5]))

    # Vocabulary size computation
    all_smiles_tokens = x_train_enum_tokens + x_valid_enum_tokens + x_test_enum_tokens
    tokens = token.extract_vocab(all_smiles_tokens)
    vocab_size = len(tokens)

    train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens))
    print(train_unique_tokens)
    print("Number of tokens only present in a training set: {}\n".format(
        len(train_unique_tokens)))
    train_unique_tokens.insert(0, 'pad')

    # Tokens as a list
    tokens = token.get_vocab(input_dir + data_name + '_tokens_set_fold_' +
                             str(k_fold_index) + '.txt')
    # Add 'pad', 'unk' tokens to the existing list
    tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)

    print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

    # Maximum of length of SMILES to process
    max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
    print(
        "Maximum length of tokenized SMILES: {} tokens (termination spaces included)\n"
        .format(max_length))

    # Transformation of tokenized SMILES to vector of intergers and vice-versa
    token_to_int = token.get_tokentoint(tokens)
    int_to_token = token.get_inttotoken(tokens)

    model_train = load_model(input_dir + 'LSTMAtt_' + data_name +
                             '_model.best_fold_' + str(k_fold_index) + '.hdf5',
                             custom_objects={'AttentionM': model.AttentionM()})

    print("Chosen model summary:\n")
    print(model_train.summary())
    print("\n")

    print("***Embedding of the individual tokens from the chosen model.***\n")
    model_train.compile(loss="mse",
                        optimizer='adam',
                        metrics=[metrics.mae, metrics.mse])

    model_embed_weights = model_train.layers[1].get_weights()[0]
    #print(model_embed_weights.shape)
    #tsne = TSNE(perplexity=30, early_exaggeration=120 , n_components=2, random_state=123, verbose=0)
    pca = PCA(n_components=2, random_state=123)
    transformed_weights = pca.fit_transform(model_embed_weights)
    #transformed_weights = tsne.fit_transform(model_embed_weights)

    f = plt.figure(figsize=(9, 9))
    ax = plt.subplot(aspect='equal')

    if affinity_propn:
        # Compute Affinity Propagation
        af = AffinityPropagation().fit(model_embed_weights)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)
        # Plot it
        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        for k, col in zip(range(n_clusters_), colors):
            class_members = np.where(np.array(labels == k) == True)[0].tolist()
            for ilabpt in class_members:
                alpha_tmp = 0.5 if tokens[
                    ilabpt] in train_unique_tokens else 0.5
                line_tmp = 1 if tokens[ilabpt] in train_unique_tokens else 5
                marker_tmp = 'o' if tokens[
                    ilabpt] in train_unique_tokens else 'x'
                edge_color_tmp = 'black' if tokens[
                    ilabpt] in train_unique_tokens else col
                ax.plot(transformed_weights[ilabpt, 0],
                        transformed_weights[ilabpt, 1],
                        col,
                        marker=marker_tmp,
                        markeredgecolor=edge_color_tmp,
                        markeredgewidth=line_tmp,
                        alpha=alpha_tmp,
                        markersize=10)
    else:
        # Black and white plot
        for ilabpt in range(vocab_size):
            alpha_tmp = 0.5 if tokens[ilabpt] in train_unique_tokens else 0.2
            size_tmp = 40 if tokens[ilabpt] in train_unique_tokens else 20
            ax.scatter(transformed_weights[ilabpt, 0],
                       transformed_weights[ilabpt, 1],
                       lw=1,
                       s=size_tmp,
                       facecolor='black',
                       marker='o',
                       alpha=alpha_tmp)

    annotations = []
    weight_tmp = 'bold'
    ilabpt = 0
    for ilabpt, (x_i, y_i) in enumerate(
            zip(transformed_weights[:, 0].tolist(),
                transformed_weights[:, 1].tolist())):
        weight_tmp = 'black' if tokens[
            ilabpt] in train_unique_tokens else 'normal'
        tokens_tmp = tokens[ilabpt]
        if tokens_tmp == ' ':
            tokens_tmp = 'space'
        elif tokens_tmp == '.':
            tokens_tmp = 'dot'
        annotations.append(
            plt.text(x_i, y_i, tokens_tmp, fontsize=12, weight=weight_tmp))
    adjust_text(annotations,
                x=transformed_weights[:, 0].tolist(),
                y=transformed_weights[:, 1].tolist(),
                arrowprops=dict(arrowstyle="-", color='k', lw=0.5))

    plt.xticks([])
    plt.yticks([])
    ax.axis('tight')

    plt.savefig(save_dir + 'Visualization_' + data_name + '_Embedding_fold_' +
                str(k_fold_index) + '.png',
                bbox_inches='tight')
    plt.show()
예제 #3
0
def Interpretation(data, 
                   data_name, 
                   data_units = '',
                   k_fold_number = 8,
                   k_fold_index=0,
                   augmentation = False, 
                   outdir = "../data/", 
                   smiles_toviz = 'CCC', 
                   font_size = 15, 
                   font_rotation = 'horizontal'):
    
    if augmentation:
        p_dir_temp = 'Augm'
    else:
        p_dir_temp = 'Can'
        
    input_dir = outdir+'Main/'+'{}/{}/'.format(data_name,p_dir_temp)
    save_dir = outdir+'Interpretation/'+'{}/{}/'.format(data_name,p_dir_temp)
    os.makedirs(save_dir, exist_ok=True)
    
    print("***SMILES_X Interpreter starts...***\n\n")
    np.random.seed(seed=123)
    seed_list = np.random.randint(int(1e6), size = k_fold_number).tolist()
    # Train/validation/test data splitting - 80/10/10 % at random with diff. seeds for k_fold_number times
    selection_seed = seed_list[k_fold_index]
        
    print("******")
    print("***Fold #{} initiated...***".format(selection_seed))
    print("******")

    print("***Sampling and splitting of the dataset.***\n")
    x_train, x_valid, x_test, y_train, y_valid, y_test, scaler = \
    utils.random_split(smiles_input=data.smiles, 
                       prop_input=np.array(data.iloc[:,1]), 
                       random_state=selection_seed, 
                       scaling = True)

    np.savetxt(save_dir+'smiles_train.txt', np.asarray(x_train), newline="\n", fmt='%s')
    np.savetxt(save_dir+'smiles_valid.txt', np.asarray(x_valid), newline="\n", fmt='%s')
    np.savetxt(save_dir+'smiles_test.txt', np.asarray(x_test), newline="\n", fmt='%s')
    
    mol_toviz = Chem.MolFromSmiles(smiles_toviz)
    if mol_toviz != None:
        smiles_toviz_can = Chem.MolToSmiles(mol_toviz)
    else:
        print("***Process of visualization automatically aborted!***")
        print("The smiles_toviz is incorrect and cannot be canonicalized by RDKit.")
        return
    smiles_toviz_x = np.array([smiles_toviz_can])
    if smiles_toviz_can in np.array(data.smiles):
        smiles_toviz_y = np.array([[data.iloc[np.where(data.smiles == smiles_toviz_x[0])[0][0],1]]])
    else:
        smiles_toviz_y = np.array([[np.nan]])

    # data augmentation or not
    if augmentation == True:
        print("***Data augmentation.***\n")
        canonical = False
        rotation = True
    else:
        print("***No data augmentation has been required.***\n")
        canonical = True
        rotation = False

    x_train_enum, x_train_enum_card, y_train_enum = \
    augm.Augmentation(x_train, y_train, canon=canonical, rotate=rotation)

    x_valid_enum, x_valid_enum_card, y_valid_enum = \
    augm.Augmentation(x_valid, y_valid, canon=canonical, rotate=rotation)

    x_test_enum, x_test_enum_card, y_test_enum = \
    augm.Augmentation(x_test, y_test, canon=canonical, rotate=rotation)
    
    smiles_toviz_x_enum, smiles_toviz_x_enum_card, smiles_toviz_y_enum = \
    augm.Augmentation(smiles_toviz_x, smiles_toviz_y, canon=canonical, rotate=rotation)

    print("Enumerated SMILES:\n\tTraining set: {}\n\tValidation set: {}\n\tTest set: {}\n".\
    format(x_train_enum.shape[0], x_valid_enum.shape[0], x_test_enum.shape[0]))

    print("***Tokenization of SMILES.***\n")
    # Tokenize SMILES per dataset
    x_train_enum_tokens = token.get_tokens(x_train_enum)
    x_valid_enum_tokens = token.get_tokens(x_valid_enum)
    x_test_enum_tokens = token.get_tokens(x_test_enum)
    
    smiles_toviz_x_enum_tokens = token.get_tokens(smiles_toviz_x_enum)

    print("Examples of tokenized SMILES from a training set:\n{}\n".\
    format(x_train_enum_tokens[:5]))

    # Vocabulary size computation
    all_smiles_tokens = x_train_enum_tokens+x_valid_enum_tokens+x_test_enum_tokens
    tokens = token.extract_vocab(all_smiles_tokens)
    vocab_size = len(tokens)

    train_unique_tokens = list(token.extract_vocab(x_train_enum_tokens))
    print(train_unique_tokens)
    print("Number of tokens only present in a training set: {}\n".format(len(train_unique_tokens)))
    train_unique_tokens.insert(0,'pad')
    
    # Tokens as a list
    tokens = token.get_vocab(input_dir+data_name+'_Vocabulary.txt')
    # Add 'pad', 'unk' tokens to the existing list
    tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
    
    print("Full vocabulary: {}\nOf size: {}\n".format(tokens, vocab_size))

    # Maximum of length of SMILES to process
    max_length = np.max([len(ismiles) for ismiles in all_smiles_tokens])
    print("Maximum length of tokenized SMILES: {} tokens\n".format(max_length))

    # Transformation of tokenized SMILES to vector of intergers and vice-versa
    token_to_int = token.get_tokentoint(tokens)
    int_to_token = token.get_inttotoken(tokens)

    # Best architecture to visualize from
    model_topredict = load_model(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5', 
                                          custom_objects={'AttentionM': model.AttentionM()})
    best_arch = [model_topredict.layers[2].output_shape[-1]/2, 
                 model_topredict.layers[3].output_shape[-1], 
                 model_topredict.layers[1].output_shape[-1]]

    # Architecture to return attention weights
    model_att = model.LSTMAttModel.create(inputtokens = max_length+1, 
                                          vocabsize = vocab_size, 
                                          lstmunits= int(best_arch[0]), 
                                          denseunits = int(best_arch[1]), 
                                          embedding = int(best_arch[2]), 
                                          return_proba = True)

    print("Best model summary:\n")
    print(model_att.summary())
    print("\n")

    print("***Interpretation from the best model.***\n")
    model_att.load_weights(input_dir+'LSTMAtt_'+data_name+'_model.best_seed_'+str(selection_seed)+'.hdf5')
    model_att.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])

    smiles_toviz_x_enum_tokens_tointvec = token.int_vec_encode(tokenized_smiles_list= smiles_toviz_x_enum_tokens, 
                                                               max_length = max_length+1,
                                                               vocab = tokens)
    
    intermediate_layer_model = Model(inputs=model_att.input,
                                     outputs=model_att.layers[-2].output)
    intermediate_output = intermediate_layer_model.predict(smiles_toviz_x_enum_tokens_tointvec)
    
    smiles_toviz_x_card_cumsum_viz = np.cumsum(smiles_toviz_x_enum_card)
    smiles_toviz_x_card_cumsum_shift_viz = shift(smiles_toviz_x_card_cumsum_viz, 1, cval=0)

    mols_id = 0
    ienumcard = smiles_toviz_x_card_cumsum_shift_viz[mols_id]
    
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    intermediate_output_tmp = intermediate_output[ienumcard,-smiles_len_tmp+1:-1].flatten().reshape(1,-1)
    max_intermediate_output_tmp = np.max(intermediate_output_tmp)

    plt.matshow(intermediate_output_tmp, 
                cmap='Reds')
    plt.tick_params(axis='x', bottom = False)
    plt.xticks([ix for ix in range(smiles_len_tmp-2)])
    plt.xticks(range(smiles_len_tmp-2), 
               [int_to_token[iint].replace('pad','') \
                for iint in smiles_toviz_x_enum_tokens_tointvec[ienumcard,-smiles_len_tmp+1:-1]], 
               fontsize = font_size, 
               rotation = font_rotation)
    plt.yticks([])
    plt.savefig(save_dir+'Interpretation_1D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
    #plt.show()
    
    smiles_tmp = smiles_toviz_x_enum[ienumcard]
    mol_tmp = Chem.MolFromSmiles(smiles_tmp)
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    mol_df_tmp = pd.DataFrame([smiles_toviz_x_enum_tokens[ienumcard][1:-1],intermediate_output[ienumcard].\
                               flatten().\
                               tolist()[-smiles_len_tmp+1:-1]]).transpose()
    bond = ['-','=','#','$','/','\\','.','(',')']
    mol_df_tmp = mol_df_tmp[~mol_df_tmp.iloc[:,0].isin(bond)]
    mol_df_tmp = mol_df_tmp[[not itoken.isdigit() for itoken in mol_df_tmp.iloc[:,0].values.tolist()]]

    minmaxscaler = MinMaxScaler(feature_range=(0,1))
    norm_weights = minmaxscaler.fit_transform(mol_df_tmp.iloc[:,1].values.reshape(-1,1)).flatten().tolist()
    fig = GetSimilarityMapFromWeights(mol=mol_tmp, 
                                      size = (250,250), 
                                      scale=-1,  
                                      sigma=0.05,
                                      weights=norm_weights, 
                                      colorMap='Reds', 
                                      contourLines = 10,
                                      alpha = 0.25)
    fig.savefig(save_dir+'Interpretation_2D_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')
    #fig.show()
    
    model_topredict.compile(loss="mse", optimizer='adam', metrics=[metrics.mae,metrics.mse])
    
    y_pred_test_tmp = model_topredict.predict(smiles_toviz_x_enum_tokens_tointvec[ienumcard].reshape(1,-1))[0,0]
    y_test_tmp = smiles_toviz_y_enum[ienumcard,0]
    if not np.isnan(y_test_tmp):
        print("True value: {0:.2f} Predicted: {1:.2f}".format(y_test_tmp,
                                                    scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0]))
    else:
        print("Predicted: {0:.2f}".format(scaler.inverse_transform(y_pred_test_tmp.reshape(1, -1))[0][0]))
    
    smiles_len_tmp = len(smiles_toviz_x_enum_tokens[ienumcard])
    diff_topred_list = list()
    diff_totrue_list = list()
    for csubsmiles in range(1,smiles_len_tmp):
        isubsmiles = smiles_toviz_x_enum_tokens[ienumcard][:csubsmiles]+[' ']
        isubsmiles_tointvec= token.int_vec_encode(tokenized_smiles_list = [isubsmiles], 
                                                  max_length = max_length+1, 
                                                  vocab = tokens)
        predict_prop_tmp = model_topredict.predict(isubsmiles_tointvec)[0,0]
        diff_topred_tmp = (predict_prop_tmp-y_pred_test_tmp)/np.abs(y_pred_test_tmp)
        diff_topred_list.append(diff_topred_tmp)
        diff_totrue_tmp = (predict_prop_tmp-y_test_tmp)/np.abs(y_test_tmp)
        diff_totrue_list.append(diff_totrue_tmp)
    max_diff_topred_tmp = np.max(diff_topred_list)
    max_diff_totrue_tmp = np.max(diff_totrue_list)

    plt.figure(figsize=(15,7))
    markers, stemlines, baseline = plt.stem([ix for ix in range(smiles_len_tmp-1)], 
                                            diff_topred_list, 
                                            'k.-', 
                                             use_line_collection=True)
    plt.setp(baseline, color='k', linewidth=2, linestyle='--')
    plt.setp(markers, linewidth=1, marker='o', markersize=10, markeredgecolor = 'black')
    plt.setp(stemlines, color = 'k', linewidth=0.5, linestyle='-')
    plt.xticks(range(smiles_len_tmp-1), 
               smiles_toviz_x_enum_tokens[ienumcard][:-1],
               fontsize = font_size, 
               rotation = font_rotation)
    plt.yticks(fontsize = 20)
    plt.ylabel('Temporal relative distance', fontsize = 25, labelpad = 15)
    plt.savefig(save_dir+'Interpretation_temporal_'+data_name+'_seed_'+str(selection_seed)+'.png', bbox_inches='tight')