def __init__(self, experiment_name):

        # Read parameter used during training
        self._config = configparser.ConfigParser()
        self._config.read('../experiments/' + experiment_name + '.ini')

        self._model_type = self._config['MODEL']['model']
        self._experiment_name = experiment_name

        self._file_name = self._config['DATA']['data']
        self._encoding_size = int(self._config['DATA']['encoding_size'])
        self._molecular_size = int(self._config['DATA']['molecular_size'])

        self._epochs = int(self._config['TRAINING']['epochs'])
        self._n_folds = int(self._config['TRAINING']['n_folds'])
        self._learning_rate = float(self._config['TRAINING']['learning_rate'])
        self._batch_size = int(self._config['TRAINING']['batch_size'])

        self._samples = int(self._config['EVALUATION']['samples'])
        self._T = float(self._config['EVALUATION']['temp'])
        self._starting_token = self._config['EVALUATION']['starting_token']

        if os.path.isfile('../data/' + self._file_name + '.csv'):
            self._data = pd.read_csv('../data/' + self._file_name + '.csv',
                                     header=None).values[:, 0]
        elif os.path.isfile('../data/' + self._file_name + '.tar.xz'):
            # Skip first line since empty and last line since nan
            self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz',
                                     compression='xz',
                                     header=None).values[1:-1, 0]
        # Clean data from start, end and padding token
        for i, mol_dat in enumerate(self._data):
            self._data[i] = clean_molecule(mol_dat, self._model_type)
    def sample(self,
               N=100,
               stor_dir='../evaluation',
               T=0.7,
               fold=[1],
               epoch=[9],
               valid=True,
               novel=True,
               unique=True,
               write_csv=True):
        '''Sample from a model where the number of novel valid unique molecules is fixed
        :param stor_dir:    directory where the generated SMILES are saved
        :param N:        number of samples
        :param T:        Temperature
        :param fold:     Folds to use for sampling
        :param epoch:    Epochs to use for sampling
        :param valid:    If True, only accept valid SMILES
        :param novel:    If True, only accept novel SMILES
        :param unique:   If True, only accept unique SMILES
        :param write_csv If True, the generated SMILES are written in stor_dir
        :return: res_molecules: list with all the generated SMILES
        '''

        res_molecules = []
        print('Sampling: started')
        for f in fold:
            for e in epoch:
                self._model.build(stor_dir + '/' + self._experiment_name +
                                  '/models/model_fold_' + str(f) + '_epochs_' +
                                  str(e))

                new_molecules = []
                while len(new_molecules) < N:
                    new_mol = self._encoder.decode(
                        self._model.sample(self._starting_token, T))

                    # Remove remains from generation
                    new_mol = clean_molecule(new_mol[0], self._model_type)

                    # If not valid, get new molecule
                    if valid and not check_valid(new_mol):
                        continue

                    # If not unique, get new molecule
                    if unique and (new_mol in new_molecules):
                        continue

                    # If not novel, get molecule
                    if novel and (new_mol in self._data):
                        continue

                    # If all conditions checked, add new molecule
                    new_molecules.append(new_mol)

                # Prepare name for file
                name = 'molecules_fold_' + str(f) + '_epochs_' + str(
                    e) + '_T_' + str(T) + '_N_' + str(N) + '.csv'
                if unique:
                    name = 'unique_' + name
                if valid:
                    name = 'valid_' + name
                if novel:
                    name = 'novel_' + name

                # Store final molecules
                if write_csv:
                    if not os.path.exists(stor_dir + '/' +
                                          self._experiment_name +
                                          '/molecules/'):
                        os.makedirs(stor_dir + '/' + self._experiment_name +
                                    '/molecules/')
                    mol = np.array(new_molecules).reshape(-1)
                    pd.DataFrame(mol).to_csv(stor_dir + '/' +
                                             self._experiment_name +
                                             '/molecules/' + name,
                                             header=None)

            res_molecules.append(new_molecules)

        print('Sampling: done')
        return res_molecules
Beispiel #3
0
    def fine_tuning(self, stor_dir='../evaluation/', restart=False):
        '''Perform fine-tuning and store statistic,
        NOTE: Directory should be prepared with the correct name and model
        NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used'
        :param stor_dir:    directory to store data
        :return:
        '''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE random
        if self._model_type == 'NADE' and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Build model
        self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model)

        # Store total Statistics
        tot_stat = []

        # only single fold
        fold = 1

        for i in range(self._epochs):
            print('Fold:', fold)
            print('Epoch:', i)

            if restart:
                # Read existing files
                tmp_stat_file = pd.read_csv(
                    stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None).to_numpy()

                # Check if current epoch is successfully completed else continue with normal training
                if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules(
                        self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i:
                    # Load model
                    self._model.build(
                        stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i))

                    # Fill statistic and loss list
                    tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist())

                    # Skip this epoch
                    continue

                else:
                    restart = False

            # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
            # to  (all_SMILES, molecular_size, encoding_size))
            statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size),
                                          label.reshape(-1, self._molecular_size), epochs=1,
                                          batch_size=self._batch_size)
            tot_stat.append(statistic.tolist())

            # Store model
            self._model.save(
                stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) )

            # Sample new molecules
            new_molecules = []
            for s in range(self._samples):
                mol = self._encoder.decode(self._model.sample(self._starting_token, self._T))
                new_molecules.append(clean_molecule(mol[0], self._model_type))

            # Store new molecules
            new_molecules = np.array(new_molecules)
            pd.DataFrame(new_molecules).to_csv(
                stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(
                    i) + '.csv', header=None)

            # Store statistic
            store_stat = np.array(tot_stat).reshape(i + 1, -1)
            pd.DataFrame(np.array(store_stat)).to_csv(
                stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv',
                header=None)
    def cross_validation(self, stor_dir='../evaluation/', restart=False):
        '''Perform cross-validation and store data'''

        # Create directories
        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/models'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/models')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/molecules'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/statistic'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic')

        if not os.path.exists(stor_dir + '/' + self._experiment_name +
                              '/validation'):
            os.makedirs(stor_dir + '/' + self._experiment_name + '/validation')

        self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2)

        # Count iterations
        fold = 0

        # Compute labels
        label = np.argmax(self._data, axis=-1).astype(int)

        # Special preprocessing in the case of NADE
        if (self._model_type == 'NADE') and self._generation == 'random':
            # First column stores correct SMILES and second column stores SMILES with missing values
            label = np.argmax(self._data[:, 0], axis=-1).astype(int)
            aug = self._data.shape[1] - 1
            label = np.repeat(label[:, np.newaxis, :], aug, axis=1)
            self._data = self._data[:, 1:]

        # Split data into train and test data
        for train, test in self._kf.split(self._data):

            # Shuffle index within test and train set
            np.random.shuffle(train)
            np.random.shuffle(test)

            fold += 1

            self._model.build()

            # Store total statistics
            tot_stat = []

            # Store validation loss
            tot_loss = []

            for i in range(self._epochs):
                print('Fold:', fold)
                print('Epoch:', i)

                if restart:
                    tmp_val_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/validation/val_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    tmp_stat_file = pd.read_csv(
                        stor_dir + '/' + self._experiment_name +
                        '/statistic/stat_fold_' + str(fold) + '.csv',
                        header=None).to_numpy()

                    # Check if current epoch is successfully complete[0]d else continue with normal training
                    if check_model(
                            self._model_type, self._experiment_name, stor_dir,
                            fold, i) and check_molecules(
                                self._experiment_name, stor_dir, fold,
                                i) and tmp_val_file.shape[
                                    0] > i and tmp_stat_file.shape[0] > i:

                        # Load model
                        self._model.build(stor_dir + '/' +
                                          self._experiment_name +
                                          '/models/model_fold_' + str(fold) +
                                          '_epochs_' + str(i))

                        # Fill statistic and loss list
                        tot_stat.append(tmp_stat_file[i,
                                                      1:].reshape(1,
                                                                  -1).tolist())
                        tot_loss.append(tmp_val_file[i, 1])

                        # Skip this epoch
                        continue

                    else:
                        restart = False

                # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size)
                # to  (all_SMILES, molecular_size, encoding_size))
                statistic = self._model.train(
                    self._data[train].reshape(-1, self._molecular_size,
                                              self._encoding_size),
                    label[train].reshape(-1, self._molecular_size),
                    epochs=1,
                    batch_size=self._batch_size)

                tot_stat.append(statistic.tolist())

                # Store model
                self._model.save(stor_dir + '/' + self._experiment_name +
                                 '/models/model_fold_' + str(fold) +
                                 '_epochs_' + str(i))

                # Test model on validation set
                tot_loss.append(
                    self._model.validate(
                        self._data[test].reshape(-1, self._molecular_size,
                                                 self._encoding_size),
                        label[test].reshape(-1, self._molecular_size)))

                # Sample new molecules
                new_molecules = []
                for s in range(self._samples):
                    mol = self._encoder.decode(
                        self._model.sample(self._starting_token, self._T))
                    new_molecules.append(
                        clean_molecule(mol[0], self._model_type))

                # Store new molecules
                new_molecules = np.array(new_molecules)
                pd.DataFrame(new_molecules).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/molecules/molecule_fold_' + str(fold) + '_epochs_' +
                    str(i) + '.csv',
                    header=None)

                # Store statistic
                store_stat = np.array(tot_stat).reshape(i + 1, -1)
                pd.DataFrame(np.array(store_stat)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/statistic/stat_fold_' + str(fold) + '.csv',
                    header=None)

                # Store validation data
                pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv(
                    stor_dir + '/' + self._experiment_name +
                    '/validation/val_fold_' + str(fold) + '.csv',
                    header=None)
    def eval_molecule(self, stor_dir='.'):
        '''Plot percentage of novel, valid and unique SMILES
        :return:
        '''

        valid = np.zeros((self._n_folds, self._epochs))
        unique = np.zeros((self._n_folds, self._epochs))
        novel = np.zeros((self._n_folds, self._epochs))

        for i in range(self._n_folds):
            for j in range(self._epochs):

                mol = pd.read_csv(stor_dir + '/' + self._experiment_name +
                                  '/molecules/molecule_fold_' + str(i + 1) +
                                  '_epochs_' + str(j) + '.csv',
                                  header=None).values[:, 1].astype(str)

                # Remove padding
                for k, m in enumerate(mol):
                    mol[k] = clean_molecule(m, self._model_type)

                # Compute unique molecules
                unique[i, j] = len(set(mol)) / self._samples

                # Remove duplicates
                mol = np.array(list(set(mol)))

                # Check validity and remove non-valid molecules
                to_delete = []
                for k, m in enumerate(mol):
                    if not check_valid(m):
                        to_delete.append(k)
                valid_mol = np.delete(mol, to_delete)
                valid[i, j] = len(valid_mol) / self._samples

                # Compute molecules unequal to training data
                if valid_mol.size != 0:
                    new_m = self.check_with_training_data(list(valid_mol))
                    novel[i, j] = len(new_m) / self._samples

        # Get percentage
        unique *= 100
        novel *= 100
        valid *= 100

        # Get mean values
        mean_unique = np.mean(unique, axis=0)
        mean_valid = np.mean(valid, axis=0)
        mean_novel = np.mean(novel, axis=0)

        # Get standard deviation
        std_unique = np.std(unique, axis=0)
        std_valid = np.std(valid, axis=0)
        std_novel = np.std(novel, axis=0)

        print(mean_unique)
        print(mean_valid)
        print(mean_novel)

        # PLot
        plt.figure(1)
        plt.errorbar(np.arange(1, self._epochs + 1),
                     mean_unique,
                     yerr=std_unique,
                     capsize=3,
                     label='unique')
        plt.errorbar(np.arange(1, self._epochs + 1),
                     mean_valid,
                     yerr=std_valid,
                     capsize=3,
                     label='valid & unique')
        plt.errorbar(np.arange(1, self._epochs + 1),
                     mean_novel,
                     yerr=std_novel,
                     capsize=3,
                     label='novel, valid & unique',
                     linestyle=':')
        plt.yticks(np.arange(0, 110, step=10))
        plt.legend()
        plt.ylim(0, 105)
        plt.title('SMILES T=' + str(self._T))
        plt.ylabel('% SMILES')
        plt.xlabel('Epoch')
        plt.savefig(stor_dir + '/' + self._experiment_name +
                    '/molecules/novel_valid_unique_molecules.png')

        # Store data
        data = np.vstack((mean_unique, std_unique, mean_valid, std_valid,
                          mean_novel, std_novel))
        pd.DataFrame(data).to_csv(self._experiment_name + '/molecules/' +
                                  self._experiment_name + '_data.csv')

        plt.show()