class FineTuner(): def __init__(self, experiment_name='ForwardRNN'): self._encoder = SMILESEncoder() # Read all parameter from the .ini file self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = '../data/' + self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) # self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode([self._config['EVALUATION']['starting_token']]) # Read starting model self._start_model = self._config['FINETUNING']['start_model'] if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode([self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) self._data = self._encoder.encode_from_file(self._file_name) def fine_tuning(self, stor_dir='../evaluation/', restart=False): '''Perform fine-tuning and store statistic, NOTE: Directory should be prepared with the correct name and model NOTE: Molecules are not generated or validation is not performed. To sample molecules sampler should be used' :param stor_dir: directory to store data :return: ''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE random if self._model_type == 'NADE' and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Build model self._model.build(stor_dir + '/' + self._experiment_name + '/' + self._start_model) # Store total Statistics tot_stat = [] # only single fold fold = 1 for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: # Read existing files tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i: # Load model self._model.build( stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) # Skip this epoch continue else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train(self._data.reshape(-1, self._molecular_size, self._encoding_size), label.reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save( stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i) ) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode(self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str( i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None)
class Sampler(): def __init__(self, experiment_name): self._encoder = SMILESEncoder() # Read parameter used during training self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode( [self._config['EVALUATION']['starting_token']]) if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode( [self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) # Read data if os.path.isfile('../data/' + self._file_name + '.csv'): self._data = pd.read_csv('../data/' + self._file_name + '.csv', header=None).values[:, 0] elif os.path.isfile('../data/' + self._file_name + '.tar.xz'): # Skip first line since empty and last line since nan self._data = pd.read_csv('../data/' + self._file_name + '.tar.xz', compression='xz', header=None).values[1:-1, 0] # Clean data from start, end and padding token for i, mol_dat in enumerate(self._data): self._data[i] = clean_molecule(mol_dat, self._model_type) def sample(self, N=100, stor_dir='../evaluation', T=0.7, fold=[1], epoch=[9], valid=True, novel=True, unique=True, write_csv=True): '''Sample from a model where the number of novel valid unique molecules is fixed :param stor_dir: directory where the generated SMILES are saved :param N: number of samples :param T: Temperature :param fold: Folds to use for sampling :param epoch: Epochs to use for sampling :param valid: If True, only accept valid SMILES :param novel: If True, only accept novel SMILES :param unique: If True, only accept unique SMILES :param write_csv If True, the generated SMILES are written in stor_dir :return: res_molecules: list with all the generated SMILES ''' res_molecules = [] print('Sampling: started') for f in fold: for e in epoch: self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(f) + '_epochs_' + str(e)) new_molecules = [] while len(new_molecules) < N: new_mol = self._encoder.decode( self._model.sample(self._starting_token, T)) # Remove remains from generation new_mol = clean_molecule(new_mol[0], self._model_type) # If not valid, get new molecule if valid and not check_valid(new_mol): continue # If not unique, get new molecule if unique and (new_mol in new_molecules): continue # If not novel, get molecule if novel and (new_mol in self._data): continue # If all conditions checked, add new molecule new_molecules.append(new_mol) # Prepare name for file name = 'molecules_fold_' + str(f) + '_epochs_' + str( e) + '_T_' + str(T) + '_N_' + str(N) + '.csv' if unique: name = 'unique_' + name if valid: name = 'valid_' + name if novel: name = 'novel_' + name # Store final molecules if write_csv: if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules/'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules/') mol = np.array(new_molecules).reshape(-1) pd.DataFrame(mol).to_csv(stor_dir + '/' + self._experiment_name + '/molecules/' + name, header=None) res_molecules.append(new_molecules) print('Sampling: done') return res_molecules
class Trainer(): def __init__(self, experiment_name='ForwardRNN'): self._encoder = SMILESEncoder() # Read all parameter from the .ini file self._config = configparser.ConfigParser() self._config.read('../experiments/' + experiment_name + '.ini') self._model_type = self._config['MODEL']['model'] self._experiment_name = experiment_name self._hidden_units = int(self._config['MODEL']['hidden_units']) self._file_name = '../data/' + self._config['DATA']['data'] self._encoding_size = int(self._config['DATA']['encoding_size']) self._molecular_size = int(self._config['DATA']['molecular_size']) self._epochs = int(self._config['TRAINING']['epochs']) self._n_folds = int(self._config['TRAINING']['n_folds']) self._learning_rate = float(self._config['TRAINING']['learning_rate']) self._batch_size = int(self._config['TRAINING']['batch_size']) self._samples = int(self._config['EVALUATION']['samples']) self._T = float(self._config['EVALUATION']['temp']) self._starting_token = self._encoder.encode( [self._config['EVALUATION']['starting_token']]) if self._model_type == 'FBRNN': self._model = FBRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'ForwardRNN': self._model = ForwardRNN(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'BIMODAL': self._model = BIMODAL(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units) elif self._model_type == 'NADE': self._generation = self._config['MODEL']['generation'] self._missing_token = self._encoder.encode( [self._config['TRAINING']['missing_token']]) self._model = NADE(self._molecular_size, self._encoding_size, self._learning_rate, self._hidden_units, self._generation, self._missing_token) self._data = self._encoder.encode_from_file(self._file_name) def complete_run(self, stor_dir='../evaluation/', restart=False): '''Training without validation on complete data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if self._model_type == 'NADE' and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Build model self._model.build() # Store total Statistics tot_stat = [] # only single fold fold = 1 # Shuffle data before training (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) self._data, label = shuffle( self._data.reshape(-1, self._molecular_size, self._encoding_size), label.reshape(-1, self._molecular_size)) for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) # With restart read existing files if restart: tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) continue # Continue with normal training else: restart = False # Train model statistic = self._model.train(self._data, label, epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) def single_run(self, stor_dir='../evaluation/', restart=False): '''Training with validation and store data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/validation'): os.makedirs(stor_dir + '/' + self._experiment_name + '/validation') # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if (self._model_type == 'NADE' or self._model_type == 'NADE_v2') and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Split data into train and test data train_data, test_data, train_label, test_label = train_test_split( self._data, label, test_size=1. / 5, random_state=1, shuffle=True) # Build model self._model.build() # Store total Statistics tot_stat = [] # Store validation loss tot_loss = [] # only single fold fold = 1 for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: # Read existing files tmp_val_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None).to_numpy() tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully completed else continue with normal training if check_model(self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_val_file.shape[ 0] > i and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) tot_loss.append(tmp_val_file[i, 1]) # Skip this epoch continue # Continue with normal training else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train( train_data.reshape(-1, self._molecular_size, self._encoding_size), train_label.reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Test model on validation set tot_loss.append( self._model.validate( test_data.reshape(-1, self._molecular_size, self._encoding_size), test_label.reshape(-1, self._molecular_size))) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append(clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) # Store validation data pd.DataFrame(np.array(tot_loss).reshape( -1, 1)).to_csv(stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None) def cross_validation(self, stor_dir='../evaluation/', restart=False): '''Perform cross-validation and store data''' # Create directories if not os.path.exists(stor_dir + '/' + self._experiment_name + '/models'): os.makedirs(stor_dir + '/' + self._experiment_name + '/models') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/molecules'): os.makedirs(stor_dir + '/' + self._experiment_name + '/molecules') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/statistic'): os.makedirs(stor_dir + '/' + self._experiment_name + '/statistic') if not os.path.exists(stor_dir + '/' + self._experiment_name + '/validation'): os.makedirs(stor_dir + '/' + self._experiment_name + '/validation') self._kf = KFold(n_splits=self._n_folds, shuffle=True, random_state=2) # Count iterations fold = 0 # Compute labels label = np.argmax(self._data, axis=-1).astype(int) # Special preprocessing in the case of NADE if (self._model_type == 'NADE') and self._generation == 'random': # First column stores correct SMILES and second column stores SMILES with missing values label = np.argmax(self._data[:, 0], axis=-1).astype(int) aug = self._data.shape[1] - 1 label = np.repeat(label[:, np.newaxis, :], aug, axis=1) self._data = self._data[:, 1:] # Split data into train and test data for train, test in self._kf.split(self._data): # Shuffle index within test and train set np.random.shuffle(train) np.random.shuffle(test) fold += 1 self._model.build() # Store total statistics tot_stat = [] # Store validation loss tot_loss = [] for i in range(self._epochs): print('Fold:', fold) print('Epoch:', i) if restart: tmp_val_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None).to_numpy() tmp_stat_file = pd.read_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None).to_numpy() # Check if current epoch is successfully complete[0]d else continue with normal training if check_model( self._model_type, self._experiment_name, stor_dir, fold, i) and check_molecules( self._experiment_name, stor_dir, fold, i) and tmp_val_file.shape[ 0] > i and tmp_stat_file.shape[0] > i: # Load model self._model.build(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Fill statistic and loss list tot_stat.append(tmp_stat_file[i, 1:].reshape(1, -1).tolist()) tot_loss.append(tmp_val_file[i, 1]) # Skip this epoch continue else: restart = False # Train model (Data reshaped from (N_samples, N_augmentation, molecular_size, encoding_size) # to (all_SMILES, molecular_size, encoding_size)) statistic = self._model.train( self._data[train].reshape(-1, self._molecular_size, self._encoding_size), label[train].reshape(-1, self._molecular_size), epochs=1, batch_size=self._batch_size) tot_stat.append(statistic.tolist()) # Store model self._model.save(stor_dir + '/' + self._experiment_name + '/models/model_fold_' + str(fold) + '_epochs_' + str(i)) # Test model on validation set tot_loss.append( self._model.validate( self._data[test].reshape(-1, self._molecular_size, self._encoding_size), label[test].reshape(-1, self._molecular_size))) # Sample new molecules new_molecules = [] for s in range(self._samples): mol = self._encoder.decode( self._model.sample(self._starting_token, self._T)) new_molecules.append( clean_molecule(mol[0], self._model_type)) # Store new molecules new_molecules = np.array(new_molecules) pd.DataFrame(new_molecules).to_csv( stor_dir + '/' + self._experiment_name + '/molecules/molecule_fold_' + str(fold) + '_epochs_' + str(i) + '.csv', header=None) # Store statistic store_stat = np.array(tot_stat).reshape(i + 1, -1) pd.DataFrame(np.array(store_stat)).to_csv( stor_dir + '/' + self._experiment_name + '/statistic/stat_fold_' + str(fold) + '.csv', header=None) # Store validation data pd.DataFrame(np.array(tot_loss).reshape(-1, 1)).to_csv( stor_dir + '/' + self._experiment_name + '/validation/val_fold_' + str(fold) + '.csv', header=None)