def get_loss(model, batch, inference_only=False): answer_lens = [len(a) for _, a in batch] questions = pad_seqs([q for q, _ in batch]) answers = pad_seqs([a for _, a in batch]) questions = Variable(torch.LongTensor(questions), volatile=inference_only).cuda() answers = Variable(torch.LongTensor(answers), volatile=inference_only).cuda() # print questions.size(), answers.size() q_embedded = model.embedding(questions) a_embedded = model.embedding(answers) # print questions.size(), answers.size() _, thought = model.encoder(q_embedded) decoder_output, _ = model.decoder(a_embedded, thought) loss = 0 loss_fn = torch.nn.NLLLoss() batch_size = len(batch) for i in xrange(batch_size): loss += loss_fn(decoder_output[i, :answer_lens[i] - 1], answers[i, 1:answer_lens[i]]) return loss / batch_size
def load_data(self, preprocess=False, stereochem=1., augment=1): all_mols = read_smiles_file(self.dataset) if preprocess: all_mols = preprocess_smiles(all_mols, stereochem) self.molecules = all_mols self.smiles = all_mols del all_mols print("%i molecules loaded from %s..." % (len(self.molecules), self.dataset)) self.maxlen = max([len(m) for m in self.molecules]) + 2 print("Maximal sequence length: %i" % (self.maxlen - 2)) if augment > 1: print("augmenting SMILES %i-fold..." % augment) augmented_mols = randomize_smileslist(self.molecules, num=augment) print("%i SMILES strings generated for %i molecules" % (len(augmented_mols), len(self.molecules))) self.smiles = self.molecules self.molecules = augmented_mols del augmented_mols self.padded = pad_seqs(["^%s$" % m for m in self.molecules], ' ', given_len=self.maxlen) self.n_mols = len(self.molecules) self.val_mols, self.train_mols = np.split( np.random.choice(range(self.n_mols), self.n_mols, replace=False), [int(self.validation * self.n_mols)]) print("Using %i examples for training and %i for valdiation" % (len(self.train_mols), len(self.val_mols))) self.build_tokenizer()
def _get_feed_dict(self, batch_words, batch_poses, batch_labels=None, training_flag=True): feed_dict = {} batch_pad_words, batch_words_len = pad_seqs(batch_words) batch_pad_poses, batch_poses_len = pad_seqs(batch_poses) feed_dict[self.word_inputs] = batch_pad_words feed_dict[self.pos_inputs] = batch_pad_poses feed_dict[self.batch_sequences_length] = batch_words_len if batch_labels: batch_pad_labels, _ = pad_seqs(batch_labels) feed_dict[self.targets] = batch_pad_labels if training_flag: feed_dict[self.keep_prob_pl] = self.keep_prob else: feed_dict[self.keep_prob_pl] = 1.0 return feed_dict, batch_words_len
def get_loss(model, batch, inference_only=False): answer_lens = [len(a) for _, a in batch] questions = pad_seqs([q for q, _ in batch]) answers = pad_seqs([a for _, a in batch]) questions = Variable(torch.LongTensor(questions), volatile=inference_only).cuda() answers = Variable(torch.LongTensor(answers), volatile=inference_only).cuda() batch_size = len(batch) hidden = init_hidden(model.num_layers, batch_size, model.hidden_size) _, encoder_hidden = model.encoder(questions) decoder_output, _ = model.decoder(answers, encoder_hidden, hidden) loss = 0 loss_fn = torch.nn.NLLLoss() for i in xrange(batch_size): loss += loss_fn(decoder_output[i, :answer_lens[i] - 1], answers[i, 1:answer_lens[i]]) return loss / batch_size
def train_model(self, n_sample=100): print("Training model...") writer = tf.compat.v1.summary.FileWriter('./logs/' + self.run_name, graph=tf.Graph()) mol_file = open("./generated/" + self.run_name + "_generated.csv", 'a') i = 0 while i < self.num_epochs: print("\n------ ITERATION %i ------" % i) self.set_lr(i) print("\nCurrent learning rate: %.5f" % tf.keras.backend.get_value(self.model.optimizer.lr)) chkpntr = tf.keras.callbacks.ModelCheckpoint( filepath=self.checkpoint_dir + 'model_epoch_{:02d}.hdf5'.format(i), verbose=1) if self.validation: generator_train = DataGenerator(self.padded, self.train_mols, self.maxlen - 1, self.token_indices, self.step, self.batch_size) generator_val = DataGenerator(self.padded, self.val_mols, self.maxlen - 1, self.token_indices, self.step, self.batch_size) history = self.model.fit_generator( generator=generator_train, epochs=1, validation_data=generator_val, use_multiprocessing=self.multi, workers=self.workers, callbacks=[chkpntr]) val_loss_sum = tf.Summary(value=[ tf.Summary.Value(tag="val_loss", simple_value=history.history['val_loss'] [-1]) ]) writer.add_summary(val_loss_sum, i) else: generator = DataGenerator(self.padded, range(self.n_mols), self.maxlen - 1, self.token_indices, self.step, self.batch_size) history = self.model.fit_generator( generator=generator, epochs=1, use_multiprocessing=self.multi, workers=self.workers, callbacks=[chkpntr]) # write losses to tensorboard log loss_sum = tf.Summary(value=[ tf.Summary.Value(tag="loss", simple_value=history.history['loss'][-1]) ]) writer.add_summary(loss_sum, i) lr_sum = tf.Summary(value=[ tf.Summary.Value(tag="lr", simple_value=tf.keras.backend.get_value( self.model.optimizer.lr)) ]) writer.add_summary(lr_sum, i) if (i + 1) % self.sample_after == 0: valid_mols = self.sample_points(n_sample, self.temp) n_valid = len(valid_mols) if n_valid: print("Comparing novelty...") novel = np.array( compare_mollists(valid_mols, np.array(self.smiles), False)) n_novel = float(len(set(novel))) / n_valid mol_file.write("\n----- epoch %i -----\n" % i) mol_file.write("\n".join(set(valid_mols))) else: novel = [] n_novel = 0 # write generated compound summary to tensorboard log valid_sum = tf.Summary(value=[ tf.Summary.Value(tag="valid", simple_value=(float(n_valid) / n_sample)) ]) novel_sum = tf.Summary(value=[ tf.Summary.Value(tag="novel (of valid)", simple_value=n_novel) ]) writer.add_summary(valid_sum, i) writer.add_summary(novel_sum, i) print("\nValid:\t{}/{}".format(n_valid, n_sample)) print("Unique:\t{}".format(len(set(valid_mols)))) print("Novel:\t{}\n".format(len(novel))) if self.reinforce: # reinforce = add most similar generated compounds to training pool if len(novel) > (n_sample / 5): if self.mw_filter: # only consider molecules in given MW range mw = np.array([ Descriptors.MolWt(MolFromSmiles(s)) if MolFromSmiles(s) else 0 for s in novel ]) mw_idx = np.where((int(self.mw_filter[0]) < mw) & (mw < int(self.mw_filter[1])))[0] novel = np.array(novel)[mw_idx] print( "Calculating CATS similarities of novel generated molecules to SMILES pool..." ) fp_novel = cats_descriptor( [MolFromSmiles(s) for s in novel]) if self.reference: # if a reference mol(s) is given, calculate distance to that one fp_train = cats_descriptor( [MolFromSmiles(self.reference)]) else: # else calculate the distance to all training mols fp_train = cats_descriptor( [MolFromSmiles(s) for s in self.smiles]) sims = parallel_pairwise_similarities( fp_novel, fp_train, metric='euclidean') top = sims[range(len(novel)), np.argsort(sims, axis=1)[:, 0, 0]].flatten() # take most similar third of the novel mols and add it to self.padded print( "Adding top 3 most similar but novel molecules to SMILES pool" ) add = randomize_smileslist(novel[np.argsort(top)[:3]], num=3) padd_add = pad_seqs(["^%s$" % m for m in add], ' ', given_len=self.maxlen) self.padded = np.hstack((self.padded, padd_add)) self.padded = np.random.choice(self.padded, len(self.padded), False) # shuffle i += 1 # next epoch