Ejemplo n.º 1
0
def get_loss(model, batch, inference_only=False):
    answer_lens = [len(a) for _, a in batch]

    questions = pad_seqs([q for q, _ in batch])
    answers = pad_seqs([a for _, a in batch])

    questions = Variable(torch.LongTensor(questions),
                         volatile=inference_only).cuda()
    answers = Variable(torch.LongTensor(answers),
                       volatile=inference_only).cuda()

    # print questions.size(), answers.size()

    q_embedded = model.embedding(questions)
    a_embedded = model.embedding(answers)

    # print questions.size(), answers.size()
    _, thought = model.encoder(q_embedded)
    decoder_output, _ = model.decoder(a_embedded, thought)

    loss = 0
    loss_fn = torch.nn.NLLLoss()
    batch_size = len(batch)
    for i in xrange(batch_size):
        loss += loss_fn(decoder_output[i, :answer_lens[i] - 1],
                        answers[i, 1:answer_lens[i]])

    return loss / batch_size
Ejemplo n.º 2
0
 def load_data(self, preprocess=False, stereochem=1., augment=1):
     all_mols = read_smiles_file(self.dataset)
     if preprocess:
         all_mols = preprocess_smiles(all_mols, stereochem)
     self.molecules = all_mols
     self.smiles = all_mols
     del all_mols
     print("%i molecules loaded from %s..." %
           (len(self.molecules), self.dataset))
     self.maxlen = max([len(m) for m in self.molecules]) + 2
     print("Maximal sequence length: %i" % (self.maxlen - 2))
     if augment > 1:
         print("augmenting SMILES %i-fold..." % augment)
         augmented_mols = randomize_smileslist(self.molecules, num=augment)
         print("%i SMILES strings generated for %i molecules" %
               (len(augmented_mols), len(self.molecules)))
         self.smiles = self.molecules
         self.molecules = augmented_mols
         del augmented_mols
     self.padded = pad_seqs(["^%s$" % m for m in self.molecules],
                            ' ',
                            given_len=self.maxlen)
     self.n_mols = len(self.molecules)
     self.val_mols, self.train_mols = np.split(
         np.random.choice(range(self.n_mols), self.n_mols, replace=False),
         [int(self.validation * self.n_mols)])
     print("Using %i examples for training and %i for valdiation" %
           (len(self.train_mols), len(self.val_mols)))
     self.build_tokenizer()
Ejemplo n.º 3
0
 def _get_feed_dict(self,
                    batch_words,
                    batch_poses,
                    batch_labels=None,
                    training_flag=True):
     feed_dict = {}
     batch_pad_words, batch_words_len = pad_seqs(batch_words)
     batch_pad_poses, batch_poses_len = pad_seqs(batch_poses)
     feed_dict[self.word_inputs] = batch_pad_words
     feed_dict[self.pos_inputs] = batch_pad_poses
     feed_dict[self.batch_sequences_length] = batch_words_len
     if batch_labels:
         batch_pad_labels, _ = pad_seqs(batch_labels)
         feed_dict[self.targets] = batch_pad_labels
     if training_flag:
         feed_dict[self.keep_prob_pl] = self.keep_prob
     else:
         feed_dict[self.keep_prob_pl] = 1.0
     return feed_dict, batch_words_len
Ejemplo n.º 4
0
def get_loss(model, batch, inference_only=False):
    answer_lens = [len(a) for _, a in batch]

    questions = pad_seqs([q for q, _ in batch])
    answers = pad_seqs([a for _, a in batch])

    questions = Variable(torch.LongTensor(questions), volatile=inference_only).cuda()
    answers = Variable(torch.LongTensor(answers), volatile=inference_only).cuda()

    batch_size = len(batch)
    hidden = init_hidden(model.num_layers, batch_size, model.hidden_size)

    _, encoder_hidden = model.encoder(questions)
    decoder_output, _ = model.decoder(answers, encoder_hidden, hidden)

    loss = 0
    loss_fn = torch.nn.NLLLoss()
    for i in xrange(batch_size):
        loss += loss_fn(decoder_output[i, :answer_lens[i] - 1], answers[i, 1:answer_lens[i]])

    return loss / batch_size
Ejemplo n.º 5
0
    def train_model(self, n_sample=100):
        print("Training model...")
        writer = tf.compat.v1.summary.FileWriter('./logs/' + self.run_name,
                                                 graph=tf.Graph())
        mol_file = open("./generated/" + self.run_name + "_generated.csv", 'a')
        i = 0
        while i < self.num_epochs:
            print("\n------ ITERATION %i ------" % i)
            self.set_lr(i)
            print("\nCurrent learning rate: %.5f" %
                  tf.keras.backend.get_value(self.model.optimizer.lr))
            chkpntr = tf.keras.callbacks.ModelCheckpoint(
                filepath=self.checkpoint_dir +
                'model_epoch_{:02d}.hdf5'.format(i),
                verbose=1)
            if self.validation:
                generator_train = DataGenerator(self.padded, self.train_mols,
                                                self.maxlen - 1,
                                                self.token_indices, self.step,
                                                self.batch_size)
                generator_val = DataGenerator(self.padded, self.val_mols,
                                              self.maxlen - 1,
                                              self.token_indices, self.step,
                                              self.batch_size)
                history = self.model.fit_generator(
                    generator=generator_train,
                    epochs=1,
                    validation_data=generator_val,
                    use_multiprocessing=self.multi,
                    workers=self.workers,
                    callbacks=[chkpntr])
                val_loss_sum = tf.Summary(value=[
                    tf.Summary.Value(tag="val_loss",
                                     simple_value=history.history['val_loss']
                                     [-1])
                ])
                writer.add_summary(val_loss_sum, i)

            else:
                generator = DataGenerator(self.padded, range(self.n_mols),
                                          self.maxlen - 1, self.token_indices,
                                          self.step, self.batch_size)
                history = self.model.fit_generator(
                    generator=generator,
                    epochs=1,
                    use_multiprocessing=self.multi,
                    workers=self.workers,
                    callbacks=[chkpntr])
            # write losses to tensorboard log
            loss_sum = tf.Summary(value=[
                tf.Summary.Value(tag="loss",
                                 simple_value=history.history['loss'][-1])
            ])
            writer.add_summary(loss_sum, i)
            lr_sum = tf.Summary(value=[
                tf.Summary.Value(tag="lr",
                                 simple_value=tf.keras.backend.get_value(
                                     self.model.optimizer.lr))
            ])
            writer.add_summary(lr_sum, i)

            if (i + 1) % self.sample_after == 0:
                valid_mols = self.sample_points(n_sample, self.temp)
                n_valid = len(valid_mols)
                if n_valid:
                    print("Comparing novelty...")
                    novel = np.array(
                        compare_mollists(valid_mols, np.array(self.smiles),
                                         False))
                    n_novel = float(len(set(novel))) / n_valid
                    mol_file.write("\n----- epoch %i -----\n" % i)
                    mol_file.write("\n".join(set(valid_mols)))
                else:
                    novel = []
                    n_novel = 0
                # write generated compound summary to tensorboard log
                valid_sum = tf.Summary(value=[
                    tf.Summary.Value(tag="valid",
                                     simple_value=(float(n_valid) / n_sample))
                ])
                novel_sum = tf.Summary(value=[
                    tf.Summary.Value(tag="novel (of valid)",
                                     simple_value=n_novel)
                ])
                writer.add_summary(valid_sum, i)
                writer.add_summary(novel_sum, i)
                print("\nValid:\t{}/{}".format(n_valid, n_sample))
                print("Unique:\t{}".format(len(set(valid_mols))))
                print("Novel:\t{}\n".format(len(novel)))

                if self.reinforce:  # reinforce = add most similar generated compounds to training pool
                    if len(novel) > (n_sample / 5):
                        if self.mw_filter:
                            # only consider molecules in given MW range
                            mw = np.array([
                                Descriptors.MolWt(MolFromSmiles(s))
                                if MolFromSmiles(s) else 0 for s in novel
                            ])
                            mw_idx = np.where((int(self.mw_filter[0]) < mw) &
                                              (mw < int(self.mw_filter[1])))[0]
                            novel = np.array(novel)[mw_idx]

                        print(
                            "Calculating CATS similarities of novel generated molecules to SMILES pool..."
                        )
                        fp_novel = cats_descriptor(
                            [MolFromSmiles(s) for s in novel])
                        if self.reference:  # if a reference mol(s) is given, calculate distance to that one
                            fp_train = cats_descriptor(
                                [MolFromSmiles(self.reference)])
                        else:  # else calculate the distance to all training mols
                            fp_train = cats_descriptor(
                                [MolFromSmiles(s) for s in self.smiles])
                        sims = parallel_pairwise_similarities(
                            fp_novel, fp_train, metric='euclidean')
                        top = sims[range(len(novel)),
                                   np.argsort(sims, axis=1)[:, 0,
                                                            0]].flatten()
                        # take most similar third of the novel mols and add it to self.padded
                        print(
                            "Adding top 3 most similar but novel molecules to SMILES pool"
                        )
                        add = randomize_smileslist(novel[np.argsort(top)[:3]],
                                                   num=3)
                        padd_add = pad_seqs(["^%s$" % m for m in add],
                                            ' ',
                                            given_len=self.maxlen)
                        self.padded = np.hstack((self.padded, padd_add))
                        self.padded = np.random.choice(self.padded,
                                                       len(self.padded),
                                                       False)  # shuffle

            i += 1  # next epoch