def _time_progress(self, start_time, n_steps, step, smiles, mean_score):
     time_elapsed = int(time.time() - start_time)
     time_left = (time_elapsed * ((n_steps - step) / (step + 1)))
     valid_fraction = fraction_valid_smiles(smiles)
     message = (
         f"\n Step {step}   Fraction valid SMILES: {valid_fraction:4.1f}   Score: {mean_score:.4f}   "
         f"Time elapsed: {time_elapsed}   "
         f"Time left: {time_left:.1f}\n")
     return message
 def timestep_report(self, smiles: [str], likelihoods: np.array):
     fraction_valid_smiles = utils_general.fraction_valid_smiles(smiles)
     fraction_unique_entries = self._get_unique_entires_fraction(
         likelihoods)
     structures_table = self._visualize_structures(smiles)
     data = self._assemble_timestep_report(structures_table,
                                           fraction_valid_smiles,
                                           fraction_unique_entries)
     self._notify_server(data, self._log_config.recipient)
    def timestep_report(self, start_time, n_steps, step, smiles,
                        mean_score: np.array, score_summary: FinalSummary, score,
                        agent_likelihood: torch.tensor, prior_likelihood: torch.tensor,
                        augmented_likelihood: torch.tensor):
        score_components = self._score_summary_breakdown(score_summary, mean_score)
        learning_curves = self._learning_curve_profile(agent_likelihood, prior_likelihood, augmented_likelihood)
        structures_table = self._visualize_structures(smiles, score, score_summary)
        smiles_report = self._create_sample_report(smiles, score, score_summary)

        time_estimation = ul_rl.estimate_run_time(start_time, n_steps, step)
        data = self._assemble_timestep_report(step, score_components, structures_table, learning_curves,
                                              time_estimation, fraction_valid_smiles(smiles), smiles_report)
        self._notify_server(data, self._log_config.recipient)
Beispiel #4
0
    def log_timestep(self, lr, epoch, sampled_smiles, sampled_nlls,
                     validation_nlls, training_nlls, jsd_data, jsd_joined_data,
                     model):

        learning_mean = self._mean_learning_curve_profile(
            sampled_nlls, training_nlls)
        learning_variation = self._variation_learning_curve_profile(
            sampled_nlls, training_nlls)
        fraction_valid_smiles = utils_general.fraction_valid_smiles(
            sampled_smiles)
        structures_table = self._visualize_structures(sampled_smiles)
        data = self._assemble_timestep_report(epoch, fraction_valid_smiles,
                                              structures_table, learning_mean,
                                              learning_variation, sampled_nlls,
                                              training_nlls)
        self._notify_server(data, self._log_config.recipient)
Beispiel #5
0
 def _tensorboard_report(self, step, smiles, score,
                         score_summary: FinalSummary, agent_likelihood,
                         prior_likelihood, augmented_likelihood):
     self._summary_writer.add_scalars(
         "nll/avg", {
             "prior": prior_likelihood.mean(),
             "augmented": augmented_likelihood.mean(),
             "agent": agent_likelihood.mean()
         }, step)
     mean_score = np.mean(score)
     for i, log in enumerate(score_summary.profile):
         self._summary_writer.add_scalar(
             score_summary.profile[i].name,
             np.mean(score_summary.profile[i].score), step)
     self._summary_writer.add_scalar("average score", mean_score, step)
     self._summary_writer.add_scalar("Fraction valid SMILES",
                                     fraction_valid_smiles(smiles), step)
     if step % 10 == 0:
         self._log_out_smiles_sample(smiles, score, step, score_summary)
Beispiel #6
0
def reinforcement_learning(prior: models.reinvent.Model,
                           agent: models.reinvent.Model,
                           scoring_function: Callable,
                           logdir: str,
                           resultdir: str,
                           n_steps=3000,
                           sigma=120,
                           experience_replay=False,
                           lr=0.0001,
                           batch_size=128,
                           save_every=50,
                           keep_max=10,
                           reset=0,
                           temperature=1.0,
                           reset_score_cutoff=0.5):
    assert prior.voc == agent.voc, "The agent and the prior must have the same vocabulary!"
    start_time = time.time()

    # We don't need gradients with respect to Prior
    for param in prior.rnn.parameters():
        param.requires_grad = False

    optimizer = torch.optim.Adam(agent.rnn.parameters(), lr=lr)

    # For policy based RL, we normally train on-policy and correct for the fact that more likely actions
    # occur more often (which means the agent can get biased towards them). Using experience replay is
    # therefor not as theoretically sound as it is for value based RL, but it seems to work well.
    if experience_replay:
        experience = Experience(prior.voc)
        # Can add an initial experience if we want to
        # experience.initiate_from_file('/home/excape/reinvent/tala_xray_lig.smi', scoring_function, Prior)

    reset_countdown = 0

    for step in range(n_steps):

        # Sample from Agent
        seqs, agent_likelihood, entropy = agent.sample(batch_size,
                                                       temperature=temperature)

        # Remove duplicates, ie only consider unique seqs
        unique_idxs = unique(seqs)
        seqs = seqs[unique_idxs]
        agent_likelihood = agent_likelihood[unique_idxs]
        entropy = entropy[unique_idxs]

        # Get prior likelihood and score
        prior_likelihood, _ = prior.likelihood(Variable(seqs),
                                               temperature=temperature)
        smiles = prior.sequence_to_smiles(seqs)
        score_components = scoring_function(smiles)
        # we need to extract the total_score key
        score = score_components.pop('total_score')
        # Calculate augmented likelihood
        augmented_likelihood = prior_likelihood + sigma * Variable(score)
        loss = torch.pow((augmented_likelihood - agent_likelihood), 2)

        # Experience Replay
        # First sample
        if experience_replay and len(experience) > 4:
            exp_seqs, exp_score, exp_prior_likelihood = experience.sample(8)
            exp_agent_likelihood, exp_entropy = agent.likelihood(
                exp_seqs.long(), temperature=temperature)
            exp_augmented_likelihood = exp_prior_likelihood + sigma * exp_score
            exp_loss = torch.pow(
                (Variable(exp_augmented_likelihood) - exp_agent_likelihood), 2)
            loss = torch.cat((loss, exp_loss), 0)
            agent_likelihood = torch.cat(
                (agent_likelihood, exp_agent_likelihood), 0)

        # Then add new experience
        prior_likelihood = prior_likelihood.data.cpu().numpy()
        new_experience = zip(smiles, score, prior_likelihood)
        if experience_replay:
            experience.add_experience(new_experience)

        # Calculate loss
        loss = loss.mean()

        # Add regularizer that penalizes high likelihood for the entire sequence
        # With this regularizer the example where only Celecoxib is generated
        # doesnt work for obvious reasons.
        # loss_p = - (1 / agent_likelihood).mean()
        # loss += 5 * 1e3 * loss_p

        # Calculate gradients and make an update to the network weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Now optimize with respect to the entropy
        entropy = torch.sum(entropy)
        # loss.backward()
        # optimizer.step()

        # Convert to numpy arrays so that we can print them
        augmented_likelihood = augmented_likelihood.data.cpu().numpy()
        agent_likelihood = agent_likelihood.data.cpu().numpy()

        # Print some information for this step
        time_elapsed = int(time.time() - start_time)
        time_left = (time_elapsed * ((n_steps - step) / (step + 1)))
        mean_score = np.mean(score)
        message = (
            "\n Step {}   Fraction valid SMILES: {:4.1f}   Score: {:.4f}   Time elapsed: {}   "
            "Time left: {:.1f}\n").format(step,
                                          fraction_valid_smiles(smiles) * 100,
                                          mean_score, time_elapsed, time_left)
        message += "     ".join(["  Agent", "Prior", "Target", "Score"] +
                                list(score_components.keys()) + ["SMILES\n"])
        for i in range(min(10, len(smiles))):
            print_component_scores = [
                score_components[key][i] for key in score_components
            ]
            message += "  {:6.2f}    {:6.2f}    {:6.2f}    {:6.2f}    ".format(
                agent_likelihood[i], prior_likelihood[i],
                augmented_likelihood[i], score[i])
            message += ("{:6.2f}    " * len(print_component_scores)).format(
                *print_component_scores)
            message += "{}\n".format(smiles[i])

        logging.info(message)
        if step % save_every == 0:
            logging.debug("Write Agent memory")
            agent.save(os.path.join(logdir, 'Agent.{}.ckpt'.format(step)))
            if keep_max > 0:
                for oldsteps in range(0, step - (keep_max * save_every) + 1,
                                      save_every):
                    with contextlib.suppress(FileNotFoundError):
                        os.remove(
                            os.path.join(logdir,
                                         'Agent.{}.ckpt'.format(oldsteps)))

        logging.debug("Entropy: {}".format(entropy))

        # reset the weight of NN to search for diverse solutions
        if reset:
            if reset_countdown:
                reset_countdown += 1
            elif mean_score >= reset_score_cutoff:
                reset_countdown = 1

            if reset_countdown == reset:
                agent.reset()
                reset_countdown = 0
                logging.debug("Agent RNN is reset!")

    # If the entire training finishes, we create a new folder where we save some sampled
    # sequences and the contents of the experinence (which are the highest scored
    # sequences seen during training)
    if not os.path.isdir(resultdir):
        os.makedirs(resultdir)

    agent.save(os.path.join(resultdir, 'Agent.ckpt'))
    if experience_replay:
        experience.print_memory(os.path.join(resultdir, "experience_memory"))

    # copy the output.log as well
    copyfile(os.path.join(logdir, "output.log"),
             os.path.join(resultdir, "output.log"))
    copyfile(os.path.join(logdir, "debug.log"),
             os.path.join(resultdir, "debug.log"))

    # copy metadata
    copyfile(os.path.join(logdir, "metadata.json"),
             os.path.join(resultdir, "metadata.json"))
Beispiel #7
0
def hill_climbing(pattern=None,
                  restore_agent_from='data/Prior.ckpt',
                  scoring_function='tanimoto',
                  scoring_function_kwargs=None,
                  save_dir=None,
                  learning_rate=0.0005,
                  batch_size=64,
                  n_steps=10,
                  num_processes=0,
                  use_custom_voc="data/Voc"):

    voc = Vocabulary(init_from_file=use_custom_voc)

    start_time = time.time()
    if pattern:
        Agent = scaffold_constrained_RNN(voc)
    else:
        Agent = RNN(voc)

    logger = VizardLog('data/logs')

    if torch.cuda.is_available():
        Agent.rnn.load_state_dict(torch.load(restore_agent_from))
    else:
        Agent.rnn.load_state_dict(
            torch.load(restore_agent_from,
                       map_location=lambda storage, loc: storage))

    optimizer = torch.optim.Adam(Agent.rnn.parameters(), lr=learning_rate)

    # Scoring_function
    scoring_function = get_scoring_function(scoring_function=scoring_function,
                                            num_processes=num_processes,
                                            **scoring_function_kwargs)

    # For policy based RL, we normally train on-policy and correct for the fact that more likely actions
    # occur more often (which means the agent can get biased towards them). Using experience replay is
    # therefor not as theoretically sound as it is for value based RL, but it seems to work well.
    experience = Experience(voc)

    # Log some network weights that can be dynamically plotted with the Vizard bokeh app
    logger.log(Agent.rnn.gru_2.weight_ih.cpu().data.numpy()[::100],
               "init_weight_GRU_layer_2_w_ih")
    logger.log(Agent.rnn.gru_2.weight_hh.cpu().data.numpy()[::100],
               "init_weight_GRU_layer_2_w_hh")
    logger.log(Agent.rnn.embedding.weight.cpu().data.numpy()[::30],
               "init_weight_GRU_embedding")
    logger.log(Agent.rnn.gru_2.bias_ih.cpu().data.numpy(),
               "init_weight_GRU_layer_2_b_ih")
    logger.log(Agent.rnn.gru_2.bias_hh.cpu().data.numpy(),
               "init_weight_GRU_layer_2_b_hh")

    # Information for the logger
    step_score = [[], []]

    print("Model initialized, starting training...")

    for step in range(n_steps):

        # Sample from Agent
        if pattern:
            seqs, agent_likelihood, entropy = Agent.sample(pattern, batch_size)
        else:
            seqs, agent_likelihood, entropy = Agent.sample(batch_size)
        gc.collect()
        # Remove duplicates, ie only consider unique seqs
        unique_idxs = unique(seqs)
        seqs = seqs[unique_idxs]
        agent_likelihood = agent_likelihood[unique_idxs]
        entropy = entropy[unique_idxs]

        # Get prior likelihood and score
        smiles = seq_to_smiles(seqs, voc)
        score = scoring_function(smiles)

        new_experience = zip(smiles, score, agent_likelihood)
        experience.add_experience(new_experience)

        indexes = np.flip(np.argsort(np.array(score)))
        # Train the agent for 10 epochs on hill-climbing procedure
        for epoch in range(10):
            loss = Variable(torch.zeros(1))
            counter = 0
            seen_seqs = []
            for j in indexes:
                if counter > 50:
                    break
                seq = seqs[j]
                s = smiles[j]
                if s not in seen_seqs:
                    seen_seqs.append(s)
                    log_p, _ = Agent.likelihood(Variable(seq).view(1, -1))
                    loss -= log_p.mean()
                    counter += 1
            loss /= counter
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Print some information for this step
        time_elapsed = (time.time() - start_time) / 3600
        time_left = (time_elapsed * ((n_steps - step) / (step + 1)))
        print(
            "\n       Step {}   Fraction valid SMILES: {:4.1f}  Time elapsed: {:.2f}h Time left: {:.2f}h"
            .format(step,
                    fraction_valid_smiles(smiles) * 100, time_elapsed,
                    time_left))
        print("  Agent    Prior   Target   Score             SMILES")
        for i in range(10):
            print(" {:6.2f}     {}".format(score[i], smiles[i]))
        # Need this for Vizard plotting
        step_score[0].append(step + 1)
        step_score[1].append(np.mean(score))

        # Log some weights
        logger.log(Agent.rnn.gru_2.weight_ih.cpu().data.numpy()[::100],
                   "weight_GRU_layer_2_w_ih")
        logger.log(Agent.rnn.gru_2.weight_hh.cpu().data.numpy()[::100],
                   "weight_GRU_layer_2_w_hh")
        logger.log(Agent.rnn.embedding.weight.cpu().data.numpy()[::30],
                   "weight_GRU_embedding")
        logger.log(Agent.rnn.gru_2.bias_ih.cpu().data.numpy(),
                   "weight_GRU_layer_2_b_ih")
        logger.log(Agent.rnn.gru_2.bias_hh.cpu().data.numpy(),
                   "weight_GRU_layer_2_b_hh")
        logger.log("\n".join([smiles + "\t" + str(round(score, 2)) for smiles, score in zip \
                            (smiles[:12], score[:12])]), "SMILES", dtype="text", overwrite=True)
        logger.log(np.array(step_score), "Scores")

    # If the entire training finishes, we create a new folder where we save this python file
    # as well as some sampled sequences and the contents of the experinence (which are the highest
    # scored sequences seen during training)
    if not save_dir:
        save_dir = 'data/results/run_' + time.strftime("%Y-%m-%d-%H_%M_%S",
                                                       time.localtime())
    try:
        os.makedirs(save_dir)
    except:
        print("Folder already existing... overwriting previous results")

    copyfile('train_agent.py', os.path.join(save_dir, "train_agent.py"))

    experience.print_memory(os.path.join(save_dir, "memory"))
    torch.save(Agent.rnn.state_dict(), os.path.join(save_dir, 'Agent.ckpt'))
    previous_smiles = []
    with open(os.path.join(save_dir, "memory.smi"), 'w') as f:
        for i, exp in enumerate(experience.memory):
            try:
                if Chem.MolToSmiles(
                        Chem.rdmolops.RemoveStereochemistry(
                            Chem.MolFromSmiles(
                                exp[0]))) not in previous_smiles:
                    f.write("{}\n".format(exp[0]))
                    previous_smiles.append(
                        Chem.MolToSmiles(
                            Chem.rdmolops.RemoveStereochemistry(
                                Chem.MolFromSmiles(exp[0]))))
            except:
                pass
Beispiel #8
0
def train_agent(restore_prior_from='data/Prior.ckpt',
                restore_agent_from='data/Prior.ckpt',
                scoring_function='tanimoto',
                scoring_function_kwargs=None,
                save_dir=None,
                learning_rate=0.0005,
                batch_size=64,
                n_steps=3000,
                num_processes=0,
                sigma=60,
                experience_replay=0):

    voc = Vocabulary(init_from_file="data/Voc")

    start_time = time.time()

    Prior = RNN(voc)
    Agent = RNN(voc)

    logger = VizardLog('data/logs')

    # By default restore Agent to same model as Prior, but can restore from already trained Agent too.
    # Saved models are partially on the GPU, but if we dont have cuda enabled we can remap these
    # to the CPU.
    if torch.cuda.is_available():
        Prior.rnn.load_state_dict(torch.load(restore_prior_from))
        Agent.rnn.load_state_dict(torch.load(restore_agent_from))
    else:
        Prior.rnn.load_state_dict(
            torch.load(restore_prior_from,
                       map_location=lambda storage, loc: storage))
        Agent.rnn.load_state_dict(
            torch.load(restore_agent_from,
                       map_location=lambda storage, loc: storage))

    # We dont need gradients with respect to Prior
    for param in Prior.rnn.parameters():
        param.requires_grad = False

    optimizer = torch.optim.Adam(Agent.rnn.parameters(), lr=0.0005)

    # Scoring_function
    scoring_function = get_scoring_function(scoring_function=scoring_function,
                                            num_processes=num_processes,
                                            **scoring_function_kwargs)

    # For policy based RL, we normally train on-policy and correct for the fact that more likely actions
    # occur more often (which means the agent can get biased towards them). Using experience replay is
    # therefor not as theoretically sound as it is for value based RL, but it seems to work well.
    experience = Experience(voc)

    # Log some network weights that can be dynamically plotted with the Vizard bokeh app
    logger.log(Agent.rnn.gru_2.weight_ih.cpu().data.numpy()[::100],
               "init_weight_GRU_layer_2_w_ih")
    logger.log(Agent.rnn.gru_2.weight_hh.cpu().data.numpy()[::100],
               "init_weight_GRU_layer_2_w_hh")
    logger.log(Agent.rnn.embedding.weight.cpu().data.numpy()[::30],
               "init_weight_GRU_embedding")
    logger.log(Agent.rnn.gru_2.bias_ih.cpu().data.numpy(),
               "init_weight_GRU_layer_2_b_ih")
    logger.log(Agent.rnn.gru_2.bias_hh.cpu().data.numpy(),
               "init_weight_GRU_layer_2_b_hh")

    # Information for the logger
    step_score = [[], []]

    print("Model initialized, starting training...")

    for step in range(n_steps):

        # Sample from Agent
        seqs, agent_likelihood, entropy = Agent.sample(batch_size)

        # Remove duplicates, ie only consider unique seqs
        unique_idxs = unique(seqs)
        seqs = seqs[unique_idxs]
        agent_likelihood = agent_likelihood[unique_idxs]
        entropy = entropy[unique_idxs]

        # Get prior likelihood and score
        prior_likelihood, _ = Prior.likelihood(Variable(seqs))
        smiles = seq_to_smiles(seqs, voc)
        score = scoring_function(smiles)

        # Calculate augmented likelihood
        augmented_likelihood = prior_likelihood + sigma * Variable(score)
        loss = torch.pow((augmented_likelihood - agent_likelihood), 2)

        # Experience Replay
        # First sample
        if experience_replay and len(experience) > 4:
            exp_seqs, exp_score, exp_prior_likelihood = experience.sample(4)
            exp_agent_likelihood, exp_entropy = Agent.likelihood(
                exp_seqs.long())
            exp_augmented_likelihood = exp_prior_likelihood + sigma * exp_score
            exp_loss = torch.pow(
                (Variable(exp_augmented_likelihood) - exp_agent_likelihood), 2)
            loss = torch.cat((loss, exp_loss), 0)
            agent_likelihood = torch.cat(
                (agent_likelihood, exp_agent_likelihood), 0)

        # Then add new experience
        prior_likelihood = prior_likelihood.data.cpu().numpy()
        new_experience = zip(smiles, score, prior_likelihood)
        experience.add_experience(new_experience)

        # Calculate loss
        loss = loss.mean()

        # Add regularizer that penalizes high likelihood for the entire sequence
        loss_p = -(1 / agent_likelihood).mean()
        loss += 5 * 1e3 * loss_p

        # Calculate gradients and make an update to the network weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Convert to numpy arrays so that we can print them
        augmented_likelihood = augmented_likelihood.data.cpu().numpy()
        agent_likelihood = agent_likelihood.data.cpu().numpy()

        # Print some information for this step
        time_elapsed = (time.time() - start_time) / 3600
        time_left = (time_elapsed * ((n_steps - step) / (step + 1)))
        print(
            "\n       Step {}   Fraction valid SMILES: {:4.1f}  Time elapsed: {:.2f}h Time left: {:.2f}h"
            .format(step,
                    fraction_valid_smiles(smiles) * 100, time_elapsed,
                    time_left))
        print("  Agent    Prior   Target   Score             SMILES")
        for i in range(10):
            print(" {:6.2f}   {:6.2f}  {:6.2f}  {:6.2f}     {}".format(
                agent_likelihood[i], prior_likelihood[i],
                augmented_likelihood[i], score[i], smiles[i]))
        # Need this for Vizard plotting
        step_score[0].append(step + 1)
        step_score[1].append(np.mean(score))

        # Log some weights
        logger.log(Agent.rnn.gru_2.weight_ih.cpu().data.numpy()[::100],
                   "weight_GRU_layer_2_w_ih")
        logger.log(Agent.rnn.gru_2.weight_hh.cpu().data.numpy()[::100],
                   "weight_GRU_layer_2_w_hh")
        logger.log(Agent.rnn.embedding.weight.cpu().data.numpy()[::30],
                   "weight_GRU_embedding")
        logger.log(Agent.rnn.gru_2.bias_ih.cpu().data.numpy(),
                   "weight_GRU_layer_2_b_ih")
        logger.log(Agent.rnn.gru_2.bias_hh.cpu().data.numpy(),
                   "weight_GRU_layer_2_b_hh")
        logger.log("\n".join([smiles + "\t" + str(round(score, 2)) for smiles, score in zip \
                            (smiles[:12], score[:12])]), "SMILES", dtype="text", overwrite=True)
        logger.log(np.array(step_score), "Scores")

    # If the entire training finishes, we create a new folder where we save this python file
    # as well as some sampled sequences and the contents of the experinence (which are the highest
    # scored sequences seen during training)
    if not save_dir:
        save_dir = 'data/results/run_' + time.strftime("%Y-%m-%d-%H_%M_%S",
                                                       time.localtime())
    os.makedirs(save_dir)
    copyfile('train_agent.py', os.path.join(save_dir, "train_agent.py"))

    experience.print_memory(os.path.join(save_dir, "memory"))
    torch.save(Agent.rnn.state_dict(), os.path.join(save_dir, 'Agent.ckpt'))

    seqs, agent_likelihood, entropy = Agent.sample(256)
    prior_likelihood, _ = Prior.likelihood(Variable(seqs))
    prior_likelihood = prior_likelihood.data.cpu().numpy()
    smiles = seq_to_smiles(seqs, voc)
    score = scoring_function(smiles)
    with open(os.path.join(save_dir, "sampled"), 'w') as f:
        f.write("SMILES Score PriorLogP\n")
        for smiles, score, prior_likelihood in zip(smiles, score,
                                                   prior_likelihood):
            f.write("{} {:5.2f} {:6.2f}\n".format(smiles, score,
                                                  prior_likelihood))
 def _valid_stats(self, smiles, epoch):
     self._summary_writer.add_scalar("valid", fraction_valid_smiles(smiles), epoch)
Beispiel #10
0
def train_agent(runname='celecoxib',
                priorname='chembl',
                scoring_function='Tanimoto',
                scoring_function_kwargs=None,
                save_dir=None,
                batch_size=64,
                n_steps=3000,
                num_processes=6,
                sigma=60,
                experience_replay=5,
                lr=0.0005):
    print("\nStarting run %s with prior %s ..." % (runname, priorname))
    start_time = time.time()

    voc = Vocabulary(init_from_file="data/Voc_%s" % priorname)

    prior = RNN(voc)
    agent = RNN(voc)

    writer = SummaryWriter('logs/%s' % runname)

    # By default restore Agent to same model as Prior, but can restore from already trained Agent too.
    # Saved models are partially on the GPU, but if we dont have cuda enabled we can remap these
    # to the CPU.
    if torch.cuda.is_available():
        prior.rnn.load_state_dict(torch.load('data/prior_%s.ckpt' % priorname))
        agent.rnn.load_state_dict(torch.load('data/prior_%s.ckpt' % priorname))
    else:
        prior.rnn.load_state_dict(
            torch.load('data/prior_%s.ckpt' % priorname,
                       map_location=lambda storage, loc: storage))
        agent.rnn.load_state_dict(
            torch.load('data/prior_%s.ckpt' % priorname,
                       map_location=lambda storage, loc: storage))

    # We dont need gradients with respect to Prior
    for param in prior.rnn.parameters():
        param.requires_grad = False

    optimizer = torch.optim.Adam(agent.rnn.parameters(), lr=lr)

    # Scoring_function
    scoring_function = get_scoring_function(scoring_function=scoring_function,
                                            num_processes=num_processes,
                                            **scoring_function_kwargs)

    # For policy based RL, we normally train on-policy and correct for the fact that more likely actions
    # occur more often (which means the agent can get biased towards them). Using experience replay is
    # therefor not as theoretically sound as it is for value based RL, but it seems to work well.
    experience = Experience(voc)

    print("Model initialized, starting training...")

    for step in range(n_steps):
        # Sample from Agent
        seqs, agent_likelihood, entropy = agent.sample(batch_size)

        # Remove duplicates, ie only consider unique seqs
        unique_ids = unique(seqs)
        seqs = seqs[unique_ids]
        agent_likelihood = agent_likelihood[unique_ids]
        entropy = entropy[unique_ids]

        # Get prior likelihood and score
        prior_likelihood, _ = prior.likelihood(Variable(seqs))
        smiles = seq_to_smiles(seqs, voc)
        score = scoring_function(smiles)

        # Calculate augmented likelihood
        augmented_likelihood = prior_likelihood + sigma * Variable(score)
        loss = torch.pow((augmented_likelihood - agent_likelihood), 2)

        # Experience Replay
        # First sample
        if experience_replay and len(experience) > 4:
            exp_seqs, exp_score, exp_prior_likelihood = experience.sample(4)
            exp_agent_likelihood, exp_entropy = agent.likelihood(
                exp_seqs.long())
            exp_augmented_likelihood = exp_prior_likelihood + sigma * exp_score
            exp_loss = torch.pow(
                (Variable(exp_augmented_likelihood) - exp_agent_likelihood), 2)
            loss = torch.cat((loss, exp_loss), 0)
            agent_likelihood = torch.cat(
                (agent_likelihood, exp_agent_likelihood), 0)

        # Then add new experience
        prior_likelihood = prior_likelihood.data.cpu().numpy()
        new_experience = zip(smiles, score, prior_likelihood)
        best_memory = experience.add_experience(new_experience)

        # Calculate loss
        loss = loss.mean()

        # Add regularizer that penalizes high likelihood for the entire sequence
        loss_p = -(1 / agent_likelihood).mean()
        loss += 5 * 1e3 * loss_p

        # Calculate gradients and make an update to the network weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Convert to numpy arrays so that we can print them
        augmented_likelihood = augmented_likelihood.data.cpu().numpy()
        agent_likelihood = agent_likelihood.data.cpu().numpy()

        # Print some information for this step
        time_elapsed = (time.time() - start_time) / 3600
        time_left = (time_elapsed * ((n_steps - step) / (step + 1)))
        print(
            "\n       Step {}   Fraction valid SMILES: {:4.1f}  Time elapsed: {:.2f}h Time left: {:.2f}h\n"
            .format(step,
                    fraction_valid_smiles(smiles) * 100, time_elapsed,
                    time_left))
        print("  Agent    Prior   Target   Score             SMILES")
        for i in range(10):
            print(" {:6.2f}   {:6.2f}  {:6.2f}  {:6.2f}     {}".format(
                agent_likelihood[i], prior_likelihood[i],
                augmented_likelihood[i], score[i], smiles[i]))

        # Log
        writer.add_scalar('loss', loss.item(), step)
        writer.add_scalar('score', np.mean(score), step)
        writer.add_scalar('entropy', entropy.mean(), step)
        if best_memory:
            writer.add_scalar('best_memory', best_memory, step)

        # get 4 random valid smiles and scores for logging
        val_ids = np.array(
            [i for i, s in enumerate(smiles) if is_valid_mol(s)])
        val_ids = np.random.choice(val_ids, 4, replace=False)
        smiles = np.array(smiles)[val_ids]
        score = ['%.3f' % s for s in np.array(score)[val_ids]]
        writer.add_image('generated_mols', mol_to_torchimage(smiles, score),
                         step)

    # If the entire training finishes, we create a new folder where we save this python file
    # as well as some sampled sequences and the contents of the experinence (which are the highest
    # scored sequences seen during training)
    if not save_dir:
        save_dir = 'results/%s' % runname + time.strftime(
            "%Y-%m-%d-%H_%M_%S", time.localtime())
    os.makedirs(save_dir)
    copyfile('agent.py', os.path.join(save_dir, "agent_%s.py" % runname))

    experience.print_memory(os.path.join(save_dir, "memory"))
    torch.save(agent.rnn.state_dict(),
               os.path.join(save_dir, 'Agent_%s.ckpt' % runname))

    seqs, agent_likelihood, entropy = agent.sample(256)
    prior_likelihood, _ = prior.likelihood(Variable(seqs))
    prior_likelihood = prior_likelihood.data.cpu().numpy()
    smiles = seq_to_smiles(seqs, voc)
    score = scoring_function(smiles)
    with open(os.path.join(save_dir, "sampled.txt"), 'w') as f:
        f.write("SMILES Score PriorLogP\n")
        for smiles, score, prior_likelihood in zip(smiles, score,
                                                   prior_likelihood):
            f.write("{} {:5.2f} {:6.2f}\n".format(smiles, score,
                                                  prior_likelihood))

    print("\nDONE! Whole run took %s" %
          datetime.timedelta(seconds=time.time() - start_time))
 def _log_timestep(self, smiles: np.array, likelihoods: np.array):
     fraction_valid_smiles = utils_general.fraction_valid_smiles(smiles)
     fraction_unique_entries = self._get_unique_entires_fraction(likelihoods)
     self._visualize_structures(smiles)
     self._summary_writer.add_text('Data', f'Valid SMILES: {fraction_valid_smiles}% '
                                           f'Unique Mols: {fraction_unique_entries}%  ')