Example #1
0
    def __init__(self, state_size, action_size, action_limits=1.):
        self.state_size = state_size
        self.action_size = action_size
        self.action_limits = action_limits

        self.memory = Memory(MEMORY_SIZE)
        self.noise = Noise(action_size)

        self.actor = ActorNet(state_size, action_size)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE)

        self.critic = CriticNet(state_size, action_size)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE)
    def create(self, model):
        parameters = self._parameters

        #行動の数
        actions_count = 2

        #方策(ここでいう方策とは、greedyかε-greedyということ)
        policies = self._create_policies(model, parameters, actions_count)

        #経験を記憶する
        memory = Memory(parameters["memory_size"])

        #割引率γ
        gamma = parameters["gamma"]

        #replay_start_memory_size個のデータが貯まるまで学習を開始しない
        replay_start_memory_size = parameters["replay_start_memory_size"]

        #experience_replayするときのデータ数
        replay_count = parameters["replay_count"]

        #学習する間隔を少し開ける
        training_interval_steps = parameters["training_interval_steps"]

        #optimizerを生成する
        optimizer_parameters = (parameters["optimizer"]["alpha"],
                                parameters["optimizer"]["epsilon"])
        optimizer = optimizers.Adam(alpha=optimizer_parameters[0],
                                    eps=optimizer_parameters[1])
        optimizer.setup(model)

        #モデルを更新する処理
        model_updater = SoftModelUpdater(parameters["tau"])

        #agent生成
        agent = DQNAgent(gamma, model, optimizer, model_updater, memory,
                         replay_start_memory_size, replay_count,
                         training_interval_steps, policies)

        return agent
Example #3
0
from agent.memory import Memory
from functions import *
from preprocess_price import preprocess_price
from keras.models import clone_model
import sys
import numpy as np

if len(sys.argv) != 4:
    print("Usage: python3 train.py [stock] [window] [episodes]")
    exit()

stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int(
    sys.argv[3])

max_queue_size = 100
memory = Memory(max_queue_size)
agent = Agent(window_size, memory)

data = preprocess_price(stock_name)
# data = getStockDataVec(stock_name)
l = len(data) - 1
batch_size = 32
budget = 10000
errors = []
profits = []
fee = 0.2 / 100

for e in range(episode_count + 1):
    print("Episode " + str(e) + "/" + str(episode_count))
    state = getState(data, 0, window_size + 1)
    state = state.reshape((state.shape[0], state.shape[1], 1))
Example #4
0
    def __init__(self,
                 policy,
                 optimizer,
                 env,
                 writer,
                 pretrained_lm,
                 out_path,
                 gamma=1.,
                 lr=1e-2,
                 grad_clip=None,
                 scheduler=None,
                 pretrain=False,
                 update_every=50,
                 num_truncated=10,
                 p_th=None,
                 truncate_mode="top_k",
                 log_interval=10,
                 test_envs=[],
                 eval_no_trunc=0,
                 alpha_logits=0.,
                 alpha_decay_rate=0.,
                 epsilon_truncated=0.,
                 train_seed=0,
                 epsilon_truncated_rate=1.,
                 is_loss_correction=1,
                 train_metrics=[],
                 test_metrics=[],
                 top_p=1.,
                 temperature=1,
                 temp_factor=1,
                 temperature_step=1,
                 temperature_min=1,
                 temperature_max=10,
                 s_min=10,
                 s_max=200,
                 inv_schedule_step=0,
                 schedule_start=1,
                 curriculum=0,
                 KL_coeff=0.,
                 truncation_optim=0):
        self.device = policy.device
        self.policy = policy.to(self.device)
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.grad_clip = grad_clip
        self.gamma = gamma
        self.log_interval = log_interval
        self.test_envs = test_envs
        self.truncate_mode = truncate_mode
        self.alpha_logits_lm = alpha_logits
        self.alpha_decay_rate = alpha_decay_rate
        self.temperature = temperature
        self.temp_factor = temp_factor
        self.temperature_step = temperature_step
        self.temperature_min = temperature_min
        self.temperature_max = temperature_max
        self.inv_schedule_step = inv_schedule_step
        self.schedule_start = schedule_start
        self.env = env
        self.pretrain = pretrain
        self.update_every = update_every
        self.memory = Memory()
        self.num_truncated = num_truncated
        self.epsilon_truncated = epsilon_truncated
        self.epsilon_truncated_rate = epsilon_truncated_rate
        self.is_loss_correction = is_loss_correction
        self.curriculum = curriculum
        self.KL_coeff = KL_coeff
        self.truncation_optim = truncation_optim
        if self.curriculum > 0:
            self.env.update_mode(mode=env.mode, answer_sampl="random")
        p_th_ = p_th if p_th is not None else 1 / self.env.dataset.len_vocab

        if self.truncate_mode is not None:
            self.eval_trunc = {
                "no_trunc": False,
                "with_trunc": True
            } if eval_no_trunc else {
                "with_trunc": True
            }
            self.truncation = truncations[truncate_mode](
                self,
                num_truncated=num_truncated,
                p_th=p_th_,
                pretrained_lm=pretrained_lm,
                top_p=top_p,
                s_min=s_min,
                s_max=s_max)  # adding the truncation class.
        else:
            self.eval_trunc = {"no_trunc": False}
            self.truncation = truncations["no_trunc"](
                self,
                num_truncated=num_truncated,
                p_th=p_th_,
                top_p=top_p,
                pretrained_lm=pretrained_lm)

        self.writer = writer
        self.out_path = out_path
        self.checkpoints_path = os.path.join(out_path, "checkpoints")
        if not os.path.isdir(self.checkpoints_path):
            os.makedirs(self.checkpoints_path)
        self.generated_text = []
        self.train_metrics_names = train_metrics
        self.test_metrics_names = test_metrics
        self.init_metrics()
        self.start_episode = 1
        self.train_seed = train_seed
        if self.env.answer_sampling == "inv_frequency":
            inv_freq_answer_decoded = self.env.decode_inv_frequency()
            logger.info(
                "---------------- INV FREQ ANSWERS DISTRIBUTION FOR ANSWER SAMPLING--------------------------------"
            )
            logger.info(inv_freq_answer_decoded)
            logger.info("-" * 100)
        if self.env.answer_sampling == "img_sampling":
            logger.info(
                "---------------- ANSWER / IMG STATS ---------------------------------------------------------------"
            )
            min, mean, max = self.env.dataset.get_answer_img_stats()
            logger.info("number MIN of answers per img:{}".format(min))
            logger.info("number MEAN of answers per img:{}".format(mean))
            logger.info("number MAX of answers per img:{}".format(max))
            logger.info("-" * 100)
Example #5
0
class Agent:
    def __init__(self,
                 policy,
                 optimizer,
                 env,
                 writer,
                 pretrained_lm,
                 out_path,
                 gamma=1.,
                 lr=1e-2,
                 grad_clip=None,
                 scheduler=None,
                 pretrain=False,
                 update_every=50,
                 num_truncated=10,
                 p_th=None,
                 truncate_mode="top_k",
                 log_interval=10,
                 test_envs=[],
                 eval_no_trunc=0,
                 alpha_logits=0.,
                 alpha_decay_rate=0.,
                 epsilon_truncated=0.,
                 train_seed=0,
                 epsilon_truncated_rate=1.,
                 is_loss_correction=1,
                 train_metrics=[],
                 test_metrics=[],
                 top_p=1.,
                 temperature=1,
                 temp_factor=1,
                 temperature_step=1,
                 temperature_min=1,
                 temperature_max=10,
                 s_min=10,
                 s_max=200,
                 inv_schedule_step=0,
                 schedule_start=1,
                 curriculum=0,
                 KL_coeff=0.,
                 truncation_optim=0):
        self.device = policy.device
        self.policy = policy.to(self.device)
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.grad_clip = grad_clip
        self.gamma = gamma
        self.log_interval = log_interval
        self.test_envs = test_envs
        self.truncate_mode = truncate_mode
        self.alpha_logits_lm = alpha_logits
        self.alpha_decay_rate = alpha_decay_rate
        self.temperature = temperature
        self.temp_factor = temp_factor
        self.temperature_step = temperature_step
        self.temperature_min = temperature_min
        self.temperature_max = temperature_max
        self.inv_schedule_step = inv_schedule_step
        self.schedule_start = schedule_start
        self.env = env
        self.pretrain = pretrain
        self.update_every = update_every
        self.memory = Memory()
        self.num_truncated = num_truncated
        self.epsilon_truncated = epsilon_truncated
        self.epsilon_truncated_rate = epsilon_truncated_rate
        self.is_loss_correction = is_loss_correction
        self.curriculum = curriculum
        self.KL_coeff = KL_coeff
        self.truncation_optim = truncation_optim
        if self.curriculum > 0:
            self.env.update_mode(mode=env.mode, answer_sampl="random")
        p_th_ = p_th if p_th is not None else 1 / self.env.dataset.len_vocab

        if self.truncate_mode is not None:
            self.eval_trunc = {
                "no_trunc": False,
                "with_trunc": True
            } if eval_no_trunc else {
                "with_trunc": True
            }
            self.truncation = truncations[truncate_mode](
                self,
                num_truncated=num_truncated,
                p_th=p_th_,
                pretrained_lm=pretrained_lm,
                top_p=top_p,
                s_min=s_min,
                s_max=s_max)  # adding the truncation class.
        else:
            self.eval_trunc = {"no_trunc": False}
            self.truncation = truncations["no_trunc"](
                self,
                num_truncated=num_truncated,
                p_th=p_th_,
                top_p=top_p,
                pretrained_lm=pretrained_lm)

        self.writer = writer
        self.out_path = out_path
        self.checkpoints_path = os.path.join(out_path, "checkpoints")
        if not os.path.isdir(self.checkpoints_path):
            os.makedirs(self.checkpoints_path)
        self.generated_text = []
        self.train_metrics_names = train_metrics
        self.test_metrics_names = test_metrics
        self.init_metrics()
        self.start_episode = 1
        self.train_seed = train_seed
        if self.env.answer_sampling == "inv_frequency":
            inv_freq_answer_decoded = self.env.decode_inv_frequency()
            logger.info(
                "---------------- INV FREQ ANSWERS DISTRIBUTION FOR ANSWER SAMPLING--------------------------------"
            )
            logger.info(inv_freq_answer_decoded)
            logger.info("-" * 100)
        if self.env.answer_sampling == "img_sampling":
            logger.info(
                "---------------- ANSWER / IMG STATS ---------------------------------------------------------------"
            )
            min, mean, max = self.env.dataset.get_answer_img_stats()
            logger.info("number MIN of answers per img:{}".format(min))
            logger.info("number MEAN of answers per img:{}".format(mean))
            logger.info("number MAX of answers per img:{}".format(max))
            logger.info("-" * 100)

    def init_metrics(self):
        self.metrics = {}
        self.metrics["train"] = {
            key: metrics[key](self,
                              train_test="train",
                              env=self.env,
                              trunc="trunc",
                              sampling="sampling")
            for key in self.train_metrics_names if key in metrics
        }
        for env_ in self.test_envs:
            for trunc in self.eval_trunc.keys():
                for sampling_mode in [
                        "sampling", "greedy", "sampling_ranking_lm"
                ]:
                    id = "_".join([env_.mode, trunc, sampling_mode])
                    self.metrics[id] = {
                        key: metrics[key](self,
                                          train_test="test",
                                          trunc=trunc,
                                          sampling=sampling_mode,
                                          env=env_)
                        for key in self.test_metrics_names if key in metrics
                    }

    def get_score_metric(self, metrics):
        score_metric = metrics["language_score"]
        return score_metric

    def get_metrics(self, mode, trunc, sampling_mode):
        id = "{}_{}_{}".format(mode, trunc, sampling_mode)
        return self.metrics[id]

    def update_per_episode(self,
                           i_episode,
                           alpha_min=0.001,
                           update_every=500,
                           num_episodes_train=1000):
        if self.alpha_decay_rate > 0 and self.alpha_logits_lm > alpha_min:
            if i_episode % update_every == 0:
                self.alpha_logits_lm *= (1 - self.alpha_decay_rate)
                logger.info(
                    "decaying alpha logits parameter at Episode #{} - new value: {}"
                    .format(i_episode, self.alpha_logits_lm))
        # if i_episode == int(self.epsilon_truncated_rate * num_episodes_train) + 1:
        # self.epsilon_truncated = 1
        # logger.info("setting epsilon for truncation equal to 1 - starting fine-tuning with all space policy")

        self.update_temperature(i_episode)
        if i_episode == self.curriculum:
            print(self.env.answer_sampling)
            logger.info("UPDATING ANSWER SAMPLING FROM RANDOM TO UNIFORM...")
            self.env.update_mode(mode=self.env.mode, answer_sampl="uniform")
            print(self.env.answer_sampling)

    def update_temperature(self, i_episode):
        if i_episode + 1 == self.inv_schedule_step:
            self.temp_factor = 1 / self.temp_factor
            print("inversing the temperature schedule at episode {}".format(
                i_episode + 1))
        if (i_episode + 1) >= self.schedule_start:
            if (i_episode + 1) == self.schedule_start:
                print(
                    "starting the temperature scheduling at episode {}".format(
                        i_episode + 1))
            if self.temp_factor < 1:
                if (
                        i_episode + 1
                ) % self.temperature_step == 0 and self.temperature > self.temperature_min:
                    self.temperature *= self.temp_factor
                    if self.temperature < self.temperature_min:
                        logger.info(
                            "LAST TEMPERATURE UPDATE at temp {}".format(
                                self.temperature_min))
                        self.temperature = self.temperature_min
            else:
                if (
                        i_episode + 1
                ) % self.temperature_step == 0 and self.temperature < self.temperature_max:
                    self.temperature *= self.temp_factor
                    if self.temperature > self.temperature_max:
                        logger.info(
                            "LAST TEMPERATURE UPDATE at temp {}".format(
                                self.temperature_max))
                        self.temperature = self.temperature_max
        self.writer.add_scalar('temperature', self.temperature, i_episode)

    def act(self,
            state,
            mode='sampling',
            truncation=True,
            forced=None,
            ht=None,
            ct=None):
        valid_actions, action_probs, logits_lm, log_probas_lm, origin_log_probs_lm = self.truncation.get_valid_actions(
            state, truncation, temperature=self.temperature)
        alpha = self.alpha_logits_lm
        policy_dist, policy_dist_truncated, value, ht, ct = self.get_policy_distributions(
            state, valid_actions, logits_lm, alpha=alpha, ht=ht, ct=ct)
        if self.truncation_optim == 1:
            policy_dist = policy_dist_truncated
        action = self.sample_action(
            policy_dist=policy_dist,
            policy_dist_truncated=policy_dist_truncated,
            valid_actions=valid_actions,
            mode=mode,
            forced=forced)
        log_prob = policy_dist.log_prob(action.to(self.device)).view(-1)
        log_prob_truncated = policy_dist_truncated.log_prob(
            action.to(self.device)).view(-1)

        return action, log_prob, value, (
            valid_actions, action_probs, log_prob_truncated
        ), policy_dist, logits_lm, log_probas_lm, origin_log_probs_lm, ht, ct

    def get_policy_distributions(self,
                                 state,
                                 valid_actions,
                                 logits_lm=None,
                                 alpha=0.,
                                 ht=None,
                                 ct=None):
        policy_dist, policy_dist_truncated, value, ht, ct = self.policy(
            state.text,
            state.img,
            state.answer,
            valid_actions=valid_actions,
            logits_lm=logits_lm,
            alpha=alpha,
            ht=ht,
            ct=ct)
        return policy_dist, policy_dist_truncated, value, ht, ct

    def sample_action(self,
                      policy_dist,
                      policy_dist_truncated,
                      valid_actions,
                      mode='sampling',
                      forced=None):
        policy_to_sample_from = policy_dist_truncated
        epsilon_truncated_sample = random.random()
        if epsilon_truncated_sample < self.epsilon_truncated:
            policy_to_sample_from = policy_dist
        if mode == 'forced':
            action = forced
        elif mode == 'sampling':
            action = policy_to_sample_from.sample()
        elif mode == 'greedy':
            action = torch.argmax(policy_to_sample_from.probs).view(1).detach()
        if policy_to_sample_from.probs.size() != policy_dist.probs.size():
            action = torch.gather(valid_actions, 1, action.view(1, 1))
        return action

    def save(self, out_file):
        with open(out_file, 'wb') as f:
            torch.save(self.policy.state_dict(), f)

    def save_ckpt(self, EPOCH, loss):
        torch.save(
            {
                'epoch': EPOCH,
                'model_state_dict': self.policy.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'loss': loss,
            }, os.path.join(self.checkpoints_path, 'model.pt'))

    def load_ckpt(self, ckpt_path):
        checkpoint = torch.load(os.path.join(ckpt_path, 'model.pt'))
        self.policy.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        loss = checkpoint['loss']
        return epoch, loss

    def test(self,
             num_episodes=10,
             test_mode='sampling',
             test_seed=0,
             num_diversity=1):
        for env in self.test_envs:
            logger.info(
                '-----------------------Starting Evaluation for {} dialog ------------------'
                .format(env.mode))
            self.test_env(env,
                          num_episodes=num_episodes,
                          test_mode=test_mode,
                          test_seed=test_seed)

    def init_hidden(self, state):
        h, c = self.policy.init_hidden_state(state)
        return h, c

    def generate_one_episode(self,
                             timestep,
                             i_episode,
                             env,
                             seed=None,
                             train=True,
                             truncation=True,
                             test_mode='sampling',
                             metrics=[],
                             idx_diversity=0,
                             num_diversity=10):
        if train or seed is None:
            state, ep_reward = env.reset(seed=seed), 0
        else:
            state, ep_reward = env.reset(i_episode=i_episode), 0
        (ht, ct) = self.init_hidden(state)
        for t in range(0, env.max_len):
            forced = env.ref_question[t]
            action, log_probs, value, (
                valid_actions, actions_probs, log_probs_truncated
            ), dist, logits_lm, log_probas_lm, origin_log_probs_lm, new_ht, new_ct = self.act(
                state=state,
                mode=test_mode,
                truncation=truncation,
                forced=forced,
                ht=ht,
                ct=ct)
            new_state, (reward, closest_question,
                        pred_answer), done, _ = env.step(action.cpu().numpy())
            if train:
                # Saving reward and is_terminal:
                self.memory.add_step(action, state.text[0], state.img[0],
                                     log_probs, log_probs_truncated, reward,
                                     done, value, state.answer, ht, ct,
                                     log_probas_lm)
                if self.env.reward_type == "vilbert" and done:
                    self.writer.add_scalar("vilbert_rank", pred_answer,
                                           i_episode)
            timestep += 1
            for key, metric in metrics.items():
                metric.fill(state=state,
                            action=action,
                            done=done,
                            dist=dist,
                            valid_actions=valid_actions,
                            actions_probs=actions_probs,
                            ref_question=env.ref_questions,
                            ref_questions_decoded=env.ref_questions_decoded,
                            reward=reward,
                            closest_question=closest_question,
                            new_state=new_state,
                            log_probs=log_probs,
                            log_probs_truncated=log_probs_truncated,
                            test_mode=test_mode,
                            pred_answer=pred_answer,
                            i_episode=i_episode,
                            ref_question_idx=env.ref_question_idx,
                            logits_lm=logits_lm,
                            log_probas_lm=log_probas_lm,
                            timestep=t,
                            origin_log_probs_lm=origin_log_probs_lm,
                            alpha=self.alpha_logits_lm,
                            ref_answer=env.ref_answer)
            state = new_state
            ht = new_ht
            ct = new_ct
            ep_reward += reward

            # update if its time
            if train:
                if self.update_mode == "step" and timestep % self.update_every == 0:
                    loss = self.update()
                    logger.info("UPDATING POLICY WEIGHTS...")
                    self.memory.clear_memory()
                    timestep = 0
                else:
                    loss = None

            if done:
                if train:
                    if self.update_mode == "episode" and i_episode % self.update_every == 0:
                        loss = self.update()
                        logger.info("UPDATING POLICY WEIGHTS...")
                        self.memory.clear_memory()
                else:
                    loss = None
                break
        for key, metric in metrics.items():
            metric.compute(state=state,
                           closest_question=closest_question,
                           img_idx=env.img_idx,
                           reward=reward,
                           ref_question=env.ref_questions,
                           ref_questions_decoded=env.ref_questions_decoded,
                           question_idx=env.ref_question_idx,
                           test_mode=test_mode,
                           pred_answer=pred_answer,
                           ref_answer=env.ref_answer,
                           idx_diversity=idx_diversity,
                           num_diversity=num_diversity)

        return state, ep_reward, closest_question, valid_actions, timestep, loss

    def test_env(self,
                 env,
                 num_episodes=10,
                 test_mode='sampling',
                 test_seed=0):
        num_diversity = 10 if test_mode == "sampling_ranking_lm" else 1
        test_mode_episode = {
            "greedy": "greedy",
            "sampling": "sampling",
            "sampling_ranking_lm": "sampling"
        }
        print("temperature at test: {}".format(self.temperature))
        env.reset()  # init env.
        timestep = 1
        self.policy.eval()
        for i_episode in range(num_episodes):
            logger.info('-' * 20 + 'Test Episode: {}'.format(i_episode) +
                        '-' * 20)
            seed = i_episode if test_seed else None
            for key_trunc, trunc in self.eval_trunc.items():
                metrics = self.get_metrics(env.mode, key_trunc, test_mode)
                for i in range(
                        num_diversity
                ):  # loop multiple time over the same image to measure langage diversity.
                    with torch.no_grad():
                        state, ep_reward, closest_question, valid_actions, timestep, _ = self.generate_one_episode(
                            timestep=timestep,
                            i_episode=i_episode,
                            env=env,
                            seed=seed,
                            train=False,
                            test_mode=test_mode_episode[test_mode],
                            truncation=trunc,
                            metrics=metrics,
                            idx_diversity=i,
                            num_diversity=num_diversity)
                    for _, metric in metrics.items():
                        metric.write()
                        metric.log(valid_actions=valid_actions)
                for _, metric in metrics.items():
                    metric.write_div()
        for key_trunc in self.eval_trunc.keys():
            metrics = self.get_metrics(env.mode, key_trunc, test_mode)
            idx_to_keep = None
            if test_mode == "sampling_ranking_lm":
                language_score = metrics["language_score"]
                idx_to_keep = language_score.get_min_ppl_idxs(num_diversity)
                pd.Series(idx_to_keep).to_csv(
                    os.path.join(language_score.out_path, "metrics",
                                 "min_ppl_idx.csv"))
            for key_metric, metric in metrics.items():
                metric.post_treatment(num_episodes=num_episodes,
                                      idx_to_keep=idx_to_keep)

    def log_at_train(self, i_episode, ep_reward, state, closest_question,
                     valid_actions):
        logger.info(
            '-' * 20 +
            'Episode {} - Img  {}'.format(i_episode, self.env.img_idx) +
            '-' * 20)
        logger.info('Last reward: {:.2f}'.format(ep_reward))
        for key, metric in self.metrics["train"].items():
            metric.log(valid_actions=valid_actions)
            metric.write()
        logger.info("-" * 100)

    def learn(self, num_episodes=100):
        sampling_mode = "forced" if self.pretrain else "sampling"
        start_time = time.time()
        current_time = time.time()
        timestep = 1
        for i_episode in range(self.start_episode,
                               self.start_episode + num_episodes):
            seed = i_episode if self.train_seed else None
            state, ep_reward, closest_question, valid_actions, timestep, loss = self.generate_one_episode(
                timestep=timestep,
                i_episode=i_episode,
                env=self.env,
                seed=seed,
                metrics=self.metrics["train"],
                test_mode=sampling_mode)
            self.update_per_episode(i_episode=i_episode,
                                    num_episodes_train=num_episodes)
            if i_episode % self.log_interval == 0:
                self.log_at_train(i_episode=i_episode,
                                  ep_reward=ep_reward,
                                  state=state,
                                  closest_question=closest_question,
                                  valid_actions=valid_actions)

            if i_episode % 1000 == 0:
                elapsed = time.time() - current_time
                logger.info(
                    "Training time for 1000 episodes: {:5.2f}".format(elapsed))
                current_time = time.time()
                # saving checkpoint:
                self.save_ckpt(EPOCH=i_episode, loss=loss)

        if valid_actions is not None and "action_probs" in self.metrics[
                "train"] and "action_probs_lm" in self.metrics[
                    "train"]:  # to compare the discrepancy between the 'truncated policy' and the 'all space' policy
            self.writer.add_custom_scalars({
                'Train_all_probs': {
                    'action_probs': [
                        'Multiline',
                        [
                            'train_action_probs',
                            'train_action_probs_truncated',
                            'train_action_probs_lm'
                        ]
                    ]
                }
            })

        for _, metric in self.metrics["train"].items():
            metric.post_treatment(num_episodes=num_episodes)
        logger.info("total training time: {:7.2f}".format(time.time() -
                                                          start_time))
        logger.info(
            "--------------------------------------------END OF TRAINING ----------------------------------------------------"
        )

    def compute_write_all_metrics(self, output_path, logger):
        # write to csv test scalar metrics:
        logger.info(
            "------------------------------------- test metrics statistics -----------------------------------------"
        )
        all_metrics = {trunc: {} for trunc in self.eval_trunc.keys()}
        for key in self.test_metrics_names:
            stats_dict = {trunc: {} for trunc in self.eval_trunc.keys()}
            stats_dict_div = {trunc: {} for trunc in self.eval_trunc.keys()}

            instances_of_metric = [
                self.metrics[key_mode][key]
                for key_mode in self.metrics.keys() if key_mode != "train"
            ]
            # for stats
            for metric in instances_of_metric:
                if metric.stats:
                    for key_stat, stat in metric.stats.items():
                        stats_dict[metric.trunc]["_".join(
                            [metric.env_mode, metric.sampling,
                             key_stat])] = stat[0]
                        if str(stat[0]) != 'nan':
                            all_metrics[metric.trunc].setdefault(
                                key_stat, []).append(stat[0])

                if metric.stats_div:
                    for key_stat, stat in metric.stats.items():
                        stats_dict[metric.trunc]["_".join(
                            [metric.env_mode, metric.sampling,
                             key_stat])] = stat[0]
                        # all_metrics[metric.trunc].setdefault(key_stat, []).append(stat[0])
            stats_path = os.path.join(self.out_path, "stats",
                                      "{}.csv".format(key))
            div_path = os.path.join(self.out_path, "stats",
                                    "{}_div.csv".format(key))

            pd.DataFrame(data=stats_dict).to_csv(stats_path)
            pd.DataFrame(data=stats_dict_div).to_csv(div_path)

        # for all metrics
        for trunc in all_metrics.keys():
            for key_s in all_metrics[trunc].keys():
                if len(all_metrics[trunc][key_s]) > 0:
                    all_metrics[trunc][key_s] = np.round(np.mean(
                        all_metrics[trunc][key_s]),
                                                         decimals=3)

        stats_path = os.path.join(self.out_path, "all_metrics.csv")
        pd.DataFrame(data=all_metrics).to_csv(stats_path)
Example #6
0
def train():
    # argsの取得
    args = get_args()
    log_dir = args.log_dir
    model_path = args.model_path
    n_episodes = args.n_episodes
    n_steps = args.n_steps

    os.makedirs(args.log_dir, exist_ok=True)

    # メインの処理プロセス
    # --- PRE-PROCESS ---
    # セッションスタート
    sess = tf.Session()

    # インスタンスの作成
    env = RubiksCubeEnv()
    st_shape, act_list =\
        env.get_state_shape(), env.get_action_list()
    agent = ActorCriticAgent(st_shape, act_list)
    memory = Memory()
    logger = HistoryLogger(log_dir)

    # ネットワーク変数の初期化
    _init_g = tf.global_variables_initializer()
    sess.run(_init_g)

    # 学習済みモデルからネットワーク変数をrestore
    if model_path:
        agent.restore_graph(sess, model_path)

    # history loggingのヘッダ定義
    _header = [
        'episode', 'avg_reward', 'avg_loss', 'avg_vloss',
        'avg_aloss'
    ]
    logger.set_history_header(_header)

    # --- TRAIN MAIN ---
    # monotoring metrics用の変数
    min_metric = 0.0
    list_losses, list_rewards = [], []
    start_time = time.time()

    # エピソードのループ
    for i_episode in range(n_episodes):

        # Cube環境の初期化
        env.reset()
        # Cubeのランダムシャッフル
        _, state = env.apply_scramble_w_weight()

        # ステップのループ
        for i_step in range(n_steps):

            # エージェント(方策ネットワーク)による行動推定
            action = agent.get_action(sess, state)

            # 選択行動に対して、環境から報酬値などの取得
            next_state, reward, done, _ = env.step(action)

            # 経験のメモリ登録
            memory.push(state, action, reward,
                        next_state, done)

            state = next_state

            if done[0]:
                break

        # --- POST-PROCESS (EPISODE) ---
        # メモリからの経験データの取得
        memory_data = memory.get_memory_data()

        # 経験データを用いたエージェントの更新
        _args = zip(*memory_data)
        losses = agent.update_model(sess, *_args)

        loss, vloss, aloss = losses
        _, _, _rwd, _, _ = zip(*memory_data)
        reward = _rwd

        list_losses.append([loss, vloss, aloss])
        list_rewards.append(np.mean(reward))

        # 次エピソードのためメモリの初期化
        memory.reset()

        i_episode += 1
        if not i_episode % 100:

            # monitoring metricsの算出
            duration = time.time() - start_time
            avg_loss, avg_vloss, avg_aloss = np.mean(
                list_losses, axis=0)
            avg_reward = np.mean(list_rewards)

            # monitoringのリセット
            list_losses, list_rewards = [], []
            start_time = time.time()

            # print
            log_str = 'Episode: {0:6d}/{1:6d}'.format(
                i_episode, n_episodes)
            log_str += ' - Time: {0:3.2f}'.format(
                duration)
            log_str += ' - Avg_Reward: {0:3.3f}'.format(
                avg_reward)
            log_str += ' - Avg_Loss: {0:3.5f}'.format(
                avg_loss)
            log_str += ' - Avg_VLoss: {0:3.5f}'.format(
                avg_vloss)
            log_str += ' - Avg_ALoss: {0:3.5f}'.format(
                avg_aloss)
            print(log_str)

            # modelのlogging
            if not min_metric:
                min_metric = avg_reward
            min_metric = max(min_metric, avg_reward)

            if min_metric is avg_reward:
                args = [i_episode, avg_reward, avg_loss]
                agent.save_graph(sess, log_dir, args)

            # 各種monitoring metricsのlogging
            log_list = [
                i_episode, avg_reward, avg_loss,
                avg_vloss, avg_aloss
            ]
            logger.history_save(log_list)
Example #7
0
    def train(self):
        tf.reset_default_graph()
        main_net = Q_net(self.height, self.width, self.depth,
                         self.number_of_possible_actions)
        target_net = Q_net(self.height, self.width, self.depth,
                           self.number_of_possible_actions)

        init = tf.global_variables_initializer()

        e = start_e
        steps = 0
        r_all = 0
        with tf.Session() as sess:
            sess.run(init)
            if load_model == True:
                self.load(sess, model_name)
            sess.run(
                self.get_copy_var_ops(dest_scope_name="target_net",
                                      src_scope_name="main_net"))

            for i in range(episodes):
                mem = Memory(1000, self.height, self.width, self.depth)
                self.env.init_env()
                j = 0
                s = self.env.get_state()
                while j < max_episode_length:
                    j += 1
                    if np.random.rand(1) < e:
                        act = np.random.randint(
                            0, self.number_of_possible_actions)
                    else:
                        act = sess.run(
                            main_net.selected_action,
                            feed_dict={main_net.input_data_set: [s]})[0]
                    st, a, r, end = self.env.do_action(act)
                    mem.save(s, a, r, st, end)
                    steps += 1
                    if e > min_e:
                        e *= de
                    elif steps % update_freq == 0:
                        bs = min(batch_size, mem.max_index)
                        state_batch, action_batch, reward_batch, state_new_batch, end_batch = mem.load(
                            bs)

                        Q1 = sess.run(main_net.selected_action,
                                      feed_dict={
                                          main_net.input_data_set:
                                          state_new_batch
                                      })
                        Q2 = sess.run(target_net.Q,
                                      feed_dict={
                                          target_net.input_data_set:
                                          state_new_batch
                                      })
                        dQ = Q2[range(bs), Q1]
                        em = []
                        for k in range(0, bs):
                            if not end_batch[k]:
                                em.append(1)
                            else:
                                em.append(0)
                        tQ = reward_batch + (y * dQ * em)
                        _ = sess.run(main_net.updateModel, \
                            feed_dict={main_net.input_data_set: state_batch, main_net.targetQ: tQ, main_net.actions: action_batch})
                    if e <= min_e and steps % update_freq_target == 0:
                        sess.run(
                            self.get_copy_var_ops(dest_scope_name="target_net",
                                                  src_scope_name="main_net"))
                    r_all += r
                    s = st
                    if end:
                        break

                jList.append(j)
                rList.append(r_all)
                if (i % (episodes // 10) == 0):
                    self.save(sess, model_name)
                if len(rList) % 10 == 0:
                    print(steps, np.mean(rList[-10:]), e)
            self.save(sess, model_name)
        print("완료: " + str(sum(rList) / episodes))
#!/usr/bin/env python
# encoding: utf-8
"""
@author: Young
@license: (C) Copyright 2013-2017
@contact: [email protected]
@file: test_memory.py
@time: 2018/1/16 21:37
"""
import numpy as np
from agent.memory import Memory

M = Memory()

state = np.random.normal(size=24)
action = np.random.normal(size=4)
reward = np.random.normal()
done = np.bool(np.random.randint(0, 2))
next_state = state
for _ in range(int(1e6)):
    M(state, action, reward, done, next_state)

states, actions, rewards, next_states = M.sample(128)
print(states.shape)
print(actions.shape)
print(rewards.shape)
print(next_states.shape)
            estimate_loss = (t % test_interval == 0)
            training_TD_error = agent.train(estimate_loss=estimate_loss)
        #if

        #エピソード終端か?
        if done:
            break
        #if
    #while

    if t % test_interval == 0:
        #テストする
        steps = []
        total_rewards = []

        test_memory = Memory(test_repeat * max_steps)
        agent.set_policy("Greedy")

        #テストは複数回行う
        for _ in range(test_repeat):

            state = environment.reset()
            total_reward = 0

            for step in range(1, max_steps + 1):
                action = agent.action(state)
                (state_dash, reward, done, info) = environment.step(action)

                total_reward += reward
                test_memory.append(state, action, reward, state_dash, done)
Example #10
0
class Agent(object):
    def __init__(self, state_size, action_size, action_limits=1.):
        self.state_size = state_size
        self.action_size = action_size
        self.action_limits = action_limits

        self.memory = Memory(MEMORY_SIZE)
        self.noise = Noise(action_size)

        self.actor = ActorNet(state_size, action_size)
        self.target_actor = deepcopy(self.actor)
        self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE)

        self.critic = CriticNet(state_size, action_size)
        self.target_critic = deepcopy(self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE)

    def append(self, *args):
        self.memory.append(*args)

    def sample(self, *args):
        return self.memory.sample(*args)

    def get_exploitation_policy(self, state):
        state = Variable(torch.from_numpy(np.float32(state)))
        action = self.target_actor(state).detach()
        return action.data.numpy()

    def get_exploration_policy(self, state):
        state = Variable(torch.from_numpy(np.float32(state)))
        action = self.actor(state).detach()
        return action.data.numpy() + \
               (self.noise() * self.action_limits)

    def optimize(self, batch_size=BATCH_SIZE):
        batch = self.sample(batch_size)
        state, action, reward,  next_state =\
            [Variable(torch.from_numpy(i)) for i in batch]

        next_action = self.target_actor.forward(next_state).detach()
        next_value = torch.squeeze(
            self.target_critic(next_state, next_action).detach())
        target_value = reward + GAMMA * next_value
        value = torch.squeeze(self.critic(state, action))

        loss_critic = nf.smooth_l1_loss(value, target_value)
        self.critic_optimizer.zero_grad()
        loss_critic.backward()
        self.critic_optimizer.step()

        policy_action = self.actor(state)
        loss_actor = -1 * torch.sum(self.critic(state, policy_action))
        self.actor_optimizer.zero_grad()
        loss_actor.backward()
        self.actor_optimizer.step()
        soft_update(self.target_actor, self.actor, TAU)
        soft_update(self.target_critic, self.critic, TAU)

    def restore_models(self, num_episode):
        self.actor.load_state_dict(
            torch.load("./Models/{}_actor.pkl".format(num_episode)))
        self.critic.load_state_dict(
            torch.load("./Models/{}_critic.pkl".format(num_episode)))
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

    def save_models(self, num_episode):
        torch.save(self.target_actor.state_dict(),
                   "actor_{}.pkl".format(num_episode))
        torch.save(self.target_critic.state_dict(),
                   "critic_{}.pkl".format(num_episode))
        print('Models saved successfully')