def __init__(self, state_size, action_size, action_limits=1.): self.state_size = state_size self.action_size = action_size self.action_limits = action_limits self.memory = Memory(MEMORY_SIZE) self.noise = Noise(action_size) self.actor = ActorNet(state_size, action_size) self.target_actor = deepcopy(self.actor) self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE) self.critic = CriticNet(state_size, action_size) self.target_critic = deepcopy(self.critic) self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE)
def create(self, model): parameters = self._parameters #行動の数 actions_count = 2 #方策(ここでいう方策とは、greedyかε-greedyということ) policies = self._create_policies(model, parameters, actions_count) #経験を記憶する memory = Memory(parameters["memory_size"]) #割引率γ gamma = parameters["gamma"] #replay_start_memory_size個のデータが貯まるまで学習を開始しない replay_start_memory_size = parameters["replay_start_memory_size"] #experience_replayするときのデータ数 replay_count = parameters["replay_count"] #学習する間隔を少し開ける training_interval_steps = parameters["training_interval_steps"] #optimizerを生成する optimizer_parameters = (parameters["optimizer"]["alpha"], parameters["optimizer"]["epsilon"]) optimizer = optimizers.Adam(alpha=optimizer_parameters[0], eps=optimizer_parameters[1]) optimizer.setup(model) #モデルを更新する処理 model_updater = SoftModelUpdater(parameters["tau"]) #agent生成 agent = DQNAgent(gamma, model, optimizer, model_updater, memory, replay_start_memory_size, replay_count, training_interval_steps, policies) return agent
from agent.memory import Memory from functions import * from preprocess_price import preprocess_price from keras.models import clone_model import sys import numpy as np if len(sys.argv) != 4: print("Usage: python3 train.py [stock] [window] [episodes]") exit() stock_name, window_size, episode_count = sys.argv[1], int(sys.argv[2]), int( sys.argv[3]) max_queue_size = 100 memory = Memory(max_queue_size) agent = Agent(window_size, memory) data = preprocess_price(stock_name) # data = getStockDataVec(stock_name) l = len(data) - 1 batch_size = 32 budget = 10000 errors = [] profits = [] fee = 0.2 / 100 for e in range(episode_count + 1): print("Episode " + str(e) + "/" + str(episode_count)) state = getState(data, 0, window_size + 1) state = state.reshape((state.shape[0], state.shape[1], 1))
def __init__(self, policy, optimizer, env, writer, pretrained_lm, out_path, gamma=1., lr=1e-2, grad_clip=None, scheduler=None, pretrain=False, update_every=50, num_truncated=10, p_th=None, truncate_mode="top_k", log_interval=10, test_envs=[], eval_no_trunc=0, alpha_logits=0., alpha_decay_rate=0., epsilon_truncated=0., train_seed=0, epsilon_truncated_rate=1., is_loss_correction=1, train_metrics=[], test_metrics=[], top_p=1., temperature=1, temp_factor=1, temperature_step=1, temperature_min=1, temperature_max=10, s_min=10, s_max=200, inv_schedule_step=0, schedule_start=1, curriculum=0, KL_coeff=0., truncation_optim=0): self.device = policy.device self.policy = policy.to(self.device) self.optimizer = optimizer self.scheduler = scheduler self.grad_clip = grad_clip self.gamma = gamma self.log_interval = log_interval self.test_envs = test_envs self.truncate_mode = truncate_mode self.alpha_logits_lm = alpha_logits self.alpha_decay_rate = alpha_decay_rate self.temperature = temperature self.temp_factor = temp_factor self.temperature_step = temperature_step self.temperature_min = temperature_min self.temperature_max = temperature_max self.inv_schedule_step = inv_schedule_step self.schedule_start = schedule_start self.env = env self.pretrain = pretrain self.update_every = update_every self.memory = Memory() self.num_truncated = num_truncated self.epsilon_truncated = epsilon_truncated self.epsilon_truncated_rate = epsilon_truncated_rate self.is_loss_correction = is_loss_correction self.curriculum = curriculum self.KL_coeff = KL_coeff self.truncation_optim = truncation_optim if self.curriculum > 0: self.env.update_mode(mode=env.mode, answer_sampl="random") p_th_ = p_th if p_th is not None else 1 / self.env.dataset.len_vocab if self.truncate_mode is not None: self.eval_trunc = { "no_trunc": False, "with_trunc": True } if eval_no_trunc else { "with_trunc": True } self.truncation = truncations[truncate_mode]( self, num_truncated=num_truncated, p_th=p_th_, pretrained_lm=pretrained_lm, top_p=top_p, s_min=s_min, s_max=s_max) # adding the truncation class. else: self.eval_trunc = {"no_trunc": False} self.truncation = truncations["no_trunc"]( self, num_truncated=num_truncated, p_th=p_th_, top_p=top_p, pretrained_lm=pretrained_lm) self.writer = writer self.out_path = out_path self.checkpoints_path = os.path.join(out_path, "checkpoints") if not os.path.isdir(self.checkpoints_path): os.makedirs(self.checkpoints_path) self.generated_text = [] self.train_metrics_names = train_metrics self.test_metrics_names = test_metrics self.init_metrics() self.start_episode = 1 self.train_seed = train_seed if self.env.answer_sampling == "inv_frequency": inv_freq_answer_decoded = self.env.decode_inv_frequency() logger.info( "---------------- INV FREQ ANSWERS DISTRIBUTION FOR ANSWER SAMPLING--------------------------------" ) logger.info(inv_freq_answer_decoded) logger.info("-" * 100) if self.env.answer_sampling == "img_sampling": logger.info( "---------------- ANSWER / IMG STATS ---------------------------------------------------------------" ) min, mean, max = self.env.dataset.get_answer_img_stats() logger.info("number MIN of answers per img:{}".format(min)) logger.info("number MEAN of answers per img:{}".format(mean)) logger.info("number MAX of answers per img:{}".format(max)) logger.info("-" * 100)
class Agent: def __init__(self, policy, optimizer, env, writer, pretrained_lm, out_path, gamma=1., lr=1e-2, grad_clip=None, scheduler=None, pretrain=False, update_every=50, num_truncated=10, p_th=None, truncate_mode="top_k", log_interval=10, test_envs=[], eval_no_trunc=0, alpha_logits=0., alpha_decay_rate=0., epsilon_truncated=0., train_seed=0, epsilon_truncated_rate=1., is_loss_correction=1, train_metrics=[], test_metrics=[], top_p=1., temperature=1, temp_factor=1, temperature_step=1, temperature_min=1, temperature_max=10, s_min=10, s_max=200, inv_schedule_step=0, schedule_start=1, curriculum=0, KL_coeff=0., truncation_optim=0): self.device = policy.device self.policy = policy.to(self.device) self.optimizer = optimizer self.scheduler = scheduler self.grad_clip = grad_clip self.gamma = gamma self.log_interval = log_interval self.test_envs = test_envs self.truncate_mode = truncate_mode self.alpha_logits_lm = alpha_logits self.alpha_decay_rate = alpha_decay_rate self.temperature = temperature self.temp_factor = temp_factor self.temperature_step = temperature_step self.temperature_min = temperature_min self.temperature_max = temperature_max self.inv_schedule_step = inv_schedule_step self.schedule_start = schedule_start self.env = env self.pretrain = pretrain self.update_every = update_every self.memory = Memory() self.num_truncated = num_truncated self.epsilon_truncated = epsilon_truncated self.epsilon_truncated_rate = epsilon_truncated_rate self.is_loss_correction = is_loss_correction self.curriculum = curriculum self.KL_coeff = KL_coeff self.truncation_optim = truncation_optim if self.curriculum > 0: self.env.update_mode(mode=env.mode, answer_sampl="random") p_th_ = p_th if p_th is not None else 1 / self.env.dataset.len_vocab if self.truncate_mode is not None: self.eval_trunc = { "no_trunc": False, "with_trunc": True } if eval_no_trunc else { "with_trunc": True } self.truncation = truncations[truncate_mode]( self, num_truncated=num_truncated, p_th=p_th_, pretrained_lm=pretrained_lm, top_p=top_p, s_min=s_min, s_max=s_max) # adding the truncation class. else: self.eval_trunc = {"no_trunc": False} self.truncation = truncations["no_trunc"]( self, num_truncated=num_truncated, p_th=p_th_, top_p=top_p, pretrained_lm=pretrained_lm) self.writer = writer self.out_path = out_path self.checkpoints_path = os.path.join(out_path, "checkpoints") if not os.path.isdir(self.checkpoints_path): os.makedirs(self.checkpoints_path) self.generated_text = [] self.train_metrics_names = train_metrics self.test_metrics_names = test_metrics self.init_metrics() self.start_episode = 1 self.train_seed = train_seed if self.env.answer_sampling == "inv_frequency": inv_freq_answer_decoded = self.env.decode_inv_frequency() logger.info( "---------------- INV FREQ ANSWERS DISTRIBUTION FOR ANSWER SAMPLING--------------------------------" ) logger.info(inv_freq_answer_decoded) logger.info("-" * 100) if self.env.answer_sampling == "img_sampling": logger.info( "---------------- ANSWER / IMG STATS ---------------------------------------------------------------" ) min, mean, max = self.env.dataset.get_answer_img_stats() logger.info("number MIN of answers per img:{}".format(min)) logger.info("number MEAN of answers per img:{}".format(mean)) logger.info("number MAX of answers per img:{}".format(max)) logger.info("-" * 100) def init_metrics(self): self.metrics = {} self.metrics["train"] = { key: metrics[key](self, train_test="train", env=self.env, trunc="trunc", sampling="sampling") for key in self.train_metrics_names if key in metrics } for env_ in self.test_envs: for trunc in self.eval_trunc.keys(): for sampling_mode in [ "sampling", "greedy", "sampling_ranking_lm" ]: id = "_".join([env_.mode, trunc, sampling_mode]) self.metrics[id] = { key: metrics[key](self, train_test="test", trunc=trunc, sampling=sampling_mode, env=env_) for key in self.test_metrics_names if key in metrics } def get_score_metric(self, metrics): score_metric = metrics["language_score"] return score_metric def get_metrics(self, mode, trunc, sampling_mode): id = "{}_{}_{}".format(mode, trunc, sampling_mode) return self.metrics[id] def update_per_episode(self, i_episode, alpha_min=0.001, update_every=500, num_episodes_train=1000): if self.alpha_decay_rate > 0 and self.alpha_logits_lm > alpha_min: if i_episode % update_every == 0: self.alpha_logits_lm *= (1 - self.alpha_decay_rate) logger.info( "decaying alpha logits parameter at Episode #{} - new value: {}" .format(i_episode, self.alpha_logits_lm)) # if i_episode == int(self.epsilon_truncated_rate * num_episodes_train) + 1: # self.epsilon_truncated = 1 # logger.info("setting epsilon for truncation equal to 1 - starting fine-tuning with all space policy") self.update_temperature(i_episode) if i_episode == self.curriculum: print(self.env.answer_sampling) logger.info("UPDATING ANSWER SAMPLING FROM RANDOM TO UNIFORM...") self.env.update_mode(mode=self.env.mode, answer_sampl="uniform") print(self.env.answer_sampling) def update_temperature(self, i_episode): if i_episode + 1 == self.inv_schedule_step: self.temp_factor = 1 / self.temp_factor print("inversing the temperature schedule at episode {}".format( i_episode + 1)) if (i_episode + 1) >= self.schedule_start: if (i_episode + 1) == self.schedule_start: print( "starting the temperature scheduling at episode {}".format( i_episode + 1)) if self.temp_factor < 1: if ( i_episode + 1 ) % self.temperature_step == 0 and self.temperature > self.temperature_min: self.temperature *= self.temp_factor if self.temperature < self.temperature_min: logger.info( "LAST TEMPERATURE UPDATE at temp {}".format( self.temperature_min)) self.temperature = self.temperature_min else: if ( i_episode + 1 ) % self.temperature_step == 0 and self.temperature < self.temperature_max: self.temperature *= self.temp_factor if self.temperature > self.temperature_max: logger.info( "LAST TEMPERATURE UPDATE at temp {}".format( self.temperature_max)) self.temperature = self.temperature_max self.writer.add_scalar('temperature', self.temperature, i_episode) def act(self, state, mode='sampling', truncation=True, forced=None, ht=None, ct=None): valid_actions, action_probs, logits_lm, log_probas_lm, origin_log_probs_lm = self.truncation.get_valid_actions( state, truncation, temperature=self.temperature) alpha = self.alpha_logits_lm policy_dist, policy_dist_truncated, value, ht, ct = self.get_policy_distributions( state, valid_actions, logits_lm, alpha=alpha, ht=ht, ct=ct) if self.truncation_optim == 1: policy_dist = policy_dist_truncated action = self.sample_action( policy_dist=policy_dist, policy_dist_truncated=policy_dist_truncated, valid_actions=valid_actions, mode=mode, forced=forced) log_prob = policy_dist.log_prob(action.to(self.device)).view(-1) log_prob_truncated = policy_dist_truncated.log_prob( action.to(self.device)).view(-1) return action, log_prob, value, ( valid_actions, action_probs, log_prob_truncated ), policy_dist, logits_lm, log_probas_lm, origin_log_probs_lm, ht, ct def get_policy_distributions(self, state, valid_actions, logits_lm=None, alpha=0., ht=None, ct=None): policy_dist, policy_dist_truncated, value, ht, ct = self.policy( state.text, state.img, state.answer, valid_actions=valid_actions, logits_lm=logits_lm, alpha=alpha, ht=ht, ct=ct) return policy_dist, policy_dist_truncated, value, ht, ct def sample_action(self, policy_dist, policy_dist_truncated, valid_actions, mode='sampling', forced=None): policy_to_sample_from = policy_dist_truncated epsilon_truncated_sample = random.random() if epsilon_truncated_sample < self.epsilon_truncated: policy_to_sample_from = policy_dist if mode == 'forced': action = forced elif mode == 'sampling': action = policy_to_sample_from.sample() elif mode == 'greedy': action = torch.argmax(policy_to_sample_from.probs).view(1).detach() if policy_to_sample_from.probs.size() != policy_dist.probs.size(): action = torch.gather(valid_actions, 1, action.view(1, 1)) return action def save(self, out_file): with open(out_file, 'wb') as f: torch.save(self.policy.state_dict(), f) def save_ckpt(self, EPOCH, loss): torch.save( { 'epoch': EPOCH, 'model_state_dict': self.policy.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': loss, }, os.path.join(self.checkpoints_path, 'model.pt')) def load_ckpt(self, ckpt_path): checkpoint = torch.load(os.path.join(ckpt_path, 'model.pt')) self.policy.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] loss = checkpoint['loss'] return epoch, loss def test(self, num_episodes=10, test_mode='sampling', test_seed=0, num_diversity=1): for env in self.test_envs: logger.info( '-----------------------Starting Evaluation for {} dialog ------------------' .format(env.mode)) self.test_env(env, num_episodes=num_episodes, test_mode=test_mode, test_seed=test_seed) def init_hidden(self, state): h, c = self.policy.init_hidden_state(state) return h, c def generate_one_episode(self, timestep, i_episode, env, seed=None, train=True, truncation=True, test_mode='sampling', metrics=[], idx_diversity=0, num_diversity=10): if train or seed is None: state, ep_reward = env.reset(seed=seed), 0 else: state, ep_reward = env.reset(i_episode=i_episode), 0 (ht, ct) = self.init_hidden(state) for t in range(0, env.max_len): forced = env.ref_question[t] action, log_probs, value, ( valid_actions, actions_probs, log_probs_truncated ), dist, logits_lm, log_probas_lm, origin_log_probs_lm, new_ht, new_ct = self.act( state=state, mode=test_mode, truncation=truncation, forced=forced, ht=ht, ct=ct) new_state, (reward, closest_question, pred_answer), done, _ = env.step(action.cpu().numpy()) if train: # Saving reward and is_terminal: self.memory.add_step(action, state.text[0], state.img[0], log_probs, log_probs_truncated, reward, done, value, state.answer, ht, ct, log_probas_lm) if self.env.reward_type == "vilbert" and done: self.writer.add_scalar("vilbert_rank", pred_answer, i_episode) timestep += 1 for key, metric in metrics.items(): metric.fill(state=state, action=action, done=done, dist=dist, valid_actions=valid_actions, actions_probs=actions_probs, ref_question=env.ref_questions, ref_questions_decoded=env.ref_questions_decoded, reward=reward, closest_question=closest_question, new_state=new_state, log_probs=log_probs, log_probs_truncated=log_probs_truncated, test_mode=test_mode, pred_answer=pred_answer, i_episode=i_episode, ref_question_idx=env.ref_question_idx, logits_lm=logits_lm, log_probas_lm=log_probas_lm, timestep=t, origin_log_probs_lm=origin_log_probs_lm, alpha=self.alpha_logits_lm, ref_answer=env.ref_answer) state = new_state ht = new_ht ct = new_ct ep_reward += reward # update if its time if train: if self.update_mode == "step" and timestep % self.update_every == 0: loss = self.update() logger.info("UPDATING POLICY WEIGHTS...") self.memory.clear_memory() timestep = 0 else: loss = None if done: if train: if self.update_mode == "episode" and i_episode % self.update_every == 0: loss = self.update() logger.info("UPDATING POLICY WEIGHTS...") self.memory.clear_memory() else: loss = None break for key, metric in metrics.items(): metric.compute(state=state, closest_question=closest_question, img_idx=env.img_idx, reward=reward, ref_question=env.ref_questions, ref_questions_decoded=env.ref_questions_decoded, question_idx=env.ref_question_idx, test_mode=test_mode, pred_answer=pred_answer, ref_answer=env.ref_answer, idx_diversity=idx_diversity, num_diversity=num_diversity) return state, ep_reward, closest_question, valid_actions, timestep, loss def test_env(self, env, num_episodes=10, test_mode='sampling', test_seed=0): num_diversity = 10 if test_mode == "sampling_ranking_lm" else 1 test_mode_episode = { "greedy": "greedy", "sampling": "sampling", "sampling_ranking_lm": "sampling" } print("temperature at test: {}".format(self.temperature)) env.reset() # init env. timestep = 1 self.policy.eval() for i_episode in range(num_episodes): logger.info('-' * 20 + 'Test Episode: {}'.format(i_episode) + '-' * 20) seed = i_episode if test_seed else None for key_trunc, trunc in self.eval_trunc.items(): metrics = self.get_metrics(env.mode, key_trunc, test_mode) for i in range( num_diversity ): # loop multiple time over the same image to measure langage diversity. with torch.no_grad(): state, ep_reward, closest_question, valid_actions, timestep, _ = self.generate_one_episode( timestep=timestep, i_episode=i_episode, env=env, seed=seed, train=False, test_mode=test_mode_episode[test_mode], truncation=trunc, metrics=metrics, idx_diversity=i, num_diversity=num_diversity) for _, metric in metrics.items(): metric.write() metric.log(valid_actions=valid_actions) for _, metric in metrics.items(): metric.write_div() for key_trunc in self.eval_trunc.keys(): metrics = self.get_metrics(env.mode, key_trunc, test_mode) idx_to_keep = None if test_mode == "sampling_ranking_lm": language_score = metrics["language_score"] idx_to_keep = language_score.get_min_ppl_idxs(num_diversity) pd.Series(idx_to_keep).to_csv( os.path.join(language_score.out_path, "metrics", "min_ppl_idx.csv")) for key_metric, metric in metrics.items(): metric.post_treatment(num_episodes=num_episodes, idx_to_keep=idx_to_keep) def log_at_train(self, i_episode, ep_reward, state, closest_question, valid_actions): logger.info( '-' * 20 + 'Episode {} - Img {}'.format(i_episode, self.env.img_idx) + '-' * 20) logger.info('Last reward: {:.2f}'.format(ep_reward)) for key, metric in self.metrics["train"].items(): metric.log(valid_actions=valid_actions) metric.write() logger.info("-" * 100) def learn(self, num_episodes=100): sampling_mode = "forced" if self.pretrain else "sampling" start_time = time.time() current_time = time.time() timestep = 1 for i_episode in range(self.start_episode, self.start_episode + num_episodes): seed = i_episode if self.train_seed else None state, ep_reward, closest_question, valid_actions, timestep, loss = self.generate_one_episode( timestep=timestep, i_episode=i_episode, env=self.env, seed=seed, metrics=self.metrics["train"], test_mode=sampling_mode) self.update_per_episode(i_episode=i_episode, num_episodes_train=num_episodes) if i_episode % self.log_interval == 0: self.log_at_train(i_episode=i_episode, ep_reward=ep_reward, state=state, closest_question=closest_question, valid_actions=valid_actions) if i_episode % 1000 == 0: elapsed = time.time() - current_time logger.info( "Training time for 1000 episodes: {:5.2f}".format(elapsed)) current_time = time.time() # saving checkpoint: self.save_ckpt(EPOCH=i_episode, loss=loss) if valid_actions is not None and "action_probs" in self.metrics[ "train"] and "action_probs_lm" in self.metrics[ "train"]: # to compare the discrepancy between the 'truncated policy' and the 'all space' policy self.writer.add_custom_scalars({ 'Train_all_probs': { 'action_probs': [ 'Multiline', [ 'train_action_probs', 'train_action_probs_truncated', 'train_action_probs_lm' ] ] } }) for _, metric in self.metrics["train"].items(): metric.post_treatment(num_episodes=num_episodes) logger.info("total training time: {:7.2f}".format(time.time() - start_time)) logger.info( "--------------------------------------------END OF TRAINING ----------------------------------------------------" ) def compute_write_all_metrics(self, output_path, logger): # write to csv test scalar metrics: logger.info( "------------------------------------- test metrics statistics -----------------------------------------" ) all_metrics = {trunc: {} for trunc in self.eval_trunc.keys()} for key in self.test_metrics_names: stats_dict = {trunc: {} for trunc in self.eval_trunc.keys()} stats_dict_div = {trunc: {} for trunc in self.eval_trunc.keys()} instances_of_metric = [ self.metrics[key_mode][key] for key_mode in self.metrics.keys() if key_mode != "train" ] # for stats for metric in instances_of_metric: if metric.stats: for key_stat, stat in metric.stats.items(): stats_dict[metric.trunc]["_".join( [metric.env_mode, metric.sampling, key_stat])] = stat[0] if str(stat[0]) != 'nan': all_metrics[metric.trunc].setdefault( key_stat, []).append(stat[0]) if metric.stats_div: for key_stat, stat in metric.stats.items(): stats_dict[metric.trunc]["_".join( [metric.env_mode, metric.sampling, key_stat])] = stat[0] # all_metrics[metric.trunc].setdefault(key_stat, []).append(stat[0]) stats_path = os.path.join(self.out_path, "stats", "{}.csv".format(key)) div_path = os.path.join(self.out_path, "stats", "{}_div.csv".format(key)) pd.DataFrame(data=stats_dict).to_csv(stats_path) pd.DataFrame(data=stats_dict_div).to_csv(div_path) # for all metrics for trunc in all_metrics.keys(): for key_s in all_metrics[trunc].keys(): if len(all_metrics[trunc][key_s]) > 0: all_metrics[trunc][key_s] = np.round(np.mean( all_metrics[trunc][key_s]), decimals=3) stats_path = os.path.join(self.out_path, "all_metrics.csv") pd.DataFrame(data=all_metrics).to_csv(stats_path)
def train(): # argsの取得 args = get_args() log_dir = args.log_dir model_path = args.model_path n_episodes = args.n_episodes n_steps = args.n_steps os.makedirs(args.log_dir, exist_ok=True) # メインの処理プロセス # --- PRE-PROCESS --- # セッションスタート sess = tf.Session() # インスタンスの作成 env = RubiksCubeEnv() st_shape, act_list =\ env.get_state_shape(), env.get_action_list() agent = ActorCriticAgent(st_shape, act_list) memory = Memory() logger = HistoryLogger(log_dir) # ネットワーク変数の初期化 _init_g = tf.global_variables_initializer() sess.run(_init_g) # 学習済みモデルからネットワーク変数をrestore if model_path: agent.restore_graph(sess, model_path) # history loggingのヘッダ定義 _header = [ 'episode', 'avg_reward', 'avg_loss', 'avg_vloss', 'avg_aloss' ] logger.set_history_header(_header) # --- TRAIN MAIN --- # monotoring metrics用の変数 min_metric = 0.0 list_losses, list_rewards = [], [] start_time = time.time() # エピソードのループ for i_episode in range(n_episodes): # Cube環境の初期化 env.reset() # Cubeのランダムシャッフル _, state = env.apply_scramble_w_weight() # ステップのループ for i_step in range(n_steps): # エージェント(方策ネットワーク)による行動推定 action = agent.get_action(sess, state) # 選択行動に対して、環境から報酬値などの取得 next_state, reward, done, _ = env.step(action) # 経験のメモリ登録 memory.push(state, action, reward, next_state, done) state = next_state if done[0]: break # --- POST-PROCESS (EPISODE) --- # メモリからの経験データの取得 memory_data = memory.get_memory_data() # 経験データを用いたエージェントの更新 _args = zip(*memory_data) losses = agent.update_model(sess, *_args) loss, vloss, aloss = losses _, _, _rwd, _, _ = zip(*memory_data) reward = _rwd list_losses.append([loss, vloss, aloss]) list_rewards.append(np.mean(reward)) # 次エピソードのためメモリの初期化 memory.reset() i_episode += 1 if not i_episode % 100: # monitoring metricsの算出 duration = time.time() - start_time avg_loss, avg_vloss, avg_aloss = np.mean( list_losses, axis=0) avg_reward = np.mean(list_rewards) # monitoringのリセット list_losses, list_rewards = [], [] start_time = time.time() # print log_str = 'Episode: {0:6d}/{1:6d}'.format( i_episode, n_episodes) log_str += ' - Time: {0:3.2f}'.format( duration) log_str += ' - Avg_Reward: {0:3.3f}'.format( avg_reward) log_str += ' - Avg_Loss: {0:3.5f}'.format( avg_loss) log_str += ' - Avg_VLoss: {0:3.5f}'.format( avg_vloss) log_str += ' - Avg_ALoss: {0:3.5f}'.format( avg_aloss) print(log_str) # modelのlogging if not min_metric: min_metric = avg_reward min_metric = max(min_metric, avg_reward) if min_metric is avg_reward: args = [i_episode, avg_reward, avg_loss] agent.save_graph(sess, log_dir, args) # 各種monitoring metricsのlogging log_list = [ i_episode, avg_reward, avg_loss, avg_vloss, avg_aloss ] logger.history_save(log_list)
def train(self): tf.reset_default_graph() main_net = Q_net(self.height, self.width, self.depth, self.number_of_possible_actions) target_net = Q_net(self.height, self.width, self.depth, self.number_of_possible_actions) init = tf.global_variables_initializer() e = start_e steps = 0 r_all = 0 with tf.Session() as sess: sess.run(init) if load_model == True: self.load(sess, model_name) sess.run( self.get_copy_var_ops(dest_scope_name="target_net", src_scope_name="main_net")) for i in range(episodes): mem = Memory(1000, self.height, self.width, self.depth) self.env.init_env() j = 0 s = self.env.get_state() while j < max_episode_length: j += 1 if np.random.rand(1) < e: act = np.random.randint( 0, self.number_of_possible_actions) else: act = sess.run( main_net.selected_action, feed_dict={main_net.input_data_set: [s]})[0] st, a, r, end = self.env.do_action(act) mem.save(s, a, r, st, end) steps += 1 if e > min_e: e *= de elif steps % update_freq == 0: bs = min(batch_size, mem.max_index) state_batch, action_batch, reward_batch, state_new_batch, end_batch = mem.load( bs) Q1 = sess.run(main_net.selected_action, feed_dict={ main_net.input_data_set: state_new_batch }) Q2 = sess.run(target_net.Q, feed_dict={ target_net.input_data_set: state_new_batch }) dQ = Q2[range(bs), Q1] em = [] for k in range(0, bs): if not end_batch[k]: em.append(1) else: em.append(0) tQ = reward_batch + (y * dQ * em) _ = sess.run(main_net.updateModel, \ feed_dict={main_net.input_data_set: state_batch, main_net.targetQ: tQ, main_net.actions: action_batch}) if e <= min_e and steps % update_freq_target == 0: sess.run( self.get_copy_var_ops(dest_scope_name="target_net", src_scope_name="main_net")) r_all += r s = st if end: break jList.append(j) rList.append(r_all) if (i % (episodes // 10) == 0): self.save(sess, model_name) if len(rList) % 10 == 0: print(steps, np.mean(rList[-10:]), e) self.save(sess, model_name) print("완료: " + str(sum(rList) / episodes))
#!/usr/bin/env python # encoding: utf-8 """ @author: Young @license: (C) Copyright 2013-2017 @contact: [email protected] @file: test_memory.py @time: 2018/1/16 21:37 """ import numpy as np from agent.memory import Memory M = Memory() state = np.random.normal(size=24) action = np.random.normal(size=4) reward = np.random.normal() done = np.bool(np.random.randint(0, 2)) next_state = state for _ in range(int(1e6)): M(state, action, reward, done, next_state) states, actions, rewards, next_states = M.sample(128) print(states.shape) print(actions.shape) print(rewards.shape) print(next_states.shape)
estimate_loss = (t % test_interval == 0) training_TD_error = agent.train(estimate_loss=estimate_loss) #if #エピソード終端か? if done: break #if #while if t % test_interval == 0: #テストする steps = [] total_rewards = [] test_memory = Memory(test_repeat * max_steps) agent.set_policy("Greedy") #テストは複数回行う for _ in range(test_repeat): state = environment.reset() total_reward = 0 for step in range(1, max_steps + 1): action = agent.action(state) (state_dash, reward, done, info) = environment.step(action) total_reward += reward test_memory.append(state, action, reward, state_dash, done)
class Agent(object): def __init__(self, state_size, action_size, action_limits=1.): self.state_size = state_size self.action_size = action_size self.action_limits = action_limits self.memory = Memory(MEMORY_SIZE) self.noise = Noise(action_size) self.actor = ActorNet(state_size, action_size) self.target_actor = deepcopy(self.actor) self.actor_optimizer = Adam(self.actor.parameters(), LEARNING_RATE) self.critic = CriticNet(state_size, action_size) self.target_critic = deepcopy(self.critic) self.critic_optimizer = Adam(self.critic.parameters(), LEARNING_RATE) def append(self, *args): self.memory.append(*args) def sample(self, *args): return self.memory.sample(*args) def get_exploitation_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))) action = self.target_actor(state).detach() return action.data.numpy() def get_exploration_policy(self, state): state = Variable(torch.from_numpy(np.float32(state))) action = self.actor(state).detach() return action.data.numpy() + \ (self.noise() * self.action_limits) def optimize(self, batch_size=BATCH_SIZE): batch = self.sample(batch_size) state, action, reward, next_state =\ [Variable(torch.from_numpy(i)) for i in batch] next_action = self.target_actor.forward(next_state).detach() next_value = torch.squeeze( self.target_critic(next_state, next_action).detach()) target_value = reward + GAMMA * next_value value = torch.squeeze(self.critic(state, action)) loss_critic = nf.smooth_l1_loss(value, target_value) self.critic_optimizer.zero_grad() loss_critic.backward() self.critic_optimizer.step() policy_action = self.actor(state) loss_actor = -1 * torch.sum(self.critic(state, policy_action)) self.actor_optimizer.zero_grad() loss_actor.backward() self.actor_optimizer.step() soft_update(self.target_actor, self.actor, TAU) soft_update(self.target_critic, self.critic, TAU) def restore_models(self, num_episode): self.actor.load_state_dict( torch.load("./Models/{}_actor.pkl".format(num_episode))) self.critic.load_state_dict( torch.load("./Models/{}_critic.pkl".format(num_episode))) hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) def save_models(self, num_episode): torch.save(self.target_actor.state_dict(), "actor_{}.pkl".format(num_episode)) torch.save(self.target_critic.state_dict(), "critic_{}.pkl".format(num_episode)) print('Models saved successfully')