ep_reward += reward one_ep_transition.append((state, action, reward)) state = next_state if done: break rewards.append(ep_reward) if ma_rewards: ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) else: ma_rewards.append(ep_reward) agent.update(one_ep_transition) if (i_episode + 1) % 10 == 0: print("Episode:{}/{}: Reward:{}".format(i_episode + 1, mc_cfg.n_episodes, ep_reward)) return rewards, ma_rewards if __name__ == "__main__": mc_cfg = MCConfig() env = RacetrackEnv() n_actions = 9 agent = FisrtVisitMC(n_actions, mc_cfg) rewards, ma_rewards = mc_train(mc_cfg, env, agent) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag="train", algo="On-Policy First-Visit MC Control", path=RESULT_PATH)
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) else: ma_rewards.append(ep_reward) print('Complete training!') return rewards, ma_rewards if __name__ == "__main__": cfg = HierarchicalDQNConfig() # train env, agent = env_agent_config(cfg, seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) agent.save(path=cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) plot_rewards(rewards, ma_rewards, tag="train", algo=cfg.algo, path=cfg.result_path) # eval env, agent = env_agent_config(cfg, seed=10) agent.load(path=cfg.model_path) rewards, ma_rewards = eval(cfg, env, agent) save_results(rewards, ma_rewards, tag='eval', path=cfg.result_path) plot_rewards(rewards, ma_rewards, tag="eval", env=cfg.env, algo=cfg.algo,
if done: break rewards.append(ep_reward) if ma_rewards: ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) else: ma_rewards.append(ep_reward) print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") print('完成测试!') return rewards, ma_rewards if __name__ == "__main__": cfg = DQNConfig() plot_cfg = PlotConfig() # 训练 env, agent = env_agent_config(cfg, seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 agent.save(path=plot_cfg.model_path) # 保存模型 save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 # 测试 env, agent = env_agent_config(cfg, seed=10) agent.load(path=plot_cfg.model_path) # 导入模型 rewards, ma_rewards = test(cfg, env, agent) save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
ma_rewards.append(ep_reward) return rewards, ma_rewards if __name__ == "__main__": cfg = TD3Config() env = gym.make(cfg.env) env.seed(cfg.seed) # Set seeds torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) td3 = TD3(state_dim, action_dim, max_action, cfg) cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/' td3.load(cfg.model_path) td3_rewards, td3_ma_rewards = eval(cfg.env, td3, cfg.seed) make_dir(cfg.result_path, cfg.model_path) save_results(td3_rewards, td3_ma_rewards, tag='eval', path=cfg.result_path) plot_rewards( { 'td3_rewards': td3_rewards, 'td3_ma_rewards': td3_ma_rewards, }, tag="eval", env=cfg.env, algo=cfg.algo, path=cfg.result_path) # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/' # agent.load(cfg.result_path) # eval(cfg.env,agent, cfg.seed)
if i_ep + 1 >= cfg.epsilon_start: agent.update() if (i_ep + 1) % 10 == 0: print('回合:{}/{}, 奖励:{:.2f}'.format(i_ep + 1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) else: ma_rewards.append(ep_reward) print('完成训练!') return rewards, ma_rewards if __name__ == "__main__": cfg = TD3Config() plot_cfg = PlotConfig() env = gym.make(cfg.env_name) env.seed(1) # 随机种子 torch.manual_seed(1) np.random.seed(1) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) agent = TD3(state_dim, action_dim, max_action, cfg) rewards, ma_rewards = train(cfg, env, agent) make_dir(plot_cfg.result_path, plot_cfg.model_path) agent.save(path=plot_cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 此经验池中存储的是低层每一步的数据 agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, onehot_goal]), done) state = next_state agent.update() # 此经验池中每达到一次高层限定的目标时就存储一次数据 agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward, agent.loss_numpy, agent.meta_loss_numpy)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) print('Complete training !') return rewards, ma_rewards if __name__ == '__main__': env = gym.make('CartPole-v0') env.seed(1) cfg = HierarchicalDQNConfig() state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = HierarchicalDQN(state_dim, action_dim, cfg) rewards, ma_rewards = train(cfg, env, agent) save_results(rewards, ma_rewards, 'train', RESULT_PATH) plot_rewards(rewards, ma_rewards, 'train', RESULT_PATH) plot_losses(agent.losses, cfg.algo, RESULT_PATH)
print('------------ DRY RUN ------------') continue run_model_training( trn_datagen_flow, vld_datagen_flow, mdl_config[MODEL_ARCH], results = results[BS_KEY], epochs = EP, learning_rate = LR, batch_size = BATCH_SIZE, training_batches = trn_bpe, validation_batches = val_bpe, reload = reload, ckpt_file = load_ckpt) save_results(args.results_dir + results_filename, results ) reload = True ##---------------------------------------------------------------------------------------------- ## If in debug mode write stdout intercepted IO to output file ##---------------------------------------------------------------------------------------------- end_time = datetime.now() ## .strftime("%m-%d-%Y @ %H:%M:%S") # if args.sysout in ['ALL']: # print(' --> Execution ended at:', end_time) # sys.stdout.flush() # f_obj.close() # sys.stdout = sys.__stdout__ # print(' Run information written to ', sysout_name) print('\n Execution time :', end_time - start_time) print('\n --> Execution ended at:',end_time) exit(' Execution terminated ' )