Beispiel #1
0
            ep_reward += reward
            one_ep_transition.append((state, action, reward))
            state = next_state
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        agent.update(one_ep_transition)
        if (i_episode + 1) % 10 == 0:
            print("Episode:{}/{}: Reward:{}".format(i_episode + 1,
                                                    mc_cfg.n_episodes,
                                                    ep_reward))
    return rewards, ma_rewards


if __name__ == "__main__":
    mc_cfg = MCConfig()
    env = RacetrackEnv()
    n_actions = 9
    agent = FisrtVisitMC(n_actions, mc_cfg)
    rewards, ma_rewards = mc_train(mc_cfg, env, agent)
    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 algo="On-Policy First-Visit MC Control",
                 path=RESULT_PATH)
Beispiel #2
0
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('Complete training!')
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = HierarchicalDQNConfig()

    # train
    env, agent = env_agent_config(cfg, seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)
    agent.save(path=cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 algo=cfg.algo,
                 path=cfg.result_path)
    # eval
    env, agent = env_agent_config(cfg, seed=10)
    agent.load(path=cfg.model_path)
    rewards, ma_rewards = eval(cfg, env, agent)
    save_results(rewards, ma_rewards, tag='eval', path=cfg.result_path)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="eval",
                 env=cfg.env,
                 algo=cfg.algo,
Beispiel #3
0
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
    print('完成测试!')
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = DQNConfig()
    plot_cfg = PlotConfig()
    # 训练
    env, agent = env_agent_config(cfg, seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=plot_cfg.model_path)  # 保存模型
    save_results(rewards, ma_rewards, tag='train',
                 path=plot_cfg.result_path)  # 保存结果
    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg, seed=10)
    agent.load(path=plot_cfg.model_path)  # 导入模型
    rewards, ma_rewards = test(cfg, env, agent)
    save_results(rewards, ma_rewards, tag='test',
                 path=plot_cfg.result_path)  # 保存结果
    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
Beispiel #4
0
            ma_rewards.append(ep_reward)
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = TD3Config()
    env = gym.make(cfg.env)
    env.seed(cfg.seed)  # Set seeds
    torch.manual_seed(cfg.seed)
    np.random.seed(cfg.seed)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    td3 = TD3(state_dim, action_dim, max_action, cfg)
    cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
    td3.load(cfg.model_path)
    td3_rewards, td3_ma_rewards = eval(cfg.env, td3, cfg.seed)
    make_dir(cfg.result_path, cfg.model_path)
    save_results(td3_rewards, td3_ma_rewards, tag='eval', path=cfg.result_path)
    plot_rewards(
        {
            'td3_rewards': td3_rewards,
            'td3_ma_rewards': td3_ma_rewards,
        },
        tag="eval",
        env=cfg.env,
        algo=cfg.algo,
        path=cfg.result_path)
    # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/'
    # agent.load(cfg.result_path)
    # eval(cfg.env,agent, cfg.seed)
Beispiel #5
0
            if i_ep + 1 >= cfg.epsilon_start:
                agent.update()
        if (i_ep + 1) % 10 == 0:
            print('回合:{}/{}, 奖励:{:.2f}'.format(i_ep + 1, cfg.train_eps,
                                               ep_reward))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('完成训练!')
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = TD3Config()
    plot_cfg = PlotConfig()
    env = gym.make(cfg.env_name)
    env.seed(1)  # 随机种子
    torch.manual_seed(1)
    np.random.seed(1)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    agent = TD3(state_dim, action_dim, max_action, cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(plot_cfg.result_path, plot_cfg.model_path)
    agent.save(path=plot_cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
Beispiel #6
0
                # 此经验池中存储的是低层每一步的数据
                agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, onehot_goal]), done)
                state = next_state
                agent.update()
            # 此经验池中每达到一次高层限定的目标时就存储一次数据
            agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
        print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward, agent.loss_numpy, agent.meta_loss_numpy))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('Complete training !')
    return rewards, ma_rewards

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env.seed(1)
    cfg = HierarchicalDQNConfig()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = HierarchicalDQN(state_dim, action_dim, cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    save_results(rewards, ma_rewards, 'train', RESULT_PATH)
    plot_rewards(rewards, ma_rewards, 'train', RESULT_PATH)
    plot_losses(agent.losses, cfg.algo, RESULT_PATH)




            print('------------ DRY RUN ------------')
            continue

        run_model_training( trn_datagen_flow,
                            vld_datagen_flow, 
                            mdl_config[MODEL_ARCH],
                            results = results[BS_KEY],
                            epochs = EP, 
                            learning_rate = LR, 
                            batch_size = BATCH_SIZE, 
                            training_batches = trn_bpe, 
                            validation_batches = val_bpe, 
                            reload = reload,
                            ckpt_file = load_ckpt)

        save_results(args.results_dir + results_filename, results )
        reload = True
        
##----------------------------------------------------------------------------------------------
## If in debug mode write stdout intercepted IO to output file
##----------------------------------------------------------------------------------------------            
end_time = datetime.now()     ## .strftime("%m-%d-%Y @ %H:%M:%S")
# if args.sysout in  ['ALL']:
    # print(' --> Execution ended at:', end_time)
    # sys.stdout.flush()
    # f_obj.close()    
    # sys.stdout = sys.__stdout__
    # print(' Run information written to ', sysout_name)    
print('\n Execution time :', end_time - start_time)
print('\n --> Execution ended at:',end_time)
exit(' Execution terminated ' )