예제 #1
0
            ma_rewards.append(ep_reward)
    print('Complete training!')
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = HierarchicalDQNConfig()

    # train
    env, agent = env_agent_config(cfg, seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)
    agent.save(path=cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 algo=cfg.algo,
                 path=cfg.result_path)
    # eval
    env, agent = env_agent_config(cfg, seed=10)
    agent.load(path=cfg.model_path)
    rewards, ma_rewards = eval(cfg, env, agent)
    save_results(rewards, ma_rewards, tag='eval', path=cfg.result_path)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="eval",
                 env=cfg.env,
                 algo=cfg.algo,
                 path=cfg.result_path)
예제 #2
0
        if i_episode % cfg.target_update == 0:
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(
            i_episode + 1, cfg.train_eps, ep_reward, i_step, done))
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('Complete training!')
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = DQNConfig()
    env = gym.make('CartPole-v0').unwrapped  # 可google为什么unwrapped gym,此处一般不需要
    env.seed(1)  # 设置env随机种子
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    agent = DQN(n_states, n_actions, cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    agent.save(path=SAVED_MODEL_PATH)
    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 algo=cfg.algo,
                 path=RESULT_PATH)
예제 #3
0
            ep_reward += reward
            one_ep_transition.append((state, action, reward))
            state = next_state
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        agent.update(one_ep_transition)
        if (i_episode + 1) % 10 == 0:
            print("Episode:{}/{}: Reward:{}".format(i_episode + 1,
                                                    mc_cfg.n_episodes,
                                                    ep_reward))
    return rewards, ma_rewards


if __name__ == "__main__":
    mc_cfg = MCConfig()
    env = RacetrackEnv()
    n_actions = 9
    agent = FisrtVisitMC(n_actions, mc_cfg)
    rewards, ma_rewards = mc_train(mc_cfg, env, agent)
    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 algo="On-Policy First-Visit MC Control",
                 path=RESULT_PATH)
예제 #4
0
            ma_rewards.append(ep_reward)
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = TD3Config()
    env = gym.make(cfg.env)
    env.seed(cfg.seed)  # Set seeds
    torch.manual_seed(cfg.seed)
    np.random.seed(cfg.seed)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    td3 = TD3(state_dim, action_dim, max_action, cfg)
    cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
    td3.load(cfg.model_path)
    td3_rewards, td3_ma_rewards = eval(cfg.env, td3, cfg.seed)
    make_dir(cfg.result_path, cfg.model_path)
    save_results(td3_rewards, td3_ma_rewards, tag='eval', path=cfg.result_path)
    plot_rewards(
        {
            'td3_rewards': td3_rewards,
            'td3_ma_rewards': td3_ma_rewards,
        },
        tag="eval",
        env=cfg.env,
        algo=cfg.algo,
        path=cfg.result_path)
    # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/'
    # agent.load(cfg.result_path)
    # eval(cfg.env,agent, cfg.seed)
예제 #5
0
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
            self.result_path = curr_path+"/outputs/" + self.env_name + \
                '/'+curr_time+'/results/'  # 保存结果的路径
            self.model_path = curr_path+"/outputs/" + self.env_name + \
                '/'+curr_time+'/models/'  # 保存模型的路径
            self.save = True  # 是否保存图片

    def env_agent_config(cfg, seed=1):
        env = gym.make(cfg.env_name)
        env.seed(seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        agent = PPO(state_dim, action_dim, cfg)
        return env, agent

    cfg = PPOConfig()
    plot_cfg = PlotConfig()
    # 训练
    env, agent = env_agent_config(cfg, seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=plot_cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
    # 测试
    env, agent = env_agent_config(cfg, seed=10)
    agent.load(path=plot_cfg.model_path)
    rewards, ma_rewards = eval(cfg, env, agent)
    save_results(rewards, ma_rewards, tag='eval', path=plot_cfg.result_path)
    plot_rewards(rewards, ma_rewards, plot_cfg, tag="eval")
예제 #6
0
            # reward_pool.append(reward)
            state = next_state
            if done:
                print('Episode:', i_episode, 'Reward:', ep_reward)
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print("Complete Evaluating!")
    return rewards, ma_rewards


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env.seed(
        1)  # seed()用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed()值,则每次生成的随机数相同,设置仅一次有效
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    cfg = PGConfig()
    agent = PolicyGradient(state_dim, cfg, device)
    # rewards, ma_rewards = eval(cfg, env, agent)
    rewards, ma_rewards = train(cfg, env, agent)
    agent.save_model(SAVED_MODEL_PATH)
    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
    plot_rewards(rewards,
                 ma_rewards,
                 tag='train',
                 algo="Policy Gradient",
                 path=RESULT_PATH)
예제 #7
0
파일: main.py 프로젝트: YiPeng98/DRL-L
                # 此经验池中存储的是低层每一步的数据
                agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, onehot_goal]), done)
                state = next_state
                agent.update()
            # 此经验池中每达到一次高层限定的目标时就存储一次数据
            agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
        print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward, agent.loss_numpy, agent.meta_loss_numpy))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('Complete training !')
    return rewards, ma_rewards

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env.seed(1)
    cfg = HierarchicalDQNConfig()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = HierarchicalDQN(state_dim, action_dim, cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    save_results(rewards, ma_rewards, 'train', RESULT_PATH)
    plot_rewards(rewards, ma_rewards, 'train', RESULT_PATH)
    plot_losses(agent.losses, cfg.algo, RESULT_PATH)




예제 #8
0
파일: main.py 프로젝트: YiPeng98/DRL-L
            ep_reward += reward
            ep_steps += 1
            if done:
                break
        rewards.append(ep_reward)
        steps.append(ep_steps)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 +
                              ep_reward * 0.1)  # 移动平均奖励,上一回合和这一回合的奖励和
        else:
            ma_rewards.append(ep_reward)
        print("Episode:{}/{}; reward:{}".format(i_episode + 1, cfg.train_eps,
                                                ep_reward))
    return rewards, ma_rewards


if __name__ == "__main__":
    cfg = QlearningConfig()
    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)
    action_dim = env.env.action_space.n  # 因为env是CliffWalkingWapper类打包后的环境,所以要调用属性值的话必须用env.env
    agent = QLearning(action_dim, cfg)
    rewards, ma_rewards = train(cfg, env, agent, False)
    eval(cfg, env, agent, True)
    agent.save(path=SAVED_MODEL_PATH)
    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
    plot_rewards(rewards,
                 ma_rewards,
                 tag='train',
                 algo='Off-Policy First-Visit QLearning',
                 path=RESULT_PATH)