Ejemplo n.º 1
0
def main():
    # 加载数据
    df = pd.read_csv('TD3gupiao/DATA/AAPL.csv')
    df = df.sort_values('Date')
    # 创建环境
    env = StockTradingEnv(df)
    env.reset()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    model = MujocoModel(act_dim, max_action)
    algorithm = parl.algorithms.TD3(model,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
Ejemplo n.º 2
0
def run_episode(envs, agent, rpm, episode):
    total_reward = 0
    Actions = [int(action_dim / 2) for _ in range(process_num)]
    Obses, _, _ = envs.next(Actions)
    step = 0
    for _ in range(1000):
        step += 1
        Actions = []
        for obs in Obses:
            action = agent.sample(obs)  # 采样动作,所有动作都有概率被尝试到
            Actions.append(action)
        Next_obses, Rewards, Dones = envs.next(Actions)  # 可以跟训练分开成两个进程?
        for i in range(len(Actions)):
            rpm.append(
                (Obses[i], Actions[i], Rewards[i], Next_obses[i], Dones[i]))

        # train model
        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
            (batch_obs, batch_action, batch_reward, batch_next_obs,
             batch_done) = rpm.sample(BATCH_SIZE)
            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
                                     batch_next_obs,
                                     batch_done)  # s,a,r,s',done

        total_reward += np.sum(Rewards)
        Obses = Next_obses

        if not step % 20:
            logger.info('step:{} e_greed:{} reward:{}'.format(
                step, agent.e_greed, np.sum(Rewards)))
        if not step % 500:
            image = pygame.surfarray.array3d(
                pygame.display.get_surface()).copy()
            image = np.flip(image[:, :, [2, 1, 0]], 0)
            image = np.rot90(image, 3)
            img_pt = os.path.join('outputs',
                                  'snapshoot_{}_{}.jpg'.format(episode, step))
            cv2.imwrite(img_pt, image)
    return total_reward
Ejemplo n.º 3
0
def run_train_step(agent, rpm):
    for step in range(args.train_total_steps):
        # use the first 80% data to train
        batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
            args.batch_size * gpu_num)
        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
        batch_next_obs = batch_all_obs[:, 1:, :, :]
        cost = agent.learn(batch_obs, batch_action, batch_reward,
                           batch_next_obs, batch_isOver)

        if step % 100 == 0:
            # use the last 20% data to evaluate
            batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
                args.batch_size)
            batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
            batch_next_obs = batch_all_obs[:, 1:, :, :]
            eval_cost = agent.supervised_eval(batch_obs, batch_action,
                                              batch_reward, batch_next_obs,
                                              batch_isOver)
            logger.info(
                "train step {}, train costs are {}, eval cost is {}.".format(
                    step, cost, eval_cost))
Ejemplo n.º 4
0
def main():
    env = JumpGame()
    np.random.seed(0)

    action_dim = 2
    obs_shape = 13

    model = Model(act_dim=action_dim)
    algorithm = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(algorithm, obs_dim=obs_shape, act_dim=action_dim)

    # 加载模型
    if os.path.exists('./model.ckpt'):
        save_path = './model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(
                env, agent,
                render=False)  # render=True 查看渲染效果,需要在本地运行,AIStudio无法显示
            logger.info('Test reward: {}'.format(total_reward))
            save_path = './model/dqn_model_{}_{}.ckpt'.format(i, total_reward)
            agent.save(save_path)

    # 保存模型到文件 ./model.ckpt
    agent.save('./model.ckpt')
Ejemplo n.º 5
0
def main():
    env = gym.make(args.env)
    env.seed(ENV_SEED)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    actor = ActorModel(act_dim)
    critic = CriticModel()
    algorithm = parl.algorithms.SAC(
        actor,
        critic,
        max_action=max_action,
        gamma=GAMMA,
        tau=TAU,
        actor_lr=ACTOR_LR,
        critic_lr=CRITIC_LR)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
Ejemplo n.º 6
0
def main():
    # 创建飞行器环境
    env = make_env("Quadrotor_hovering_control", task="hovering_control")
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print(obs_dim, act_dim)
    model = QuadrotorModel(act_dim + 1)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1)
    # if os.path.exists('model_dir/steps_140848.ckpt'):
    #     agent.restore('model_dir/steps_140848.ckpt')
    #     print("Restore succeed")
    # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim + 1)
    # 启动训练
    test_flag = 0
    total_steps = 0
    while total_steps < TRAIN_TOTAL_STEPS:
        train_reward, steps = run_episode(env, agent, rpm)
        total_steps += steps
        # logger.info('Steps: {} Train reward: {}'.format(total_steps, train_reward)) # 打印训练reward

        if total_steps // TEST_EVERY_STEPS >= test_flag:  # 每隔一定step数,评估一次模型
            while total_steps // TEST_EVERY_STEPS >= test_flag:
                test_flag += 1

            evaluate_reward = evaluate(env, agent)
            logger.info('Steps {}, Test reward: {}'.format(
                total_steps, evaluate_reward))  # 打印评估的reward

            # 每评估一次,就保存一次模型,以训练的step数命名
            ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps)
            agent.save(ckpt)
Ejemplo n.º 7
0
    def _parse_memory(self, client, ident, last_obs):
        mem = client.memory
        n = len(mem)

        # debug info
        if ident == 1:
            for i, exp in enumerate(mem):
                logger.info(
                    "[step:{}] obs:{} action:{} reward:{} shaping_reward:{}".
                    format(i, np.sum(mem[i].obs), np.sum(mem[i].action),
                           mem[i].reward, mem[i].info['shaping_reward']))

        episode_rpm = []
        for i in range(n - 1):
            if not mem[i].info['target_changed']:
                episode_rpm.append([
                    mem[i].obs, mem[i].action, mem[i].info['shaping_reward'],
                    mem[i + 1].obs, False, mem[i].info['target_change_times']
                ])
        if not mem[-1].info['target_changed']:
            episode_rpm.append([
                mem[-1].obs, mem[-1].action, mem[-1].info['shaping_reward'],
                last_obs, not mem[-1].info['timeout'],
                mem[i].info['target_change_times']
            ])

        indicators_dict = calc_indicators(mem)
        indicators_dict['free_client_num'] = self.ready_client_queue.qsize()
        indicators_dict['noiselevel'] = self.noiselevel

        with self.MEMORY_LOCK:
            self.add_episode_rpm(episode_rpm)
            self.scalars_manager.record(indicators_dict, self.global_step)
            self.global_step += 1
            if self.global_step >= 50:
                self.noiselevel = self.noiselevel * NOISE_DECAY

        client.reset()
Ejemplo n.º 8
0
def main():
    env = Paddle()
    np.random.seed(0)

    action_dim = 3
    obs_shape = 5

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(
        model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_shape,
        act_dim=action_dim,
        e_greed=0.1,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    save_path = './dqn_model.ckpt'
    agent.restore(save_path)
    print("模型加载成功")
    eval_reward = evaluate(agent, env)
    logger.info('test_reward:{}'.format(eval_reward))
Ejemplo n.º 9
0
def run_train_episode(env, agent, rpm):
    total_reward = 0
    all_cost = []
    obs = env.reset()
    #obs = rgb2gray(obs)
    #print(obs.shape)
    steps = 0
    while True:
        steps += 1
        context = rpm.recent_obs()

        context.append(obs)
        #print(obs.shape)
        context = np.stack(context, axis=0)
        action = agent.sample(context)
        next_obs, reward, isOver, _ = env.step(action)
        rpm.append(Experience(obs, action, reward, isOver))
        # start training
        if rpm.size() > MEMORY_WARMUP_SIZE:
            if steps % UPDATE_FREQ == 0:
                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                    args.batch_size)
                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
                batch_next_obs = batch_all_obs[:, 1:, :, :]
                cost = agent.learn(batch_obs, batch_action, batch_reward,
                                   batch_next_obs, batch_isOver)
                all_cost.append(float(cost))
        total_reward += reward
        #obs = rgb2gray(next_obs)
        obs = next_obs
        #print(obs.shape)
        if isOver:
            break
    if all_cost:
        logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
            total_reward, np.mean(all_cost)))
    return total_reward, steps, np.mean(all_cost)
Ejemplo n.º 10
0
def main():
    # 创建环境
    game = Pong(width=200, height=200, MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=True, force_fps=False)
    p.reset_game()
    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    print("act_dim:", act_dim)

    obs_dim = 200 * 200
    # 使用parl框架搭建Agent:QuadrotorModel, DDPG, QuadrotorAgent三者嵌套
    model = PongModel(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = PongAgent(algorithm, obs_dim, act_dim)
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

    max_episode = 20000
    # 开始训练
    episode = 0
    best_reward = -float('inf')
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent, render=True)  # render=True 查看显示效果
        if eval_reward > best_reward:
            best_reward = eval_reward
            agent.save('model_dir/ddpg_pong_{}.ckpt'.format(episode))
        logger.info('episode:{}   test_reward:{}'.format(episode, eval_reward))
Ejemplo n.º 11
0
def main():
    env = gym.make('CartPole-v0')
    action_dim = env.action_space.n
    obs_shape = env.observation_space.shape

    rpm = ReplayMemory(MEMORY_SIZE)

    model = CartpoleModel(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = CartpoleAgent(
        algorithm,
        obs_dim=obs_shape[0],
        act_dim=action_dim,
        e_greed=0.1,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 2000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        eval_reward = evaluate(agent, env)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))
Ejemplo n.º 12
0
def main():
    env = get_player(args.rom,
                     image_size=IMAGE_SIZE,
                     train=True,
                     frame_skip=FRAME_SKIP)
    file_path = "memory.npz"
    rpm = ReplayMemory(
        MEMORY_SIZE,
        IMAGE_SIZE,
        CONTEXT_LEN,
        load_file=True,  # load replay memory data from file
        file_path=file_path)
    act_dim = env.action_space.n

    model = AtariModel(act_dim)
    algorithm = DQN(model,
                    act_dim=act_dim,
                    gamma=GAMMA,
                    lr=LEARNING_RATE * gpu_num)
    agent = AtariAgent(algorithm,
                       act_dim=act_dim,
                       total_step=args.train_total_steps)
    if os.path.isfile('./model_dir'):
        logger.info("load model from file")
        agent.restore('./model_dir')

    if args.train:
        logger.info("train with memory data")
        run_train_step(agent, rpm)
        logger.info("finish training. Save the model.")
        agent.save('./model_dir')
    else:
        logger.info("collect experience")
        collect_exp(env, rpm, agent)
        rpm.save_memory()
        logger.info("finish collecting, save successfully")
Ejemplo n.º 13
0
    def __init__(self,
                 max_size,
                 obs_shape,
                 context_len,
                 load_file=False,
                 file_path=None):
        self.max_size = int(max_size)
        self.obs_shape = obs_shape
        self.context_len = int(context_len)

        self.file_path = file_path
        if load_file and os.path.isfile(file_path):
            logger.info("load memory from file" + self.file_path)
            self.load_memory()
            logger.info("memory size is {}".format(self._curr_size))
        else:
            self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
            self.action = np.zeros((self.max_size, ), dtype='int32')
            self.reward = np.zeros((self.max_size, ), dtype='float32')
            self.isOver = np.zeros((self.max_size, ), dtype='bool')

            self._curr_size = 0
            self._curr_pos = 0
            self._context = deque(maxlen=context_len - 1)
Ejemplo n.º 14
0
    def test_reset_actor(self):
        logger.info("running: test_reset_actor")
        # start the master
        master = Master(port=8237)
        th = threading.Thread(target=master.run)
        th.start()
        time.sleep(3)

        worker1 = Worker('localhost:8237', 4)
        parl.connect('localhost:8237')
        for _ in range(10):
            actor = Actor()
            ret = actor.add_one(1)
            self.assertEqual(ret, 2)
        del actor

        for _ in range(10):
            if master.cpu_num == 4:
                break
            time.sleep(10)

        self.assertEqual(master.cpu_num, 4)
        worker1.exit()
        master.exit()
Ejemplo n.º 15
0
def main():
    env = gym.make('CartPole-v0')
    model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
    alg = PolicyGradient(model, LEARNING_RATE)
    agent = CartpoleAgent(alg, OBS_DIM, ACT_DIM)

    with fluid.dygraph.guard():
        for i in range(1000):  # 100 episodes
            obs_list, action_list, reward_list = run_episode(env, agent)
            if i % 10 == 0:
                logger.info("Episode {}, Reward Sum {}.".format(
                    i, sum(reward_list)))

            batch_obs = np.array(obs_list)
            batch_action = np.array(action_list)
            batch_reward = calc_reward_to_go(reward_list)

            agent.learn(batch_obs, batch_action, batch_reward)
            if (i + 1) % 100 == 0:
                _, _, reward_list = run_episode(env,
                                                agent,
                                                train_or_test='test')
                total_reward = np.sum(reward_list)
                logger.info('Test reward: {}'.format(total_reward))
Ejemplo n.º 16
0
def main():
    env = gym.make(args.env)
    env.seed(seed)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = float(env.action_space.high[0])

    algorithm = SAC(ActorModel(act_dim),
                    CriticModel(),
                    max_action=act_limit,
                    gamma=gamma,
                    tau=TAU,
                    actor_lr=ACTOR_LR,
                    critic_lr=CRITIC_LR)
    agent = BipedalWalkerAgent(algorithm, obs_dim, act_dim)
    rpm = ReplayMemory(replay_size, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        # logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
            summary.add_scalar('eval/episode_reward', evaluate_reward,
                               total_steps)
            ckpt = 'star2_model_dir/steps_{}.ckpt'.format(total_steps)
            agent.save(ckpt)
Ejemplo n.º 17
0
def watch():

    import torch

    game = FlappyBird()
    env = PLE(game,
              fps=30,
              frame_skip=4,
              display_screen=True,
              force_fps=False,
              reward_values={"tick": 0.00},
              state_preprocessor=None)
    env.init()

    model = Model(obs_dim=OBS_DIM, act_dim=ACT_DIM)
    if torch.cuda.is_available():
        model = model.cuda()

    model.load_state_dict(torch.load('checkpoint.pt'))

    from parl.algorithms.torch import PolicyGradient
    alg = PolicyGradient(model, LEARNING_RATE)
    agent = Agent(alg)

    for i in range(10000):  # 1000 episodes
        obs_list, action_list, reward_list = run_episode(env, agent)

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        # agent.learn(batch_obs, batch_action, batch_reward)

        _, _, reward_list = run_episode(env, agent, train_or_test='test')
        total_reward = np.sum(reward_list)
        logger.info('Test reward: {}'.format(total_reward))
Ejemplo n.º 18
0
    def _create_client_monitor(self, client_heartbeat_address):
        """When a new client connects to the master, a socket is created to
        send heartbeat signals to the client.
        """

        client_heartbeat_socket = self.ctx.socket(zmq.REQ)
        client_heartbeat_socket.linger = 0
        client_heartbeat_socket.setsockopt(
            zmq.RCVTIMEO, remote_constants.HEARTBEAT_TIMEOUT_S * 1000)
        client_heartbeat_socket.connect("tcp://" + client_heartbeat_address)

        client_is_alive = True
        while client_is_alive and self.master_is_alive:
            try:
                client_heartbeat_socket.send_multipart(
                    [remote_constants.HEARTBEAT_TAG])
                client_status = client_heartbeat_socket.recv_multipart()

                self.cluster_monitor.update_client_status(
                    client_status, client_heartbeat_address,
                    self.client_hostname[client_heartbeat_address])

            except zmq.error.Again as e:
                client_is_alive = False
                self.cluster_monitor.drop_client_status(
                    client_heartbeat_address)
                logger.warning("[Master] cannot connect to the client " +
                               "{}. ".format(client_heartbeat_address) +
                               "Please check if it is still alive.")
            time.sleep(remote_constants.HEARTBEAT_INTERVAL_S)
        logger.warning("Master exits client monitor for {}.\n".format(
            client_heartbeat_address))
        logger.info(
            "Master connects to {} workers and have {} vacant CPUs.\n".format(
                self.worker_num, self.cpu_num))
        client_heartbeat_socket.close(0)
def main():
    # env = gym.make('CartPole-v0')
    # env = env.unwrapped # Cancel the minimum score limit
    # env = VideoOffloadEnv()
    env = TrainEnv()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists('./policygradient_model'):
        agent.restore('./policygradient_model')
        print("加载模型成功,开始预测:")
        evaluate(env, agent)

    max_episode = 20000

    log_list = []
    fo = open(
        "log/" + str(math.floor(time.time() * 1000.0)) + "policygradient.txt",
        "w")
    train_episode = 0
    test_episode = 0
    while train_episode < max_episode:
        for i in range(0, 10):
            obs_list, action_list, reward_list = run_episode(env, agent)
            log_list.append("Train " + str(train_episode) + " " +
                            str(sum(reward_list)) + "\n")
            logger.info("train_episode:{}    train_reward:{}.".format(
                train_episode, sum(reward_list)))

            batch_obs = np.array(obs_list)
            batch_action = np.array(action_list)
            batch_reward = calc_reward_to_go(reward_list)

            agent.learn(batch_obs, batch_action, batch_reward)
            train_episode += 1

        total_reward = evaluate(env, agent)
        log_list.append("Test " + str(test_episode) + " " + str(total_reward) +
                        "\n")
        logger.info('test_episode:{}    test_reward:{}'.format(
            test_episode, total_reward))
        test_episode += 1

    fo.writelines(log_list)
    fo.close()

    # save the parameters to ./policygradient_model
    agent.save('./policygradient_model')
    print("模型保存成功")
Ejemplo n.º 20
0
def test():
    # 创建环境
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    obs_dim = len(env.getGameState())
    act_dim = len(env.getActionSet())
    print('action set:', env.getActionSet())
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(algorithm,
                  obs_dim=obs_dim,
                  act_dim=act_dim,
                  e_greed=0.3,
                  e_greed_decrement=1e-6)

    # 加载模型
    save_path = './DQN/checkpoints/episode_V14600.ckpt'
    print('checkpoints:', save_path)
    if os.path.exists(save_path):
        logger.info('load ckpt success!')
        agent.restore(save_path)
    else:
        logger.error('load ckpt error!')

    action_set = env.getActionSet()
    env.init()
    episode_reward = 0
    steps = 0
    while not env.game_over():
        steps += 1
        if (steps == 1):
            continue
        obs = list(env.getGameState().values())
        action_idx = agent.predict(obs)  # 预测动作,只选最优动作
        act = action_set[action_idx]
        reward = env.act(act)
        episode_reward += reward
        reward_str = str(int(episode_reward))
        drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0),
                 (255, 255, 255))
    env.reset_game()
    logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
Ejemplo n.º 21
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    #obs_dim += 1  # add 1 to obs dim for time step feature

    logger.info('observation_dim {}, action_dim {}'.format(obs_dim, act_dim))

    scaler = Scaler(obs_dim)

    model = Model(obs_dim, act_dim)
    alg = parl.algorithms.PPO(
        model,
        act_dim=act_dim,
        policy_lr=model.policy_lr,
        value_lr=model.value_lr)
    agent = Agent(alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type)

    # run a few episodes to initialize scaler
    collect_trajectories(env, agent, scaler, episodes=5)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        trajectories = collect_trajectories(
            env, agent, scaler, episodes=args.episodes_per_batch)
        total_steps += sum([t['obs'].shape[0] for t in trajectories])
        total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])

        train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
            trajectories, agent)

        policy_loss, kl = agent.policy_learn(train_obs, train_actions,
                                             train_advantages)
        value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)

        logger.info(
            'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
            .format(total_steps, total_train_rewards / args.episodes_per_batch,
                    policy_loss, kl, value_loss))

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            eval_reward = run_evaluate_episode(env, agent, scaler, render=True)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, eval_reward))
Ejemplo n.º 22
0
def main():
    env = gym.make(args.env)
    env.seed(args.seed)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    model = MujocoModel(obs_dim, act_dim, max_action)
    algorithm = ADER(model,
                     max_action=max_action,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR,
                     kappa=args.kappa,
                     epoch=args.epoch,
                     alpha=args.alpha)
    agent = MujocoAgent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            evaluate_reward, evaluate_fall_rate, total_steps_list = run_evaluate_episode(
                env, agent)
            mean_steps = np.mean(total_steps_list)
            logger.info('Steps {}, Evaluate reward: {}, Fall rate: {}'.format(
                total_steps, evaluate_reward, evaluate_fall_rate))
            logger.info(
                'Steps {}, Mean episode steps: {}, Steps list: {}'.format(
                    total_steps, mean_steps, total_steps_list))
            res = {
                'eval_step': mean_steps,
                'fall_rate': evaluate_fall_rate,
                'Step': total_steps,
                'Value': evaluate_reward
            }
            csv_logger.log_dict(res)
Ejemplo n.º 23
0
def main():
     # create environment
    dist1 = Distribution(id=0, vals=[2], probs=[1])
    dist2 = Distribution(id=1, vals=[5], probs=[1])
    dist3 = Distribution(id=2, vals=[2,8], probs=[0.5,0.5])

    env = Environment(total_bandwidth = 10,\
        distribution_list=[dist1,dist2,dist3], \
        mu_list=[1,2,3], lambda_list=[3,2,1],\
        num_of_each_type_distribution_list=[300,300,300])
    # env = gym.make('CartPole-v0')
    # env = env.unwrapped # Cancel the minimum score limit
    # obs_dim = env.observation_space.shape[0]
    # act_dim = env.action_space.n
    obs_dim = 6
    act_dim = 2
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists('./policy_grad_model.ckpt'):
        agent.restore('./policy_grad_model.ckpt')
        # run_episode(env, agent, train_or_test='test', render=True)
        # exit()

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list, gamma=0.9)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=True)
            logger.info('Test reward: {}'.format(total_reward))

    # save the parameters to ./policy_grad_model.ckpt
    agent.save('./policy_grad_model.ckpt')
Ejemplo n.º 24
0
def play_multi_episode(submit_model, episode_num=2, vis=False, seed=0):
    np.random.seed(seed)
    env = ProstheticsEnv(visualize=vis)
    env.change_model(model='3D', difficulty=1, prosthetic=True, seed=seed)
    env = ForwardReward(env)
    env = FrameSkip(env, 4)
    env = ActionScale(env)
    env = PelvisBasedObs(env)
    all_reward = []
    all_shaping_reward = 0
    last_frames_count = 0

    for e in range(episode_num):
        t = time.time()
        episode_reward = 0.0
        episode_shaping_reward = 0.0
        observation = env.reset(project=False)
        target_change_times = 0
        step = 0
        loss = []
        while True:
            step += 1
            action = submit_model.pred_batch(observation, target_change_times)
            observation, reward, done, info = env.step(action, project=False)
            step_frames = info['frame_count'] - last_frames_count
            last_frames_count = info['frame_count']
            episode_reward += reward
            # we pacle it here to drop the first step after changing
            if target_change_times >= 1:
                loss.append(10 * step_frames - reward)
            if info['target_changed']:
                target_change_times = min(target_change_times + 1, 3)
            logger.info("[step/{}]reward:{}  info:{}".format(
                step, reward, info))
            episode_shaping_reward += info['shaping_reward']
            if done:
                break
        all_reward.append(episode_reward)
        all_shaping_reward += episode_shaping_reward
        t = time.time() - t
        logger.info(
            "[episode/{}] time: {} episode_reward:{} change_loss:{} after_change_loss:{} mean_reward:{}"
            .format(e, t, episode_reward, np.sum(loss[:15]), np.sum(loss[15:]),
                    np.mean(all_reward)))
    logger.info("Mean reward:{}".format(np.mean(all_reward)))
Ejemplo n.º 25
0
    def load_params(self, dirname, from_one_head):
        if from_one_head:
            logger.info('[From one head, extend to multi head:]')
            # load model 0
            fluid.io.load_params(
                executor=self.fluid_executor,
                dirname=dirname,
                main_program=self.learn_programs[0])

            # sync identity params of model/target_model 0 to other models/target_models
            for i in range(1, self.ensemble_num):
                params = list(
                    filter(
                        lambda x: 'identity' in x.name and '@GRAD' not in x.name,
                        self.learn_programs[i].list_vars()))
                for param in params:
                    param_var = _fetch_var(param.name, return_numpy=False)

                    model0_name = re.sub(r"identity_\d+", "identity_0",
                                         param.name)
                    model0_value = _fetch_var(model0_name, return_numpy=True)
                    logger.info('{} -> {}'.format(model0_name, param.name))
                    param_var.set(model0_value, self.place)

            # sync share params of target_model 0 to other target models
            # After deepcopy, shapre params between target models  is different
            for i in range(1, self.ensemble_num):
                params = list(
                    filter(
                        lambda x: 'shared' in x.name and 'PARL_target' in x.name and '@GRAD' not in x.name,
                        self.learn_programs[i].list_vars()))
                for param in params:
                    param_var = _fetch_var(param.name, return_numpy=False)

                    model0_name = re.sub(r"_\d+$", "_0", param.name)
                    model0_value = _fetch_var(model0_name, return_numpy=True)
                    logger.info('{} -> {}'.format(model0_name, param.name))
                    param_var.set(model0_value, self.place)

        else:
            for i in range(self.ensemble_num):
                fluid.io.load_params(
                    executor=self.fluid_executor,
                    dirname=dirname,
                    main_program=self.learn_programs[i])
Ejemplo n.º 26
0
    def keep_training(self):
        episode_count = 1000000
        for T in range(episode_count):
            if self.rpm.size() > BATCH_SIZE * args.warm_start_batchs:
                self.learn()
                logger.info(
                    "[keep_training/{}] trying to acq a new env".format(T))

            # Keep training and predicting balance
            # After training, wait for a ready actor, and make the actor start new episode
            ready_actor_event = self.ready_actor_queue.get()
            ready_actor_event.set()

            if np.mod(T, 100) == 0:
                logger.info("saving models")
                self.save(T)
            if np.mod(T, 10000) == 0:
                logger.info("saving rpm")
                self.save_rpm()
Ejemplo n.º 27
0
    def create_actors(self):
        parl.connect(self.config['master_address'])

        logger.info('Waiting for {} remote actors to connect.'.format(
            self.config['actor_num']))

        for i in six.moves.range(self.config['actor_num']):
            params_queue = queue.Queue()
            self.params_queues.append(params_queue)

            self.remote_count += 1
            logger.info('Remote actor count: {}'.format(self.remote_count))

            remote_thread = threading.Thread(
                target=self.run_remote_sample, args=(params_queue, ))
            remote_thread.setDaemon(True)
            remote_thread.start()

        logger.info('All remote actors are ready, begin to learn.')
        self.start_time = time.time()
Ejemplo n.º 28
0
def main():
    # 创建环境
    game = Snake(width=256, height=256, init_length=10)
    p = PLE(game, fps=30, display_screen=True, force_fps=True)
    # 根据parl框架构建agent
    p.reset_game()
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    obs_dim = 256 * 256

    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # # 加载模型
    # if os.path.exists('model_dir/pg_pong_episode_19.ckpt'):
    #     agent.restore('model_dir/pg_pong_episode_19.ckpt')

    best_total_reward = -float('inf')
    for i in range(50000):
        obs_list, action_list, reward_list = run_episode(p, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)
        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 50 == 0:
            total_reward = evaluate(p, agent, render=True)
            if total_reward > best_total_reward:
                best_total_reward = total_reward
                agent.save(
                    'model_dir/pg_pong_episode_{}_reward_{}.ckpt'.format(
                        i, total_reward))
            logger.info('Test reward: {}'.format(total_reward))
Ejemplo n.º 29
0
def main():
    # 创建环境
    env = gym.make('Pong-v0')
    obs_dim = 80 * 80
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists('model.ckpt'):
        agent.restore('model.ckpt')
        print("restore_succeed")

    best_reward = -float('inf')
    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Train Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=False)
            if total_reward > best_reward:
                best_reward = total_reward
                agent.save('model_dir/pg_pong_episode_{}_reward_{}'.format(
                    i, best_reward))
            logger.info('Episode {}, Test reward: {}'.format(
                i + 1, total_reward))

        # save the parameters to ./model.ckpt
    agent.save('./model.ckpt')
Ejemplo n.º 30
0
def main():
    env = gym.make('CarRacing-v0')
    obs_dim = 28 * 32
    act_dim = 3  # simply straight left and right
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    # if os.path.exists('./model.ckpt'):
    #     agent.restore('./model.ckpt')

    for i in range(1):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Train Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=False)
            logger.info('Episode {}, Test reward: {}'.format(
                i + 1, total_reward))
            ckpt = './model_episode_{}.ckpt'.format(i + 1)
            agent.save(ckpt)

    # save the parameters to ckpt
    agent.save('./final.ckpt')

    env.close()