def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        env.reset_game()
        obs = get_obs(env)
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            pred_action = agent.predict(batch_obs.astype('float32'))  # 选取最优动作
            pred_action = np.squeeze(pred_action)
            action_set = env.getActionSet()
            action_index = np.random.choice(range(3), p=pred_action)
            action = action_set[action_index]
            reward = env.act(action)
            next_obs = get_obs(env)
            done = env.game_over()

            obs = next_obs
            total_reward += reward
            steps += 1

            if render:
                env.getScreenRGB()
            if done:
                break

        eval_reward.append(total_reward)
    return np.mean(eval_reward)
def run_episode(env, agent, rpm):
    total_reward = 0
    env.reset_game()
    obs = get_obs(env)
    step = 0
    while True:
        step += 1
        action_index = agent.sample(obs)  # 采样动作,所有动作都有概率被尝试到
        action = env.getActionSet()[action_index]
        # 行动
        reward = env.act(action)
        next_obs = get_obs(env)
        done = env.game_over()
        rpm.append((obs, action_index, reward, next_obs, done))

        # train model
        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
            (batch_obs, batch_action, batch_reward, batch_next_obs,
             batch_done) = rpm.sample(BATCH_SIZE)
            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
                                     batch_next_obs,
                                     batch_done)  # s,a,r,s',done

        total_reward += reward
        obs = next_obs
        if done:
            break
    return total_reward
def run_episode(env, agent, rpm):
    total_reward = 0
    env.reset_game()
    obs = get_obs(env)
    steps = 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        pred_action = agent.predict(batch_obs.astype('float32'))  # 选取最优动作
        pred_action = np.squeeze(pred_action)
        action_set = env.getActionSet()
        action_index = np.random.choice(range(3), p=pred_action)
        action = action_set[action_index]
        reward = env.act(action)
        next_obs = get_obs(env)
        done = env.game_over()
        rpm.append(obs, action_index, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
            batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward
Example #4
0
def step(state_dic):
    """
    Parameters
    ----------
    state_dic : dictionary
        current state of agent coming from environment

    Returns
    -------
    Action: angle

    """
    global net, a, action

    if a == 10:
        state, done = get_obs(state_dic)

        state = sum(state, [])

        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
        action = np.argmax(q_vals)
        a = 0

    a += 1
    return action
Example #5
0
    def __init__(self, actor_id):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.action_size = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.actor_id = actor_id
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.sequence = []
        self.recurrent_state = []
        self.priority = []
        self.td_loss = deque(maxlen=self.learning_length)
        self.memory_sequence_size = 1000
        self.memory = ReplayMemory(
            memory_sequence_size=self.memory_sequence_size)
        self.memory_save_interval = 3

        self.gamma = 0.997
        self.actor_parameter_update_interval = 500
        self.model_path = './model_data/'
        self.actor = ActorNet(self.obs_size,
                              self.action_size,
                              cuda_id=self.actor_id % 2 +
                              1).cuda(self.actor_id % 2 + 1).eval()
        self.target_actor = deepcopy(self.actor)
        self.critic = CriticNet(self.obs_size,
                                self.action_size,
                                cuda_id=self.actor_id % 2 +
                                1).cuda(self.actor_id % 2 + 1).eval()
        self.target_critic = deepcopy(self.critic)
        self.load_model()
        self.epsilon = 1
        self.last_obs = None
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        env.reset_game()
        obs = get_obs(env)
        episode_reward = 0
        while True:
            action_index = agent.predict(obs)  # 选取最优动作
            action = env.getActionSet()[action_index]
            reward = env.act(action)
            obs = get_obs(env)
            episode_reward += reward
            if render:
                env.getScreenRGB()
            if env.game_over():
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)
def run_episode(env, agent):
    obs_list, action_list, reward_list = [], [], []
    env.reset_game()
    obs = get_obs(env)
    while True:
        obs_list.append(obs)
        action_index = agent.sample(obs)  # 采样动作,所有动作都有概率被尝试到
        action = env.getActionSet()[action_index]
        action_list.append(action_index)

        # 行动
        reward = env.act(action)
        next_obs = get_obs(env)
        done = env.game_over()
        obs = next_obs
        reward_list.append(reward)

        if done:
            break
    return obs_list, action_list, reward_list
Example #8
0
def main():
    # 创建环境
    game = Pong(width=200, height=200,MAX_SCORE=11)
    p = PLE(game, fps=30, display_screen=False, force_fps=False)
    p.reset_game()
    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())

    obs = get_obs(p)
    obs_dim = 200*200

    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    model = Model(act_dim=act_dim)
    alg = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim, e_greed_decrement=1e-6, e_greed=0.1)  # e_greed有一定概率随机选取动作,探索

    # # 加载模型
    # if os.path.exists('./water_world_dqn.ckpt'):
    #     agent.restore('./water_world_dqn.ckpt')

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(p, agent, rpm)

    max_episode = 200000
    # 开始训练
    episode = 0
    best_reward = -float('inf')
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 50):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent, render=False)  # render=True 查看显示效果
        if eval_reward>best_reward:
            best_reward = eval_reward
            agent.save('model_dir/dqn_pong_{}_reward_{}.ckpt'.format(episode,best_reward))
        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
            episode, agent.e_greed, eval_reward))
Example #9
0
    def __init__(self, n_actors):
        self.env = suite.load(domain_name="walker", task_name="run")
        self.n_actions = self.env.action_spec().shape[0]
        self.obs_size = get_obs(self.env.reset().observation).shape[1]

        self.n_actors = n_actors
        self.burn_in_length = 20  # 40-80
        self.learning_length = 40
        self.sequence_length = self.burn_in_length + self.learning_length
        self.n_step = 5
        self.memory_sequence_size = 5000000
        self.batch_size = 32
        self.memory = LearnerReplayMemory(
            memory_sequence_size=self.memory_sequence_size,
            batch_size=self.batch_size)

        self.model_path = './model_data/'
        self.memory_path = './memory_data/'
        self.actor = ActorNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_actor = deepcopy(self.actor).eval()
        self.critic = CriticNet(self.obs_size, self.n_actions, 0).cuda()
        self.target_critic = deepcopy(self.critic).eval()
        self.model_save_interval = 50  # 50
        self.memory_update_interval = 50  # 50
        self.target_update_inverval = 500  # 100

        self.gamma = 0.997
        self.actor_lr = 1e-4
        self.critic_lr = 1e-3
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.actor_lr)
        self.actor_criterion = nn.MSELoss()
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.critic_lr)
        self.critic_criterion = nn.MSELoss()
        self.save_model()
Example #10
0
def get_obs_states(
    env_id,
    system='cartpole_obs',
):
    if system == 'cartpole_obs':

        def line_line_cc(x1, y1, x2, y2, x3, y3, x4, y4):
            uA = ((x4 - x3) * (y1 - y3) - (y4 - y3) *
                  (x1 - x3)) / ((y4 - y3) * (x2 - x1) - (x4 - x3) * (y2 - y1))
            uB = ((x2 - x1) * (y1 - y3) - (y2 - y1) *
                  (x1 - x3)) / ((y4 - y3) * (x2 - x1) - (x4 - x3) * (y2 - y1))
            if uA >= 0. and uA <= 1. and uB >= 0. and uB <= 1.:
                # intersection
                return True
            # collision free
            return False

        def IsInCollision(x, obc, obc_width=4.):
            I = 10
            L = 2.5
            M = 10
            m = 5
            g = 9.8
            H = 0.5

            STATE_X = 0
            STATE_V = 1
            STATE_THETA = 2
            STATE_W = 3
            CONTROL_A = 0

            MIN_X = -30
            MAX_X = 30
            MIN_V = -40
            MAX_V = 40
            MIN_W = -2
            MAX_W = 2

            if x[0] < MIN_X or x[0] > MAX_X:
                return True
            H = 0.5
            pole_x1 = x[0]
            pole_y1 = H
            pole_x2 = x[0] + L * np.sin(x[2])
            pole_y2 = H + L * np.cos(x[2])

            width = 4
            for i in range(len(obc)):
                for j in range(0, 8, 2):
                    x1 = obc[i][j]
                    y1 = obc[i][j + 1]
                    x2 = obc[i][(j + 2) % 8]
                    y2 = obc[i][(j + 3) % 8]
                    if line_line_cc(pole_x1, pole_y1, pole_x2, pole_y2, x1, y1,
                                    x2, y2):
                        return True
            return False

        _obs_list = get_obs(system, env_id)[env_id]  #.reshape(-1, 2)
        obs_list = []
        width = 4
        for i in range(len(_obs_list)):
            x = _obs_list[i][0]
            y = _obs_list[i][1]
            obs = np.zeros(8)
            obs[0] = x - width / 2
            obs[1] = y + width / 2
            obs[2] = x + width / 2
            obs[3] = y + width / 2
            obs[4] = x + width / 2
            obs[5] = y - width / 2
            obs[6] = x - width / 2
            obs[7] = y - width / 2
            obs_list.append(obs)
        obs_i = np.array(obs_list)
        dx = 5
        dtheta = 0.5
        # feasible_points = []
        infeasible_points = []
        imin = 0
        imax = int(2 * 30. / dx)
        jmin = 0
        jmax = int(2 * np.pi / dtheta)

        for i in range(imin, imax):
            for j in range(jmin, jmax):
                x = np.array([dx * i - 30, 0., dtheta * j - np.pi, 0.])
                if IsInCollision(x, obs_i):
                    infeasible_points.append(x)
                    # pass
                # else:
                # feasible_points.append(x)
        # feasible_points = np.array(feasible_points)
        infeasible_points = np.array(infeasible_points)
        # print('feasible points')
        # print(feasible_points)
        # print('infeasible points')
        # print(infeasible_points)

    elif system == 'acrobot_obs':
        return None
    else:
        raise NotImplementedError("unkown dynamics")

    return infeasible_points
Example #11
0
    def run(self):
        episode = 0
        step = 0
        reward_sum = 0

        while True:
            time_step = self.env.reset()
            obs = get_obs(time_step.observation)
            self.actor.reset_state()
            self.critic.reset_state()
            self.target_actor.reset_state()
            self.target_critic.reset_state()
            self.sequence = []
            self.recurrent_state = []
            self.priority = []
            self.td_loss.clear()
            last_obs = None
            episode_step = 0
            done = False
            if self.actor_id == 0 and episode != 0:
                print('episode:', episode, 'step:', step, 'reward:',
                      reward_sum)
            episode += 1
            reward_sum = 0

            while not time_step.last():

                # get recurrent state
                actor_hx, actor_cx = self.actor.get_state()
                target_actor_hx, target_actor_cx = self.target_actor.get_state(
                )
                critic_hx, critic_cx = self.critic.get_state()
                target_critic_hx, target_critic_cx = self.target_critic.get_state(
                )

                action = self.actor(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1))
                target_action = self.target_actor(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1))
                _ = self.critic(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1), action)
                _ = self.target_critic(
                    torch.from_numpy(obs).cuda(self.actor_id % 2 + 1),
                    target_action)

                action = action.detach().cpu().numpy()[0]
                action += np.random.normal(0, 0.3, (self.action_size))
                action = np.clip(action, -1, 1)

                reward = 0.
                sleep(0.01)
                for i in range(4):
                    time_step = self.env.step(action)
                    next_obs = get_obs(time_step.observation)
                    reward += time_step.reward
                    if time_step.last():
                        break

                reward_sum += reward
                step += 1
                episode_step += 1
                terminal = 1. if time_step.last() else 0.
                self.sequence.append((obs[0], action, [reward], [terminal]))
                obs = next_obs.copy()

                self.recurrent_state.append(
                    [[actor_hx[0], actor_cx[0]],
                     [target_actor_hx[0], target_actor_cx[0]],
                     [critic_hx[0], critic_cx[0]],
                     [target_critic_hx[0], target_critic_cx[0]]])

                if step % self.actor_parameter_update_interval == 0:
                    self.load_model()

            if len(self.sequence) >= self.sequence_length:
                self.sequence.extend([(np.zeros((self.obs_size),
                                                dtype=np.float32),
                                       np.zeros((self.action_size),
                                                dtype=np.float32), [0.], [1.])
                                      for i in range(self.n_step)])
                self.calc_nstep_reward()
                self.calc_priorities()
                self.memory.add(self.sequence, self.recurrent_state,
                                self.priority)

            if len(self.memory.memory) > self.memory_save_interval:
                self.memory.save(self.actor_id)