Python log_probs_from_logits_and_actionsの例、vtrace.log_probs_from_logits_and_actions Pythonの例

コード例 #1

0

ファイルを表示

ファイル: vtrace_test.py プロジェクト: reinforcementdriving/scalable_agent

  def test_log_probs_from_logits_and_actions(self, batch_size):
    """Tests log_probs_from_logits_and_actions."""
    seq_len = 7
    num_actions = 3

    policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10
    actions = np.random.randint(
        0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32)

    action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
        policy_logits, actions)

    # Ground Truth
    # Using broadcasting to create a mask that indexes action logits
    action_index_mask = actions[..., None] == np.arange(num_actions)

    def index_with_mask(array, mask):
      return array[mask].reshape(*array.shape[:-1])

    # Note: Normally log(softmax) is not a good idea because it's not
    # numerically stable. However, in this test we have well-behaved values.
    ground_truth_v = index_with_mask(
        np.log(_softmax(policy_logits)), action_index_mask)

    with self.test_session() as session:
      self.assertAllClose(ground_truth_v, session.run(action_log_probs_tensor))

コード例 #2

0

ファイルを表示

ファイル: vtrace_test.py プロジェクト: zuoxiaolei/ray

    def test_log_probs_from_logits_and_actions(self, batch_size):
        """Tests log_probs_from_logits_and_actions."""
        seq_len = 7
        num_actions = 3

        policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10
        actions = np.random.randint(
            0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32)

        action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
            policy_logits, actions, {"model": None})  # dummy config dict

        # Ground Truth
        # Using broadcasting to create a mask that indexes action logits
        action_index_mask = actions[..., None] == np.arange(num_actions)

        def index_with_mask(array, mask):
            return array[mask].reshape(*array.shape[:-1])

        # Note: Normally log(softmax) is not a good idea because it's not
        # numerically stable. However, in this test we have well-behaved
        # values.
        ground_truth_v = index_with_mask(
            np.log(_softmax(policy_logits)), action_index_mask)

        with self.test_session() as session:
            self.assertAllClose(ground_truth_v,
                                session.run(action_log_probs_tensor))

コード例 #3

0

ファイルを表示

ファイル: test_vtrace.py プロジェクト: steffenvan/impala

def test_log_probs_from_logits_and_actions():
    """로짓과 동작에서 로그 확률 얻기 테스트."""
    seq_len = 7
    num_actions = 3
    batch_size = 2
    policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10
    np.random.seed(0)

    actions = np.random.randint(0,
                                num_actions,
                                size=(seq_len, batch_size),
                                dtype=np.int32)
    action_log_probs = vtrace.log_probs_from_logits_and_actions(
        policy_logits, actions)

    # Ground Truth
    # Using broadcasting to create a mask that indexes action logits
    action_index_mask = actions[..., None] == np.arange(num_actions)

    def index_with_mask(array, mask):
        return array[mask].reshape(*array.shape[:-1])

    # Note: Normally log(softmax) is not a good idea because it's not
    # numerically stable. However, in this test we have well-behaved values.
    ground_truth = index_with_mask(np.log(_softmax(policy_logits)),
                                   action_index_mask)

    for g, o in zip(ground_truth, action_log_probs):
        assert np.allclose(g, o.data.tolist())

コード例 #4

0

ファイルを表示

ファイル: vtrace_test.py プロジェクト: zuoxiaolei/ray

    def test_vtrace_from_logits(self, batch_size):
        """Tests V-trace calculated from logits."""
        seq_len = 5
        num_actions = 3
        clip_rho_threshold = None  # No clipping.
        clip_pg_rho_threshold = None  # No clipping.

        dummy_config = {"model": None}

        # Intentionally leaving shapes unspecified to test if V-trace can
        # deal with that.
        placeholders = {
            # T, B, NUM_ACTIONS
            "behaviour_policy_logits": tf.placeholder(
                dtype=tf.float32, shape=[None, None, None]),
            # T, B, NUM_ACTIONS
            "target_policy_logits": tf.placeholder(
                dtype=tf.float32, shape=[None, None, None]),
            "actions": tf.placeholder(dtype=tf.int32, shape=[None, None]),
            "discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]),
            "rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]),
            "values": tf.placeholder(dtype=tf.float32, shape=[None, None]),
            "bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]),
        }

        from_logits_output = vtrace.from_logits(
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold,
            config=dummy_config,
            **placeholders)

        target_log_probs = vtrace.log_probs_from_logits_and_actions(
            placeholders["target_policy_logits"], placeholders["actions"],
            dummy_config)
        behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
            placeholders["behaviour_policy_logits"], placeholders["actions"],
            dummy_config)
        log_rhos = target_log_probs - behaviour_log_probs
        ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)

        values = {
            "behaviour_policy_logits": _shaped_arange(seq_len, batch_size,
                                                      num_actions),
            "target_policy_logits": _shaped_arange(seq_len, batch_size,
                                                   num_actions),
            "actions": np.random.randint(
                0, num_actions - 1, size=(seq_len, batch_size)),
            "discounts": np.array(  # T, B where B_i: [0.9 / (i+1)] * T
                [[0.9 / (b + 1) for b in range(batch_size)]
                 for _ in range(seq_len)]),
            "rewards": _shaped_arange(seq_len, batch_size),
            "values": _shaped_arange(seq_len, batch_size) / batch_size,
            "bootstrap_value": _shaped_arange(batch_size) + 1.0,  # B
        }

        feed_dict = {placeholders[k]: v for k, v in values.items()}
        with self.test_session() as session:
            from_logits_output_v = session.run(
                from_logits_output, feed_dict=feed_dict)
            (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
             ground_truth_target_action_log_probs) = session.run(
                 ground_truth, feed_dict=feed_dict)

        # Calculate V-trace using the ground truth logits.
        from_iw = vtrace.from_importance_weights(
            log_rhos=ground_truth_log_rhos,
            discounts=values["discounts"],
            rewards=values["rewards"],
            values=values["values"],
            bootstrap_value=values["bootstrap_value"],
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold)

        with self.test_session() as session:
            from_iw_v = session.run(from_iw)

        self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
        self.assertAllClose(from_iw_v.pg_advantages,
                            from_logits_output_v.pg_advantages)
        self.assertAllClose(ground_truth_behaviour_action_log_probs,
                            from_logits_output_v.behaviour_action_log_probs)
        self.assertAllClose(ground_truth_target_action_log_probs,
                            from_logits_output_v.target_action_log_probs)
        self.assertAllClose(ground_truth_log_rhos,
                            from_logits_output_v.log_rhos)

コード例 #5

0

ファイルを表示

ファイル: vtrace_test.py プロジェクト: reinforcementdriving/scalable_agent

  def test_vtrace_from_logits(self, batch_size):
    """Tests V-trace calculated from logits."""
    seq_len = 5
    num_actions = 3
    clip_rho_threshold = None  # No clipping.
    clip_pg_rho_threshold = None  # No clipping.

    # Intentionally leaving shapes unspecified to test if V-trace can
    # deal with that.
    placeholders = {
        # T, B, NUM_ACTIONS
        'behaviour_policy_logits':
            tf.placeholder(dtype=tf.float32, shape=[None, None, None]),
        # T, B, NUM_ACTIONS
        'target_policy_logits':
            tf.placeholder(dtype=tf.float32, shape=[None, None, None]),
        'actions':
            tf.placeholder(dtype=tf.int32, shape=[None, None]),
        'discounts':
            tf.placeholder(dtype=tf.float32, shape=[None, None]),
        'rewards':
            tf.placeholder(dtype=tf.float32, shape=[None, None]),
        'values':
            tf.placeholder(dtype=tf.float32, shape=[None, None]),
        'bootstrap_value':
            tf.placeholder(dtype=tf.float32, shape=[None]),
    }

    from_logits_output = vtrace.from_logits(
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold,
        **placeholders)

    target_log_probs = vtrace.log_probs_from_logits_and_actions(
        placeholders['target_policy_logits'], placeholders['actions'])
    behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
        placeholders['behaviour_policy_logits'], placeholders['actions'])
    log_rhos = target_log_probs - behaviour_log_probs
    ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)

    values = {
        'behaviour_policy_logits':
            _shaped_arange(seq_len, batch_size, num_actions),
        'target_policy_logits':
            _shaped_arange(seq_len, batch_size, num_actions),
        'actions':
            np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)),
        'discounts':
            np.array(  # T, B where B_i: [0.9 / (i+1)] * T
                [[0.9 / (b + 1)
                  for b in range(batch_size)]
                 for _ in range(seq_len)]),
        'rewards':
            _shaped_arange(seq_len, batch_size),
        'values':
            _shaped_arange(seq_len, batch_size) / batch_size,
        'bootstrap_value':
            _shaped_arange(batch_size) + 1.0,  # B
    }

    feed_dict = {placeholders[k]: v for k, v in values.items()}
    with self.test_session() as session:
      from_logits_output_v = session.run(
          from_logits_output, feed_dict=feed_dict)
      (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
       ground_truth_target_action_log_probs) = session.run(
           ground_truth, feed_dict=feed_dict)

    # Calculate V-trace using the ground truth logits.
    from_iw = vtrace.from_importance_weights(
        log_rhos=ground_truth_log_rhos,
        discounts=values['discounts'],
        rewards=values['rewards'],
        values=values['values'],
        bootstrap_value=values['bootstrap_value'],
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold)

    with self.test_session() as session:
      from_iw_v = session.run(from_iw)

    self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
    self.assertAllClose(from_iw_v.pg_advantages,
                        from_logits_output_v.pg_advantages)
    self.assertAllClose(ground_truth_behaviour_action_log_probs,
                        from_logits_output_v.behaviour_action_log_probs)
    self.assertAllClose(ground_truth_target_action_log_probs,
                        from_logits_output_v.target_action_log_probs)
    self.assertAllClose(ground_truth_log_rhos, from_logits_output_v.log_rhos)

コード例 #6

0

ファイルを表示

ファイル: learner.py プロジェクト: steffenvan/impala

def main():
    """메인 함수."""
    # 환경 생성
    env = make_env(ENV_NAME)
    set_random_seed()
    device = get_device()
    net = A2C(env.observation_space.shape, env.action_space.n).to(device)
    net.apply(weights_init)
    writer = SummaryWriter(comment="-" + ENV_NAME)
    log(net)

    # ZMQ 초기화
    context, act_sock, buf_sock = init_zmq()
    # 입력을 기다린 후 시작
    log("Press Enter when the actors are ready: ")
    input()

    # 기본 모델을 발행해 액터 시작
    log("sending parameters to actors…")
    publish_model(net, act_sock)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    # optimizer = optim.RMSprop(net.parameters(),
    #                           lr=RMS_LR,
    #                           eps=RMS_EPS,
    #                           momentum=RMS_MOMENTUM)

    fps = 0.0
    p_time = None
    step_idx = 1
    max_reward = -1000
    # 감쇄 상수
    discounts = np.array([pow(GAMMA, i) for i in range(NUM_UNROLL)])
    discounts = np.repeat(discounts, NUM_BATCH).reshape(NUM_UNROLL, NUM_BATCH)
    discounts_v = torch.Tensor(discounts).to(device)

    while True:

        # 버퍼에게 학습을 위한 배치를 요청
        log("request new batch {}.".format(step_idx))
        st = time.time()
        buf_sock.send(b'')
        payload = buf_sock.recv()
        log("receive batch elapse {:.2f}".format(time.time() - st))

        if payload == b'not enough':
            # 아직 배치가 부족
            log("not enough data to batch.")
            time.sleep(1)
        else:
            # 배치 학습
            st = time.time()
            step_idx += 1
            optimizer.zero_grad()

            batch, ainfos, binfo = pickle.loads(payload)
            states, logits, actions, rewards, last_states = batch
            states_v = torch.Tensor(states).to(device)

            # 배치 수만큼
            logits = []
            values = []
            bsvalues = []
            last_state_idx = []
            for bi in range(NUM_BATCH):
                # 러너의 모델로 예측
                logit, value = net(states_v[bi])
                logits.append(logit)
                values.append(value.squeeze(1))
                if last_states[bi] is not None:
                    # 부트스트래핑을 위한 마지막 상태 수집
                    _, bsvalue = net(
                        torch.Tensor([last_states[bi]]).to(device))
                    bsvalues.append(bsvalue.squeeze(1))
                    last_state_idx.append(bi)

            # 러너/액터의 로짓과 동작에서 로그 확률얻어 중요도 샘플링 값 계산
            learner_logits = torch.stack(logits).permute(1, 0, 2)
            learner_values = torch.stack(values).permute(1, 0)
            actor_logits = torch.stack(logits).permute(1, 0, 2)
            actor_actions = torch.LongTensor(actions).to(device).permute(1, 0)
            actor_rewards = torch.Tensor(rewards).to(device).permute(1, 0)
            bootstrap_value = torch.Tensor(bsvalues).to(device)
            learner_log_probs =\
                log_probs_from_logits_and_actions(learner_logits,
                                                  actor_actions)
            actor_log_probs =\
                log_probs_from_logits_and_actions(actor_logits,
                                                  actor_actions)
            log_rhos = learner_log_probs - actor_log_probs

            # 중요도 샘플링 값에서 V-trace 결과 얻음
            vtrace_ret = from_importance_weights(
                log_rhos=log_rhos,
                discounts=discounts_v,
                rewards=actor_rewards,
                values=learner_values,
                bootstrap_value=bootstrap_value,
                last_state_idx=last_state_idx)
            # 손실 계산 후 역전파
            pg_loss, entropy_loss, baseline_loss, total_loss = \
                calc_loss_and_backprop(learner_logits, learner_values,
                                       actor_actions, vtrace_ret)

            grads = np.concatenate([
                p.grad.data.cpu().numpy().flatten() for p in net.parameters()
                if p.grad is not None
            ])
            # 경사 클리핑
            nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
            optimizer.step()

            if step_idx % SHOW_FREQ == 0:
                # 보드 게시 (프레임 단위)
                # frame_idx = step_idx * NUM_BATCH * NUM_UNROLL
                write_tb(writer, step_idx, vtrace_ret, learner_values,
                         entropy_loss, pg_loss, baseline_loss, total_loss,
                         grads, ainfos, binfo)

            # 최고 리워드 모델 저장
            _max_reward = np.max([ainfo.reward for ainfo in ainfos.values()])
            if _max_reward > max_reward and step_idx % SAVE_FREQ == 0:
                log("save best model - reward {:.2f}".format(_max_reward))
                torch.save(net, ENV_NAME + "-best.dat")
                max_reward = _max_reward

        # 모델 발행
        if step_idx % PUBLISH_FREQ == 0:
            publish_model(net, act_sock)

        if p_time is not None:
            elapsed = time.time() - p_time
            fps = 1.0 / elapsed
            log("train elapsed {:.2f} speed {:.2f} f/s".format(elapsed, fps))

        p_time = time.time()

    writer.close()

コード例 #7

0

ファイルを表示

ファイル: test_vtrace.py プロジェクト: steffenvan/impala

def test_vtrace_from_logit():
    """V-trace를 로짓에서 계산 테스트."""
    seq_len = 5  # n-step
    num_actions = 3
    batch_size = 2
    clip_rho_threshold = None  # No clipping.
    clip_pg_rho_threshold = None  # No clipping.

    np.random.seed(0)
    values = {
        'behavior_policy_logits':
        _shaped_arange(seq_len, batch_size, num_actions),
        'target_policy_logits':
        _shaped_arange(seq_len, batch_size, num_actions),
        'actions':
        np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)),
        'discounts':
        np.array(  # T, B where B_i: [0.9 / (i+1)] * T
            [[0.9 / (b + 1) for b in range(batch_size)]
             for _ in range(seq_len)]),
        'rewards':
        _shaped_arange(seq_len, batch_size),
        'values':
        _shaped_arange(seq_len, batch_size) / batch_size,
        'bootstrap_value':
        _shaped_arange(batch_size) + 1.0,  # B
    }

    from_logit_output = vtrace.from_logits(
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold,
        **values)

    ground_truth_target_log_probs = vtrace.log_probs_from_logits_and_actions(
        values['target_policy_logits'], values['actions'])
    ground_truth_behavior_log_probs = vtrace.log_probs_from_logits_and_actions(
        values['behavior_policy_logits'], values['actions'])
    ground_truth_log_rhos = ground_truth_target_log_probs - \
        ground_truth_behavior_log_probs

    from_iw = vtrace.from_importance_weights(
        log_rhos=ground_truth_log_rhos,
        discounts=values['discounts'],
        rewards=values['rewards'],
        values=values['values'],
        bootstrap_value=values['bootstrap_value'],
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold)

    # 중요도 가중치 결과 == 로짓 결과 == ground truth
    for g, o in zip(from_iw.vs, from_logit_output.vs):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(from_iw.pg_advantages, from_logit_output.pg_advantages):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(ground_truth_behavior_log_probs,
                    from_logit_output.behavior_action_log_probs):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(ground_truth_target_log_probs,
                    from_logit_output.target_action_log_probs):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(ground_truth_log_rhos, from_logit_output.log_rhos):
        assert np.allclose(g, o.data.tolist())

    logits = torch.Tensor(values['behavior_policy_logits'])
    actions = torch.LongTensor(values['actions'])
    advantages = from_iw.pg_advantages
    import pdb
    pdb.set_trace()  # breakpoint fd504776 //
    loss = calc_loss(logits, actions, advantages)
    pass