def test_log_probs_from_logits_and_actions(self, batch_size): """Tests log_probs_from_logits_and_actions.""" seq_len = 7 num_actions = 3 policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10 actions = np.random.randint( 0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32) action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( policy_logits, actions) # Ground Truth # Using broadcasting to create a mask that indexes action logits action_index_mask = actions[..., None] == np.arange(num_actions) def index_with_mask(array, mask): return array[mask].reshape(*array.shape[:-1]) # Note: Normally log(softmax) is not a good idea because it's not # numerically stable. However, in this test we have well-behaved values. ground_truth_v = index_with_mask( np.log(_softmax(policy_logits)), action_index_mask) with self.test_session() as session: self.assertAllClose(ground_truth_v, session.run(action_log_probs_tensor))
def test_log_probs_from_logits_and_actions(self, batch_size): """Tests log_probs_from_logits_and_actions.""" seq_len = 7 num_actions = 3 policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10 actions = np.random.randint( 0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32) action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( policy_logits, actions, {"model": None}) # dummy config dict # Ground Truth # Using broadcasting to create a mask that indexes action logits action_index_mask = actions[..., None] == np.arange(num_actions) def index_with_mask(array, mask): return array[mask].reshape(*array.shape[:-1]) # Note: Normally log(softmax) is not a good idea because it's not # numerically stable. However, in this test we have well-behaved # values. ground_truth_v = index_with_mask( np.log(_softmax(policy_logits)), action_index_mask) with self.test_session() as session: self.assertAllClose(ground_truth_v, session.run(action_log_probs_tensor))
def test_log_probs_from_logits_and_actions(): """로짓과 동작에서 로그 확률 얻기 테스트.""" seq_len = 7 num_actions = 3 batch_size = 2 policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10 np.random.seed(0) actions = np.random.randint(0, num_actions, size=(seq_len, batch_size), dtype=np.int32) action_log_probs = vtrace.log_probs_from_logits_and_actions( policy_logits, actions) # Ground Truth # Using broadcasting to create a mask that indexes action logits action_index_mask = actions[..., None] == np.arange(num_actions) def index_with_mask(array, mask): return array[mask].reshape(*array.shape[:-1]) # Note: Normally log(softmax) is not a good idea because it's not # numerically stable. However, in this test we have well-behaved values. ground_truth = index_with_mask(np.log(_softmax(policy_logits)), action_index_mask) for g, o in zip(ground_truth, action_log_probs): assert np.allclose(g, o.data.tolist())
def test_vtrace_from_logits(self, batch_size): """Tests V-trace calculated from logits.""" seq_len = 5 num_actions = 3 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. dummy_config = {"model": None} # Intentionally leaving shapes unspecified to test if V-trace can # deal with that. placeholders = { # T, B, NUM_ACTIONS "behaviour_policy_logits": tf.placeholder( dtype=tf.float32, shape=[None, None, None]), # T, B, NUM_ACTIONS "target_policy_logits": tf.placeholder( dtype=tf.float32, shape=[None, None, None]), "actions": tf.placeholder(dtype=tf.int32, shape=[None, None]), "discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]), "rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]), "values": tf.placeholder(dtype=tf.float32, shape=[None, None]), "bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]), } from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, config=dummy_config, **placeholders) target_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders["target_policy_logits"], placeholders["actions"], dummy_config) behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders["behaviour_policy_logits"], placeholders["actions"], dummy_config) log_rhos = target_log_probs - behaviour_log_probs ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) values = { "behaviour_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "target_policy_logits": _shaped_arange(seq_len, batch_size, num_actions), "actions": np.random.randint( 0, num_actions - 1, size=(seq_len, batch_size)), "discounts": np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), "rewards": _shaped_arange(seq_len, batch_size), "values": _shaped_arange(seq_len, batch_size) / batch_size, "bootstrap_value": _shaped_arange(batch_size) + 1.0, # B } feed_dict = {placeholders[k]: v for k, v in values.items()} with self.test_session() as session: from_logits_output_v = session.run( from_logits_output, feed_dict=feed_dict) (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs, ground_truth_target_action_log_probs) = session.run( ground_truth, feed_dict=feed_dict) # Calculate V-trace using the ground truth logits. from_iw = vtrace.from_importance_weights( log_rhos=ground_truth_log_rhos, discounts=values["discounts"], rewards=values["rewards"], values=values["values"], bootstrap_value=values["bootstrap_value"], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) with self.test_session() as session: from_iw_v = session.run(from_iw) self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs) self.assertAllClose(from_iw_v.pg_advantages, from_logits_output_v.pg_advantages) self.assertAllClose(ground_truth_behaviour_action_log_probs, from_logits_output_v.behaviour_action_log_probs) self.assertAllClose(ground_truth_target_action_log_probs, from_logits_output_v.target_action_log_probs) self.assertAllClose(ground_truth_log_rhos, from_logits_output_v.log_rhos)
def test_vtrace_from_logits(self, batch_size): """Tests V-trace calculated from logits.""" seq_len = 5 num_actions = 3 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. # Intentionally leaving shapes unspecified to test if V-trace can # deal with that. placeholders = { # T, B, NUM_ACTIONS 'behaviour_policy_logits': tf.placeholder(dtype=tf.float32, shape=[None, None, None]), # T, B, NUM_ACTIONS 'target_policy_logits': tf.placeholder(dtype=tf.float32, shape=[None, None, None]), 'actions': tf.placeholder(dtype=tf.int32, shape=[None, None]), 'discounts': tf.placeholder(dtype=tf.float32, shape=[None, None]), 'rewards': tf.placeholder(dtype=tf.float32, shape=[None, None]), 'values': tf.placeholder(dtype=tf.float32, shape=[None, None]), 'bootstrap_value': tf.placeholder(dtype=tf.float32, shape=[None]), } from_logits_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, **placeholders) target_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders['target_policy_logits'], placeholders['actions']) behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( placeholders['behaviour_policy_logits'], placeholders['actions']) log_rhos = target_log_probs - behaviour_log_probs ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) values = { 'behaviour_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'target_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'actions': np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)), 'discounts': np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), 'rewards': _shaped_arange(seq_len, batch_size), 'values': _shaped_arange(seq_len, batch_size) / batch_size, 'bootstrap_value': _shaped_arange(batch_size) + 1.0, # B } feed_dict = {placeholders[k]: v for k, v in values.items()} with self.test_session() as session: from_logits_output_v = session.run( from_logits_output, feed_dict=feed_dict) (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs, ground_truth_target_action_log_probs) = session.run( ground_truth, feed_dict=feed_dict) # Calculate V-trace using the ground truth logits. from_iw = vtrace.from_importance_weights( log_rhos=ground_truth_log_rhos, discounts=values['discounts'], rewards=values['rewards'], values=values['values'], bootstrap_value=values['bootstrap_value'], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) with self.test_session() as session: from_iw_v = session.run(from_iw) self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs) self.assertAllClose(from_iw_v.pg_advantages, from_logits_output_v.pg_advantages) self.assertAllClose(ground_truth_behaviour_action_log_probs, from_logits_output_v.behaviour_action_log_probs) self.assertAllClose(ground_truth_target_action_log_probs, from_logits_output_v.target_action_log_probs) self.assertAllClose(ground_truth_log_rhos, from_logits_output_v.log_rhos)
def main(): """메인 함수.""" # 환경 생성 env = make_env(ENV_NAME) set_random_seed() device = get_device() net = A2C(env.observation_space.shape, env.action_space.n).to(device) net.apply(weights_init) writer = SummaryWriter(comment="-" + ENV_NAME) log(net) # ZMQ 초기화 context, act_sock, buf_sock = init_zmq() # 입력을 기다린 후 시작 log("Press Enter when the actors are ready: ") input() # 기본 모델을 발행해 액터 시작 log("sending parameters to actors…") publish_model(net, act_sock) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) # optimizer = optim.RMSprop(net.parameters(), # lr=RMS_LR, # eps=RMS_EPS, # momentum=RMS_MOMENTUM) fps = 0.0 p_time = None step_idx = 1 max_reward = -1000 # 감쇄 상수 discounts = np.array([pow(GAMMA, i) for i in range(NUM_UNROLL)]) discounts = np.repeat(discounts, NUM_BATCH).reshape(NUM_UNROLL, NUM_BATCH) discounts_v = torch.Tensor(discounts).to(device) while True: # 버퍼에게 학습을 위한 배치를 요청 log("request new batch {}.".format(step_idx)) st = time.time() buf_sock.send(b'') payload = buf_sock.recv() log("receive batch elapse {:.2f}".format(time.time() - st)) if payload == b'not enough': # 아직 배치가 부족 log("not enough data to batch.") time.sleep(1) else: # 배치 학습 st = time.time() step_idx += 1 optimizer.zero_grad() batch, ainfos, binfo = pickle.loads(payload) states, logits, actions, rewards, last_states = batch states_v = torch.Tensor(states).to(device) # 배치 수만큼 logits = [] values = [] bsvalues = [] last_state_idx = [] for bi in range(NUM_BATCH): # 러너의 모델로 예측 logit, value = net(states_v[bi]) logits.append(logit) values.append(value.squeeze(1)) if last_states[bi] is not None: # 부트스트래핑을 위한 마지막 상태 수집 _, bsvalue = net( torch.Tensor([last_states[bi]]).to(device)) bsvalues.append(bsvalue.squeeze(1)) last_state_idx.append(bi) # 러너/액터의 로짓과 동작에서 로그 확률얻어 중요도 샘플링 값 계산 learner_logits = torch.stack(logits).permute(1, 0, 2) learner_values = torch.stack(values).permute(1, 0) actor_logits = torch.stack(logits).permute(1, 0, 2) actor_actions = torch.LongTensor(actions).to(device).permute(1, 0) actor_rewards = torch.Tensor(rewards).to(device).permute(1, 0) bootstrap_value = torch.Tensor(bsvalues).to(device) learner_log_probs =\ log_probs_from_logits_and_actions(learner_logits, actor_actions) actor_log_probs =\ log_probs_from_logits_and_actions(actor_logits, actor_actions) log_rhos = learner_log_probs - actor_log_probs # 중요도 샘플링 값에서 V-trace 결과 얻음 vtrace_ret = from_importance_weights( log_rhos=log_rhos, discounts=discounts_v, rewards=actor_rewards, values=learner_values, bootstrap_value=bootstrap_value, last_state_idx=last_state_idx) # 손실 계산 후 역전파 pg_loss, entropy_loss, baseline_loss, total_loss = \ calc_loss_and_backprop(learner_logits, learner_values, actor_actions, vtrace_ret) grads = np.concatenate([ p.grad.data.cpu().numpy().flatten() for p in net.parameters() if p.grad is not None ]) # 경사 클리핑 nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() if step_idx % SHOW_FREQ == 0: # 보드 게시 (프레임 단위) # frame_idx = step_idx * NUM_BATCH * NUM_UNROLL write_tb(writer, step_idx, vtrace_ret, learner_values, entropy_loss, pg_loss, baseline_loss, total_loss, grads, ainfos, binfo) # 최고 리워드 모델 저장 _max_reward = np.max([ainfo.reward for ainfo in ainfos.values()]) if _max_reward > max_reward and step_idx % SAVE_FREQ == 0: log("save best model - reward {:.2f}".format(_max_reward)) torch.save(net, ENV_NAME + "-best.dat") max_reward = _max_reward # 모델 발행 if step_idx % PUBLISH_FREQ == 0: publish_model(net, act_sock) if p_time is not None: elapsed = time.time() - p_time fps = 1.0 / elapsed log("train elapsed {:.2f} speed {:.2f} f/s".format(elapsed, fps)) p_time = time.time() writer.close()
def test_vtrace_from_logit(): """V-trace를 로짓에서 계산 테스트.""" seq_len = 5 # n-step num_actions = 3 batch_size = 2 clip_rho_threshold = None # No clipping. clip_pg_rho_threshold = None # No clipping. np.random.seed(0) values = { 'behavior_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'target_policy_logits': _shaped_arange(seq_len, batch_size, num_actions), 'actions': np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)), 'discounts': np.array( # T, B where B_i: [0.9 / (i+1)] * T [[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), 'rewards': _shaped_arange(seq_len, batch_size), 'values': _shaped_arange(seq_len, batch_size) / batch_size, 'bootstrap_value': _shaped_arange(batch_size) + 1.0, # B } from_logit_output = vtrace.from_logits( clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold, **values) ground_truth_target_log_probs = vtrace.log_probs_from_logits_and_actions( values['target_policy_logits'], values['actions']) ground_truth_behavior_log_probs = vtrace.log_probs_from_logits_and_actions( values['behavior_policy_logits'], values['actions']) ground_truth_log_rhos = ground_truth_target_log_probs - \ ground_truth_behavior_log_probs from_iw = vtrace.from_importance_weights( log_rhos=ground_truth_log_rhos, discounts=values['discounts'], rewards=values['rewards'], values=values['values'], bootstrap_value=values['bootstrap_value'], clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) # 중요도 가중치 결과 == 로짓 결과 == ground truth for g, o in zip(from_iw.vs, from_logit_output.vs): assert np.allclose(g, o.data.tolist()) for g, o in zip(from_iw.pg_advantages, from_logit_output.pg_advantages): assert np.allclose(g, o.data.tolist()) for g, o in zip(ground_truth_behavior_log_probs, from_logit_output.behavior_action_log_probs): assert np.allclose(g, o.data.tolist()) for g, o in zip(ground_truth_target_log_probs, from_logit_output.target_action_log_probs): assert np.allclose(g, o.data.tolist()) for g, o in zip(ground_truth_log_rhos, from_logit_output.log_rhos): assert np.allclose(g, o.data.tolist()) logits = torch.Tensor(values['behavior_policy_logits']) actions = torch.LongTensor(values['actions']) advantages = from_iw.pg_advantages import pdb pdb.set_trace() # breakpoint fd504776 // loss = calc_loss(logits, actions, advantages) pass