maddpg.memory.push(obs_list[:10], action_list[:10], next_obs_list[:10], red_fighter_reward) maddpg.memory.push(obs_list[10:], action_list[10:], next_obs_list[10:], blue_fighter_reward) # if done, perform a learn if env.get_done(): if maddpg.episode_done > maddpg.episodes_before_train: logger.info('done and training now begins...') c_loss, a_loss = maddpg.update_policy() agent0_c_loss = float(c_loss[0].data.cpu().numpy()) agent0_a_loss = float(a_loss[0].data.cpu().numpy()) # save loss if not IS_TEST: loss_w( agent0_c_loss, 'train/{}/pics/agent0_c_loss.txt'.format( PICS_PATH)) loss_w( agent0_a_loss, 'train/{}/pics/agent0_a_loss.txt'.format( PICS_PATH)) break # if not done learn when learn interval if maddpg.episode_done > maddpg.episodes_before_train and ( step_cnt % LEARN_INTERVAL == 0): logger.info('training now begins...') c_loss, a_loss = maddpg.update_policy() agent0_c_loss = float(c_loss[0].data.cpu().numpy()) agent0_a_loss = float(a_loss[0].data.cpu().numpy()) # save loss if not IS_TEST:
# model obs change, 归一化 course = tmp_course / 359. pos = tmp_pos / size_x r_visible_pos = tmp_r_visible_pos.reshape(1, -1)[0] / size_x # (20,) j_visible_fp = tmp_j_visible_fp.reshape(1, -1)[0] / 359. # (10,) g_visible_pos = tmp_g_visible_pos.reshape(1, -1)[0] / size_x # (20,) # todo 归一化 obs = np.concatenate((course, pos, r_visible_pos, j_visible_fp, g_visible_pos), axis=0) # store dqn.memory.push(obs_list[y], action_list[y], copy.deepcopy(obs), fighter_reward[y]) # if done, perform a learn if env.get_done(): # detector_model.learn() dqn.learn() logger.info('episode: %d, reward = %f' % (i_episode, total_reward)) logger.info('e_greedy: %f' % dqn.epsilon) # store total_reward reward_w(total_reward, 'train/dqn/pics/reward.txt') break # if not done learn when learn interval if (step_cnt > 0) and (step_cnt % LEARN_INTERVAL == 0): dqn.learn() step_cnt += 1 # store loss loss_w(dqn.cost_his, 'train/dqn/pics/loss.txt') logger.info('**********train finish!**************')
axis=0) next_obs_list.append(obs) # store maddpg.memory.push(obs_list, action_list, next_obs_list, fighter_reward) # if done, perform a learn if env.get_done(): if maddpg.episode_done > maddpg.episodes_before_train: logger.info('done and training now begins...') c_loss, a_loss = maddpg.update_policy() agent0_c_loss = float(c_loss[0].data.cpu().numpy()) agent0_a_loss = float(a_loss[0].data.cpu().numpy()) # save loss loss_w(agent0_c_loss, 'train/maddpg3/pics/agent0_c_loss.txt') loss_w(agent0_a_loss, 'train/maddpg3/pics/agent0_a_loss.txt') break # if not done learn when learn interval if maddpg.episode_done > maddpg.episodes_before_train and ( step_cnt % LEARN_INTERVAL == 0): logger.info('training now begins...') c_loss, a_loss = maddpg.update_policy() agent0_c_loss = float(c_loss[0].data.cpu().numpy()) agent0_a_loss = float(a_loss[0].data.cpu().numpy()) # save loss loss_w(agent0_c_loss, 'train/maddpg3/pics/agent0_c_loss.txt') loss_w(agent0_a_loss, 'train/maddpg3/pics/agent0_a_loss.txt') step_cnt += 1
tmp_img_obs = tmp_img_obs.transpose(2, 0, 1) tmp_info_obs = red_obs_dict['fighter'][y]['info'] fighter_model.memory.push(obs_list[y]['screen'], obs_list[y]['info'], action_list[y], copy.deepcopy(tmp_img_obs), copy.deepcopy(tmp_info_obs), fighter_reward[y]) # if done, perform a learn if env.get_done(): # detector_model.learn() fighter_model.learn() logger.info('episode: %d, reward = %f' % (i_episode, total_reward)) logger.info('e_greedy: %f' % fighter_model.epsilon) # store total_reward reward_w(total_reward, 'train/simple/pics/reward.txt') break # if not done learn when learn interval if (step_cnt > 0) and (step_cnt % LEARN_INTERVAL == 0): # detector_model.learn() fighter_model.learn() step_cnt += 1 # store loss loss_w(fighter_model.cost_his, 'train/simple/pics/loss.txt') logger.info('**********train finish!**************')