Ejemplo n.º 1
0
                maddpg.memory.push(obs_list[:10], action_list[:10],
                                   next_obs_list[:10], red_fighter_reward)
                maddpg.memory.push(obs_list[10:], action_list[10:],
                                   next_obs_list[10:], blue_fighter_reward)

            # if done, perform a learn
            if env.get_done():
                if maddpg.episode_done > maddpg.episodes_before_train:
                    logger.info('done and training now begins...')
                    c_loss, a_loss = maddpg.update_policy()
                    agent0_c_loss = float(c_loss[0].data.cpu().numpy())
                    agent0_a_loss = float(a_loss[0].data.cpu().numpy())
                    # save loss
                    if not IS_TEST:
                        loss_w(
                            agent0_c_loss,
                            'train/{}/pics/agent0_c_loss.txt'.format(
                                PICS_PATH))
                        loss_w(
                            agent0_a_loss,
                            'train/{}/pics/agent0_a_loss.txt'.format(
                                PICS_PATH))
                break
            # if not done learn when learn interval
            if maddpg.episode_done > maddpg.episodes_before_train and (
                    step_cnt % LEARN_INTERVAL == 0):
                logger.info('training now begins...')
                c_loss, a_loss = maddpg.update_policy()
                agent0_c_loss = float(c_loss[0].data.cpu().numpy())
                agent0_a_loss = float(a_loss[0].data.cpu().numpy())
                # save loss
                if not IS_TEST:
Ejemplo n.º 2
0
                    # model obs change, 归一化
                    course = tmp_course / 359.
                    pos = tmp_pos / size_x
                    r_visible_pos = tmp_r_visible_pos.reshape(1, -1)[0] / size_x  # (20,)
                    j_visible_fp = tmp_j_visible_fp.reshape(1, -1)[0] / 359.  # (10,)
                    g_visible_pos = tmp_g_visible_pos.reshape(1, -1)[0] / size_x  # (20,)
                    # todo 归一化
                    obs = np.concatenate((course, pos, r_visible_pos, j_visible_fp, g_visible_pos), axis=0)

                    # store
                    dqn.memory.push(obs_list[y], action_list[y], copy.deepcopy(obs), fighter_reward[y])

            # if done, perform a learn
            if env.get_done():
                # detector_model.learn()
                dqn.learn()
                logger.info('episode: %d, reward = %f' % (i_episode, total_reward))
                logger.info('e_greedy: %f' % dqn.epsilon)
                # store total_reward
                reward_w(total_reward, 'train/dqn/pics/reward.txt')
                break

            # if not done learn when learn interval
            if (step_cnt > 0) and (step_cnt % LEARN_INTERVAL == 0):
                dqn.learn()

            step_cnt += 1
    # store loss
    loss_w(dqn.cost_his, 'train/dqn/pics/loss.txt')
    logger.info('**********train finish!**************')
Ejemplo n.º 3
0
                    axis=0)
                next_obs_list.append(obs)

            # store
            maddpg.memory.push(obs_list, action_list, next_obs_list,
                               fighter_reward)

            # if done, perform a learn
            if env.get_done():
                if maddpg.episode_done > maddpg.episodes_before_train:
                    logger.info('done and training now begins...')
                    c_loss, a_loss = maddpg.update_policy()
                    agent0_c_loss = float(c_loss[0].data.cpu().numpy())
                    agent0_a_loss = float(a_loss[0].data.cpu().numpy())
                    # save loss
                    loss_w(agent0_c_loss,
                           'train/maddpg3/pics/agent0_c_loss.txt')
                    loss_w(agent0_a_loss,
                           'train/maddpg3/pics/agent0_a_loss.txt')
                break
            # if not done learn when learn interval
            if maddpg.episode_done > maddpg.episodes_before_train and (
                    step_cnt % LEARN_INTERVAL == 0):
                logger.info('training now begins...')
                c_loss, a_loss = maddpg.update_policy()
                agent0_c_loss = float(c_loss[0].data.cpu().numpy())
                agent0_a_loss = float(a_loss[0].data.cpu().numpy())
                # save loss
                loss_w(agent0_c_loss, 'train/maddpg3/pics/agent0_c_loss.txt')
                loss_w(agent0_a_loss, 'train/maddpg3/pics/agent0_a_loss.txt')

            step_cnt += 1
Ejemplo n.º 4
0
                    tmp_img_obs = tmp_img_obs.transpose(2, 0, 1)
                    tmp_info_obs = red_obs_dict['fighter'][y]['info']
                    fighter_model.memory.push(obs_list[y]['screen'],
                                              obs_list[y]['info'],
                                              action_list[y],
                                              copy.deepcopy(tmp_img_obs),
                                              copy.deepcopy(tmp_info_obs),
                                              fighter_reward[y])

            # if done, perform a learn
            if env.get_done():
                # detector_model.learn()
                fighter_model.learn()
                logger.info('episode: %d, reward = %f' %
                            (i_episode, total_reward))
                logger.info('e_greedy: %f' % fighter_model.epsilon)
                # store total_reward
                reward_w(total_reward, 'train/simple/pics/reward.txt')
                break

            # if not done learn when learn interval
            if (step_cnt > 0) and (step_cnt % LEARN_INTERVAL == 0):
                # detector_model.learn()
                fighter_model.learn()

            step_cnt += 1

    # store loss
    loss_w(fighter_model.cost_his, 'train/simple/pics/loss.txt')
    logger.info('**********train finish!**************')