def config_log(FLAGS):
    logdir = "tensorboard/%s/hrl_a2c_svib/%s_lr%s_%s/%s_%s_%s" % (
        FLAGS.env,FLAGS.num_timesteps, '0.0007',FLAGS.policy, start_time, FLAGS.train_option, str(FLAGS.beta))
    if FLAGS.log == "tensorboard":
        Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[TensorBoardOutputFormat(logdir)])
    elif FLAGS.log == "stdout":
        Logger.DEFAULT = Logger.CURRENT = Logger(dir=logdir, output_formats=[HumanOutputFormat(sys.stdout)])
Exemple #2
0
def main():
    FLAGS(sys.argv)
    logdir = "tensorboard"
    if FLAGS.algorithm == "deepq":
        logdir = "tensorboard/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])
    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])
    print("env : %s" % FLAGS.env)
    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("lr : %s" % FLAGS.lr)
    # Choose which RL algorithm to train.
    if FLAGS.algorithm == "deepq":  # Use DQN
        train_dqn(env_id=FLAGS.env, num_timesteps=FLAGS.timesteps)
Exemple #3
0
def main():
  FLAGS(sys.argv)

  logdir = "tensorboard"
  if(FLAGS.algorithm == "deepq"):
    logdir = "./tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.exploration_fraction,
      FLAGS.prioritized,
      FLAGS.dueling,
      FLAGS.lr,
      start_time
    )

  if(FLAGS.log == "tensorboard"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir='log.txt',
               output_formats=[TensorBoardOutputFormat(logdir)])

  elif(FLAGS.log == "stdout"):
    os.mkdir(logdir)
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[HumanOutputFormat(logdir+"/log.txt")])

  with sc2_env.SC2Env(
      map_name="DefeatZerglingsAndBanelings",
      minimap_size_px = (FLAGS.minimap_size_px, FLAGS.minimap_size_px),
      step_mul=FLAGS.step_mul,
      visualize=FLAGS.visualize,
      game_steps_per_episode= FLAGS.episode_steps) as env:

    model = deepq.models.cnn_to_mlp(
      convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)],
      hiddens=[256],
      dueling=True
    )

    act = dqfd.learn(
      env,
      q_func=model,
      num_actions=FLAGS.num_actions,
      lr=FLAGS.lr,
      print_freq= FLAGS.print_freq,
      max_timesteps=FLAGS.timesteps,
      buffer_size=FLAGS.buffer_size,
      exploration_fraction=FLAGS.exploration_fraction,
      exploration_final_eps=FLAGS.exploration_final_eps,
      train_freq=FLAGS.train_freq,
      learning_starts=FLAGS.learning_starts,
      target_network_update_freq=FLAGS.target_network_update_freq,
      gamma=FLAGS.gamma,
      prioritized_replay=FLAGS.prioritized,
      callback=deepq_callback
    )
    act.save("defeat_zerglings.pkl")
Exemple #4
0
def main():
    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",
            step_mul=step_mul,
            visualize=True,
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)),
            game_steps_per_episode=steps * step_mul) as env:
        obs = env.reset()
        #print(obs[0].observation)
        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)
        demo_replay = []
        act = dqfd.learn(env,
                         q_func=model,
                         num_actions=3,
                         lr=1e-4,
                         max_timesteps=10000000,
                         buffer_size=100000,
                         exploration_fraction=0.5,
                         exploration_final_eps=0.01,
                         train_freq=2,
                         learning_starts=100000,
                         target_network_update_freq=1000,
                         gamma=0.99,
                         prioritized_replay=True,
                         callback=deepq_callback)
        act.save("defeat_zerglings.pkl")
Exemple #5
0
def main():
    if osp.exists(LOGDIR):
        shutil.rmtree(LOGDIR)
    os.makedirs(LOGDIR)
    if not osp.exists(CKPTDIR):
        os.makedirs(CKPTDIR)
    Logger.DEFAULT = Logger.CURRENT = Logger(
        dir=None,
        output_formats=[
            HumanOutputFormat(sys.stdout),
            CSVOutputFormat(osp.join(LOGDIR, 'log.csv'))
        ])
    train()
Exemple #6
0
def main():
    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",
            step_mul=step_mul,
            visualize=True,
            agent_interface_format=sc2_env.AgentInterfaceFormat(
                feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32)),
            game_steps_per_episode=steps * step_mul) as env:

        print(env.observation_spec())
        screen_dim = env.observation_spec()[0]['feature_screen'][1:3]
        print(screen_dim)
def configure(dir, format_strs=None, custom_output_formats=None):
    if not dir:
        return

    assert isinstance(dir, str)
    os.makedirs(dir, exist_ok=True)

    if format_strs is None:
        strs = os.getenv('OPENAI_LOG_FORMAT')
        format_strs = strs.split(',') if strs else LOG_OUTPUT_FORMATS
    output_formats = [make_output_format(f, dir) for f in format_strs]

    if custom_output_formats is not None:
        assert isinstance(custom_output_formats, list)
        for custom_output_format in custom_output_formats:
            assert isinstance(custom_output_format, KVWriter)
        output_formats.extend(custom_output_formats)

    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
    log('Logging to %s' % dir)
Exemple #8
0
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")

    env = StarCraft2Env(map_name="8m",
                        reward_only_positive=False,
                        reward_scale_rate=200,
                        state_last_action=True,
                        obs_last_action=True,
                        obs_timestep_number=True,
                        state_timestep_number=True)  #reward_defeat=-200
    env_info = env.get_env_info()

    n_episodes = 2500  #4000    #2000
    timesteps = 500000
    n_agents = env_info["n_agents"]
    n_actions = env_info["n_actions"]
    output_len = n_actions
    lr = 0.002
    buffer_size = 70000  #int(timesteps * 0.1)  # 80000 # 减少一下,尽量是训练步数的1/10  70000  test 200  80000 20000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 8
    local_obs_len = 179  # local obs:80 ; global state:168;
    global_state_len = 348  # 179+169

    hidden_vector_len = 256  # 128  # 1  256
    tau = 0.001
    num_exploring = buffer_size  # buffer_size
    action_low = -1
    action_high = 1
    save_freq = 10000
    critic_output_len = 1

    logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr,
                                            start_time)

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len,
                         output_len, hidden_vector_len)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(),
                           num_agents, global_state_len, critic_output_len,
                           hidden_vector_len, n_actions)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    action_noise = OU_noise(decay_period=timesteps - buffer_size)

    action_noise.reset()
    # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings")
    # U.load_state(model_file_load, sess)
    U.initialize()

    t = 0
    step_train = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        local_obs = env.get_obs()
        local_obs = np.array(local_obs)
        global_state = env.get_state()
        global_state_expand = np.zeros(
            [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]])
        reward_hl_own_old = []
        reward_hl_en_old = []
        episode_reward_agent = [0 for n in range(n_agents)]
        for i in range(local_obs.shape[0]):
            global_state_expand[i] = np.append(local_obs[i],
                                               global_state.flatten())
            reward_hl_own_old.append(env.get_agent_health(i))
            reward_hl_en_old.append(env.get_enemy_health(i))

        while not terminated:
            t = t + 1
            critic_input = np.expand_dims(global_state_expand, axis=0)
            actor_input = np.expand_dims(local_obs, axis=0)
            action = actor.predict(actor_input)[0]
            act_with_noise = action  #np.clip(action + action_noise.get_noise(step_train), action_low, action_high)
            act_mat_norm = (act_with_noise + 1) / 2
            actions = []
            dead_unit = []
            rew_expand = np.zeros((n_agents, 1))

            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]

                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if (sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1) / len(act_prob)
                else:
                    act_prob = np.array(act_prob) / sum_avail_act

                index = np.random.choice(np.array(avail_actions_ind),
                                         p=act_prob.ravel())
                actions.append(index)

                if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0):
                    dead_unit.append(agent_id)

            reward_base, terminated, info = env.step(actions)

            new_local_obs = env.get_obs()
            new_local_obs = np.array(new_local_obs)
            new_global_state = env.get_state()
            new_global_state_expand = np.zeros([
                new_local_obs.shape[0],
                new_local_obs.shape[1] + new_global_state.shape[0]
            ])
            reward_hl_own_new = []
            reward_hl_en_new = []
            for i in range(new_local_obs.shape[0]):
                new_global_state_expand[i] = np.append(
                    new_local_obs[i], new_global_state.flatten())
                reward_hl_own_new.append(env.get_agent_health(i))
                reward_hl_en_new.append(env.get_enemy_health(i))

            for i in range(n_agents):
                if (i in dead_unit):
                    rew_expand[i] = 0
                else:
                    rew_expand[i] = -0.05
                    if (actions[i] > 5):
                        target_id = actions[i] - 6
                        health_reduce_en = reward_hl_en_old[
                            target_id] - reward_hl_en_new[target_id]
                        if (health_reduce_en > 0):
                            rew_expand[i] += 2 + health_reduce_en * 5
                            # if (reward_base > 50):
                            #     rew_expand[i] += 20
                        else:
                            rew_expand[i] += 1
                    else:
                        rew_expand[i] += (reward_hl_own_new[i] -
                                          reward_hl_own_old[i]) * 5
                #
                if (terminated):
                    if (info["battle_won"] is False):
                        rew_expand[i] += -10
                    else:
                        rew_expand[i] += 10

                episode_reward_agent[i] += rew_expand[i]

            replay_buffer.add(local_obs, global_state_expand, act_with_noise,
                              rew_expand, terminated, new_local_obs,
                              new_global_state_expand)

            episode_reward += reward_base
            local_obs = new_local_obs
            global_state_expand = new_global_state_expand
            if (t == num_exploring):
                print("training starts")
            if (t >= num_exploring):
                local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch(
                    batch_size
                )  # [group0:[batch_size, trace.dimension], group1, ... group8]
                target_q = r_batch + gamma * critic.predict_target(
                    global_s2_batch, actor.predict_target(local_s2_batch))
                predicted_q_value, _ = critic.train(
                    global_s_batch, a_batch,
                    np.reshape(target_q,
                               (batch_size, num_agents, critic_output_len)))
                a_outs = actor.predict(local_s_batch)  # a_outs和a_batch是完全相同的
                grads = critic.action_gradients(global_s_batch,
                                                a_outs)  # delta Q对a的导数
                actor.train(local_s_batch, grads)
                step_train = step_train + 1

                actor.update_target_network()
                critic.update_target_network()

                if (t % save_freq == 0):
                    model_file_save = os.path.join(
                        "model/" + str(step_train) + "_" +
                        "training_steps_model/", "8m")
                    U.save_state(model_file_save)
                    print("Model have been trained for %s times" %
                          (step_train))
                    # replay_buffer.save()

        print("steps until now : %s, episode: %s, episode reward: %s" %
              (t, e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward_episode", episode_reward)
        for i in range(n_agents):
            logger.record_tabular("reward_agent_" + str(i),
                                  episode_reward_agent[i])

        logger.dump_tabular()

    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
    # U.save_state(model_file_save)

    env.close()
Exemple #9
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)
    
    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Exemple #10
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_cpu : %s" % FLAGS.num_cpu)
    print("lr : %s" % FLAGS.lr)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env("CollectMineralShards",
                            step_mul=step_mul,
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(env,
                                             q_func=model,
                                             num_actions=64,
                                             lr=1e-3,
                                             max_timesteps=20000000,
                                             buffer_size=10000,
                                             exploration_fraction=0.5,
                                             exploration_final_eps=0.01,
                                             train_freq=4,
                                             learning_starts=10000,
                                             target_network_update_freq=1000,
                                             gamma=0.99,
                                             prioritized_replay=True,
                                             callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "acktr"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        # def make_env(rank):
        #   # env = sc2_env.SC2Env(
        #   #   "CollectMineralShards",
        #   #   step_mul=step_mul)
        #   # return env
        #   #env.seed(seed + rank)
        #   def _thunk():
        #     env = sc2_env.SC2Env(
        #         map_name=FLAGS.map,
        #         step_mul=step_mul,
        #         visualize=True)
        #     #env.seed(seed + rank)
        #     if logger.get_dir():
        #      env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
        #     return env
        #   return _thunk

        # agents = [Agent()
        #           for _ in range(num_cpu)]
        #
        # for agent in agents:
        #   time.sleep(1)
        #   agent.daemon = True
        #   agent.start()

        # agent_controller = AgentController(agents)

        #set_global_seeds(seed)
        env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map)

        policy_fn = CnnPolicy
        acktr_disc.learn(policy_fn,
                         env,
                         seed,
                         total_timesteps=num_timesteps,
                         nprocs=FLAGS.num_cpu,
                         ent_coef=0.1,
                         callback=acktr_callback)
Exemple #11
0
def train(arglist):
    with U.single_threaded_session():
        # Create environment
        env = StarCraft2Env(map_name=arglist.scenario,
                            reward_only_positive=False,
                            obs_last_action=True,
                            obs_timestep_number=True,
                            reward_scale_rate=200)
        # Create agent trainers
        env_info = env.get_env_info()
        num_agents = env_info["n_agents"]
        num_adversaries = num_agents
        obs_shape_n = [(env_info["obs_shape"], )
                       for i in range(num_adversaries)]
        action_space_n = [
            env_info["n_actions"] for i in range(num_adversaries)
        ]
        buffer_size = arglist.buffer_size

        trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n,
                                arglist, buffer_size)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))

        # Initialize
        U.initialize()

        logdir = "./tensorboard/"

        Logger.DEFAULT \
            = Logger.CURRENT \
            = Logger(dir=None,
                     output_formats=[TensorBoardOutputFormat(logdir)])

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(num_agents)]  # individual agent reward
        saver = tf.train.Saver(max_to_keep=100000000)
        n_actions_no_attack = 6

        env.reset()

        obs_n = []
        reward_hl_own_old = []
        reward_hl_en_old = []
        for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
            obs = env.get_obs_agent(agent_id)
            obs_n.append(obs)
            reward_hl_own_old.append(env.get_agent_health(agent_id))
            reward_hl_en_old.append(env.get_enemy_health(agent_id))

        episode_step = 0
        step = 0

        print('Starting iterations...')
        while True:
            # get action
            action_set_actual = []
            action_set_execute = []
            action_n = []
            dead_unit = []
            for agent_id in range(num_agents):
                action_output = trainers[agent_id].action(obs_n[agent_id])
                action_n.append(action_output)
                action_prob = action_output
                action_to_choose = np.argmax(action_prob)
                action_set_actual.append(action_to_choose)
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                if action_to_choose in avail_actions_ind:
                    action_set_execute.append(action_to_choose)
                elif (avail_actions[0] == 1):
                    action_set_execute.append(
                        0)  # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作
                else:
                    action_set_execute.append(1)  # 如果该动作不能执行,那么就用STOP动作代替

                if (len(avail_actions_ind) == 1
                        and avail_actions_ind[0] == 0):  # 判断该智能体是否已经死亡
                    dead_unit.append(agent_id)

            rew_base, done, _ = env.step(action_set_execute)
            episode_rewards[-1] += rew_base
            new_obs_n = []
            reward_hl_own_new = []
            reward_hl_en_new = []
            rew_n = []

            for agent_id in range(num_agents):
                obs_next = env.get_obs_agent(agent_id=agent_id)
                new_obs_n.append(obs_next)
                reward_hl_own_new.append(env.get_agent_health(agent_id))
                reward_hl_en_new.append(env.get_enemy_health(agent_id))

            for agent_id in range(num_agents):
                if (agent_id in dead_unit):
                    reward = 0
                elif (action_set_execute[agent_id] !=
                      action_set_actual[agent_id]
                      ):  #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励
                    reward = -2

                elif (action_set_execute[agent_id] > 5):
                    target_id = action_set_execute[
                        agent_id] - n_actions_no_attack
                    health_reduce_en = reward_hl_en_old[
                        target_id] - reward_hl_en_new[target_id]
                    if (health_reduce_en > 0):
                        if (rew_base > 0):
                            reward = 2 + rew_base
                        else:
                            reward = 2
                    else:
                        reward = 1
                else:
                    reward = (reward_hl_own_new[agent_id] -
                              reward_hl_own_old[agent_id]) * 5
                rew_n.append(reward)

            episode_step += 1

            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done)

            obs_n = new_obs_n
            reward_hl_own_old = reward_hl_own_new
            reward_hl_en_old = reward_hl_en_new

            for i, rew in enumerate(rew_n):
                agent_rewards[i][-1] += rew

            if done:
                print("steps until now : %s, episode: %s, episode reward: %s" %
                      (step, len(episode_rewards), episode_rewards[-1]))
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("episode reward", episode_rewards[-1])
                for i in range(num_agents):
                    logger.record_tabular("agent" + str(i) + " episode reward",
                                          agent_rewards[i][-1])
                logger.dump_tabular()

                env.reset()
                obs_n = []
                reward_hl_own_old = []
                reward_hl_en_old = []
                for agent_id in range(num_agents):  # 第一个循环是为了得到初始状态/观察/生命值信息
                    obs = env.get_obs_agent(agent_id)
                    obs_n.append(obs)
                    reward_hl_own_old.append(env.get_agent_health(agent_id))
                    reward_hl_en_old.append(env.get_enemy_health(agent_id))
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)

            # increment global step counter
            step += 1
            if (step == arglist.buffer_size):
                print("Training starts.")

            # update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                loss = agent.update(trainers, step)

            # save model, display training output
            if done and (len(episode_rewards) % arglist.save_rate == 0):
                save_dir = arglist.save_dir + "/model_" + str(
                    step) + "steps/" + arglist.exp_name
                U.save_state(save_dir, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}".
                          format(step, len(episode_rewards),
                                 np.mean(
                                     episode_rewards[-arglist.save_rate:])))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}"
                        .format(step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ]))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards) - 1))
                break
def main():

  # tf.reset_default_graph()
  # config = tf.ConfigProto()
  # config.gpu_options.allow_growth = True

  FLAGS(sys.argv)
  # steps_left = FLAGS.timesteps

  logdir = "tensorboard"
  if(FLAGS.algorithm == "deepq"):
    logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.exploration_fraction,
      FLAGS.prioritized,
      FLAGS.dueling,
      FLAGS.lr,
      start_time
    )
  elif(FLAGS.algorithm == "acktr"):
    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.num_cpu,
      FLAGS.lr,
      start_time
    )
  elif(FLAGS.algorithm == "BicNet"):
    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
      FLAGS.algorithm,
      FLAGS.timesteps,
      FLAGS.num_cpu,
      FLAGS.lr,
      start_time
    )

  if(FLAGS.log == "tensorboard"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[TensorBoardOutputFormat(logdir)])

  elif(FLAGS.log == "stdout"):
    Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[HumanOutputFormat(sys.stdout)])

  AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
    feature_dimensions=sc2_env.Dimensions(screen=64, minimap=64),#feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
    use_feature_units=True
  )

  lr = FLAGS.lr
  batch_size = 32  # 32
  gamma = 0.99
  num_agents = 9
  vector_obs_len = 33   #4096  # 32*32  1024
  output_len = 3
  hidden_vector_len = 128   #1
  tau = 0.001
  # stddev = 0.1


  sess = U.make_session()
  sess.__enter__()
  actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
  sess.run(tf.global_variables_initializer())

  # while(steps_left > 0):
  with sc2_env.SC2Env(
      map_name="DefeatZerglingsAndBanelings",  #DefeatZerglingsAndBanelings
      step_mul=step_mul,
      save_replay_episodes=1,
      replay_dir="D:/StarCraft II/StarCraft II/Replays/video/0722",
      agent_interface_format=AGENT_INTERFACE_FORMAT,
      visualize=False, #True
      game_steps_per_episode=steps * step_mul) as env:

    learn(
      env,
      sess=sess,
      max_timesteps=FLAGS.timesteps,
      # callback=BicNet_callback,
      actor=actor,
      num_agents=num_agents
    )
_ATTACK_SCREEN = actions.FUNCTIONS.Attack_screen.id
_SELECT_ARMY = actions.FUNCTIONS.select_army.id
_SELECT_UNIT = actions.FUNCTIONS.select_unit.id
_SELECT_POINT = actions.FUNCTIONS.select_point.id

_NOT_QUEUED = [0]
_SELECT_ALL = [0]

UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right'

#to record the output
start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
logdir = "./tensorboard/enjoy/%s" % start_time
Logger.DEFAULT \
      = Logger.CURRENT \
      = Logger(dir=None,
               output_formats=[TensorBoardOutputFormat(logdir)])

FLAGS = flags.FLAGS
flags.DEFINE_string("map_name", "DefeatZerglingsAndBanelings", "the map you want to see.")
flags.DEFINE_string("trained_model", "/home/tld/PycharmProjects/DeepQ_StarCraft2/models/deepq/zergling_45.6.pkl",
                    "the model you has trained.")
flags.DEFINE_bool("visualize", True, "if you want to see the game")
flags.DEFINE_integer("num_actions", 4, "numbers of your action")
flags.DEFINE_integer("step_mul", 5, "the time of every step spends")
flags.DEFINE_integer("episode_steps", 2800, "the steps of every episode spends")


def main():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name=FLAGS.map_name,
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=64, minimap=64
        )  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
    )
    with sc2_env.SC2Env(
            map_name="DefeatZerglingsAndBanelings",  #DefeatZerglingsAndBanelings
            step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=True,  #True
            game_steps_per_episode=steps * step_mul) as env:

        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[256],
                                        dueling=True)
        #model,需要改成lstm的形式。
        demo_replay = []
        # act = dqfd.learn(
        #   env,
        #   q_func=model,
        #   num_actions=3,
        #   lr=1e-4,
        #   max_timesteps=10000000,
        #   buffer_size=100000,
        #   exploration_fraction=0.5,
        #   exploration_final_eps=0.01,
        #   train_freq=2,
        #   learning_starts=100000,
        #   target_network_update_freq=1000,
        #   gamma=0.99,
        #   prioritized_replay=True,
        #   callback=deepq_callback
        # )
        # act.save("defeat_zerglings.pkl")
        BicNet_findAndDefeatZergling.learn(
            env,
            lr=FLAGS.lr,
            max_timesteps=FLAGS.timesteps,
            buffer_size=100000,
            train_freq=1,
            learning_starts=1000,  #100000,
            target_network_update_freq=1000,
            gamma=0.99,
            callback=BicNet_callback)
def main():
    FLAGS(sys.argv)

    steps = 0  #Test steps

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if FLAGS.lr == 0:
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if FLAGS.algorithm == "deepq-4way":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "deepq":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "a2c":
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if FLAGS.algorithm == "deepq":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16))
        # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting
        # We need this change because sc2 now requires specifying players.
        with sc2_env.SC2Env(
                map_name="Simple64",
                players=[
                    sc2_env.Agent(race=sc2_env.Race.terran),
                    sc2_env.Agent(race=sc2_env.Race.terran)
                ],
                #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)],
                step_mul=step_mul,
                visualize=True,
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            acts = deepq_nexus_wars.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)

            agent = random_agent.RandomAgent()
            run_loop.run_loop([agent], env, steps)

            acts[0].save("mineral_shards_x.pkl")
            acts[1].save("mineral_shards_y.pkl")

    elif FLAGS.algorithm == "deepq-4way":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(map_name="Simple64",
                            players=[
                                sc2_env.Agent(race=sc2_env.Race.terran),
                                sc2_env.Agent(race=sc2_env.Race.terran)
                            ],
                            step_mul=step_mul,
                            agent_interface_format=AGENT_INTERFACE_FORMAT,
                            visualize=True) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif FLAGS.algorithm == "a2c":

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(  #interface.feature_layer.resolution 和  interface.feature_layer.minimap_resolution
            feature_dimensions=sc2_env.Dimensions(screen=32,
                                                  minimap=32)  # 16 16
            # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32)  # 16 16
        )
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                step_mul=step_mul,  #推进的速度,通俗理解就是人类玩家的每秒的有效操作
                visualize=True,
                # screen_size_px=(16, 16),
                # minimap_size_px=(16, 16)) as env:
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = deepq.models.cnn_to_mlp(  #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model
                convs=[(16, 8, 4), (32, 4, 2)],
                hiddens=[256],
                dueling=True)  #卷积核数量,卷积核大小,步长
            # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True)  # 卷积核数量,卷积核大小,步长
            act = deepq_mineral_shards.learn(  #训练模型并保存
                # act = deepq_ActSeparate.learn(  #训练模型并保存
                # act=deepq_actSeparateWith4Directions.learn(
                # act = deepq_actionGroup_4way.learn(
                # act = deep_DiffActInSameTime.learn(
                env,
                q_func=model,
                num_actions=4,  #default 16  num_actions=256   3  4
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_actSeparateWith4Directions_callback
            )  #deepq_callback; deepq_ActSeperate_callback  ;   deepq_actSeparateWith4Directions_callback  deep_DiffActInSameTime_callback
            act.save(
                "mineral_shards.pkl"
            )  #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py

    elif (FLAGS.algorithm == "deepq-4way"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(  #
                map_name="CollectMineralShards",
                step_mul=step_mul,
                # screen_size_px=(32, 32),
                # minimap_size_px=(32, 32),
                save_replay_episodes=2,
                replay_dir="D:/StarCraft II/StarCraft II/video",
                agent_interface_format=AGENT_INTERFACE_FORMAT,
                visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)
            # model = deepq.models.mlp(hiddens=[256,128,4])
            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
    lr = 0.002
    buffer_size = 80000 #80000 # 减少一下,尽量是训练步数的1/10  70000  test 200  80000 20000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 8
    vector_obs_len = 248  # local obs:80 ; global state:168;
    output_len = 14
    hidden_vector_len = 256 #128  # 1  256
    tau = 0.001
    num_exploring = buffer_size #buffer_size
    action_low = -1
    action_high = 1
    save_freq = 10000
    # min_life = 45

    env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200)  #8m  DefeatZerglingsAndBanelings  reward_scale_rate=200
    env_info = env.get_env_info()

    n_episodes = 4000 #4000    #2000
    # n_actions = env_info["n_actions"]
    n_agents = env_info["n_agents"]
    episode_len = env_info["episode_limit"]

    timesteps = n_episodes * episode_len

    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
        "BicNet",
        timesteps,
        16,
        lr,
        start_time
    )

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()
    # state_dim = (n_agents, vector_obs_len)
    # action_dim = (n_agents, output_len)

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
    # actor = ActorNetwork(sess, state_dim, action_dim, lr, tau, batch_size)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len,
                              output_len, hidden_vector_len)
    # critic = CriticNetwork(sess, state_dim, action_dim, lr, tau, gamma, actor.get_num_trainable_vars())
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    action_noise = OU_noise(decay_period=timesteps - buffer_size)

    action_noise.reset()
    # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings")
    # U.load_state(model_file_load, sess)
    U.initialize()


    t = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        obs = env.get_obs()
        obs = np.array(obs)
        # state, target_attack = env.get_state()
        state, min = env.get_state()
        screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]])
        for i in range(obs.shape[0]):
            screen_expand[i] = np.append(obs[i],state.flatten())
        # screen_expand = state_transform(obs)
        # screen_expand = state_expand(state, n_agents)
        while not terminated:
            t = t+1
            screen_input = np.expand_dims(screen_expand, axis=0)
            action = actor.predict(screen_input)[0]
            act_with_noise = np.clip(action + action_noise.get_noise(t - num_exploring), action_low, action_high)
            act_mat_norm = (act_with_noise+1)/2
            actions = []
            dead_unit = []
            rew_expand = np.zeros((n_agents, 1))
            # punish = []
            # health_agent = []
            # health_enemy = []

            agent_group = []
            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]
                # print('act_unit_norm',act_unit_norm)
                # act_prob = act_unit_norm / np.sum(act_unit_norm, axis=0)
                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if(sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1)/len(act_prob)
                else : act_prob = np.array(act_prob)/sum_avail_act

                # index = np.random.choice(np.arange(0,14), p=act_prob.ravel())
                # print("act_prob",act_prob)
                index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel())
                # if (index in avail_actions_ind):
                #     punish.append(False)
                # else:
                #     punish.append(True)
                #     if (0 in avail_actions_ind):
                #         actions.append(0)
                #     else:
                #         actions.append(1)
                actions.append(index)
                # health_agent.append(state[4*agent_id])
                # health_enemy.append(state[4*n_agents + 3*agent_id])

                # if(index > 5):
                #     target_id = index - 6

                if(len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0):
                    dead_unit.append(agent_id)
            # health_agent = np.array(health_agent)
            # for i in range(len(health_enemy)):
            #     if (health_enemy[i] < min_life):
            #         min_life = health_enemy[i]
            # health_enemy = np.array(health_enemy)
            reward, terminated, _ = env.step(actions)
            # rew_expand = np.ones((n_agents, 1))*reward
            # health_enemy_new = []


            for i in range(n_agents):
                if (i not in dead_unit):
                    rew_expand[i] += reward
                    if (actions[i] > 5):
                        enemy_id = actions[i] - 6
                        rew_expand[i] += 1
                        # if(actions[i]-6 == target_attack):
                        for j in range(n_agents):
                            if (actions[j] == actions[i] and i!=j):
                                if (state[4 * n_agents + 3 * enemy_id] == min):
                                    rew_expand[i] += 1
            new_obs = env.get_obs()
            new_obs = np.array(new_obs)
            # new_state, target_attack = env.get_state()
            new_state, min = env.get_state()
            new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]])
            for i in range(new_obs.shape[0]):
                new_screen_expand[i] = np.append(new_obs[i], new_state.flatten())
            # health_agent_new = []
            # for i in range(n_agents):
            #     health_agent_new.append(new_state[4 * i])
            #     # health_enemy_new.append(new_state[4 * n_agents + 3 * i])
            # health_agent_new = np.array(health_agent_new)
            # health_enemy_new = np.array(health_enemy_new)
            # life_reduce_agent = health_agent - health_agent_new
            # life_reduce_agent_all = life_reduce_agent.sum(axis=0)
            # life_reduce_enemy = health_enemy - health_enemy_new
            # life_reduce_enemy_all = life_reduce_enemy.sum(axis=0)
            # reward_base = life_reduce_enemy_all - life_reduce_agent_all
            # for i in range(n_agents):
            #     rew_expand[i] += reward_base+life_reduce_agent[i]

            # for i in range(n_agents):
            #     if (punish[i]):
            #         rew_expand[i] += -2
            #     elif (i in dead_unit):
            #         rew_expand[i] += 0
            #     elif (actions[i] > 5):
            #         rew_expand[i] = 1
            #         if(health_enemy[actions[i] - 6] == min_life):
            #             rew_expand[i] = 1
            #     rew_expand[i] += life_reduce_agent[i]

            replay_buffer.add(screen_expand, act_with_noise, rew_expand, terminated, new_screen_expand)

            episode_reward += reward
            screen_expand = new_screen_expand
            # state = new_state
            # target_attack = target_attack_new

            if(t>=num_exploring):
                print("training starts")
                s_batch, a_batch, r_batch, done_batch, s2_batch = replay_buffer.sample_batch(batch_size)  # [group0:[batch_size, trace.dimension], group1, ... group8]
                target_q = r_batch + gamma * critic.predict_target(s2_batch, actor.predict_target(s2_batch))
                predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, output_len)))
                a_outs = actor.predict(s_batch)  # a_outs和a_batch是完全相同的
                grads = critic.action_gradients(s_batch, a_outs)  # delta Q对a的导数
                actor.train(s_batch, grads)

                actor.update_target_network()
                critic.update_target_network()

                # if(t % save_freq == 0):
                    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
                    # U.save_state(model_file_save)
                    # replay_buffer.save()

        print("Total reward in episode {} = {}".format(e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward", episode_reward)

        logger.dump_tabular()

    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
    # U.save_state(model_file_save)

    env.close()
Exemple #18
0
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")
    lr = 0.002
    batch_size = 32  # 32
    num_agents = 8
    vector_obs_len = 248  # local obs:80 ; global state:168;
    output_len = 14
    hidden_vector_len = 256 #128  # 1  256
    tau = 0.001



    env = StarCraft2Env(map_name="8m",reward_only_positive=False, reward_scale_rate=200)  #8m  DefeatZerglingsAndBanelings  reward_scale_rate=200
    env_info = env.get_env_info()

    n_episodes = 4000 #4000    #2000
    n_agents = env_info["n_agents"]
    episode_len = env_info["episode_limit"]

    timesteps = n_episodes * episode_len

    logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
        "BicNet",
        timesteps,
        16,
        lr,
        start_time
    )

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, vector_obs_len, output_len, hidden_vector_len)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, vector_obs_len,
                              output_len, hidden_vector_len)
    sess.run(tf.global_variables_initializer())
    model_file_load = os.path.join(str(300000) + "_" + "model_segment_training/", "defeat_zerglings")
    U.load_state(model_file_load, sess)

    t = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        obs = env.get_obs()
        obs = np.array(obs)
        state, min = env.get_state()
        screen_expand = np.zeros([obs.shape[0],obs.shape[1] + state.shape[0]])
        for i in range(obs.shape[0]):
            screen_expand[i] = np.append(obs[i],state.flatten())

        while not terminated:
            t = t+1
            screen_input = np.expand_dims(screen_expand, axis=0)
            action = actor.predict(screen_input)[0]
            act_with_noise = action
            act_mat_norm = (act_with_noise+1)/2
            actions = []

            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]
                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if(sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1)/len(act_prob)
                else : act_prob = np.array(act_prob)/sum_avail_act
                index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel())

                actions.append(index)
            reward, terminated, _ = env.step(actions)

            new_obs = env.get_obs()
            new_obs = np.array(new_obs)
            new_state, min = env.get_state()
            new_screen_expand = np.zeros([new_obs.shape[0], new_obs.shape[1] + new_state.shape[0]])
            for i in range(new_obs.shape[0]):
                new_screen_expand[i] = np.append(new_obs[i], new_state.flatten())

            episode_reward += reward
            screen_expand = new_screen_expand

        print("Total reward in episode {} = {}".format(e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward", episode_reward)

        logger.dump_tabular()

    env.close()
Exemple #19
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            visualize=True,
                            screen_size_px=(16, 16),
                            minimap_size_px=(16, 16)) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            screen_size_px=(32, 32),
                            minimap_size_px=(32, 32),
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
Exemple #20
0
def main():

    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True

    FLAGS(sys.argv)
    # steps_left = FLAGS.timesteps

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/zergling/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)
    elif (FLAGS.algorithm == "BicNet"):
        logdir = "tensorboard/zergling/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
        feature_dimensions=sc2_env.Dimensions(
            screen=32, minimap=32
        ),  #feature_dimensions=sc2_env.Dimensions(screen=84, minimap=64)  将他俩处理成32*32的矩阵
        use_feature_units=True)

    lr = FLAGS.lr
    buffer_size = 60000  # 50000   减少一下,尽量是训练步数的1/10  70000  test 200  70000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 2  #9
    vector_obs_len = 736  #33   #4096  # 32*32  1024
    output_len = 4  #3

    hidden_vector_len = 128  #128   #1
    tau = 0.001
    # stddev = 0.1

    sess = U.make_session()
    sess.__enter__()
    actor = tb.ActorNetwork(sess, lr, tau, batch_size, num_agents,
                            vector_obs_len, output_len, hidden_vector_len)
    critic = tb.CriticNetwork(sess, lr, tau, gamma,
                              actor.get_num_trainable_vars(), num_agents,
                              vector_obs_len, output_len, hidden_vector_len)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    # action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1), sigma=float(stddev) * np.ones(1))
    action_noise = noise_OU.OU_noise(decay_period=FLAGS.timesteps -
                                     buffer_size)

    # while(steps_left > 0):
    with sc2_env.SC2Env(
            map_name="CollectMineralShards",  #DefeatZerglingsAndBanelings
            # step_mul=step_mul,
            agent_interface_format=AGENT_INTERFACE_FORMAT,
            visualize=False,  #True
            game_steps_per_episode=steps * step_mul) as env:

        learn(
            env,
            sess=sess,
            max_timesteps=FLAGS.timesteps,
            train_freq=1,
            save_freq=10000,
            target_network_update_freq=1,  #1000
            gamma=gamma,
            # callback=BicNet_callback,
            actor=actor,
            critic=critic,
            replay_buffer=replay_buffer,
            num_agents=num_agents,
            action_noise=action_noise,
            output_len=output_len,
            num_exploring=buffer_size  #buffer_size
        )