Example #1
0
def train(num_timesteps, seed, model_path=None):
    env_id = 'Humanoid-v2'
    from baselines.ppo1 import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = make_mujoco_env(env_id, seed)

    # parameters below were the best found in a simple random search
    # these are good enough to make humanoid walk, but whether those are
    # an absolute best or not is not certain
    env = RewScale(env, 0.1)
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, 
            optim_stepsize=3e-4, 
            optim_batchsize=64, 
            gamma=0.99, 
            lam=0.95,
            schedule='linear',
        )
    env.close()
    if model_path:
        U.save_state(model_path)
        
    return pi
Example #2
0
 def save(self, path):
     """Save model to a pickle located at `path`"""
     with tempfile.TemporaryDirectory() as td:
         U.save_state(os.path.join(td, "model"))
         arc_name = os.path.join(td, "packed.zip")
         with zipfile.ZipFile(arc_name, 'w') as zipf:
             for root, dirs, files in os.walk(td):
                 for fname in files:
                     file_path = os.path.join(root, fname)
                     if file_path != arc_name:
                         zipf.write(file_path, os.path.relpath(file_path, td))
         with open(arc_name, "rb") as f:
             model_data = f.read()
     with open(path, "wb") as f:
         dill.dump((model_data, self._act_params), f)
Example #3
0
def maybe_save_model(savedir, container, state):
    """This function checkpoints the model and state of the training algorithm."""
    if savedir is None:
        return
    start_time = time.time()
    model_dir = "model-{}".format(state["num_iters"])
    U.save_state(os.path.join(savedir, model_dir, "saved"))
    if container is not None:
        container.put(os.path.join(savedir, model_dir), model_dir)
    relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
    if container is not None:
        container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip')
    relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl'))
    if container is not None:
        container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl')
    logger.log("Saved model in {} seconds\n".format(time.time() - start_time))
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
          adam_epsilon=1e-5, optim_stepsize=3e-4,
          ckpt_dir=None, log_dir=None, task_name=None,
          verbose=False):

    val_per_iter = int(max_iters/10)
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(ac-pi.ac))
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Pretraining with Behavior Cloning...")
    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
        train_loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
            logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss))

    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        savedir_fname = osp.join(ckpt_dir, task_name)
    U.save_state(savedir_fname, var_list=pi.get_variables())
    return savedir_fname
Example #5
0
 def save(self, save_path):
     tf_util.save_state(save_path, sess=self.sess)
Example #6
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=1000,
          learning_starts=50,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          job_id=None,
          outdir="/tmp/rosrl/experiments/discrete/deepq/"):
    """Train a deepqn model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    action_no: int
        number of actions available in action space
    actions_discr: Box space
        Discretized actions
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """

    # Create all the functions necessary to train the model
    # sess = tf.Session()
    # sess.__enter__()
    if job_id is not None:
        #Directory for log and Tensorboard data
        outdir = '/tmp/rosrl/' + str(
            env.__class__.__name__) + '/deepq/' + 'sim_' + job_id
    else:
        outdir = '/tmp/rosrl/' + str(env.__class__.__name__) + '/deepq/'

    #TODO This should not go here. Instead pass both action_no and actions as arguments to learn function
    #Discrete actions
    goal_average_steps = 2
    max_number_of_steps = 20
    last_time_steps = np.ndarray(0)
    n_bins = 10
    epsilon_decay_rate = 0.99  ########
    it = 1  ######

    # Number of states is huge so in order to simplify the situation
    # typically, we discretize the space to: n_bins ** number_of_features
    joint1_bins = pandas.cut([-np.pi / 2, np.pi / 2],
                             bins=n_bins,
                             retbins=True)[1][1:-1]
    joint2_bins = pandas.cut([-np.pi / 2, np.pi / 2],
                             bins=n_bins,
                             retbins=True)[1][1:-1]
    joint3_bins = pandas.cut([-np.pi / 2, np.pi / 2],
                             bins=n_bins,
                             retbins=True)[1][1:-1]
    action_bins = pandas.cut([-np.pi / 2, np.pi / 2],
                             bins=n_bins,
                             retbins=True)[1][1:-1]

    difference_bins = abs(joint1_bins[0] - joint1_bins[1])
    actions_discr = [(difference_bins, 0.0, 0.0), (-difference_bins, 0.0, 0.0),
                     (0.0, difference_bins, 0.0), (0.0, -difference_bins, 0.0),
                     (0.0, 0.0, difference_bins), (0.0, 0.0, -difference_bins),
                     (0.0, 0.0, 0.0)]
    action_no = 7
    actions = [0, 1, 2, 3, 4, 5, 6]

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape

    # with tf.Session(config=tf.ConfigProto()) as session:
    def make_obs_ph(name):
        return U.BatchInput(observation_space_shape, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=action_no,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        #'num_actions': env.action_space.n,
        'num_actions': action_no,
    }

    act = ActWrapper(act, act_params)

    # TODO: include also de Prioritized buffer
    # # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
        # Create the schedule for exploration starting from 1.
        exploration = LinearSchedule(schedule_timesteps=int(
            exploration_fraction * max_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        saved_mean_reward = None
        obs = env.reset()
        reset = True
        with tempfile.TemporaryDirectory() as td:
            # Log training stuff using tf primitives
            summary_writer = tf.summary.FileWriter(
                outdir, graph=tf.get_default_graph())
            # render the environment to visualize the progress
            env.render()
            sim_r = 0
            sim_t = 0
            done_quant = 0
            model_saved = False
            model_file = os.path.join(td, "model")
            for e in range(150):  # run 10 episodes
                print("Episode: ", e)
                # reset the environment
                obs = env.reset()
                print("observation: ", obs[:3])
                episode_rewards = [0.0]

                for t in range(max_timesteps):
                    if callback is not None:
                        if callback(locals(), globals()):
                            break
                    # Take action and update exploration to the newest value
                    kwargs = {}

                    ## TODO: review in more detail
                    if not param_noise:
                        update_eps = exploration.value(t)
                        update_param_noise_threshold = 0.
                    else:
                        update_eps = 0.
                        #     # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                        #     # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                        #     # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                        #     # for detailed explanation.
                        update_param_noise_threshold = -np.log(
                            1. - exploration.value(t) +
                            exploration.value(t) / float(env.action_space.n))
                        kwargs['reset'] = reset
                        kwargs[
                            'update_param_noise_threshold'] = update_param_noise_threshold
                        kwargs['update_param_noise_scale'] = True
                    action = act(np.array(obs)[None],
                                 update_eps=update_eps,
                                 **kwargs)[0]
                    if isinstance(env.action_space, gym.spaces.MultiBinary):
                        env_action = np.zeros(env.action_space.n)
                        env_action[action] = 1
                    else:
                        env_action = action

                    update_eps = exploration.value(t)
                    update_param_noise_threshold = 0.

                    # Choose action
                    action = act(np.array(obs)[None],
                                 update_eps=update_eps,
                                 **kwargs)[0]

                    reset = False
                    new_obs, rew, done, _ = step(env, actions_discr[action],
                                                 obs[:3])
                    # Store transition in the replay buffer.
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                    obs = new_obs
                    episode_rewards[-1] += rew

                    # RK: removed this, too many prints
                    # print("reward: ", rew)
                    # Log the episode reward
                    #summary = tf.Summary(value=[tf.Summary.Value(tag="Episode reward", simple_value = episode_rewards[-1]/(t + 1))])
                    #summary_writer.add_summary(summary, t+ e*max_timesteps)
                    # print("average episode reward: ", episode_rewards[-1]/(t + 1))
                    sim_r += rew
                    sim_t += 1

                    if done:
                        # summary = tf.Summary(value=[tf.Summary.Value(tag="Mean episode reward", simple_value = episode_rewards[-1]/(t + 1))])
                        # summary_writer.add_summary(summary, t)
                        done_quant += 1
                        print("Done!")
                        obs = env.reset()
                        episode_rewards.append(0.0)
                        reset = True

                    if t + e * max_timesteps > learning_starts and t % train_freq == 0:
                        # TODO review if prioritized_replay is needed
                        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                        if prioritized_replay:
                            experience = replay_buffer.sample(
                                batch_size, beta=beta_schedule.value(t))
                            (obses_t, actions, rewards, obses_tp1, dones,
                             weights, batch_idxes) = experience
                        else:
                            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                                batch_size)
                            weights, batch_idxes = np.ones_like(rewards), None

                        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                        obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                            batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                        #td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                        #[td_errors, weighted_error] = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                        [
                            td_error, weighted_error, q_t_selected_target,
                            rew_t_ph
                        ] = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)

                        #logger.log("Evaluating losses...")
                        #logger.log("q_t_selected_target", q_t_selected_target)
                        #logger.log("Episode reward", episode_rewards[-1])

                        # TODO review if prioritized_replay is needed
                        if prioritized_replay:
                            new_priorities = np.abs(
                                td_errors) + prioritized_replay_eps
                            replay_buffer.update_priorities(
                                batch_idxes, new_priorities)

                    if t + e * max_timesteps > learning_starts and t % target_network_update_freq == 0:
                        # Update target network periodically.
                        update_target()

                    mean_100ep_reward = round(np.mean(episode_rewards[-6:-1]),
                                              1)
                    #print("SIMPLE ROBOTICS -> Episode rewards",episode_rewards)
                    #print("SIMPLE ROBOTICS -> np.mean(Episode rewards)", len(episode_rewards))
                    #print("SIMPLE ROBOTICS -> mean_100ep_reward", mean_100ep_reward)
                    #print("line 383 -> SIMULATION_REWARD", sim_r / 5 * max_timesteps)

                    num_episodes = len(episode_rewards)

                    if done and print_freq is not None and len(
                            episode_rewards) % print_freq == 0:
                        logger.record_tabular("steps", t)
                        logger.record_tabular("episodes", num_episodes)
                        logger.record_tabular("mean 100 episode reward",
                                              mean_100ep_reward)
                        logger.record_tabular("% time spent exploring",
                                              int(100 * exploration.value(t)))
                        logger.dump_tabular()

                        print("steps", t)
                        print("episodes", num_episodes)
                        print("mean 100 episode reward", mean_100ep_reward)
                        print("% time spent exploring",
                              int(100 * exploration.value(t)))

                    if (checkpoint_freq is not None
                            and t + e * max_timesteps > learning_starts
                            and num_episodes > 100
                            and t % checkpoint_freq == 0):
                        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                            if print_freq is not None:
                                logger.log(
                                    "Saving model due to mean reward increase: {} -> {}"
                                    .format(saved_mean_reward,
                                            mean_100ep_reward))
                            U.save_state(model_file)
                            model_saved = True
                            saved_mean_reward = mean_100ep_reward
            if model_saved:
                if print_freq is not None:
                    logger.log("Restored model with mean reward: {}".format(
                        saved_mean_reward))
                U.load_state(model_file)

    opt_r = 1 - (sim_r / sim_t)
    # Log training stuff using tf primitives
    summary_writer = tf.summary.FileWriter(outdir + '/error/',
                                           graph=tf.get_default_graph())
    summary = tf.Summary(
        value=[tf.Summary.Value(tag="Simulation error", simple_value=opt_r)])
    summary_writer.add_summary(summary, job_id)
    summary_writer.flush()
    summary_writer_done = tf.summary.FileWriter(outdir + '/done/',
                                                graph=tf.get_default_graph())

    summary_done = tf.Summary(
        value=[tf.Summary.Value(tag="No. dones", simple_value=done_quant)])
    summary_writer_done.add_summary(summary_done, job_id)
    summary_writer_done.flush()
    print("OPT_r", opt_r)
    print("No. of times it converges: ", done_quant)
    # act_tmp = act
    # session.close()
    # tf.reset_default_graph()
    return act, opt_r
lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
ext_rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
int_rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

distbuffer = deque(maxlen=100)
tstart = time.time()
writer = U.FileWriter(tensorboard_dir)
loss_stats = stats(["pol_surr", "pol_entpen", "vf_ext_loss", "vf_int_loss", "kl", "ent", "aux_loss"]
)
ep_stats = stats(["Reward_Ext", "Reward_Int", "Episode_Length", "Episode_This_Iter", "Distance"])

while timesteps_so_far < args.max_timesteps:
    # Save model
    if iters_so_far % args.save_per_iter == 0 and iters_so_far > 0 and ckpt_dir is not None:
        U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far)

    logger.log2("********** Iteration %i ************"%iters_so_far)

    seg = seg_gen.next()
    losses = policy.train(seg, args.optim_batchsize, args.optim_epochs)

    lrlocal = (seg["ep_lens"], seg["ep_rets_ext"], seg["ep_rets_int"], seg["ep_dists"]) # local values
    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
    lens, rews_ext, rews_int, dists = map(flatten_lists, zip(*listoflrpairs))

    lenbuffer.extend(lens)
    ext_rewbuffer.extend(rews_ext)
    int_rewbuffer.extend(rews_int)
    #rewbuffer.extend(list(np.array(rews_ext) + np.array(rews_int)))
    distbuffer.extend(dists)
Example #8
0
def main():

    start_time = datetime.datetime.now().strftime("%Y%m%d%H%M")

    env = StarCraft2Env(map_name="8m",
                        reward_only_positive=False,
                        reward_scale_rate=200,
                        state_last_action=True,
                        obs_last_action=True,
                        obs_timestep_number=True,
                        state_timestep_number=True)  #reward_defeat=-200
    env_info = env.get_env_info()

    n_episodes = 2500  #4000    #2000
    timesteps = 500000
    n_agents = env_info["n_agents"]
    n_actions = env_info["n_actions"]
    output_len = n_actions
    lr = 0.002
    buffer_size = 70000  #int(timesteps * 0.1)  # 80000 # 减少一下,尽量是训练步数的1/10  70000  test 200  80000 20000
    batch_size = 32  # 32
    gamma = 0.99
    num_agents = 8
    local_obs_len = 179  # local obs:80 ; global state:168;
    global_state_len = 348  # 179+169

    hidden_vector_len = 256  # 128  # 1  256
    tau = 0.001
    num_exploring = buffer_size  # buffer_size
    action_low = -1
    action_high = 1
    save_freq = 10000
    critic_output_len = 1

    logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr,
                                            start_time)

    Logger.DEFAULT \
        = Logger.CURRENT \
        = Logger(dir=None,
                 output_formats=[TensorBoardOutputFormat(logdir)])

    sess = U.make_session()
    sess.__enter__()

    actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len,
                         output_len, hidden_vector_len)
    critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(),
                           num_agents, global_state_len, critic_output_len,
                           hidden_vector_len, n_actions)
    sess.run(tf.global_variables_initializer())
    replay_buffer = ReplayBuffer(buffer_size)
    action_noise = OU_noise(decay_period=timesteps - buffer_size)

    action_noise.reset()
    # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings")
    # U.load_state(model_file_load, sess)
    U.initialize()

    t = 0
    step_train = 0
    for e in range(n_episodes):
        env.reset()
        terminated = False
        episode_reward = 0
        local_obs = env.get_obs()
        local_obs = np.array(local_obs)
        global_state = env.get_state()
        global_state_expand = np.zeros(
            [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]])
        reward_hl_own_old = []
        reward_hl_en_old = []
        episode_reward_agent = [0 for n in range(n_agents)]
        for i in range(local_obs.shape[0]):
            global_state_expand[i] = np.append(local_obs[i],
                                               global_state.flatten())
            reward_hl_own_old.append(env.get_agent_health(i))
            reward_hl_en_old.append(env.get_enemy_health(i))

        while not terminated:
            t = t + 1
            critic_input = np.expand_dims(global_state_expand, axis=0)
            actor_input = np.expand_dims(local_obs, axis=0)
            action = actor.predict(actor_input)[0]
            act_with_noise = action  #np.clip(action + action_noise.get_noise(step_train), action_low, action_high)
            act_mat_norm = (act_with_noise + 1) / 2
            actions = []
            dead_unit = []
            rew_expand = np.zeros((n_agents, 1))

            for agent_id in range(n_agents):
                sum_avail_act = 0
                act_prob = []
                avail_actions = env.get_avail_agent_actions(agent_id)
                avail_actions_ind = np.nonzero(avail_actions)[0]
                act_unit_norm = act_mat_norm[agent_id]

                for i in avail_actions_ind:
                    act_prob.append(act_unit_norm[i])
                    sum_avail_act = sum_avail_act + act_unit_norm[i]

                if (sum_avail_act == 0):
                    act_prob = (np.array(act_prob) + 1) / len(act_prob)
                else:
                    act_prob = np.array(act_prob) / sum_avail_act

                index = np.random.choice(np.array(avail_actions_ind),
                                         p=act_prob.ravel())
                actions.append(index)

                if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0):
                    dead_unit.append(agent_id)

            reward_base, terminated, info = env.step(actions)

            new_local_obs = env.get_obs()
            new_local_obs = np.array(new_local_obs)
            new_global_state = env.get_state()
            new_global_state_expand = np.zeros([
                new_local_obs.shape[0],
                new_local_obs.shape[1] + new_global_state.shape[0]
            ])
            reward_hl_own_new = []
            reward_hl_en_new = []
            for i in range(new_local_obs.shape[0]):
                new_global_state_expand[i] = np.append(
                    new_local_obs[i], new_global_state.flatten())
                reward_hl_own_new.append(env.get_agent_health(i))
                reward_hl_en_new.append(env.get_enemy_health(i))

            for i in range(n_agents):
                if (i in dead_unit):
                    rew_expand[i] = 0
                else:
                    rew_expand[i] = -0.05
                    if (actions[i] > 5):
                        target_id = actions[i] - 6
                        health_reduce_en = reward_hl_en_old[
                            target_id] - reward_hl_en_new[target_id]
                        if (health_reduce_en > 0):
                            rew_expand[i] += 2 + health_reduce_en * 5
                            # if (reward_base > 50):
                            #     rew_expand[i] += 20
                        else:
                            rew_expand[i] += 1
                    else:
                        rew_expand[i] += (reward_hl_own_new[i] -
                                          reward_hl_own_old[i]) * 5
                #
                if (terminated):
                    if (info["battle_won"] is False):
                        rew_expand[i] += -10
                    else:
                        rew_expand[i] += 10

                episode_reward_agent[i] += rew_expand[i]

            replay_buffer.add(local_obs, global_state_expand, act_with_noise,
                              rew_expand, terminated, new_local_obs,
                              new_global_state_expand)

            episode_reward += reward_base
            local_obs = new_local_obs
            global_state_expand = new_global_state_expand
            if (t == num_exploring):
                print("training starts")
            if (t >= num_exploring):
                local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch(
                    batch_size
                )  # [group0:[batch_size, trace.dimension], group1, ... group8]
                target_q = r_batch + gamma * critic.predict_target(
                    global_s2_batch, actor.predict_target(local_s2_batch))
                predicted_q_value, _ = critic.train(
                    global_s_batch, a_batch,
                    np.reshape(target_q,
                               (batch_size, num_agents, critic_output_len)))
                a_outs = actor.predict(local_s_batch)  # a_outs和a_batch是完全相同的
                grads = critic.action_gradients(global_s_batch,
                                                a_outs)  # delta Q对a的导数
                actor.train(local_s_batch, grads)
                step_train = step_train + 1

                actor.update_target_network()
                critic.update_target_network()

                if (t % save_freq == 0):
                    model_file_save = os.path.join(
                        "model/" + str(step_train) + "_" +
                        "training_steps_model/", "8m")
                    U.save_state(model_file_save)
                    print("Model have been trained for %s times" %
                          (step_train))
                    # replay_buffer.save()

        print("steps until now : %s, episode: %s, episode reward: %s" %
              (t, e, episode_reward))
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", e)
        logger.record_tabular("reward_episode", episode_reward)
        for i in range(n_agents):
            logger.record_tabular("reward_agent_" + str(i),
                                  episode_reward_agent[i])

        logger.dump_tabular()

    # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings")
    # U.save_state(model_file_save)

    env.close()
Example #9
0
def learn(
    env,
    var_func,
    cvar_func,
    nb_atoms,
    run_alpha=None,
    lr=5e-4,
    max_timesteps=100000,
    buffer_size=50000,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    train_freq=1,
    batch_size=32,
    print_freq=1,
    checkpoint_freq=10000,
    learning_starts=1000,
    gamma=0.95,
    target_network_update_freq=500,
    num_cpu=4,
    callback=None,
    periodic_save_freq=1000000,
    periodic_save_path=None,
    grad_norm_clip=None,
):
    """Train a CVaR DQN model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    var_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    cvar_func: function
        same as var_func
    nb_atoms: int
        number of atoms used in CVaR discretization
    run_alpha: float
        optimize CVaR_alpha while running. None if you want random alpha each episode.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the best model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    periodic_save_freq: int
        How often do we save the model - periodically
    periodic_save_path: str
        Where do we save the model - periodically
    grad_norm_clip: float
        Clip gradient to this value. No clipping if None
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/distdeepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = make_session(num_cpu=num_cpu)
    sess.__enter__()

    obs_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return U.BatchInput(obs_space_shape, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        var_func=var_func,
        cvar_func=cvar_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        nb_atoms=nb_atoms,
        grad_norm_clipping=grad_norm_clip)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'cvar_func': cvar_func,
        'var_func': var_func,
        'num_actions': env.action_space.n,
        'nb_atoms': nb_atoms
    }

    # Create the replay buffer
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    episode = 0
    alpha = 1.

    # --------------------------------- RUN ---------------------------------
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    print('Target reached')
                    model_saved = False
                    break
            # Take action and update exploration to the newest value
            update_eps = exploration.value(t)

            update_param_noise_threshold = 0.

            action = act(np.array(obs)[None], alpha, update_eps=update_eps)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)

            # ===== DEBUG =====

            # s = np.ones_like(np.array(obs)[None])
            # a = np.ones_like(act(np.array(obs)[None], run_alpha, update_eps=update_eps))
            # r = np.array([0])
            # s_ = np.ones_like(np.array(obs)[None])
            # d = np.array([False])
            # s = obs[None]
            # a = np.array([action])
            # r = np.array([rew])
            # s_ = new_obs[None]
            # d = np.array([done])
            # if t % 100 == 0:
            #     for f in debug:
            #         print(f(s, a, r, s_, d))
            #     print('-------------')
            #
            #     # print([sess.run(v) for v in tf.global_variables('cvar_dqn/cvar_func')])
            #     # print([sess.run(v) for v in tf.global_variables('cvar_dqn/var_func')])

            # =================

            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True
                if run_alpha is None:
                    alpha = np.random.random()

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.

                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

                errors = train(obses_t, actions, rewards, obses_tp1, dones,
                               weights)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            # Log results and periodically save the model
            mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])),
                                      1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.record_tabular("(current alpha)", "%.2f" % alpha)
                logger.dump_tabular()

            # save and report best model
            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward

            # save periodically
            if periodic_save_freq is not None and periodic_save_path is not None and t > learning_starts:
                if t % periodic_save_freq == 0:
                    ActWrapper(act, act_params).save("{}-{}.pkl".format(
                        periodic_save_path, int(t / periodic_save_freq)))

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
Example #10
0
def learn(
        env,
        p_dist_func,
        lr=2.5e-4,
        eps=0.0003125,
        max_timesteps=100000,
        buffer_size=50000,
        exp_t1=1e6,
        exp_p1=0.1,
        exp_t2=25e6,
        exp_p2=0.01,
        # exploration_fraction=0.1,
        # exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=1,
        checkpoint_freq=10000,
        learning_starts=1000,
        gamma=0.95,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        num_cpu=16,
        param_noise=False,
        callback=None,
        dist_params=None):
    """Train a distdeepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    p_dist_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/distdeepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = make_session(num_cpu=num_cpu)
    sess.__enter__()

    #logger.configure()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    if dist_params is None:
        raise ValueError('dist_params is required')

    # z, dz = build_z(**dist_params)

    act, train, update_target, debug = distdeepq.build_train(
        make_obs_ph=make_obs_ph,
        p_dist_func=p_dist_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr, epsilon=eps),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise,
        dist_params=dist_params)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'p_dist_func': p_dist_func,
        'num_actions': env.action_space.n,
        'dist_params': dist_params
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    #exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
    #                             initial_p=1.0,
    #                             final_p=exploration_final_eps)
    #exploration = PiecewiseSchedule([(0, 1.0),(max_timesteps/25, 0.1),
    #                                  (max_timesteps, 0.01)], outside_value=0.01)
    exploration = PiecewiseSchedule([(0, 1.0), (exp_t1, exp_p1),
                                     (exp_t2, exp_p2)],
                                    outside_value=exp_p2)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)

            # rew = rew-1 for proposed loss with new metric
            # rew = rew-1
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, np.sign(rew), new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                errors = train(obses_t, actions, rewards, obses_tp1, dones,
                               weights)

                if prioritized_replay:
                    new_priorities = np.abs(errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                # debug['pi'] = tf.Print(debug['pi'], [debug['pi'], "target pi"])
                # tf.Print(debug['mu'], [debug['mu'], "target mu"])
                # tf.Print(debug['sigma'], [debug['sigma'], "target sigma"])
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_model,  # 'True' means load the model, 'False' build new model
        model_path):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n(
        [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    # Load neural net variables from file or Initialize
    if load_model == True:
        print("Loading model...")
        model_file = tf.train.get_checkpoint_state(model_path)
        U.load_state(model_file.model_checkpoint_path)
    else:
        U.initialize()

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], seg["adv"]
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()

        # Save the model
        if iters_so_far % 5 == 0:
            U.save_state(model_path + '/model-' + str(episodes_so_far) +
                         '.cptk')
            print("Model saved")
Example #12
0
def train(model_file, game="CartPole-v1"):
    """Train at a game."""
    with tf_util.make_session(8):
        env = gym.make(game)

        def make_placeholder(name):
            """Make a placeholder input."""
            return tf_util.BatchInput(env.observation_space.shape, name=name)

        act_params = {
            'make_obs_ph': make_placeholder,
            'q_func': model,
            'num_actions': env.action_space.n
        }
        act, train, update_target, debug = deepq.build_train(
            **act_params,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4)
        )
        act = ActWrapper(act, act_params)

        replay_buffer = ReplayBuffer(50000)

        exploration = LinearSchedule(
            schedule_timesteps=100000,
            initial_p=1.0,
            final_p=0.02
        )

        tf_util.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for t in itertools.count():
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)
            if not len(episode_rewards) % 100:
                env.render()

            if t > 1000:
                obses_t, actions, rewards, obses_tp1, dones = (
                    replay_buffer.sample(32)
                )
                train(
                    obses_t, actions, rewards, obses_tp1, dones,
                    np.ones_like(rewards)
                )
            if not t % 1000:
                update_target()
            if not t % 3000:
                if model_file:
                    tf_util.save_state(model_file)
                yield act

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular(
                    "mean episode reward",
                    round(np.mean(episode_rewards[-101:-1]), 1)
                )
                logger.record_tabular(
                    "% time spent exploring",
                    int(100 * exploration.value(t))
                )
                logger.dump_tabular()
Example #13
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            U.load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            U.load_state(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return act
Example #14
0
def learn(env,
          q_func,
          num_actions=3,
          lr=5e-4,
          max_timesteps=1000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
    observation_in: object
              the output of observation placeholder
  num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    num_episodes = 0
    saved_mean_reward = None

    path_memory = np.zeros((64, 64))

    obs = env.reset()

    # Select all marines first
    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative + path_memory

    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
    obs = env.step(
        actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

    for i in range(len(player_x)):
        xy = [player_x[i], player_y[i]]
        obs = env.step(
            actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])])

        group_id = 0
        group_list = []
        unit_xy_list = []
        for i in range(len(player_x)):
            if i % 4 != 0:
                continue

            if group_id > 2:
                break

            xy = [player_x[i], player_y[i]]
            unit_xy_list.append(xy)

            if (len(unit_xy_list) >= 1):
                for idx, xy in enumerate(unit_xy_list):
                    if (idx == 0):
                        obs = env.step(actions=[
                            sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
                        ])
                    else:
                        obs = env.step(actions=[
                            sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy])
                        ])

                obs = env.step(actions=[
                    sc2_actions.FunctionCall(
                        _SELECT_CONTROL_GROUP,
                        [[_CONTROL_GROUP_SET], [group_id]])
                ])
                unit_xy_list = []

                group_list.append(group_id)
                group_id += 1

        if (len(unit_xy_list) >= 1):
            for idx, xy in enumerate(unit_xy_list):
                if (idx == 0):
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
                    ])
                else:
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(_SELECT_POINT, [[1], xy])
                    ])

            obs = env.step(actions=[
                sc2_actions.FunctionCall(_SELECT_CONTROL_GROUP,
                                         [[_CONTROL_GROUP_SET], [group_id]])
            ])

            group_list.append(group_id)
            group_id += 1

            return obs

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")

        for t in range(max_timesteps):
            if t % 1000 == 0:
                ActWrapper.save(ActWrapper, "mineral_shards.pkl")
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(screen)[None],
                         update_eps=update_eps,
                         **kwargs)[0]
            reset = False
            rew = 0

            #select marines
            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            screen = player_relative + path_memory
            player = []

            while (len(group_list) > 0):
                group_id = np.random.choice(group_list)
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(
                        _SELECT_CONTROL_GROUP,
                        [[_CONTROL_GROUP_RECALL], [group_id]])
                ])

                selected = obs[0].observation["screen"][_SELECTED]
                player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
                if (len(player_y) > 0):
                    player = [int(player_x.mean()), int(player_y.mean())]
                    break
                else:
                    group_list.remove(group_id)

            if (len(player) == 2):

                if (player[0] > 32):
                    screen = shift(LEFT, player[0] - 32, screen)
                elif (player[0] < 32):
                    screen = shift(RIGHT, 32 - player[0], screen)

                if (player[1] > 32):
                    screen = shift(UP, player[1] - 32, screen)
                elif (player[1] < 32):
                    screen = shift(DOWN, 32 - player[1], screen)

            coord = [player[0], player[1]]

            path_memory_ = np.array(path_memory, copy=True)

            if (action == 0):  #UP

                if (player[1] >= 16):
                    coord = [player[0], player[1] - 16]
                    path_memory_[player[1] - 16:player[1], player[0]] = -1
                elif (player[1] > 0):
                    coord = [player[0], 0]
                    path_memory_[0:player[1], player[0]] = -1
                    #else:
                    #  rew -= 1

            elif (action == 1):  #DOWN

                if (player[1] <= 47):
                    coord = [player[0], player[1] + 16]
                    path_memory_[player[1]:player[1] + 16, player[0]] = -1
                elif (player[1] > 47):
                    coord = [player[0], 63]
                    path_memory_[player[1]:63, player[0]] = -1
                    #else:
                    #  rew -= 1

            elif (action == 2):  #LEFT

                if (player[0] >= 16):
                    coord = [player[0] - 16, player[1]]
                    path_memory_[player[1], player[0] - 16:player[0]] = -1
                elif (player[0] < 16):
                    coord = [0, player[1]]
                    path_memory_[player[1], 0:player[0]] = -1
                    #else:
                    #  rew -= 1

            elif (action == 3):  #RIGHT

                if (player[0] <= 47):
                    coord = [player[0] + 16, player[1]]
                    path_memory_[player[1], player[0]:player[0] + 16] = -1
                elif (player[0] > 47):
                    coord = [63, player[1]]
                    path_memory_[player[1], player[0]:63] = -1

            path_memory = np.array(path_memory_)

            if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                for i in range(len(player_x)):
                    xy = [player_x[i], player_y[i]]
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
                    ])
                    #obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

            new_action = [
                sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
            ]

            # else:
            #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

            obs = env.step(actions=new_action)

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            new_screen = player_relative + path_memory

            selected = obs[0].observation["screen"][_SELECTED]
            player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()

            rew = obs[0].reward

            done = obs[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer.
            replay_buffer.add(screen, action, rew, new_screen, float(done))
            screen = new_screen

            episode_rewards[-1] += rew
            #episode_minerals[-1] += obs[0].reward

            if done:
                obs = env.reset()
                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]

                screen = player_relative + path_memory

                player_y, player_x = (
                    player_relative == _PLAYER_FRIENDLY).nonzero()
                player = [int(player_x.mean()), int(player_y.mean())]

                if (player[0] > 32):
                    screen = shift(LEFT, player[0] - 32, screen)
                elif (player[0] < 32):
                    screen = shift(RIGHT, 32 - player[0], screen)

                if (player[1] > 32):
                    screen = shift(UP, player[1] - 32, screen)
                elif (player[1] < 32):
                    screen = shift(DOWN, 32 - player[1], screen)

                # Select all marines first
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])

                for i in range(len(player_x)):
                    xy = [player_x[i], player_y[i]]
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(_SELECT_POINT, [[0], xy])
                    ])

                    group_id = 0
                    group_list = []
                    unit_xy_list = []
                    for i in range(len(player_x)):
                        if i % 4 != 0:
                            continue

                        if group_id > 2:
                            break

                        xy = [player_x[i], player_y[i]]
                        unit_xy_list.append(xy)

                        if (len(unit_xy_list) >= 1):
                            for idx, xy in enumerate(unit_xy_list):
                                if (idx == 0):
                                    obs = env.step(actions=[
                                        sc2_actions.FunctionCall(
                                            _SELECT_POINT, [[0], xy])
                                    ])
                                else:
                                    obs = env.step(actions=[
                                        sc2_actions.FunctionCall(
                                            _SELECT_POINT, [[1], xy])
                                    ])

                            obs = env.step(actions=[
                                sc2_actions.FunctionCall(
                                    _SELECT_CONTROL_GROUP,
                                    [[_CONTROL_GROUP_SET], [group_id]])
                            ])
                            unit_xy_list = []

                            group_list.append(group_id)
                            group_id += 1

                    if (len(unit_xy_list) >= 1):
                        for idx, xy in enumerate(unit_xy_list):
                            if (idx == 0):
                                obs = env.step(actions=[
                                    sc2_actions.FunctionCall(
                                        _SELECT_POINT, [[0], xy])
                                ])
                            else:
                                obs = env.step(actions=[
                                    sc2_actions.FunctionCall(
                                        _SELECT_POINT, [[1], xy])
                                ])

                        obs = env.step(actions=[
                            sc2_actions.FunctionCall(
                                _SELECT_CONTROL_GROUP,
                                [[_CONTROL_GROUP_SET], [group_id]])
                        ])

                        group_list.append(group_id)
                        group_id += 1

                    episode_rewards.append(0.0)
                    reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            #mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                #logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act)
Example #15
0
def learn(
    env,
    policy_fn,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    restore_model_from_file=None,
    save_model_with_prefix,  # this is the naming of the saved model file. Usually here we set indication of the target goal:
    # for example 3dof_ppo1_H.
    # That way we can only select which networks we can execute to the real robot. We do not have to send all files or folder.
    # Naming of the model file should be self explanatory.
    job_id=None,  # this variable is used for indentifing Spearmint iteration number. It is usually set by the Spearmint iterator
    outdir="/tmp/rosrl/experiments/continuous/ppo1/"):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()
    """
    Here we add a possibility to resume from a previously saved model if a model file is provided
    """
    if restore_model_from_file:
        # saver = tf.train.Saver(tf.all_variables())
        saver = tf.train.import_meta_graph(restore_model_from_file)
        saver.restore(
            tf.get_default_session(),
            tf.train.latest_checkpoint('./'))  #restore_model_from_file)
        logger.log("Loaded model from {}".format(restore_model_from_file))

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    if save_model_with_prefix:
        if job_id is not None:
            basePath = '/tmp/rosrl/' + str(
                env.__class__.__name__) + '/ppo1/' + job_id
        else:
            basePath = '/tmp/rosrl/' + str(env.__class__.__name__) + '/ppo1/'

    # Create the writer for TensorBoard logs
    summary_writer = tf.summary.FileWriter(outdir,
                                           graph=tf.get_default_graph())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpRewSEM", np.std(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        """
        Save the model at every itteration
        """

        if save_model_with_prefix:
            #if np.mean(rewbuffer) > -50.0:
            if iters_so_far % 10 == 0:
                basePath = outdir + "/models/"

                if not os.path.exists(basePath):
                    os.makedirs(basePath)
                modelF = basePath + save_model_with_prefix + "_afterIter_" + str(
                    iters_so_far) + ".model"
                U.save_state(modelF)
                logger.log("Saved model to file :{}".format(modelF))

        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="EpRewMean", simple_value=np.mean(rewbuffer))
        ])
        summary_writer.add_summary(summary, timesteps_so_far)
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def agent():
        """Run the agent, connecting to a (remote) host started independently."""
        agent_module, agent_name = FLAGS.agent.rsplit(".", 1)
        agent_cls = getattr(importlib.import_module(agent_module), agent_name)

        with lan_sc2_env.LanSC2Env(
                host=FLAGS.host,
                config_port=FLAGS.config_port,
                race=sc2_env.Race[FLAGS.agent_race],
                step_mul=FLAGS.step_mul,
                realtime=FLAGS.realtime,
                agent_interface_format=sc2_env.parse_agent_interface_format(
                    feature_screen=FLAGS.feature_screen_size,
                    feature_minimap=FLAGS.feature_minimap_size,
                    rgb_screen=FLAGS.rgb_screen_size,
                    rgb_minimap=FLAGS.rgb_minimap_size,
                    action_space=FLAGS.action_space,
                    use_unit_counts=True,
                    use_camera_position=True,
                    show_cloaked=True,
                    show_burrowed_shadows=True,
                    show_placeholders=True,
                    send_observation_proto=True,
                    crop_to_playable_area=True,
                    raw_crop_to_playable_area=True,
                    allow_cheating_layers=True,
                    add_cargo_to_units=True,
                    use_feature_units=FLAGS.use_feature_units),
                visualize=FLAGS.render) as env:
            agents = [agent_cls()]
            logging.info("Connected, starting run_loop.")
            try:
                run_loop.run_loop(agents, env)
            except lan_sc2_env.RestartError:
                pass
        logging.info("Done.")

    def make_obs_ph(name):
        return BatchInput((1, 16, 16), name=name)

    act_x, train_x, update_target_x, debug_x = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope="deepq_x")

    act_y, train_y, update_target_y, debug_y = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope="deepq_y")

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer_x = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        replay_buffer_y = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)

        beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)
    else:
        replay_buffer_x = ReplayBuffer(buffer_size)
        replay_buffer_y = ReplayBuffer(buffer_size)

        beta_schedule_x = None
        beta_schedule_y = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target_x()
    update_target_y()

    #time.sleep(30)  # Stagger startups, otherwise tshey seem to conflict somehow

    episode_rewards = [0.0]
    saved_mean_reward = None

    obs = env.reset()

    action_blacklist = ['0']

    #function_id = numpy.random.choice(obs[0].observation.available_actions)

    #step forward a noop so units and prob appear
    obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])

    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]

    screen = (player_relative == _PLAYER_NEUTRAL).astype(int)  #+ path_memory

    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
    player = [0, 0]

    reset = True

    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join("model/", "nexus_wars")
        print(model_file)

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            action_x = act_x(np.expand_dims(np.array(screen)[None], axis=0),
                             update_eps=update_eps,
                             **kwargs)[0]
            action_y = act_y(np.expand_dims(np.array(screen)[None], axis=0),
                             update_eps=update_eps,
                             **kwargs)[0]

            reset = False

            coord = [player[0], player[1]]
            rew = 0

            coord = [action_x, action_y]

            observation_spec = env.observation_spec()
            action_spec = env.action_spec()

            #get available actions
            avail_actions_now = obs[0].observation.available_actions

            #ready for actions yet? 4 actions = nothing to do yet
            if len(avail_actions_now) > 5:
                #game state is ready for random action commands, get them and args
                function_id = numpy.random.choice(
                    obs[0].observation.available_actions)
                args = [[numpy.random.randint(0, size) for size in arg.sizes]
                        for arg in action_spec[0].functions[function_id].args]

                #issue random command and arg
                obs = env.step(
                    actions=[sc2_actions.FunctionCall(function_id, args)])

                #obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])
            else:
                #step no matter wat
                obs = env.step(actions=[sc2_actions.FunctionCall(_NO_OP, [])])

            player_relative = obs[0].observation["feature_screen"][
                _PLAYER_RELATIVE]
            new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int)

            player_y, player_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()
            # resolve the cannot convert float NaN to integer issue
            if len(player_x) == 0:
                player_x = np.array([0])
            if len(player_y) == 0:
                player_y = np.array([0])
            player = [int(player_x.mean()), int(player_y.mean())]

            rew = obs[0].reward
            done = obs[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer.
            replay_buffer_x.add(screen, action_x, rew, new_screen, float(done))
            replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

            screen = new_screen

            episode_rewards[-1] += rew
            reward = episode_rewards[-1]

            if done:
                obs = env.reset()
                player_relative = obs[0].observation["feature_screen"][
                    _PLAYER_RELATIVE]
                screent = (player_relative == _PLAYER_NEUTRAL).astype(int)

                player_y, player_x = (
                    player_relative == _PLAYER_FRIENDLY).nonzero()
                player = [int(player_x.mean()), int(player_y.mean())]

                # Select all marines first
                env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
                episode_rewards.append(0.0)
                #episode_minerals.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:

                    experience_x = replay_buffer_x.sample(
                        batch_size, beta=beta_schedule_x.value(t))
                    (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x,
                     weights_x, batch_idxes_x) = experience_x

                    experience_y = replay_buffer_y.sample(
                        batch_size, beta=beta_schedule_y.value(t))
                    (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y,
                     weights_y, batch_idxes_y) = experience_y
                else:

                    obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample(
                        batch_size)
                    weights_x, batch_idxes_x = np.ones_like(rewards_x), None

                    obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(
                        batch_size)
                    weights_y, batch_idxes_y = np.ones_like(rewards_y), None

                td_errors_x = train_x(np.expand_dims(obses_t_x, axis=1),
                                      actions_x, rewards_x,
                                      np.expand_dims(obses_tp1_x, axis=1),
                                      dones_x, weights_x)

                td_errors_y = train_x(np.expand_dims(obses_t_y, axis=1),
                                      actions_y, rewards_y,
                                      np.expand_dims(obses_tp1_y, axis=1),
                                      dones_y, weights_y)

                if prioritized_replay:
                    new_priorities_x = np.abs(
                        td_errors_x) + prioritized_replay_eps
                    new_priorities_y = np.abs(
                        td_errors_y) + prioritized_replay_eps
                    replay_buffer_x.update_priorities(batch_idxes_x,
                                                      new_priorities_x)
                    replay_buffer_y.update_priorities(batch_idxes_y,
                                                      new_priorities_y)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target_x()
                update_target_y()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act_x), ActWrapper(act_y)
Example #17
0
def learn(
          env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
        asyn 之下该参数修改为在replay_buffer的数据大小下开始?
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1. 探索率
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        # 在这里我们如果指定了checkpoint_path,则模型在训练时会保存当前网络状态至指定路径,不过其使用的是tensorflow的Saver()
        # 程序中断后,再次运行前会从保存的状态开始恢复训练,没有保存强化学习部分的参数,需要改动
        # 但应当注意的是,需要保存的内容包括replay_buffer
        model_file = os.path.join(td, "model_tn")  # 将两端路径名/文件名 合在一起
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        # 在最大步数内训练
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            # 这里选择动作
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            # 这里直接将observation放入了buffer,DQN论文中则是将序列作为状态,也许是在atari_wrappers中已经做好了相关转换
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs
            env.render()
            # 将即时回报加入回报序列,如果多actor的话,这里应该怎么修改?相当于查看一下整体的mean_reward?或是在每个actor上单独计算(better)
            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            # 经过learning_starts步后开始训练网络(先在buffer中存入一定量数据)
            # 每经过train_freq步进行一次梯度下降
            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))  # 注意beta的用法
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    #  np.ones_like() : Return an array of ones with the same shape and type as a given array.
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                # print(td_errors)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            # 平均100次情景的回报(), 注意episode是轮数,不是步数
            # mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            mean_100ep_reward = round(np.mean(episode_rewards[-21:-1]), 2)
            num_episodes = len(episode_rewards)
            # 下面是关于输出训练信息,以及保存网络参数的部分
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 20 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            # checkpoint_freq轮数、mean reward增长才会保存模型
            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        # 至此,训练结束
        # 训练结束后保存最佳模型
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_state(model_file)
        # 返回一个ActWrapper,用来act.save("cartpole_model.pkl")或其它的动作
    return act
def learn(
          env,
          actor_deque,
          action_pipes,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    actor_deque: structure is --> (ac_num, obs, action, new_obs, rew, done)
    action_pipes: structure is --> pipes_conn1 = [pipes[i][1] for i in range(0, 2)]
        use --> action_pipes[actor_num].send(s)  default is str
        至于为什么一处为deque,一处为pipe. well, actor需要接受action来执行下一步,此前为阻塞状态.
        而trainer是响应式的,无论哪个actor有数据都要进行计算,使用deque.empty()很方便,
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
        asyn 之下该参数修改为在replay_buffer的数据大小下开始?
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1. 探索率
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    # obs = env.reset()
    reset = True
    done = None
    end = 100  # 传输一个非正常动作,结束训练

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td
        model_file = os.path.join(td, "model_tn")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        # 在最大步数内训练
        t = 0
        while t <= max_timesteps:
            if callback is not None:
                if callback(locals(), globals()):
                    break
            if actor_deque.empty() is True:
                pass
                # time.sleep()
            else:
                actor_information = actor_deque.get()
                if actor_information[2] is None:  # 表示其为一轮开始
                    ac_num = actor_information[0]
                    new_obs = actor_information[3]
                    done = False  # important
                    # print("ac_num "+str(ac_num)+" start")
                else:
                    ac_num = actor_information[0]
                    obs = actor_information[1]
                    action = actor_information[2]
                    new_obs = actor_information[3]
                    rew = actor_information[4]
                    done = actor_information[5]
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                if done:  # done 与start是不会共存的
                    # obs = env.reset()
                    # episode_rewards.append(0.0)
                    reset = True
                else:
                    # Take action and update exploration to the newest value
                    kwargs = {}
                    if not param_noise:
                        update_eps = exploration.value(t)
                        update_param_noise_threshold = 0.
                    else:
                        update_eps = 0.
                        update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                        kwargs['reset'] = reset
                        kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                        kwargs['update_param_noise_scale'] = True
                    action = act(np.array(new_obs)[None], update_eps=update_eps, **kwargs)[0]
                    env_action = action
                    reset = False
                    action_pipes[ac_num-1].send(env_action)  # 这里ac_num与pipe位置没有对齐
                    # 经过learning_starts步后开始训练网络(先在buffer中存入一定量数据)
                    # 每经过train_freq步进行一次梯度下降
                    if t > learning_starts and t % train_freq == 0:
                        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                        if prioritized_replay:
                            experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))  # 注意beta的用法
                            (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                        else:
                            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                            #  np.ones_like() : Return an array of ones with the same shape and type as a given array.
                            weights, batch_idxes = np.ones_like(rewards), None
                        td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                        if prioritized_replay:
                            new_priorities = np.abs(td_errors) + prioritized_replay_eps
                            replay_buffer.update_priorities(batch_idxes, new_priorities)

                    if t > learning_starts and t % target_network_update_freq == 0:
                        # Update target network periodically.
                        update_target()

                # 下面是关于输出训练信息,以及保存网络参数的部分
                if print_freq is not None and t % print_freq == 0:
                    logger.record_tabular("total_steps", t)
                    # logger.record_tabular("episodes", num_episodes)
                    # logger.record_tabular("mean 20 episode reward", mean_100ep_reward)
                    logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                    logger.dump_tabular()

                # checkpoint_freq轮数、mean reward增长才会保存模型
                if checkpoint_freq is not None and t > learning_starts and t % checkpoint_freq == 0:
                    save_state(model_file)
                    model_saved = True
                t += 1
        # 至此,训练结束
        # end = True
        for i in range(0, len(action_pipes)):
            action_pipes[i].send(end)  # end = 100
        # 训练结束后保存最佳模型
        # if model_saved:
        #     if print_freq is not None:
        #         logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
        #     load_state(model_file)
        # 返回一个ActWrapper,用来act.save("cartpole_model.pkl")或其它的动作
    return act
Example #19
0
def train_DISCARL(env_id,
                  num_timesteps,
                  seed,
                  render,
                  max_steps_episode,
                  clip_action=False,
                  ckpt_dir=None,
                  restore_dir=None,
                  n=1.0):
    def policy_pro(name, ob_space, ac_space):
        # return MlpPolicy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
        #                            hid_size=64, num_hid_layers=2)
        return MlpPolicy_Pro.MlpPolicy(name=name,
                                       ob_space=ob_space,
                                       ac_space=ac_space,
                                       tau=3e-4,
                                       hid_size=64,
                                       num_hid_layers=4)  # 倒立双摆 4层

    def policy_adv(name, ob_space, ac_space):
        # return MlpPolicy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
        #                            hid_size=64, num_hid_layers=2)
        return MlpPolicy_Adv.MlpPolicy(name=name,
                                       ob_space=ob_space,
                                       ac_space=ac_space,
                                       tau=3e-4,
                                       hid_size=64,
                                       num_hid_layers=4)  # 倒立双摆 4层

    env = gym.make(env_id)

    env.update_adversary(n)

    set_global_seeds(seed)
    env.seed(seed)

    save_timestep_period = num_timesteps
    if ckpt_dir:
        print('logging to ' + ckpt_dir)
    pro_pi, rew, timesteps_so_far, len_mean = PPO_RARL_DISCARL2_v5.learn(
        env,
        policy_pro,
        policy_adv,
        max_timesteps=num_timesteps,
        timesteps_per_batch=2048,
        clip_param=0.02,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=5e-4,
        optim_batchsize=64,
        max_steps_episode=max_steps_episode,
        gamma=0.99,
        lam=0.95,
        lr_l=5e-4,
        lr_a=5e-4,
        schedule='linear',
        clip_action=clip_action,
        restore_dir=restore_dir,
        ckpt_dir=None,
        save_timestep_period=save_timestep_period,
    )
    if ckpt_dir:
        # print(model_path)
        U.save_state(ckpt_dir)

    env.close()
    return pro_pi, len_mean, timesteps_so_far
Example #20
0
def learn(
    env,
    q_func,
    lr=5e-4,
    max_timesteps=100000,
    buffer_size=50000,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    train_freq=1,
    batch_size=32,
    print_freq=100,
    checkpoint_freq=10000,
    checkpoint_path=None,
    learning_starts=1000,
    gamma=1.0,
    target_network_update_freq=500,
    prioritized_replay=False,
    prioritized_replay_alpha=0.6,
    prioritized_replay_beta0=0.4,
    prioritized_replay_beta_iters=None,
    prioritized_replay_eps=1e-6,
    param_noise=False,
    callback=None,
    epoch_steps=20000,
    gpu_memory=1.0,
    double_q=False,
    scope="deepq",
    directory='.',
    nb_test_steps=10000,
):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.per_process_gpu_memory_fraction = gpu_memory
    config.gpu_options.polling_inactive_delay_msecs = 25
    sess = tf.Session(config=config)
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, act_greedy, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise,
        double_q=bool(double_q),
        scope=scope)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    #recording
    records = {'loss': [], 'online_reward': [], 'test_reward': []}

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        ep_losses, ep_means, losses = [], [], []
        print("===== LEARNING STARTS =====")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            timelimit_env = env
            while (not hasattr(timelimit_env, '_elapsed_steps')):
                timelimit_env = timelimit_env.env

            if timelimit_env._elapsed_steps < timelimit_env._max_episode_steps:
                # Store transition in the replay buffer.
                replay_buffer.add(obs, action, rew, new_obs, float(done))
            else:
                replay_buffer.add(obs, action, rew, new_obs, float(not done))

            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True
                if losses:
                    ep_losses.append(np.mean(losses))
                    losses = []

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                losses.append(td_errors)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)

            if (t + 1) % epoch_steps == 0 and (t + 1) > learning_starts:
                test_reward = test(env,
                                   act_greedy,
                                   nb_test_steps=nb_test_steps)
                records['test_reward'].append(test_reward)
                records['loss'].append(np.mean(ep_losses))
                records['online_reward'].append(
                    round(np.mean(episode_rewards[-101:-1]), 1))
                pickle.dump(records,
                            open(os.path.join(directory, "records.pkl"), "wb"))
                print("==== EPOCH %d ===" % ((t + 1) / epoch_steps))
                print(tabulate([[k, v[-1]] for (k, v) in records.items()]))

            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and (t + 1) > learning_starts
                    and num_episodes > 100 and (t + 1) % checkpoint_freq == 0):
                print("Saving model to model_%d.pkl" % (t + 1))
                act.save(
                    os.path.join(directory, "model_" + str(t + 1) + ".pkl"))
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_state(model_file)
    return act, records
Example #21
0
def save_model(dict_state):
    save_state("saved_model/model.ckpt")
    relatively_safe_pickle_dump(dict_state, "saved_model/model_state.pkl.zip", compression=True)
Example #22
0
 def save(self, path):
     save_state(path)
Example #23
0
def learn(
        update_flag,
        end_train_flag,
        total_step,
        net_list,
        net_list_lock,
        mem_queue,
        env,
        q_func,
        lr=5e-4,
        max_timesteps=1000000,
        buffer_size=100000,
        batch_size=32,
        checkpoint_freq=10000,
        checkpoint_path=None,
        learning_starts=5000,
        gamma=1.0,
        target_network_update_freq=500,  # asyn中 trainer要比正常运行快,这些参数都有待商议
        actor_network_update_freq=500,  # 最好比actor那边小点(到也没必要,trainer这边运行速度肯定比actor快得多)
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    batch_size: int
        size of a batched sampled from replay buffer for training
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
        asyn 之下该参数修改为在replay_buffer的数据大小下开始?
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    # sess = tf.Session()
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.2  # 占用GPU20%的显存
    sess = tf.Session(config=config)
    # sess = U.single_threaded_session()  # 限制使用单核心
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, init_actor_qfunc, update_actor_qfunc, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    replay_buffer = MemBufferThread(
        mem_queue,
        max_timesteps=max_timesteps,
        buffer_size=buffer_size,
        batch_size=batch_size,
        prioritized_replay=prioritized_replay,
        prioritized_replay_alpha=prioritized_replay_alpha,
        prioritized_replay_beta0=prioritized_replay_beta0,
        prioritized_replay_beta_iters=prioritized_replay_beta_iters,
        prioritized_replay_eps=prioritized_replay_eps)

    replay_buffer.setDaemon(True)  # 设置子线程与主线程一起退出,需在start之前
    replay_buffer.start()

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()
    init_actor_qfunc(sess=sess, net_list=net_list)  # 初始化结束后,先为actor传递一次网络
    # update_actor_qfunc(sess=sess, net_list=net_list, net_list_lock=net_list_lock)
    update_flag.value += 1  # 设置标志位,允许各actor复制初始网络

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td
        model_file = os.path.join(td, "model_tn")  # 将两端路径名/文件名 合在一起
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        t = 0
        # 在最大步数内训练, infinite
        # for t in range(max_timesteps):
        while True:
            if callback is not None:
                if callback(locals(), globals()):
                    break

            # 一直等待replay_buffer的数据足够多,才开始训练网络
            while replay_buffer.__len__() < learning_starts:
                # print(replay_buffer.__len__())
                time.sleep(1)

            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            obses_t, actions, rewards, obses_tp1, dones, weights = replay_buffer.sample(
                total_step.value)

            td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                              weights)
            # print(td_errors)
            if prioritized_replay:
                replay_buffer.update_priorities(td_errors)

            if t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

                # 更新actor_network
            if t % actor_network_update_freq == 0:
                update_actor_qfunc(sess=sess,
                                   net_list=net_list,
                                   net_list_lock=net_list_lock)

            # time.sleep(0.05)  # 不应该存在
            # checkpoint_freq轮数保存模型
            if (checkpoint_freq is not None and t % checkpoint_freq == 0):
                logger.log("Saving model")
                save_state(model_file)  # 这里是tensorflow的保存方式,是为了继续训练的
                model_saved = True
                act.save("n_robot_model.pkl")  # 这里只保存了act相关内容,可以用来检查运行结果
                # act.save("cartpole_model.pkl")  # 这里只保存了act相关内容,可以用来检查运行结果
                # act.save("MountainCar_model.pkl")
            t += 1
            # # 4 是actor数量, max_timesteps 是每个actor的最大步数,意味着actor训练结束,train随之结束(not work well)
            # if (total_step.value+4)/4 + 1000 >= max_timesteps:
            #     break
            if end_train_flag.value == 4:  # 4 是actor数量
                break
        # 至此,训练结束
        # 返回一个ActWrapper,用来act.save("cartpole_model.pkl")或其它的动作
        print("end training")
        if model_saved:
            # logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            logger.log("Restored model")
            load_state(model_file)
    # replay_buffer.join()
    return act
 def save(self, save_path):
     tf_util.save_state(save_path, sess=self.sess)
Example #25
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        double_q=True,
        grad_norm_clipping=10
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
Example #26
0
def learn(env, policy_func, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        save_name=None,
        save_per_acts=3,
        reload_name=None
        ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.sensor_space
    ac_space = env.action_space

    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    if reload_name:
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), reload_name)
        print("Loaded model successfully.")


    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult) 
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)            
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

        #print(iters_so_far, save_per_acts)

        if save_name and (iters_so_far % save_per_acts == 0):
            base_path = os.path.dirname(os.path.abspath(__file__))
            print(base_path)
            out_name = os.path.join(base_path, 'models', save_name + '_' + str(iters_so_far) + ".model")
            U.save_state(out_name)
            print ("Saved model successfully.")
Example #27
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          save_model_with_prefix=None,
          restore_model_from_file=None,
          outdir="/tmp/rosrl/experiments/continuous/acktr/"):

    obfilter = ZFilter(env.observation_space.shape)
    # Risto change
    max_pathlength = env.max_episode_steps
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()
    """
    Here we add a possibility to resume from a previously saved model if a model file is provided
    """
    if restore_model_from_file:
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), restore_model_from_file)
        logger.log("Loaded model from {}".format(restore_model_from_file))

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0

    if save_model_with_prefix:
        # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/'
        summary_writer = tf.summary.FileWriter(outdir,
                                               graph=tf.get_default_graph())

    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)

        if callback:
            callback()
        logger.dump_tabular()
        """
        Save the model at every itteration
        """
        if save_model_with_prefix:
            if np.mean([path["reward"].sum() for path in paths]) > -50.0:
                # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/'
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag="EpRewMean",
                                     simple_value=np.mean([
                                         path["reward"].sum() for path in paths
                                     ]))
                ])
                summary_writer.add_summary(summary, i)
                if not os.path.exists(outdir):
                    os.makedirs(outdir)
                modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str(
                    i) + ".model"
                U.save_state(modelF)
                logger.log("Saved model to file :{}".format(modelF))

        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
def learn(env,
          q_func,
          beta1=0.9,
          beta2=0.999,
          epsilon=1e-8,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          exploration_schedule=None,
          start_lr=5e-4,
          end_lr=5e-4,
          start_step=0,
          end_step=1,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          model_directory=None,
          lamda=0.1):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    beta1: float
        beta1 parameter for adam
    beta2: float
        beta2 parameter for adam
    epsilon: float
        epsilon parameter for adam
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    exploration_schedule: Schedule
        a schedule for exploration chance
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return U.BatchInput(observation_space_shape, name=name)

    global_step = tf.Variable(0, trainable=False)
    lr = interpolated_decay(start_lr, end_lr, global_step, start_step,
                            end_step)
    act, train, update_target, debug = multiheaded_build_graph.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr,
                                         beta1=beta1,
                                         beta2=beta2,
                                         epsilon=epsilon),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise,
        global_step=global_step,
        lamda=lamda,
    )
    tf.summary.FileWriter(logger.get_dir(), graph_def=sess.graph_def)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    if exploration_schedule is None:
        exploration = LinearSchedule(schedule_timesteps=int(
            exploration_fraction * max_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)
    else:
        exploration = exploration_schedule

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        if model_directory is None:
            model_directory = pathlib.Path(td)
        model_file = str(model_directory / "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            if isinstance(env.action_space, gym.spaces.MultiBinary):
                env_action = np.zeros(env.action_space.n)
                env_action[action] = 1
            else:
                env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    act.save(str(model_directory / "act_model.pkl"))
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return act
class ActWrapper(object):
    def __init__(self, act):
        self._act = act
        #self._act_params = act_params
    
    @staticmethod
    def load(path, act_params, num_cpu=16):
        with open(path, "rb") as f:
            model_data = dill.load(f)
        act = deepq.build_act(**act_params)
        sess = U.make_session(num_cpu=num_cpu)
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)
            
            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            U.load_state(os.path.join(td, "model"))
        
        return ActWrapper(act)
    
    def __call__(self, *args, **kwargs):
        return self._act(*args, **kwargs)
    
    def save(self, path):
        # Save model to a pickle located at `path`
        with tempfile.TemporaryDirectory() as td:
            U.save_state(os.path.join(td, "model"))
            arc_name = os.path.join(td, "packed.zip")
            with zipfile.ZipFile(arc_name, 'w') as zipf:
                for root, dirs, files in os.walk(td):
                    for fname in files:
                        file_path = os.path.join(root, fname)
                        if file_path != arc_name:
                            zipf.write(file_path, os.path.relpath(file_path, td))
            with open(arc_name, "rb") as f:
                model_data = f.read()
        with open(path, "wb") as f:
            dill.dump((model_data), f)
    
    def load(path, act_params, num_cpu=16):
        """ Load act function that was returned by learn function.

        Parameters
        ----------
        path: str
            path to the act function pickle
        num_cpu: int
            number of cpus to use for executing the policy
        
        Returns
        -------
        act: ActWrapper
            function that takes a batch of observations
            and returns actions.
        """
        return ActWrapper.load(path, num_cpu=cpu, act_params=act_params)
    
    def learn(
        env,
        q_func,
        num_actions=4,
        lr=5e-4
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=1,
        checkpoint_freq=10000,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500;
        prioritized_replay=False,
        prioritized_replay_alpha=0.6;
        prioritized_replay_beta0=0.4;
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        num_cpu=16,
        param_noise=False,
        param_noise_threshold=0.05,
        callback=None
    ):
    
    # Train a deepq model.
    # Parameters
    # -------
    # env: pysc2.env.SC2Env
    #   environment to train on
    # q_func: (tf.Variable, int, str, bool) -> tf.Variable
    #   the model that takes the following inputs:
    #       observation_in: object
    #           the output of observation placeholder
    #       num_actions: int
    #           number of actions
    #       scope: str
    #       reuse: bool
    #           should be passed to outer variable scope
    #   and returns a tensor of shape (batch_size, num_actions) with values of every action.
    # lr: float
    #   learning rate for adam optimizer
    # max_timesteps: int
    #   number of env steps to optimizer for
    # buffer_size: int
    #   size of the replay buffer
    # exploration_fraction: float
    #   fraction of entire training period over which the exploration rate is annealed
    # exploration_final_eps: float
    #   final value of random action probability
    # train_freq: int
    #   update the model every `train_freq` steps.
    #   set to None to disable printing
    # batch_size: int
    #   size of a batched sampled from replay buffer for training
    # print_freq: int
    #   how often to print out training progress
    #   set to None to disable printing
    # checkpoint_freq: int
    #   how often to save the model. This is so that the best version is restored
    #   at the end of the training. If you do not wish to restore the best version at
    #   the end of the training set this variable to None.
    # learning_starts: int
    #   how many steps of the model to collect transitions for before learning starts
    # gamma: float
    #   discount factor
    # target_network_update_freq: int
    #   update the target network every `target_network_update_freq` steps.
    # prioritized_replay: True
    #   if True prioritized replay buffer will be used.
    # prioritized_replay_alpha: float
    #   alpha parameter for prioritized replay buffer
    # prioritized_replay_beta0: float
    #   initial value of beta for prioritized replay buffer
    # prioritized_replay_beta_iters: int
    #   number of iterations over which beta will be annealed from initial value
    #   to 1.0. If set to None equals to max_timesteps.
    # prioritized_replay_eps: float
    #   epsilon to add to the TD errors when updating priorities.
    # num_cpu: int
    #   number of cpus to use for training
    # callback: (locals, globals) -> None
    #   function called at every steps with state of the algorithm.
    #   If callback returns true training stops.
    # Returns
    # -------
    # act: ActWrapper
    #   Wrapper over act function. Adds ability to save it and load it.
    #   See header of baselines/deepq/categorical.py for details on the act function.

    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)
    
    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rage=lr),
        gamma=gamma,
        grad_norm_clipping=10
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
        initial_p=prioritized_replay_beta0,
        final_p=1.0)
        else:
            replay_buffer=ReplayBuffer(buffer_size)
            beta_schedule = None
        # Create the schedule for exploration startin gfrom 1.
        exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial+p=1,0,
        final_p=exploration_final_eps)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        episode_rewards=[0.0]
        # episode_minerals = [0.0]
        saved_mean_reward = None

        path_memory = np.zeros((64,64))

        obs = env.reset()
        # Select all marines first
        obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

        player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

        screen = player_relative + path_memory
        
        player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
        player = [int(player_x.mean()), int(player_y.mean())]

        if(player[0]>32):
            screen=shift(LEFT, player[0]-32, screen)
        elif(player[0]<32):
            screen=shift(RIGHT, 32- player[0], screen)
        
        if(player[1]>32):
            screen = shift(UP, player[1]-32, screen)
            screen = shift(DOWN, 32- player[1], screen)
        
        reset = True
        with tempfile.TemporaryDirectory() as td:
            model_saved = False
            model_file = os.path.join(td, "model")

            for t in range(max_timesteps):
                if callback is not None:
                    if callback(locals(), globals()):
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not param_noise:
                    update_eps = exploration.value(t)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    if param_noise_threshold >= 0.:
                        update_param_noise_threshold = update_param_noise_threshold
                    else:
                        # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                        # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                        # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                        # for detailed explanation.
                        update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(num_actions))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
                reset = False

                coord = [player[0], player[1]]
                rew = 0

                path_memory_ = np.array(path_memory, copy=True)
                if(action == 0): #UP

                    if(player[1] >= 16):
                        coord = [player[0], player[1] - 16]
                        path_memory_[player[1] - 16 : player[1], player[0]] = -1
                    elif(player[1] > 0):
                        coord = [player[0], 0]
                        path_memory_[0 : player[1], player[0]] = -1
                    #else:
                    #   rew -= 1
                elif(action == 1): #DOWN

                    if(player[1] <= 47):
                        coord = [player[0], player[1] + 16]
                        path_memory_[player[1] : player[1] + 16, player[0]] = -1
                    elif(player[1] > 47):
                        coord = [player[0], 63]
                        path_memory_[player[1] : 63, player[0]] = -1
                    #else:
                    #   rew -=1

                elif(action == 2): # LEFT
                    if(player[0] >= 16):
                        coord = [player[0] - 16, player[1]]
                        path_memory_[player[1], player[0] - 16 : player[0]] = -1
                    elif(player[0] < 16):
                        coord = [0, player[1]]
                        path_memory_[player[1], 0 : player[0]] = -1
                    #else:
                    #  rew -= 1

                elif(action == 3): #RIGHT

                    if(player[0] <= 47):
                        coord = [player[0] + 16, player[1]]
                        path_memory_[player[1], player[0] : player[0] + 16] = -1
                    elif(player[0] > 47):
                        coord = [63, player[1]]
                        path_memory_[player[1], player[0] : 63] = -1
                    #else:
                    #  rew -= 1

                #else:
                    #Cannot move, give minus reward
                #

                # if(path_memory[coord[1],coord[0]] !=0):
                #   rew -= 0.5

                path_memory = np.array(path_memory_)
                #print("action : %s Coord : %s" % (action, coord))

                if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                    obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
                
                new_action = [sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])]

                # else:
                #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

                obs = env.step(actions=new_action)

                player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
                new_screen = player_relative + path_memory

                player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
                player = [int(player_x.mean()), int(player_y.mean())]

                if(player[0]>32):
                    new_screen = shift(LEFT, player[0]-32, new_screen)
                elif(player[0]<32):
                    new_screen = shift(RIGHT, 32 - player[0], new_screen)

                if(player[1]>32):
                    new_screen = shift(UP, player[1]-32, new_screen)
                elif(player[1]<32):
                    new_screen = shift(DOWN, 32 - player[1], new_screen)

                rew = obs[0].reward

                done = obs[0].step_type == environment.StepType.LAST

                # Store transition in the replay buffer.
                replay_buffer.add(screen, action, rew, new_screen, float(done))
                screen = new_screen

                episode_rewards[-1] += rew
                #episode_minerals[-1] += obs[0].reward

                if done:
                    obs = env.reset()
                    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

                    screen = player_relative + path_memory

                    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
                    player = [int(player_x.mean()), int(player_y.mean())]

                    if(player[0]>32):
                        screen = shift(LEFT, player[0]-32, screen)
                    elif(player[0]<32):
                        screen = shift(RIGHT, 32 - player[0], screen)
                    
                    if(player[1]>32):
                        screen = shift(UP, player[1]-32, screen)
                    elif(player[1]<32):
                        screen = shift(DOWN, 32 - player[1], screen)

                    # Select all marines first
                    env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
                    episode_rewards.append(0.0)
                    #episode_minerals.append(0.0)

                    path_memory = np.zeros((64,64))

                    reset = True
                if t > learning_starts and t % train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)
                if t > learning_starts and t % target_network_update_freq == 0:
                    # Update target network periodically
                    update_target()
                
                mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
                #mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1)
                num_episodes = len(episode_rewards)
                if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                    logger.record_tabular("steps", t)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    #logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral)
                    logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                    logger.dump_tabular()
                
                if (checkpoint_freq is not None and t > learning_starts and
                        num_episodes > 100 and t % checkpoint_freq == 0):
                    if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                        if print_freq is not None:
                            logger.log("Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward, mean_100ep_reward))
                        U.save_state(model_file)
                        model_saved = True
                        saved_mean_reward = mean_100ep_reward
            if model_saved:
                if print_req is not None:
                    logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
                U.load_state(model_file)
        
        return ActWrapper(act)
    
    def intToCoordinate(num, size=64):
        if size !=64:
            num = num * size * size // 4096
        y = num // size
        x = num - size * y
        return [x, y]
    
    UP, DOWN, LEFT, RIGHT = 'up', 'down', 'left', 'right'

    def shift(direction, number, matrix):
        ''' shift given 2D matrix in-place the given number of rows or columns
            in the specified (UP, DOWN, LEFT, RIGHT) direction and return it
        '''
        if direction in (UP):
            matrix = np.roll(matrix, -number, axis=0)
            matrix[number:,:] = -2
            return matrix
        elif direction in (DOWN):
            matrix = np.roll(matrix, number, axis=0)
            matrix[:number,:] = -2
            return matrix
        elif direction in (LEFT):
            matrix = np.roll(matrix, -number, axis=1)
            matrix[:,number:] = -2
            return matrix
        elif direction in (RIGHT):
            matrix = np.roll(matrix, number, axis=1)
            matrix[:,:number] = -2
            return matrix
        else:
            return matrix




                  
Example #30
0
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    act_x, train_x, update_target_x, debug_x = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope="deepq_x")

    act_y, train_y, update_target_y, debug_y = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope="deepq_y")

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer_x = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        replay_buffer_y = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)

        beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)
    else:
        replay_buffer_x = ReplayBuffer(buffer_size)
        replay_buffer_y = ReplayBuffer(buffer_size)

        beta_schedule_x = None
        beta_schedule_y = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target_x()
    update_target_y()

    episode_rewards = [0.0]
    saved_mean_reward = None

    obs = env.reset()
    # Select all marines first
    obs = env.step(
        actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative  #+ path_memory

    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
    player = [int(player_x.mean()), int(player_y.mean())]

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join("model/", "mineral_shards")
        print(model_file)

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            action_x = act_x(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]

            action_y = act_y(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]

            reset = False

            coord = [player[0], player[1]]
            rew = 0

            coord = [action_x, action_y]

            if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])

            new_action = [
                sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
            ]

            # else:
            #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

            obs = env.step(actions=new_action)

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            new_screen = player_relative  #+ path_memory

            player_y, player_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()
            player = [int(player_x.mean()), int(player_y.mean())]

            rew = obs[0].reward

            done = obs[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer.
            replay_buffer_x.add(screen, action_x, rew, new_screen, float(done))
            replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

            screen = new_screen

            episode_rewards[-1] += rew
            reward = episode_rewards[-1]

            if done:
                obs = env.reset()
                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]

                screen = player_relative  #+ path_memory

                player_y, player_x = (
                    player_relative == _PLAYER_FRIENDLY).nonzero()
                player = [int(player_x.mean()), int(player_y.mean())]

                # Select all marines first
                env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
                episode_rewards.append(0.0)
                #episode_minerals.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:

                    experience_x = replay_buffer_x.sample(
                        batch_size, beta=beta_schedule_x.value(t))
                    (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x,
                     weights_x, batch_idxes_x) = experience_x

                    experience_y = replay_buffer_y.sample(
                        batch_size, beta=beta_schedule_y.value(t))
                    (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y,
                     weights_y, batch_idxes_y) = experience_y
                else:

                    obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample(
                        batch_size)
                    weights_x, batch_idxes_x = np.ones_like(rewards_x), None

                    obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(
                        batch_size)
                    weights_y, batch_idxes_y = np.ones_like(rewards_y), None

                td_errors_x = train_x(obses_t_x, actions_x, rewards_x,
                                      obses_tp1_x, dones_x, weights_x)

                td_errors_y = train_x(obses_t_y, actions_y, rewards_y,
                                      obses_tp1_y, dones_y, weights_y)

                if prioritized_replay:
                    new_priorities_x = np.abs(
                        td_errors_x) + prioritized_replay_eps
                    new_priorities_y = np.abs(
                        td_errors_y) + prioritized_replay_eps
                    replay_buffer_x.update_priorities(batch_idxes_x,
                                                      new_priorities_x)
                    replay_buffer_y.update_priorities(batch_idxes_y,
                                                      new_priorities_y)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target_x()
                update_target_y()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act_x), ActWrapper(act_y)
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
  """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
  # Create all the functions necessary to train the model

  sess = U.make_session(num_cpu=num_cpu)
  sess.__enter__()

  def make_obs_ph(name):
    return U.BatchInput((32, 32), name=name)

  act, train, update_target, debug = deepq.build_train(
    make_obs_ph=make_obs_ph,
    q_func=q_func,
    num_actions=num_actions,
    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    gamma=gamma,
    grad_norm_clipping=10,
    scope="deepq")
  #
  # act_y, train_y, update_target_y, debug_y = deepq.build_train(
  #   make_obs_ph=make_obs_ph,
  #   q_func=q_func,
  #   num_actions=num_actions,
  #   optimizer=tf.train.AdamOptimizer(learning_rate=lr),
  #   gamma=gamma,
  #   grad_norm_clipping=10,
  #   scope="deepq_y"
  # )

  act_params = {
    'make_obs_ph': make_obs_ph,
    'q_func': q_func,
    'num_actions': num_actions,
  }

  # Create the replay buffer
  if prioritized_replay:
    replay_buffer = PrioritizedReplayBuffer(
      buffer_size, alpha=prioritized_replay_alpha)
    # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)

    if prioritized_replay_beta_iters is None:
      prioritized_replay_beta_iters = max_timesteps
    beta_schedule = LinearSchedule(
      prioritized_replay_beta_iters,
      initial_p=prioritized_replay_beta0,
      final_p=1.0)

    # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
    #                                  initial_p=prioritized_replay_beta0,
    #                                  final_p=1.0)
  else:
    replay_buffer = ReplayBuffer(buffer_size)
    # replay_buffer_y = ReplayBuffer(buffer_size)

    beta_schedule = None
    # beta_schedule_y = None
  # Create the schedule for exploration starting from 1.
  exploration = LinearSchedule(
    schedule_timesteps=int(exploration_fraction * max_timesteps),
    initial_p=1.0,
    final_p=exploration_final_eps)

  # Initialize the parameters and copy them to the target network.
  U.initialize()
  update_target()
  # update_target_y()

  episode_rewards = [0.0]
  saved_mean_reward = None

  obs = env.reset()
  # Select all marines first
  obs = env.step(
    actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

  player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

  screen = (player_relative == _PLAYER_NEUTRAL).astype(int)  #+ path_memory

  player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
  player = [int(player_x.mean()), int(player_y.mean())]

  if (player[0] > 16):
    screen = shift(LEFT, player[0] - 16, screen)
  elif (player[0] < 16):
    screen = shift(RIGHT, 16 - player[0], screen)

  if (player[1] > 16):
    screen = shift(UP, player[1] - 16, screen)
  elif (player[1] < 16):
    screen = shift(DOWN, 16 - player[1], screen)

  reset = True
  with tempfile.TemporaryDirectory() as td:
    model_saved = False
    model_file = os.path.join("model/", "mineral_shards")
    print(model_file)

    for t in range(max_timesteps):
      if callback is not None:
        if callback(locals(), globals()):
          break
      # Take action and update exploration to the newest value
      kwargs = {}
      if not param_noise:
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.
      else:
        update_eps = 0.
        if param_noise_threshold >= 0.:
          update_param_noise_threshold = param_noise_threshold
        else:
          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
          # for detailed explanation.
          update_param_noise_threshold = -np.log(
            1. - exploration.value(t) +
            exploration.value(t) / float(num_actions))
        kwargs['reset'] = reset
        kwargs[
          'update_param_noise_threshold'] = update_param_noise_threshold
        kwargs['update_param_noise_scale'] = True

      action = act(
        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      reset = False

      coord = [player[0], player[1]]
      rew = 0

      if (action == 0):  #UP

        if (player[1] >= 8):
          coord = [player[0], player[1] - 8]
          #path_memory_[player[1] - 16 : player[1], player[0]] = -1
        elif (player[1] > 0):
          coord = [player[0], 0]
          #path_memory_[0 : player[1], player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 1):  #DOWN

        if (player[1] <= 23):
          coord = [player[0], player[1] + 8]
          #path_memory_[player[1] : player[1] + 16, player[0]] = -1
        elif (player[1] > 23):
          coord = [player[0], 31]
          #path_memory_[player[1] : 63, player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 2):  #LEFT

        if (player[0] >= 8):
          coord = [player[0] - 8, player[1]]
          #path_memory_[player[1], player[0] - 16 : player[0]] = -1
        elif (player[0] < 8):
          coord = [0, player[1]]
          #path_memory_[player[1], 0 : player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 3):  #RIGHT

        if (player[0] <= 23):
          coord = [player[0] + 8, player[1]]
          #path_memory_[player[1], player[0] : player[0] + 16] = -1
        elif (player[0] > 23):
          coord = [31, player[1]]
          #path_memory_[player[1], player[0] : 63] = -1

      if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
        obs = env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])

      new_action = [
        sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
      ]

      # else:
      #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

      obs = env.step(actions=new_action)

      player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
      new_screen = (player_relative == _PLAYER_NEUTRAL).astype(
        int)  #+ path_memory

      player_y, player_x = (
        player_relative == _PLAYER_FRIENDLY).nonzero()
      player = [int(player_x.mean()), int(player_y.mean())]

      if (player[0] > 16):
        new_screen = shift(LEFT, player[0] - 16, new_screen)
      elif (player[0] < 16):
        new_screen = shift(RIGHT, 16 - player[0], new_screen)

      if (player[1] > 16):
        new_screen = shift(UP, player[1] - 16, new_screen)
      elif (player[1] < 16):
        new_screen = shift(DOWN, 16 - player[1], new_screen)

      rew = obs[0].reward

      done = obs[0].step_type == environment.StepType.LAST

      # Store transition in the replay buffer.
      replay_buffer.add(screen, action, rew, new_screen, float(done))
      # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

      screen = new_screen

      episode_rewards[-1] += rew
      reward = episode_rewards[-1]

      if done:
        obs = env.reset()
        player_relative = obs[0].observation["screen"][
          _PLAYER_RELATIVE]

        screen = (player_relative == _PLAYER_NEUTRAL).astype(
          int)  #+ path_memory

        player_y, player_x = (
          player_relative == _PLAYER_FRIENDLY).nonzero()
        player = [int(player_x.mean()), int(player_y.mean())]

        # Select all marines first
        env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])
        episode_rewards.append(0.0)
        #episode_minerals.append(0.0)

        reset = True

      if t > learning_starts and t % train_freq == 0:
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:

          experience = replay_buffer.sample(
            batch_size, beta=beta_schedule.value(t))
          (obses_t, actions, rewards, obses_tp1, dones, weights,
           batch_idxes) = experience

          # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
          # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y
        else:

          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
            batch_size)
          weights, batch_idxes = np.ones_like(rewards), None

          # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size)
          # weights_y, batch_idxes_y = np.ones_like(rewards_y), None

        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                          weights)

        # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y)

        if prioritized_replay:
          new_priorities = np.abs(td_errors) + prioritized_replay_eps
          # new_priorities = np.abs(td_errors) + prioritized_replay_eps
          replay_buffer.update_priorities(batch_idxes,
                                          new_priorities)
          # replay_buffer.update_priorities(batch_idxes, new_priorities)

      if t > learning_starts and t % target_network_update_freq == 0:
        # Update target network periodically.
        update_target()
        # update_target_y()

      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
      num_episodes = len(episode_rewards)
      if done and print_freq is not None and len(
          episode_rewards) % print_freq == 0:
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("reward", reward)
        logger.record_tabular("mean 100 episode reward",
                              mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * exploration.value(t)))
        logger.dump_tabular()

      if (checkpoint_freq is not None and t > learning_starts
          and num_episodes > 100 and t % checkpoint_freq == 0):
        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
          if print_freq is not None:
            logger.log(
              "Saving model due to mean reward increase: {} -> {}".
                format(saved_mean_reward, mean_100ep_reward))
          U.save_state(model_file)
          model_saved = True
          saved_mean_reward = mean_100ep_reward
    if model_saved:
      if print_freq is not None:
        logger.log("Restored model with mean reward: {}".format(
          saved_mean_reward))
      U.load_state(model_file)

  return ActWrapper(act)
Example #32
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
Example #33
0
    def task_train(self):
        self.episode_rewards = [0.0]
        self.episode_steps = [0.0]
        self.saved_mean_reward = None
        obs = self.env.reset()
        reset = True
        with tempfile.TemporaryDirectory() as td:
            model_saved = False
            model_file = os.path.join(td, "model")
            for t in range(self.max_timesteps):
                if self.callback is not None:
                    if self.callback(locals(), globals()):
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(t)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(1. - self.exploration.value(t) + self.exploration.value(t) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                self.episode_rewards[-1] += rew
                self.episode_steps[-1] += 1
                if done:
                    obs = self.env.reset()
                    self.episode_rewards.append(0.0)
                    self.episode_steps.append(0.0)
                    reset = True

                if t > self.learning_starts and t % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(t))
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones, weights)
                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                if t > self.learning_starts and t % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target()

                mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 1)
                num_episodes = len(self.episode_rewards)
                if done and self.print_freq is not None and len(self.episode_rewards) % self.print_freq == 0:
                    logger.record_tabular("steps", t)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(t)))
                    logger.dump_tabular()

                if (self.checkpoint_freq is not None and t > self.learning_starts and
                        num_episodes > 100 and t % self.checkpoint_freq == 0):
                    if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward:
                        if self.print_freq is not None:
                            logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                       self.saved_mean_reward, mean_100ep_reward))
                        U.save_state(model_file)
                        model_saved = True
                        self.saved_mean_reward = mean_100ep_reward

                if num_episodes >= self.max_episodes:
                    break

            if model_saved:
                if self.print_freq is not None:
                    logger.log("Restored model with mean reward: {}".format(self.saved_mean_reward))
                U.load_state(model_file)
        return self.act, self.episode_rewards, self.episode_steps
Example #34
0
def learn(env,
          q_func,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None,
          demo_replay=[]):
  """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
  # Create all the functions necessary to train the model

  sess = U.make_session(num_cpu=num_cpu)
  sess.__enter__()

  #def make_obs_ph(name):
  #  return U.BatchInput((64, 64), name=name)
  
  obs_spec = env.observation_spec()[0]
  screen_dim = obs_spec['feature_screen'][1:3]

  def make_obs_ph(name):
    return ObservationInput(Box(low=0.0, high=screen_dim[0], shape=(screen_dim[0],screen_dim[1],1)), name=name)

  act, train, update_target, debug = deepq.build_train(
    make_obs_ph=make_obs_ph,
    q_func=q_func,
    num_actions=num_actions,
    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    gamma=gamma,
    grad_norm_clipping=10)
  act_params = {
    'make_obs_ph': make_obs_ph,
    'q_func': q_func,
    'num_actions': num_actions,
  }

  # Create the replay buffer
  if prioritized_replay:
    replay_buffer = PrioritizedReplayBuffer(
      buffer_size, alpha=prioritized_replay_alpha)
    if prioritized_replay_beta_iters is None:
      prioritized_replay_beta_iters = max_timesteps
    beta_schedule = LinearSchedule(
      prioritized_replay_beta_iters,
      initial_p=prioritized_replay_beta0,
      final_p=1.0)
  else:
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
  # Create the schedule for exploration starting from 1.
  exploration = LinearSchedule(
    schedule_timesteps=int(exploration_fraction * max_timesteps),
    initial_p=1.0,
    final_p=exploration_final_eps)

  # Initialize the parameters and copy them to the target network.
  U.initialize()
  update_target()

  episode_rewards = [0.0]
  saved_mean_reward = None

  obs = env.reset()
  # Select all marines first

  player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]

  screen = player_relative

  obs, xy_per_marine = common.init(env, obs)

  group_id = 0
  reset = True
  with tempfile.TemporaryDirectory() as td:
    model_saved = False
    model_file = os.path.join(td, "model")

    for t in range(max_timesteps):
      if callback is not None:
        if callback(locals(), globals()):
          break
      # Take action and update exploration to the newest value
      kwargs = {}
      if not param_noise:
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.
      else:
        update_eps = 0.
        if param_noise_threshold >= 0.:
          update_param_noise_threshold = param_noise_threshold
        else:
          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
          # for detailed explanation.
          update_param_noise_threshold = -np.log(
            1. - exploration.value(t) +
            exploration.value(t) / float(num_actions))
        kwargs['reset'] = reset
        kwargs[
          'update_param_noise_threshold'] = update_param_noise_threshold
        kwargs['update_param_noise_scale'] = True

      # custom process for DefeatZerglingsAndBanelings

      obs, screen, player = common.select_marine(env, obs)

      action = act(
        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
      reset = False
      rew = 0

      new_action = None

      obs, new_action = common.marine_action(env, obs, player, action)
      army_count = env._obs[0].observation.player_common.army_count

      try:
        if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
          obs = env.step(actions=new_action)
        else:
          new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
          obs = env.step(actions=new_action)
      except Exception as e:
        #print(e)
        1  # Do nothing

      player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
      new_screen = player_relative

      rew += obs[0].reward

      done = obs[0].step_type == environment.StepType.LAST

      selected = obs[0].observation["feature_screen"][_SELECTED]
      player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()

      if (len(player_y) > 0):
        player = [int(player_x.mean()), int(player_y.mean())]

      if (len(player) == 2):

        if (player[0] > 32):
          new_screen = common.shift(LEFT, player[0] - 32, new_screen)
        elif (player[0] < 32):
          new_screen = common.shift(RIGHT, 32 - player[0],
                                    new_screen)

        if (player[1] > 32):
          new_screen = common.shift(UP, player[1] - 32, new_screen)
        elif (player[1] < 32):
          new_screen = common.shift(DOWN, 32 - player[1], new_screen)

      # Store transition in the replay buffer.
      replay_buffer.add(screen, action, rew, new_screen, float(done))
      screen = new_screen

      episode_rewards[-1] += rew
      reward = episode_rewards[-1]

      if done:
        print("Episode Reward : %s" % episode_rewards[-1])
        obs = env.reset()
        player_relative = obs[0].observation["feature_screen"][
          _PLAYER_RELATIVE]

        screen = player_relative

        group_list = common.init(env, obs)

        # Select all marines first
        #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
        episode_rewards.append(0.0)

        reset = True

      if t > learning_starts and t % train_freq == 0:
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:
          experience = replay_buffer.sample(
            batch_size, beta=beta_schedule.value(t))
          (obses_t, actions, rewards, obses_tp1, dones, weights,
           batch_idxes) = experience
        else:
          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
            batch_size)
          weights, batch_idxes = np.ones_like(rewards), None
        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                          weights)
        if prioritized_replay:
          new_priorities = np.abs(td_errors) + prioritized_replay_eps
          replay_buffer.update_priorities(batch_idxes,
                                          new_priorities)

      if t > learning_starts and t % target_network_update_freq == 0:
        # Update target network periodically.
        update_target()

      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
      num_episodes = len(episode_rewards)
      if done and print_freq is not None and len(
          episode_rewards) % print_freq == 0:
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("reward", reward)
        logger.record_tabular("mean 100 episode reward",
                              mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * exploration.value(t)))
        logger.dump_tabular()

      if (checkpoint_freq is not None and t > learning_starts
          and num_episodes > 100 and t % checkpoint_freq == 0):
        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
          if print_freq is not None:
            logger.log(
              "Saving model due to mean reward increase: {} -> {}".
                format(saved_mean_reward, mean_100ep_reward))
          U.save_state(model_file)
          model_saved = True
          saved_mean_reward = mean_100ep_reward
    if model_saved:
      if print_freq is not None:
        logger.log("Restored model with mean reward: {}".format(
          saved_mean_reward))
      U.load_state(model_file)

  return ActWrapper(act)
Example #35
0
def train(env,
        eval_env,
        q_func,
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=100,
        checkpoint_freq=10000,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None,
        my_skill_set=None,
        log_dir = None,
        num_eval_episodes=10,
        render=False,
        render_eval = False,
        commit_for = 1
        ):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model


    if my_skill_set: assert commit_for>=1, "commit_for >= 1"

    save_idx = 0
    with U.single_threaded_session() as sess:
    

        ## restore
        if my_skill_set:
            action_shape = my_skill_set.len
        else:
            action_shape = env.action_space.n
            
        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph
        observation_space_shape = env.observation_space.shape
        def make_obs_ph(name):
            return U.BatchInput(observation_space_shape, name=name)

        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=action_shape,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            gamma=gamma,
            grad_norm_clipping=10,
            param_noise=param_noise
        )

        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': q_func,
            'num_actions': action_shape,
        }

        act = ActWrapper(act, act_params)

        # Create the replay buffer
        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = max_timesteps
            beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                           initial_p=prioritized_replay_beta0,
                                           final_p=1.0)
        else:
            replay_buffer = ReplayBuffer(buffer_size)
            beta_schedule = None
        # Create the schedule for exploration starting from 1.
        exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        # sess.run(tf.variables_initializer(new_variables))
        # sess.run(tf.global_variables_initializer())
        update_target()

        if my_skill_set:
            ## restore skills
            my_skill_set.restore_skillset(sess=sess)
            

        episode_rewards = [0.0]
        saved_mean_reward = None
        obs = env.reset()
        reset = True
        
        model_saved = False
        
        model_file = os.path.join(log_dir, "model", "deepq")

        # save the initial act model 
        print("Saving the starting model")
        os.makedirs(os.path.dirname(model_file), exist_ok=True)
        act.save(model_file + '.pkl')

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            paction = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            
            if(my_skill_set):
                skill_obs = obs.copy()
                primitive_id = paction
                rew = 0.
                for _ in range(commit_for):
                
                    ## break actions into primitives and their params    
                    action = my_skill_set.pi(primitive_id=primitive_id, obs = skill_obs.copy(), primitive_params=None)
                    new_obs, skill_rew, done, _ = env.step(action)
                    if render:
                        # print(action)
                        env.render()
                        sleep(0.1)
                    rew += skill_rew
                    skill_obs = new_obs
                    terminate_skill = my_skill_set.termination(new_obs)
                    if done or terminate_skill:
                        break
                    
            else:
                action= paction

                env_action = action
                reset = False
                new_obs, rew, done, _ = env.step(env_action)
                if render:
                    env.render()
                    sleep(0.1)
              


            # Store transition in the replay buffer for the outer env
            replay_buffer.add(obs, paction, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True
                print("Time:%d, episodes:%d"%(t,len(episode_rewards)))

                # add hindsight experience
            

            if t > learning_starts and t % train_freq == 0:
                # print('Training!')
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            # print(len(episode_rewards), episode_rewards[-11:-1])
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
        
            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 50 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    act.save(model_file + '%d.pkl'%save_idx)
                    save_idx += 1
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
                # else:
                #     print(saved_mean_reward, mean_100ep_reward)

            if (eval_env is not None) and t > learning_starts and t % target_network_update_freq == 0:
                
                # dumping other stats
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("%d time spent exploring", int(100 * exploration.value(t)))

                print("Testing!")
                eval_episode_rewards = []
                eval_episode_successes = []

                for i in range(num_eval_episodes):
                    eval_episode_reward = 0.
                    eval_obs = eval_env.reset()
                    eval_obs_start = eval_obs.copy()
                    eval_done = False
                    while(not eval_done):
                        eval_paction = act(np.array(eval_obs)[None])[0]
                        
                        if(my_skill_set):
                            eval_skill_obs = eval_obs.copy()
                            eval_primitive_id = eval_paction
                            eval_r = 0.
                            for _ in range(commit_for):
                            
                                ## break actions into primitives and their params    
                                eval_action, _ = my_skill_set.pi(primitive_id=eval_primitive_id, obs = eval_skill_obs.copy(), primitive_params=None)
                                eval_new_obs, eval_skill_rew, eval_done, eval_info = eval_env.step(eval_action)
                                # print('env reward:%f'%eval_skill_rew)
                                if render_eval:
                                    print("Render!")
                                    
                                    eval_env.render()
                                    print("rendered!")

                                eval_r += eval_skill_rew
                                eval_skill_obs = eval_new_obs
                                
                                eval_terminate_skill = my_skill_set.termination(eval_new_obs)

                                if eval_done or eval_terminate_skill:
                                    break
                                
                        else:
                            eval_action= eval_paction

                            env_action = eval_action
                            reset = False
                            eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(env_action)
                            if render_eval:
                                # print("Render!")
                                
                                eval_env.render()
                                # print("rendered!")


                        
                        eval_episode_reward += eval_r
                        # print("eval_r:%f, eval_episode_reward:%f"%(eval_r, eval_episode_reward))
                        eval_obs = eval_new_obs
                        
                    eval_episode_success = (eval_info["done"]=="goal reached")
                    if(eval_episode_success):
                        logger.info("success, training epoch:%d,starting config:"%t)


                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_successes.append(eval_episode_success)

                combined_stats = {}

                # print(eval_episode_successes, np.mean(eval_episode_successes))
                combined_stats['eval/return'] = normal_mean(eval_episode_rewards)
                combined_stats['eval/success'] = normal_mean(eval_episode_successes)
                combined_stats['eval/episodes'] = (len(eval_episode_rewards))

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                
                print("dumping the stats!")
                logger.dump_tabular()

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)
Example #36
0
def learn(
        env,
        q_func,  # input obs,num od actions etc and obtain q value for each action
        num_actions=16,  # available actions: up down left right
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,  # size of the replay buffer
        exploration_fraction=0.1,  # during the first 10% training period, exploration rate is decreased from 1 to 0.02
        exploration_final_eps=0.02,  # final value of random action probability
        train_freq=1,  # update the model every `train_freq` steps.
        batch_size=32,  # size of a batched sampled from replay buffer for training
        print_freq=1,
        checkpoint_freq=10000,
        learning_starts=1000,  # time for the model to collect transitions before learning starts
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,  # beta keeps to be beta0
        prioritized_replay_eps=1e-6,
        num_cpu=16,  # number of cpus to use for training
        param_noise=False,  # whether or not to use parameter space noise
        param_noise_threshold=0.05,
        callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(
        name
    ):  # Creates a placeholder for a batch of tensors of a given shape and dtype
        return U_b.BatchInput((16, 16), name=name)

    act_x, train_x, update_target_x, debug_x = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,  #   clip gradient norms to this value
        scope="deepq_x")

    act_y, train_y, update_target_y, debug_y = deepq.build_train(  #because there are two players in the game
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope="deepq_y")

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer_x = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        replay_buffer_y = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule_x = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,  # 0.4->1
            final_p=1.0)

        beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)
    else:
        replay_buffer_x = ReplayBuffer(buffer_size)
        replay_buffer_y = ReplayBuffer(buffer_size)

        beta_schedule_x = None
        beta_schedule_y = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.  ---环境初始化
    U.initialize()
    update_target_x()
    update_target_y()
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()  # start a new episode

    # Select all marines first      ---选择所有个体,获得新的观察
    obs = env.step(actions=[
        sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
    ])  # Apply actions, step the world forward, and return observations.

    # 查看返回的字典中屏幕中的目标关系分布图:1表示着地图中个体的位置,3表示着矿物的位置,就是终端的矩阵图
    player_relative = obs[0].observation["feature_screen"][
        _PLAYER_RELATIVE]  #obs is a 'TimeStep' whose type is tuple of ['step_type', 'reward', 'discount', 'observation'];step_type.first or mid or last
    # 矿的位置 0,1矩阵分布
    screen = (player_relative == _PLAYER_NEUTRAL).astype(
        int
    )  #+ path_memory   screen=1 or 0  to indicate the location of mineral
    # 队友的位置,给出行列信息
    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero(
    )  #the location of team member: row, col <-> y,x

    # print(player_relative)
    # print('*************')
    # print(screen)
    # print(_PLAYER_FRIENDLY)
    #
    # print(player_x)
    # print(player_y)
    # print('ssss)

    # if (len(player_x) == 0):
    #   player_x = np.array([0])
    #   # print('player_x from null to 0')
    #   # print(player_x)
    # if (len(player_y) == 0):
    #   player_y = np.array([0])
    #   # print('player_y from null to 0')
    #   # print(player_y)

    player = [int(player_x.mean()), int(player_y.mean())]

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join("model/", "mineral_shards")  #给了一个模型保存路径
        print(model_file)

        for t in range(max_timesteps):
            # print('timestep=',t)
            if callback is not None:
                if callback(locals(), globals()):
                    break

            # Take action and update exploration to the newest value--更新探索并采取动作
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)  # 输出一个1->0.02之间的值
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            # actions obtained after exploration
            action_x = act_x(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
            # print('action_x is ',action_x)

            action_y = act_y(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
            # print('action_y is ',action_y)
            reset = False

            # coord = [player[0], player[1]]
            rew = 0  #reward

            coord = [action_x, action_y]

            if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
            # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

            new_action = [
                sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
            ]

            # else:
            #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

            obs = env.step(actions=new_action)

            player_relative = obs[0].observation["feature_screen"][
                _PLAYER_RELATIVE]
            # print(player_relative)
            new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int)

            # print(_PLAYER_FRIENDLY)

            # print(player_x)
            # print(player_y)
            # print('ssssss2')

            # if (len(player_x) == 0):
            #   player_x = np.array([0])
            #   # print('player_x from null to 0')
            #   # print(player_x)
            # if (len(player_y) == 0):
            #   player_y = np.array([0])
            #   # print('player_y from null to 0')
            #   # print(player_y)

            # player = [int(player_x.mean()), int(player_y.mean())]

            rew = obs[0].reward

            done = obs[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer.
            replay_buffer_x.add(screen, action_x, rew, new_screen, float(done))
            replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

            screen = new_screen

            episode_rewards[-1] += rew
            reward = episode_rewards[-1]

            if done:
                obs = env.reset()
                # player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
                # screent = (player_relative == _PLAYER_NEUTRAL).astype(int)
                #
                # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
                # player = [int(player_x.mean()), int(player_y.mean())]

                # Select all marines first
                env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
                episode_rewards.append(0.0)
                # print("episode_rewards is ", episode_rewards)
                print('num_episodes is', len(episode_rewards))

                #episode_minerals.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:  #train_freq=1: update the model every `train_freq` steps
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:

                    experience_x = replay_buffer_x.sample(
                        batch_size, beta=beta_schedule_x.value(t))
                    (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x,
                     weights_x, batch_idxes_x) = experience_x

                    experience_y = replay_buffer_y.sample(
                        batch_size, beta=beta_schedule_y.value(t))
                    (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y,
                     weights_y, batch_idxes_y) = experience_y
                else:

                    obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample(
                        batch_size)
                    weights_x, batch_idxes_x = np.ones_like(
                        rewards_x
                    ), None  # weights_x is an array padded with 1 which has the same shape as rewards_x

                    obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(
                        batch_size)
                    weights_y, batch_idxes_y = np.ones_like(rewards_y), None

                td_errors_x = train_x(obses_t_x, actions_x, rewards_x,
                                      obses_tp1_x, dones_x, weights_x)

                td_errors_y = train_y(obses_t_y, actions_y, rewards_y,
                                      obses_tp1_y, dones_y, weights_y)

                if prioritized_replay:
                    new_priorities_x = np.abs(
                        td_errors_x) + prioritized_replay_eps
                    new_priorities_y = np.abs(
                        td_errors_y) + prioritized_replay_eps
                    replay_buffer_x.update_priorities(batch_idxes_x,
                                                      new_priorities_x)
                    replay_buffer_y.update_priorities(batch_idxes_y,
                                                      new_priorities_y)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target_x()
                update_target_y()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]),
                                      1)  # round: sishewuru value
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act_x), ActWrapper(act_y)
Example #37
0
def learn(
        env,
        sess,
        actor,
        critic,
        replay_buffer,
        action_noise,
        num_exploring,
        max_timesteps=100000,
        train_freq=1,  #1
        batch_size=32,  #32
        print_freq=1,
        save_freq=10000,  #10000
        gamma=1.0,
        target_network_update_freq=1,  #500,
        num_agents=9,
        output_len=4,
        # num_baneling=4,
        # num_zergling=6,
        # unit_flag_friend=0.4, #48
        # unit_flag_baneling=0.7,  #9
        # unit_flag_zergling=1,  #105
        action_low=-1,
        action_high=1):
    #   """Train a deepq model.
    #
    # Parameters
    # -------
    # env: pysc2.env.SC2Env
    #     environment to train on
    # q_func: (tf.Variable, int, str, bool) -> tf.Variable
    #     the model that takes the following inputs:
    #         observation_in: object
    #             the output of observation placeholder
    #         num_actions: int
    #             number of actions
    #         scope: str
    #         reuse: bool
    #             should be passed to outer variable scope
    #     and returns a tensor of shape (batch_size, num_actions) with values of every action.
    # lr: float
    #     learning rate for adam optimizer
    # max_timesteps: int
    #     number of env steps to optimizer for
    # buffer_size: int
    #     size of the replay buffer
    # train_freq: int
    #     update the model every `train_freq` steps.
    #     set to None to disable printing
    # batch_size: int
    #     size of a batched sampled from replay buffer for training
    # print_freq: int
    #     how often to print out training progress
    #     set to None to disable printing
    # checkpoint_freq: int
    #     how often to save the model. This is so that the best version is restored
    #     at the end of the training. If you do not wish to restore the best version at
    #     the end of the training set this variable to None.
    # learning_starts: int
    #     how many steps of the model to collect transitions for before learning starts
    # gamma: float
    #     discount factor
    # target_network_update_freq: int
    #     update the target network every `target_network_update_freq` steps.
    # num_cpu: int
    #     number of cpus to use for training
    # callback: (locals, globals) -> None
    #     function called at every steps with state of the algorithm.
    #     If callback returns true training stops.
    #
    # Returns
    # -------
    # act: ActWrapper
    #     Wrapper over act function. Adds ability to save it and load it.
    #     See
    #
    #
    #     of baselines/deepq/categorical.py for details on the act function.
    # """
    #   # Create all the functions necessary to train the model
    #
    # tf.reset_default_graph()
    # config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    # sess = tf.Session(config = config)
    # sess.__enter__()

    obs = env.reset()
    action_noise.reset()
    episode_rewards = [0.0]
    obs, _ = common_group.init(env, obs)
    # model_file_load = os.path.join(str(40000) + "_" + "model_segment_training/", "defeat_zerglings")
    # U.load_state(model_file_load, sess)
    U.initialize()
    min = 5
    punish = -0.01
    eps_time = 1

    #求出screen_expand
    player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
    screen = np.zeros((player_relative.shape[0] - 9, player_relative.shape[1]))
    for i in range(player_relative.shape[0] - 9):
        for j in range(player_relative.shape[1]):
            screen[i, j] = round(player_relative[i, j] / 3, 1)
    screen_expand = screenConcat(screen, num_agents)

    #选择,以便MOVE_SCREEN是available的
    obs = env.step(
        actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()

    # 对player_x和player_y的处理,保证其含有两个元素,对应两个agent的位置
    # 智能体不存在的情况,一般不会出现,因为地图不是敌对双方的设置
    if (len(player_x) == 0):
        player_x = np.array([0])
        player_y = np.array([0])

    #两个智能体重合于一点的情况
    if (len(player_x) == 1):
        player_x = np.append(player_x, player_x[0])
        player_y = np.append(player_y, player_y[0])

    pos_agent1_target = [player_x[0], player_y[0]]
    pos_agent2_target = [player_x[1], player_y[1]]

    with tempfile.TemporaryDirectory() as td:

        for t in range(max_timesteps):
            startTime = datetime.datetime.now()
            #输入观察,得到动作
            screen_input = np.expand_dims(screen_expand, axis=0)

            action = actor.predict(screen_input)[0]  # (2, 4)
            rnn_out = actor.rnn_out_pre(screen_input)
            # action[0] = MaxMinNormalization(action[0], getMax(action[0]), getMin(action[0]))
            # action[1] = MaxMinNormalization(action[1], getMax(action[1]), getMin(action[1]))
            act_with_noise = np.clip(
                action + action_noise.get_noise(t - num_exploring), action_low,
                action_high)
            act_prob = (act_with_noise + 1) / 2  #act_with_noise
            # act_prob_sum = act_prob.sum(axis=1)
            act_index = [0, 1, 2, 3]
            # if(act_prob_sum[0] == 0):
            #   prob = (np.array(act_prob[0]) + 1) / len(act_prob[0])
            # else:
            #   prob = act_prob[0]/act_prob_sum[0]
            #
            # a1 = np.random.choice(np.array(act_index), p=prob.ravel())
            #
            # if (act_prob_sum[1] == 0):
            #   prob = (np.array(act_prob[1]) + 1) / len(act_prob[1])
            # else:
            #   prob = act_prob[1] / act_prob_sum[1]
            #
            # a2 = np.random.choice(np.array(act_index), p=prob.ravel())
            # a1 = act_with_noise[0]
            # a2 = act_with_noise[1]
            arr_m = act_prob[0]
            max_m = -1

            for i in range(len(arr_m)):
                if (arr_m[i] > max_m):
                    max_m = arr_m[i]
                    idx_max_m = i

            arr_n = act_prob[1]
            max_n = -1

            for j in range(len(arr_n)):
                if (arr_n[j] > max_n):
                    max_n = arr_n[j]
                    idx_max_n = j

            #选择概率最大的动作
            a1 = idx_max_m
            a2 = idx_max_n

            #动作执行
            pos_agent1 = [player_x[0], player_y[0]]
            pos_agent2 = [player_x[1], player_y[1]]

            diff_1toTarget1 = (pos_agent1_target[0] - pos_agent1[0]) * (
                pos_agent1_target[0] -
                pos_agent1[0]) + (pos_agent1_target[1] - pos_agent1[1]) * (
                    pos_agent1_target[1] - pos_agent1[1])
            diff_2toTarget1 = (pos_agent1_target[0] - pos_agent2[0]) * (
                pos_agent1_target[0] -
                pos_agent2[0]) + (pos_agent1_target[1] - pos_agent2[1]) * (
                    pos_agent1_target[1] - pos_agent2[1])

            diff_1toTarget2 = (pos_agent2_target[0] - pos_agent1[0]) * (
                pos_agent2_target[0] -
                pos_agent1[0]) + (pos_agent2_target[1] - pos_agent1[1]) * (
                    pos_agent2_target[1] - pos_agent1[1])
            diff_2toTarget2 = (pos_agent2_target[0] - pos_agent2[0]) * (
                pos_agent2_target[0] -
                pos_agent2[0]) + (pos_agent2_target[1] - pos_agent2[1]) * (
                    pos_agent2_target[1] - pos_agent2[1])

            if ((diff_1toTarget1 > diff_2toTarget1)
                    and (diff_1toTarget2 < diff_2toTarget2)):
                pos_agent1 = [player_x[1], player_y[1]]
                pos_agent2 = [player_x[0], player_y[0]]

            # 如果本来就位于边缘,还往边缘方向跑,就给惩罚
            pos_agent1_target, punish_1 = obtainTargetPos(a1, pos_agent1)
            pos_agent2_target, punish_2 = obtainTargetPos(a2, pos_agent2)

            player_relative_old = obs[0].observation["feature_screen"][
                _PLAYER_RELATIVE]
            mineral_y_old, mineral_x_old = (
                player_relative_old == _PLAYER_NEUTRAL).nonzero()
            if (len(mineral_x_old) == 0):
                mineral_x_old = np.array([0])
                mineral_y_old = np.array([0])

            obs = env.step_rewrite(actions=[
                sc2_actions.FunctionCall(_SELECT_POINT,
                                         [_SELECT_POINT_ACT, pos_agent1])
            ])
            obs = env.step_rewrite(actions=[
                sc2_actions.FunctionCall(_MOVE_SCREEN,
                                         [_NOT_QUEUED, pos_agent1_target])
            ])
            obs = env.step_rewrite(actions=[
                sc2_actions.FunctionCall(_SELECT_POINT,
                                         [_SELECT_POINT_ACT, pos_agent2])
            ])
            obs = env.step_rewrite(actions=[
                sc2_actions.FunctionCall(_MOVE_SCREEN,
                                         [_NOT_QUEUED, pos_agent2_target])
            ])
            obs = env._step()

            flag_end = obs[0].step_type == environment.StepType.LAST
            rew = obs[0].reward
            #得到新的观察
            player_relative = obs[0].observation["feature_screen"][
                _PLAYER_RELATIVE]
            new_screen = np.zeros(
                (player_relative.shape[0] - 9, player_relative.shape[1]))
            for i in range(player_relative.shape[0] - 9):
                for j in range(player_relative.shape[1]):
                    new_screen[i, j] = round(player_relative[i, j] / 3, 1)
            new_screen_expand = screenConcat(new_screen, num_agents)

            #得到新的矿到智能体距离总和
            player_y, player_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()

            # 对player_x和player_y的处理,保证其含有两个元素,对应两个agent的位置
            # 智能体不存在的情况,一般不会出现,因为地图不是敌对双方的设置
            if (len(player_x) == 0):
                player_x = np.array([0])
                player_y = np.array([0])

            # 两个智能体重合于一点的情况
            if (len(player_x) == 1):
                player_x = np.append(player_x, player_x[0])
                player_y = np.append(player_y, player_y[0])

            # 求智能体到step之前最近的矿的距离,如果小于最小值,认为已经采到矿
            reward_dist_a1 = False
            reward_dist_a2 = False
            for i in range(len(mineral_x_old)):
                dist_agent1 = (mineral_x_old[i] - player_x[0]) * (
                    mineral_x_old[i] - player_x[0]) + (
                        mineral_y_old[i] - player_y[0]) * (mineral_y_old[i] -
                                                           player_y[0])
                dist_agent2 = (mineral_x_old[i] - player_x[1]) * (
                    mineral_x_old[i] - player_x[1]) + (
                        mineral_y_old[i] - player_y[1]) * (mineral_y_old[i] -
                                                           player_y[1])
                if (dist_agent1 < min and rew > 0):
                    reward_dist_a1 = True
                    break
                if (dist_agent2 < min and rew > 0):
                    reward_dist_a2 = True
                    break

            # 根据前后矿到智能体的距离计算各自奖励
            rew_expand = np.zeros((num_agents, 1))

            #collect mineral reward
            if (reward_dist_a1 and rew > 0):
                rew_expand[0] = rew
            if (reward_dist_a2 and rew > 0):
                rew_expand[1] = rew

            # if(reward_dist_a1 or reward_dist_a2 or rew==1):
            #   rew_expand[0] += rew*10
            #   rew_expand[1] += rew*10

            # 每一步给一个惩罚值
            if (punish_1):
                rew_expand[0] += -10  #碰壁给一个惩罚
            rew_expand[0] += punish * eps_time

            if (punish_2):
                rew_expand[1] += -10
            rew_expand[1] += punish * eps_time

            # if (punish_1 or punish_2):
            #   rew_expand[0] += -10
            #   rew_expand[1] += -10

            replay_buffer.add(screen_expand, act_with_noise, rew_expand,
                              flag_end, new_screen_expand)

            episode_rewards[-1] += rew  # rew.sum(axis=0)

            # 将新的观察作为当前观察
            screen_expand = new_screen_expand
            eps_time += 1

            if (flag_end):
                eps_time = 1
                reward = episode_rewards[-1]
                print("Episode Reward : %s" % reward)
                obs = env.reset()
                action_noise.reset()
                print('num_episodes is', len(episode_rewards))
                episode_rewards.append(0.0)

                #得到初始观察
                player_relative = obs[0].observation["feature_screen"][
                    _PLAYER_RELATIVE]
                screen = np.zeros(
                    (player_relative.shape[0] - 9, player_relative.shape[1]))
                for i in range(player_relative.shape[0] - 9):
                    for j in range(player_relative.shape[1]):
                        screen[i, j] = round(player_relative[i, j] / 3, 1)
                screen_expand = screenConcat(screen, num_agents)

                #选中全部
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])

                # 求出最开始矿到每个智能体的距离和
                player_y, player_x = (
                    player_relative == _PLAYER_FRIENDLY).nonzero()

                # 对player_x和player_y的处理,保证其含有两个元素,对应两个agent的位置
                if (len(player_x) == 0):
                    player_x = np.array([0])
                    player_y = np.array([0])

                if (len(player_x) == 1):
                    player_x = np.append(player_x, player_x[0])
                    player_y = np.append(player_y, player_y[0])

                pos_agent1_target = [player_x[0], player_y[0]]
                pos_agent2_target = [player_x[1], player_y[1]]

            if (t > num_exploring) and (t % train_freq
                                        == 0):  #t % train_freq == 0:
                # trainStartTime = datetime.datetime.now()
                print("training starts")
                s_batch, a_batch, r_batch, done_batch, s2_batch = replay_buffer.sample_batch(
                    batch_size
                )  #[group0:[batch_size, trace.dimension], group1, ... group8]
                target_q = r_batch + gamma * critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                rnn_c_out = critic.predict_target_rnn(
                    s2_batch, actor.predict_target(s2_batch))
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(target_q, (batch_size, num_agents, output_len)))
                a_outs = actor.predict(s_batch)  # a_outs和a_batch是完全相同的
                grads = critic.action_gradients(s_batch,
                                                a_outs)  # delta Q对a的导数
                actor.train(s_batch, grads)  # 这里会计算a对θ的导数和最后的梯度

            if (t > num_exploring) and (
                    t % target_network_update_freq
                    == 0):  #t % target_network_update_freq == 0:
                actor.update_target_network()
                critic.update_target_network()

            if (t > num_exploring) and ((t - num_exploring) % save_freq == 0):
                # saveStartTime = datetime.datetime.now()
                model_file_save = os.path.join(
                    str(t) + "_" + "model_segment_training2/",
                    "defeat_zerglings")
                U.save_state(model_file_save)
                replay_buffer.save()

            elif (t == max_timesteps - 1):
                model_file_save = os.path.join(
                    str(t) + "_" + "model_segment_training2/",
                    "defeat_zerglings")
                U.save_state(model_file_save)

            # mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if flag_end and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                # logger.record_tabular("mean 100 episode reward",
                #                       mean_100ep_reward)
                logger.dump_tabular()

            endTime = datetime.datetime.now()
            time_used = str(endTime - startTime)
            print("t = %d, time used = %s" % (t, time_used))