Ejemplo n.º 1
0
    def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer

        demoData = np.load(demoDataFile) #load the demonstration data from data file
        info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
        info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [] ,[] ,[]
            i = 0
            for transition in range(self.T - 1):
                obs.append([demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition, i] = demo_data_info[epsd][transition][key]


            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs,
                           u=acts,
                           g=goals,
                           ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(episode)
                transitions = self.sample_transitions(episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size
Ejemplo n.º 2
0
def display_var_info(vars):
    from baselines import logger
    count_params = 0
    for v in vars:
        name = v.name
        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
        count_params += np.prod(v.shape.as_list())
        if "/b:" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
        logger.info("    %s%s%s" % (name, " "*(55-len(name)), str(v.shape)))
    logger.info("Total model parameters: %0.1f million" % (count_params*1e-6))
Ejemplo n.º 3
0
def get_target_updates(vars, target_vars, tau):
    logger.info('setting up target updates ...')
    soft_updates = []
    init_updates = []
    assert len(vars) == len(target_vars)
    for var, target_var in zip(vars, target_vars):
        logger.info('  {} <- {}'.format(target_var.name, var.name))
        init_updates.append(tf.assign(target_var, var))
        soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var))
    assert len(init_updates) == len(vars)
    assert len(soft_updates) == len(vars)
    return tf.group(*init_updates), tf.group(*soft_updates)
Ejemplo n.º 4
0
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev):
    assert len(actor.vars) == len(perturbed_actor.vars)
    assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)

    updates = []
    for var, perturbed_var in zip(actor.vars, perturbed_actor.vars):
        if var in actor.perturbable_vars:
            logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
        else:
            logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
            updates.append(tf.assign(perturbed_var, var))
    assert len(updates) == len(actor.vars)
    return tf.group(*updates)
Ejemplo n.º 5
0
    def step_wait(self):
        obs, rews, dones, infos = self.venv.step_wait()

        self.step_id += 1
        if self.recording:
            self.video_recorder.capture_frame()
            self.recorded_frames += 1
            if self.recorded_frames > self.video_length:
                logger.info("Saving video to ", self.video_recorder.path)
                self.close_video_recorder()
        elif self._video_enabled():
                self.start_video_recorder()

        return obs, rews, dones, infos
Ejemplo n.º 6
0
    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
Ejemplo n.º 7
0
 def setup_critic_optimizer(self):
     logger.info('setting up critic optimizer')
     normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
     self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
     if self.critic_l2_reg > 0.:
         critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
         for var in critic_reg_vars:
             logger.info('  regularizing: {}'.format(var.name))
         logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
         critic_reg = tc.layers.apply_regularization(
             tc.layers.l2_regularizer(self.critic_l2_reg),
             weights_list=critic_reg_vars
         )
         self.critic_loss += critic_reg
     critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
     critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
     logger.info('  critic shapes: {}'.format(critic_shapes))
     logger.info('  critic params: {}'.format(critic_nb_params))
     self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
     self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
         beta1=0.9, beta2=0.999, epsilon=1e-08)
Ejemplo n.º 8
0
def train(policy, rollout_worker, evaluator,
          n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
          save_policies, **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl')
    best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl')
    periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl')

    logger.info("Training...")
    best_success_rate = -1
    for epoch in range(n_epochs):
        # train
        rollout_worker.clear_history()
        for _ in range(n_cycles):
            episode = rollout_worker.generate_rollouts()
            policy.store_episode(episode)
            for _ in range(n_batches):
                policy.train()
            policy.update_target_net()

        # test
        evaluator.clear_history()
        for _ in range(n_test_rollouts):
            evaluator.generate_rollouts()

        # record logs
        logger.record_tabular('epoch', epoch)
        for key, val in evaluator.logs('test'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in policy.logs():
            logger.record_tabular(key, mpi_average(val))

        if rank == 0:
            logger.dump_tabular()

        # save the policy if it's better than the previous ones
        success_rate = mpi_average(evaluator.current_success_rate())
        if rank == 0 and success_rate >= best_success_rate and save_policies:
            best_success_rate = success_rate
            logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
            evaluator.save_policy(best_policy_path)
            evaluator.save_policy(latest_policy_path)
        if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies:
            policy_path = periodic_policy_path.format(epoch)
            logger.info('Saving periodic policy to {} ...'.format(policy_path))
            evaluator.save_policy(policy_path)

        # make sure that different threads have different seeds
        local_uniform = np.random.uniform(size=(1,))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]
Ejemplo n.º 9
0
 def setup_actor_optimizer(self):
     logger.info('setting up actor optimizer')
     self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
     actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
     actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
     logger.info('  actor shapes: {}'.format(actor_shapes))
     logger.info('  actor params: {}'.format(actor_nb_params))
     self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
     self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
         beta1=0.9, beta2=0.999, epsilon=1e-08)
Ejemplo n.º 10
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
Ejemplo n.º 11
0
def run_baselines(env, seed, log_dir):
    """
    Create baselines model and training.

    Replace the ddpg and its training with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return
    """
    rank = MPI.COMM_WORLD.Get_rank()
    seed = seed + 1000000 * rank
    set_global_seeds(seed)
    env.seed(seed)

    # Set up logger for baselines
    configure(dir=log_dir)
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        rank, seed, baselines_logger.get_dir()))

    # Set up params for baselines ddpg
    nb_actions = env.action_space.shape[-1]
    layer_norm = False

    action_noise = OrnsteinUhlenbeckActionNoise(
        mu=np.zeros(nb_actions),
        sigma=float(params["sigma"]) * np.ones(nb_actions))
    memory = Memory(
        limit=params["replay_buffer_size"],
        action_shape=env.action_space.shape,
        observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    training.train(
        env=env,
        eval_env=None,
        param_noise=None,
        action_noise=action_noise,
        actor=actor,
        critic=critic,
        memory=memory,
        nb_epochs=params["n_epochs"],
        nb_epoch_cycles=params["n_epoch_cycles"],
        render_eval=False,
        reward_scale=1.,
        render=False,
        normalize_returns=False,
        normalize_observations=False,
        critic_l2_reg=0,
        actor_lr=params["policy_lr"],
        critic_lr=params["qf_lr"],
        popart=False,
        gamma=params["discount"],
        clip_norm=None,
        nb_train_steps=params["n_train_steps"],
        nb_rollout_steps=params["n_rollout_steps"],
        nb_eval_steps=100,
        batch_size=64)

    return osp.join(log_dir, "progress.csv")
Ejemplo n.º 12
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
Ejemplo n.º 13
0
def learn(env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=20,
          nb_rollout_steps=100,
          LOG_STD_MIN=0.1,
          LOG_STD_MAX=0.9,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          normalize_returns=False,
          normalize_observations=True,
          actor_lr=1e-4,
          critic_lr=1e-3,
          gamma=0.99,
          nb_train_steps=50, # per epoch cycle and MPI worker,
          nb_eval_steps=100,
          nb_save_epochs=None,
          batch_size=64, # per MPI worker
          tau=0.01,
          action_range=(-250.0, 250.0),
          observation_range=(-5.0, 5.0),
          eval_env=None,
          load_path=None,
          save_dir=None,
          param_noise_adaption_interval=50,
          **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0
    
    memory = Memory(limit=int(1e6))

    network_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]
    
    vnetwork_spec = [
            {
                'layer_type': 'concat',
                'nodes_in': ['action_movement', 'observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]
    
    vnetwork_spec2 = [
            {
                'layer_type': 'concat',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]

    ob_space = env.observation_space
    ac_space = env.action_space

    network = SacPolicy(scope="sac", ob_space=ob_space, ac_space=ac_space, network_spec=network_spec,
                 v_network_spec=vnetwork_spec, v_network_spec2=vnetwork_spec2,
                 stochastic=False, reuse=False, build_act=True,
                 trainable_vars=None, not_trainable_vars=None,
                 gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999,
                 normalize_observations=normalize_observations, normalize_returns=normalize_returns,
                 observation_range=observation_range,
                 LOG_STD_MIN=LOG_STD_MIN, LOG_STD_MAX=LOG_STD_MAX)

    target_network = SacPolicy(scope="sac_target", ob_space=ob_space, ac_space=ac_space, network_spec=network_spec,
                 v_network_spec=vnetwork_spec, v_network_spec2=vnetwork_spec2,
                 stochastic=False, reuse=False, build_act=True,
                 trainable_vars=None, not_trainable_vars=None,
                 gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999,
                 normalize_observations=normalize_observations, normalize_returns=normalize_returns,
                 observation_range=observation_range,
                 LOG_STD_MIN=LOG_STD_MIN, LOG_STD_MAX=LOG_STD_MAX)

    # obs0
    obs0 = dict()
    for k, v in ob_space.spaces.items():
        obs0[k] = network.phs[k]

    # obs1
    obs1 = dict()
    for k, v in ob_space.spaces.items():
        obs1[k] = target_network.phs[k]

    # actions
    acts = dict()
    for k, v in ac_space.spaces.items():
        acts[k] = network.phs[k]

    self.dones = tf.placeholder(tf.float32, shape=(None, 1), name='dones')
    self.rewards = tf.placeholder(tf.float32, shape=(None, None, 1), name='rewards')

    # min double-q
    min_q_pi = tf.minimum(network.q1_pi, network.q2_pi)


    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    saver = functools.partial(save_variables, sess=sess)
    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = env.num_envs
    n_agents = obs['observation_self'].shape[0]

    episode_reward = np.zeros((nenvs, n_agents), dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                for k, v in action.items():
                    action[k] *= max_action
                
                nenvs_actions = []
                for i in range(nenvs):
                    nenv_action = {'action_movement' : action['action_movement'][i*n_agents:(i + 1)*n_agents]}
                    nenvs_actions.append(nenv_action)
                new_obs, r, done, info = env.step(nenvs_actions)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv
                
                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)
        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
        
        if nb_save_epochs != None and (epoch + 1) % nb_save_epochs == 0:
            if save_dir == None:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
            else:
                checkdir = osp.join(save_dir, 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i'%epoch)
            print('Saving to', savepath)
            saver(savepath)

    return agent
Ejemplo n.º 14
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)
    
    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Ejemplo n.º 15
0
def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts,
          n_cycles, n_batches, policy_save_interval, save_policies, num_cpu,
          dump_buffer, w_potential, w_linear, w_rotational, rank_method,
          clip_energy, **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl')
    best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl')
    periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl')

    logger.info("Training...")
    best_success_rate = -1
    t = 1
    for epoch in range(n_epochs):
        # train
        rollout_worker.clear_history()
        for cycle in range(n_cycles):
            episode = rollout_worker.generate_rollouts()
            policy.store_episode(episode, dump_buffer, w_potential, w_linear,
                                 w_rotational, rank_method, clip_energy)
            for batch in range(n_batches):
                t = ((epoch * n_cycles * n_batches) +
                     (cycle * n_batches) + batch) * num_cpu
                policy.train(t, dump_buffer)

            policy.update_target_net()

        # test
        evaluator.clear_history()
        for _ in range(n_test_rollouts):
            evaluator.generate_rollouts()

        # record logs
        logger.record_tabular('epoch', epoch)
        for key, val in evaluator.logs('test'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in policy.logs():
            logger.record_tabular(key, mpi_average(val))

        if rank == 0:
            logger.dump_tabular()

            if dump_buffer:
                policy.dump_buffer(epoch)

        # save the policy if it's better than the previous ones
        success_rate = mpi_average(evaluator.current_success_rate())
        if rank == 0 and success_rate >= best_success_rate and save_policies:
            best_success_rate = success_rate
            logger.info(
                'New best success rate: {}. Saving policy to {} ...'.format(
                    best_success_rate, best_policy_path))
            evaluator.save_policy(best_policy_path)
            evaluator.save_policy(latest_policy_path)
        if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies:
            policy_path = periodic_policy_path.format(epoch)
            logger.info('Saving periodic policy to {} ...'.format(policy_path))
            evaluator.save_policy(policy_path)

        # make sure that different threads have different seeds
        local_uniform = np.random.uniform(size=(1, ))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]
Ejemplo n.º 16
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)
            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s'%x)
            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        # Dynamics loss with random features.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c1r",
                        nf=convfeat * 1,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c2r",
                        nf=convfeat * 2 * 1,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c3r",
                        nf=convfeat * 2 * 1,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None,
            None,
            self.ac_space.n,
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xrp = ph[:, :-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c1rp_pred",
                        nf=convfeat,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c2rp_pred",
                        nf=convfeat * 2,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c3rp_pred",
                        nf=convfeat * 2,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbrp = to2d(xrp)

                X_r_hat = tf.nn.relu(
                    fc(
                        cond(rgbrp),
                        "fc1r_hat1_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = tf.nn.relu(
                    fc(
                        cond(X_r_hat),
                        "fc1r_hat2_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = fc(cond(X_r_hat),
                             "fc1r_hat3_pred",
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.0,
                                 maxval=1.0,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.0)
    def apply_policy(
        ph_ob,
        reuse,
        scope,
        hidsize,
        memsize,
        extrahid,
        sy_nenvs,
        sy_nsteps,
        pdparamsize,
        additional_inputs=None,
    ):
        meta_rl = False
        data_format = "NHWC"
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info(
            f"CnnPolicy: using '{ph.name}' shape {ph.shape} as image input")
        X = tf.cast(ph, tf.float32) / 255.0
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"):
            X = activ(
                conv(
                    X,
                    "c1",
                    nf=32,
                    rf=8,
                    stride=4,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = activ(
                conv(
                    X,
                    "c2",
                    nf=64,
                    rf=4,
                    stride=2,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = activ(
                conv(
                    X,
                    "c3",
                    nf=64,
                    rf=4,
                    stride=1,
                    init_scale=np.sqrt(2),
                    data_format=data_format,
                ))
            X = to2d(X)
            mix_other_observations = [X]

            if ('prev_acs' in additional_inputs) and ('prev_rew'
                                                      in additional_inputs):
                # Cast numpy arrays to tf tensors
                prev_acs = tf.cast(additional_inputs['prev_acs'], tf.float32)
                prev_rew = tf.cast(additional_inputs['prev_rew'], tf.float32)

                # Flatten out time dimension
                prev_acs = tf.reshape(prev_acs,
                                      (-1, *prev_acs.shape.as_list()[2:]))
                prev_rew = tf.reshape(prev_rew,
                                      (-1, *prev_rew.shape.as_list()[2:]))

                # Add to 2D features going to FC layers
                mix_other_observations.extend([prev_acs, prev_rew])

            X = tf.concat(mix_other_observations, axis=1)
            X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2)))
            additional_size = 448
            X = activ(
                fc(X,
                   "fc_additional",
                   nh=additional_size,
                   init_scale=np.sqrt(2)))
            snext = tf.zeros((sy_nenvs, memsize))
            mix_timeout = [X]

            Xtout = tf.concat(mix_timeout, axis=1)
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, "fc2val", nh=additional_size, init_scale=0.1))
                X = X + activ(
                    fc(X, "fc2act", nh=additional_size, init_scale=0.1))
            pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Ejemplo n.º 19
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          callback=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See baselines.common/policies.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float):
        lr = constfn(lr)
    else:
        assert callable(lr)
    if isinstance(cliprange, float):
        cliprange = constfn(cliprange)
    else:
        assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches

    make_model = lambda: Model(policy=policy,
                               ob_space=ob_space,
                               ac_space=ac_space,
                               nbatch_act=nenvs,
                               nbatch_train=nbatch_train,
                               nsteps=nsteps,
                               ent_coef=ent_coef,
                               vf_coef=vf_coef,
                               max_grad_norm=max_grad_norm)
    # if save_interval and logger.get_dir():
    #     import cloudpickle
    #     with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
    #         fh.write(cloudpickle.dumps(make_model))
    model = make_model()
    if load_path is not None:
        model.load(load_path)
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf = deque(maxlen=100)
    tfirststart = time.time()

    nupdates = total_timesteps // nbatch
    for update in range(1, nupdates + 1):

        if callback is not None:
            if callback(locals(), globals()):
                break

        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates

        cliprangenow = cliprange(frac)
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  # pylint: disable=E0632
        epinfobuf.extend(epinfos)
        mblrvals = []
        mblossvals = []
        if states is None:  # nonrecurrent version
            inds = np.arange(nbatch)
            for iteration in range(noptepochs):
                np.random.shuffle(inds)
                lrnow = lr(frac, iteration)
                logger.info('lrnow={} (frac={},iteration={})'.format(
                    lrnow, frac, iteration))
                mblrvals.append(lrnow)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for iteration in range(noptepochs):
                np.random.shuffle(envinds)
                lrnow = lr(frac, iteration)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        lossvals = np.mean(mblossvals, axis=0)
        lrvals = np.mean(mblrvals)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, returns)
            logger.logkv("serial_timesteps", update * nsteps)
            logger.logkv("nupdates", update)
            logger.logkv("total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('time_elapsed', tnow - tfirststart)
            logger.logkv('lr_mean', lrvals)
            logger.logkv('epscoremean',
                         safemean([epinfo['s'] for epinfo in epinfobuf]))
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv(lossname, lossval)
            if MPI.COMM_WORLD.Get_rank() == 0:
                logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update == 1
                              or update == nupdates) and logger.get_dir(
                              ) and MPI.COMM_WORLD.Get_rank() == 0:
            checkdir = osp.join(logger.get_dir(), 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
    return model
Ejemplo n.º 20
0
    def call(self, perform, save_networks, use_expert, expert, on_policy):
        runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
        expert_buffer = self.expert_buffer

        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()

            runner.myrun()
            # if self.flag>0:
            # 	print(self.flag,'=================================')
            # 	print(enc_obs, obs, actions, rewards, mus, dones, masks)
            # 	self.flag = self.flag -1
            self.episode_stats.feed(rewards, dones)
            if buffer is not None and not perform:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()
            #enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()

        if not perform:
            # reshape stuff correctly
            obs = obs.reshape(runner.batch_ob_shape)
            actions = actions.reshape([runner.nbatch])
            rewards = rewards.reshape([runner.nbatch])
            mus = mus.reshape([runner.nbatch, runner.nact])
            dones = dones.reshape([runner.nbatch])
            masks = masks.reshape([runner.batch_ob_shape[0]])

            if not use_expert:
                names_ops, values_ops = model.train(obs, actions, rewards,
                                                    dones, mus,
                                                    model.initial_state, masks,
                                                    steps)
            else:
                expert_obs, expert_actions, expert_rewards, expert_mus, expert_dones, expert_masks = expert.get(
                )
                expert_obs = expert_obs.reshape(runner.batch_ob_shape)
                expert_actions = expert_actions.reshape([runner.nbatch])
                expert_rewards = expert_rewards.reshape([runner.nbatch])
                expert_mus = expert_mus.reshape([runner.nbatch, runner.nact])
                expert_dones = expert_dones.reshape([runner.nbatch])
                expert_masks = expert_masks.reshape([runner.batch_ob_shape[0]])
                names_ops, values_ops = model.expert_train(
                    obs, actions, rewards, dones, mus, model.initial_state,
                    masks, steps, expert_obs, expert_actions, expert_rewards,
                    expert_mus, expert_dones, expert_masks)

            if on_policy and (int(steps / runner.nbatch) % self.log_interval
                              == 0):
                logger.record_tabular("total_timesteps", steps)
                logger.record_tabular("fps",
                                      int(steps / (time.time() - self.tstart)))
                # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
                # Thus, this is mean until end of life, not end of episode.
                # For true episode rewards, see the monitor files in the log folder.
                logger.record_tabular("mean_episode_length",
                                      self.episode_stats.mean_length())
                logger.record_tabular("mean_episode_reward",
                                      self.episode_stats.mean_reward())
                for name, val in zip(names_ops, values_ops):
                    logger.record_tabular(name, float(val))
                logger.dump_tabular()

                if save_networks and (int(steps / runner.nbatch) %
                                      self.log_interval * 10 == 0):
                    model.save(int(steps))

        else:  #if perform
            expert_buffer.append(
                [enc_obs, actions, rewards, mus, dones, masks])

            if len(expert_buffer) > 0 and len(expert_buffer) % 100 == 0:
                expert_dir = os.path.join('./expert') + '/'
                if not os.path.exists(expert_dir):
                    os.makedirs(expert_dir)
                pwritefile = open(os.path.join(expert_dir, 'expert_test.pkl'),
                                  'wb')
                pickle.dump(expert_buffer, pwritefile, -1)
                pwritefile.close()
                logger.info('Successfully Saved the Expert Data')

            obs = obs.reshape(runner.batch_ob_shape)
            actions = actions.reshape([runner.nbatch])
            rewards = rewards.reshape([runner.nbatch])
            mus = mus.reshape([runner.nbatch, runner.nact])
            dones = dones.reshape([runner.nbatch])
            masks = masks.reshape([runner.batch_ob_shape[0]])
            if on_policy and (int(steps / runner.nbatch) % self.log_interval
                              == 0):
                logger.record_tabular("total_timesteps", steps)
                logger.record_tabular("fps",
                                      int(steps / (time.time() - self.tstart)))
                # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
                # Thus, this is mean until end of life, not end of episode.
                # For true episode rewards, see the monitor files in the log folder.
                logger.record_tabular("mean_episode_length",
                                      self.episode_stats.mean_length())
                logger.record_tabular("mean_episode_reward",
                                      self.episode_stats.mean_reward())

                logger.dump_tabular()
Ejemplo n.º 21
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both
            maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else: #If  not training with demonstrations
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
Ejemplo n.º 22
0
def learn(network, env,
          seed=None,
          total_timesteps=None,
          nb_epochs=None, # with default settings, perform 1M steps total
          nb_epoch_cycles=20,
          nb_rollout_steps=100,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='adaptive-param_0.2',
          normalize_returns=False,
          normalize_observations=True,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=50, # per epoch cycle and MPI worker,
          nb_eval_steps=100,
          batch_size=64, # per MPI worker
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50,
          **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype = np.float32) #vector
    episode_step = np.zeros(nenvs, dtype = int) # vector
    episodes = 0 #scalar
    t = 0 # scalar

    epoch = 0



    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()



            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)
        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s'%x)

        combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                    pickle.dump(eval_env.get_state(), f)


    return agent
Ejemplo n.º 23
0
def learn(*,
          network,
          env,
          env_name,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          data_aug='normal',
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''
    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)
    policy = build_policy(env, network, data_aug=data_aug, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    data_aug=data_aug)

    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam,
                             data_aug=data_aug,
                             eval_flag=True)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    # random filter
    sess = tf.get_default_session()
    rand_processes = [v for v in tf.global_variables() if 'randcnn' in v.name]
    if len(rand_processes) > 0:
        init_process = tf.variables_initializer(rand_processes)
    else:
        init_process = None

    nupdates = total_timesteps // nbatch

    traj_dict = {'obs': [], 'acts': [], 'rews': [], 'dones': [], 'infos': []}
    traj_count = 0

    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # randomize parameters
        if len(rand_processes) > 0:
            sess.run(init_process)
            logger.info('Randomizing parameters...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632

        traj_dict['obs'].append(obs)
        traj_dict['acts'].append(actions)
        traj_dict['rews'].append(returns)
        traj_dict['dones'].append(masks)
        traj_dict['infos'].append(epinfos)

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632
        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    if load_path is not None:
                        mblossvals.append(
                            model.train(lrnow, cliprangenow, *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    if load_path is not None:
                        mblossvals.append(
                            model.train(lrnow, cliprangenow, *slices,
                                        mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()
        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            if load_path is not None:
                video_out_path = osp.join(
                    logger.get_dir(),
                    f'video_{env_name}_traj_{traj_count}.mp4')
                video_writer = TensorFrameWriter(video_out_path,
                                                 make_grid=False,
                                                 adjust_axis=False)
                obs = obs
                for image, done in zip(obs, masks):
                    if not done:
                        video_writer.add_tensor(image)
                    else:
                        video_writer.close()
                        breakpoint()
                        traj_count += 1
                        video_out_path = osp.join(
                            logger.get_dir(),
                            f'video_{env_name}_traj_{traj_count}.mp4')
                        video_writer = TensorFrameWriter(video_out_path,
                                                         make_grid=False,
                                                         adjust_axis=False)

            else:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
                os.makedirs(checkdir, exist_ok=True)
                savepath = osp.join(checkdir, '%.5i' % update)
                print('Saving to', savepath)
                model.save(savepath)
        break

    all_dones = traj_dict['dones'].copy()
    all_dones = np.concatenate(all_dones, axis=0)
    traj_ends, = np.nonzero(all_dones)
    real_n_traj = len(traj_ends)
    trajectories = []
    traj_starts = np.concatenate(([0], traj_ends[:-1]), axis=0)
    traj_start_end = np.stack((traj_starts, traj_ends), axis=1)
    for start_idx, end_idx in traj_start_end:
        trajectories.append(
            {k: v[start_idx:end_idx]
             for k, v in traj_dict.items()})
    n_traj = len(trajectories)

    import pickle
    traj_filename = osp.join(logger.get_dir(), f'demo_{env_name}.pickle')
    with open(traj_filename, 'wb') as handle:
        pickle.dump(traj_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    video_writer.close()
    return model
Ejemplo n.º 24
0
    def __init__(
        self,
        *,
        scope,
        ob_space,
        ac_space,
        stochpol_fn,
        nsteps,
        nepochs=4,
        nminibatches=1,
        gamma=0.99,
        gamma_ext=0.99,
        lam=0.95,
        ent_coef=0,
        cliprange=0.2,
        max_grad_norm=1.0,
        vf_coef=1.0,
        lr=30e-5,
        adam_hps=None,
        testing=False,
        comm=None,
        comm_train=None,
        use_news=False,
        update_ob_stats_every_step=True,
        int_coeff=None,
        ext_coeff=None,
    ):
        self.lr = lr
        self.ext_coeff = ext_coeff
        self.int_coeff = int_coeff
        self.use_news = use_news
        self.update_ob_stats_every_step = update_ob_stats_every_step
        self.abs_scope = (tf.get_variable_scope().name + '/' +
                          scope).lstrip('/')
        self.testing = testing
        self.comm_log = MPI.COMM_SELF
        if comm is not None and comm.Get_size() > 1:
            self.comm_log = comm
            assert not testing or comm.Get_rank(
            ) != 0, "Worker number zero can't be testing"
        if comm_train is not None:
            self.comm_train, self.comm_train_size = comm_train, comm_train.Get_size(
            )
        else:
            self.comm_train, self.comm_train_size = self.comm_log, self.comm_log.Get_size(
            )
        self.is_log_leader = self.comm_log.Get_rank() == 0
        self.is_train_leader = self.comm_train.Get_rank() == 0
        with tf.variable_scope(scope):
            self.best_ret = -np.inf
            self.local_best_ret = -np.inf
            self.rooms = []
            self.local_rooms = []
            self.scores = []
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.stochpol = stochpol_fn()
            self.nepochs = nepochs
            self.cliprange = cliprange
            self.nsteps = nsteps
            self.nminibatches = nminibatches
            self.gamma = gamma
            self.gamma_ext = gamma_ext
            self.lam = lam
            self.adam_hps = adam_hps or dict()
            self.ph_adv = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_int = tf.placeholder(tf.float32, [None, None])
            self.ph_ret_ext = tf.placeholder(tf.float32, [None, None])
            self.ph_oldnlp = tf.placeholder(tf.float32, [None, None])
            self.ph_oldvpred = tf.placeholder(tf.float32, [None, None])
            self.ph_lr = tf.placeholder(tf.float32, [])
            self.ph_lr_pred = tf.placeholder(tf.float32, [])
            self.ph_cliprange = tf.placeholder(tf.float32, [])

            #Define loss.
            neglogpac = self.stochpol.pd_opt.neglogp(self.stochpol.ph_ac)
            entropy = tf.reduce_mean(self.stochpol.pd_opt.entropy())
            vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(
                tf.square(self.stochpol.vpred_int_opt - self.ph_ret_int))
            vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(
                tf.square(self.stochpol.vpred_ext_opt - self.ph_ret_ext))
            vf_loss = vf_loss_int + vf_loss_ext
            ratio = tf.exp(self.ph_oldnlp - neglogpac)  # p_new / p_old
            negadv = -self.ph_adv
            pg_losses1 = negadv * ratio
            pg_losses2 = negadv * tf.clip_by_value(
                ratio, 1.0 - self.ph_cliprange, 1.0 + self.ph_cliprange)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2))
            ent_loss = (-ent_coef) * entropy
            approxkl = .5 * tf.reduce_mean(
                tf.square(neglogpac - self.ph_oldnlp))
            maxkl = .5 * tf.reduce_max(tf.square(neglogpac - self.ph_oldnlp))
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                       self.ph_cliprange)))
            loss = pg_loss + ent_loss + vf_loss + self.stochpol.aux_loss

            #Create optimizer.
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=self.abs_scope)
            logger.info("PPO: using MpiAdamOptimizer connected to %i peers" %
                        self.comm_train_size)
            trainer = MpiAdamOptimizer(self.comm_train,
                                       learning_rate=self.ph_lr,
                                       **self.adam_hps)
            grads_and_vars = trainer.compute_gradients(loss, params)
            grads, vars = zip(*grads_and_vars)
            if max_grad_norm:
                _, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            global_grad_norm = tf.global_norm(grads)
            grads_and_vars = list(zip(grads, vars))
            self._train = trainer.apply_gradients(grads_and_vars)

        #Quantities for reporting.
        self._losses = [
            loss, pg_loss, vf_loss, entropy, clipfrac, approxkl, maxkl,
            self.stochpol.aux_loss, self.stochpol.feat_var,
            self.stochpol.max_feat, global_grad_norm
        ]
        self.loss_names = [
            'tot', 'pg', 'vf', 'ent', 'clipfrac', 'approxkl', 'maxkl',
            "auxloss", "featvar", "maxfeat", "gradnorm"
        ]
        self.I = None
        self.disable_policy_update = None
        allvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope=self.abs_scope)
        if self.is_log_leader:
            tf_util.display_var_info(allvars)
        tf.get_default_session().run(tf.variables_initializer(allvars))
        sync_from_root(tf.get_default_session(),
                       allvars)  #Syncs initialization across mpi workers.
        self.t0 = time.time()
        self.global_tcount = 0
Ejemplo n.º 25
0
 def save(training_step):
     logger.info(
         'Saved network with {} training steps'.format(training_step))
     self.saver.save(self.sess,
                     self.cpk_dir + 'demo',
                     global_step=training_step)
    elif 'normal' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    elif 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    else:
        raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

# Configure components of DDPG
memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=args.layer_norm)
actor = Actor(nb_actions, layer_norm=args.layer_norm)
# Seed everything to make things reproducible.
seed = args.seed + 1000000 * rank
logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
# tf.reset_default_graph()
set_global_seeds(seed)
env.seed(seed)
if eval_env is not None:
    eval_env.seed(seed)

# Disable logging for rank != 0 to avoid noise.
if rank == 0:
    start_time = time.time()

# Derive the different numbers for the training process
num_timesteps = args.num_timesteps
nb_rollout_steps = args.nb_rollout_steps
nb_epoch_cycles = args.nb_epoch_cycles
nb_epochs = num_timesteps//(nb_rollout_steps*nb_epoch_cycles)
Ejemplo n.º 27
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            epoch_train_duration = time.time() - epoch_start_time
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = {}
            for key in sorted(stats.keys()):
                combined_stats[key] = mpi_mean(stats[key])

            # Rollout statistics.
            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = mpi_mean(
                np.mean(episode_rewards_history))
            combined_stats['rollout/episode_steps'] = mpi_mean(
                epoch_episode_steps)
            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)

            # Train statistics.
            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = mpi_mean(
                epoch_adaptive_distances)

            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
                combined_stats['eval/return_history'] = mpi_mean(
                    np.mean(eval_episode_rewards_history))
                combined_stats['eval/Q'] = mpi_mean(eval_qs)
                combined_stats['eval/episodes'] = mpi_mean(
                    len(eval_episode_rewards))

            # Total statistics.
            combined_stats['total/duration'] = mpi_mean(duration)
            combined_stats['total/steps_per_second'] = mpi_mean(
                float(t) / float(duration))
            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 28
0
def log_params(params, logger=logger):
    for key in sorted(params.keys()):
        logger.info('{}: {}'.format(key, params[key]))
Ejemplo n.º 29
0
def run(seed, noise_type, layer_norm, **kwargs):
    """Configure things."""
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0: logger.set_level(logger.DISABLED)
    """Create Simulation envs."""
    env = PegintoHoles()
    """Create True envs"""
    # env = Env_robot_control()
    """Parse noise_type"""
    action_noise = None
    param_noise = None
    nb_actions = env.action_dim

    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(0.2) * np.ones(nb_actions))
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    """Configure components."""
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_dim,
                    observation_shape=env.state_dim)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    """Seed everything to make things reproducible."""
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    """Disable logging to avoid noise."""
    start_time = time.time()
    """Train the model"""
    training.train(env=env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    """Eval the result"""
    logger.info('total runtime: {}s'.format(time.time() - start_time))
Ejemplo n.º 30
0
def learn(*,
          network,
          env,
          total_timesteps,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          policy_save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          **kwargs):

    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    params['rollout_batch_size'] = env.num_envs

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size

    logger.info("n_epochs:", n_epochs)
    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file)
    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        logger.info(
            "Using RND BONUS ****************************************************"
        )

        # RND bonus.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c1r",
                        nf=convfeat * 1,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c2r",
                        nf=convfeat * 2 * 1,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    conv(
                        xr,
                        "c3r",
                        nf=convfeat * 2 * 1,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], "fc1r", nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info(
                    f"CnnTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c1rp_pred",
                        nf=convfeat,
                        rf=8,
                        stride=4,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c2rp_pred",
                        nf=convfeat * 2,
                        rf=4,
                        stride=2,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    conv(
                        xrp,
                        "c3rp_pred",
                        nf=convfeat * 2,
                        rf=3,
                        stride=1,
                        init_scale=np.sqrt(2),
                    ))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(
                        rgbrp,
                        "fc1r_hat1_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = tf.nn.relu(
                    fc(
                        X_r_hat,
                        "fc1r_hat2_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = fc(X_r_hat,
                             "fc1r_hat3_pred",
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.0,
                                 maxval=1.0,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.0)
Ejemplo n.º 32
0
    def __init__(self, *, sess, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm):
        # sess = tf.get_default_session()
        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps)
        norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        params = tf.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            # mpi_print('param', p, num_params)
            total_num_params += num_params

        logger.info('total num params:', total_num_params)

        l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * L2_WEIGHT

        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        grads_and_var = trainer.compute_gradients(loss, params)

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
            advs = returns - values

            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, _train],
                td_map
            )[:-1]
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss']

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load

        initialize()
Ejemplo n.º 33
0
 def save_model(self, num_steps):
     logger.info("Saving model at step: {}".format(num_steps))
     tf_util.save_state(self.model_path)
Ejemplo n.º 34
0
def learn(*, network, sess, env, nsteps, total_timesteps, ent_coef, lr,
            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
            save_interval=0, save_path=None, load_path=None, **network_kwargs):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_size = comm.Get_size()
    #sess = tf.get_default_session()
    # tb_writer = TB_Writer(sess)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    
    nbatch_train = nbatch // nminibatches
    policy = RandCropCnnPolicy
    model = Model(policy=policy, sess=sess, ob_space=ob_space, ac_space=ac_space, 
        nbatch_act=nenvs, nbatch_train=nbatch_train,
        nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm)

    # utils.load_all_params(sess)
    if load_path is not None:
        model.load(load_path)
        logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
    logger.info("Initilizing runner")
    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    tfirststart = time.time()
    active_ep_buf = epinfobuf100

    nupdates = total_timesteps//nbatch
    logger.info("Running {} updates, each needs {} batches".format(nupdates, nbatch))
    mean_rewards = []
    datapoints = []

    run_t_total = 0
    train_t_total = 0

    can_save = True
    checkpoints = list(range(0,2049,10))
    saved_key_checkpoints = [False] * len(checkpoints)
    #init_rand = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name])

    # if Config.SYNC_FROM_ROOT and rank != 0:
    #     can_save = False

    # def save_model(base_name=None):
    #     base_dict = {'datapoints': datapoints}
    #     utils.save_params_in_scopes(
    #         sess, ['model'], Config.get_save_file(base_name=base_name), base_dict)

    for update in range(1, nupdates+1):
        assert nbatch % nminibatches == 0
        nbatch_train = nbatch // nminibatches
        tstart = time.time()
        frac = 1.0 - (update - 1.0) / nupdates
        lrnow = lr(frac)
        cliprangenow = cliprange(frac)

        logger.info('collecting rollouts...')
        run_tstart = time.time()
        
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run()
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        run_elapsed = time.time() - run_tstart
        run_t_total += run_elapsed
        logger.info('rollouts complete')

        mblossvals = []

        logger.info('updating parameters...')
        train_tstart = time.time()
        
        if states is None: # nonrecurrent version
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                np.random.shuffle(inds)
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
                        
        else: # recurrent version
            assert nenvs % nminibatches == 0
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            envsperbatch = nbatch_train // nsteps
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))

        # update the dropout mask
        sess.run([model.train_model.dropout_assign_ops])

        train_elapsed = time.time() - train_tstart
        train_t_total += train_elapsed
        logger.info('update complete')

        lossvals = np.mean(mblossvals, axis=0)
        tnow = time.time()
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            step = update*nbatch
            #rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step)

            rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
            rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
            ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
            ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])
            
            logger.info('\n----', update)

            mean_rewards.append(rew_mean_10)
            datapoints.append([step, rew_mean_10])
            mean_rewards.append(rew_mean_10)
            logger.logkv('eprew10', rew_mean_10)
            logger.logkv('eprew100', rew_mean_100)
            logger.logkv('eplenmean10', ep_len_mean_10)
            logger.logkv('eplenmean100', ep_len_mean_100)

            
            #logger.info('time_elapsed', tnow - tfirststart, run_t_total, train_t_total)
            logger.logkv('misc/total_time_elapsed', tnow - tfirststart)
            logger.logkv('misc/run_t_total', run_t_total)
            logger.logkv('misc/train_t_total', train_t_total)
            logger.logkv('nupdate', update)

            #logger.info('timesteps', update*nsteps, total_timesteps)
            logger.logkv("misc/total_timesteps", update*nbatch)
            logger.logkv("misc/serial_timesteps", update*nsteps)
            
            #logger.info('fps', fps)
            logger.logkv("fps", fps)

            if len(mblossvals):
                for (lossval, lossname) in zip(lossvals, model.loss_names):
                    logger.info(lossname, lossval)
                    #tb_writer.log_scalar(lossval, lossname)
                    logger.logkv('loss/' + lossname, lossval)
            logger.info('----\n')
            logger.dumpkvs()
 

        #if can_save:
        if 0: ## not doing checkpoint saving yet
            if save_interval and (update % save_interval == 0):
                save_model()

            for j, checkpoint in enumerate(checkpoints):
                if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)):
                    saved_key_checkpoints[j] = True
                    save_model(str(checkpoint) + 'M')

    # save_model()
    if save_path:
        model.save(save_path)

    env.close()
    return model
Ejemplo n.º 35
0
    def update(self):

        #Some logic gathering best ret, rooms etc using MPI.
        temp = sum(MPI.COMM_WORLD.allgather(self.local_rooms), [])
        temp = sorted(list(set(temp)))
        self.rooms = temp

        temp = sum(MPI.COMM_WORLD.allgather(self.scores), [])
        temp = sorted(list(set(temp)))
        self.scores = temp

        temp = sum(MPI.COMM_WORLD.allgather([self.local_best_ret]), [])
        self.best_ret = max(temp)

        eprews = MPI.COMM_WORLD.allgather(
            np.mean(list(self.I.statlists["eprew"])))
        local_best_rets = MPI.COMM_WORLD.allgather(self.local_best_ret)
        n_rooms = sum(MPI.COMM_WORLD.allgather([len(self.local_rooms)]), [])

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.info(f"Rooms visited {self.rooms}")
            logger.info(f"Best return {self.best_ret}")
            logger.info(f"Best local return {sorted(local_best_rets)}")
            logger.info(f"eprews {sorted(eprews)}")
            logger.info(f"n_rooms {sorted(n_rooms)}")
            logger.info(f"Extrinsic coefficient {self.ext_coeff}")
            logger.info(f"Gamma {self.gamma}")
            logger.info(f"Gamma ext {self.gamma_ext}")
            logger.info(f"All scores {sorted(self.scores)}")

        #Normalize intrinsic rewards.
        rffs_int = np.array(
            [self.I.rff_int.update(rew) for rew in self.I.buf_rews_int.T])
        self.I.rff_rms_int.update(rffs_int.ravel())
        rews_int = self.I.buf_rews_int / np.sqrt(self.I.rff_rms_int.var)
        self.mean_int_rew = np.mean(rews_int)
        self.max_int_rew = np.max(rews_int)

        #Don't normalize extrinsic rewards.
        rews_ext = self.I.buf_rews_ext

        rewmean, rewstd, rewmax = self.I.buf_rews_int.mean(
        ), self.I.buf_rews_int.std(), np.max(self.I.buf_rews_int)

        #Calculate intrinsic returns and advantages.
        lastgaelam = 0
        for t in range(self.nsteps - 1, -1, -1):  # nsteps-2 ... 0
            if self.use_news:
                nextnew = self.I.buf_news[:, t +
                                          1] if t + 1 < self.nsteps else self.I.buf_new_last
            else:
                nextnew = 0.0  #No dones for intrinsic reward.
            nextvals = self.I.buf_vpreds_int[:, t +
                                             1] if t + 1 < self.nsteps else self.I.buf_vpred_int_last
            nextnotnew = 1 - nextnew
            delta = rews_int[:,
                             t] + self.gamma * nextvals * nextnotnew - self.I.buf_vpreds_int[:,
                                                                                             t]
            self.I.buf_advs_int[:,
                                t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
        rets_int = self.I.buf_advs_int + self.I.buf_vpreds_int

        #Calculate extrinsic returns and advantages.
        lastgaelam = 0
        for t in range(self.nsteps - 1, -1, -1):  # nsteps-2 ... 0
            nextnew = self.I.buf_news[:, t +
                                      1] if t + 1 < self.nsteps else self.I.buf_new_last
            #Use dones for extrinsic reward.
            nextvals = self.I.buf_vpreds_ext[:, t +
                                             1] if t + 1 < self.nsteps else self.I.buf_vpred_ext_last
            nextnotnew = 1 - nextnew
            delta = rews_ext[:,
                             t] + self.gamma_ext * nextvals * nextnotnew - self.I.buf_vpreds_ext[:,
                                                                                                 t]
            self.I.buf_advs_ext[:,
                                t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam
        rets_ext = self.I.buf_advs_ext + self.I.buf_vpreds_ext

        #Combine the extrinsic and intrinsic advantages.
        self.I.buf_advs = self.int_coeff * self.I.buf_advs_int + self.ext_coeff * self.I.buf_advs_ext

        #Collects info for reporting.
        info = dict(
            advmean=self.I.buf_advs.mean(),
            advstd=self.I.buf_advs.std(),
            retintmean=rets_int.mean(),  # previously retmean
            retintstd=rets_int.std(),  # previously retstd
            retextmean=rets_ext.mean(),  # previously not there
            retextstd=rets_ext.std(),  # previously not there
            rewintmean_unnorm=rewmean,  # previously rewmean
            rewintmax_unnorm=rewmax,  # previously not there
            rewintmean_norm=self.mean_int_rew,  # previously rewintmean
            rewintmax_norm=self.max_int_rew,  # previously rewintmax
            rewintstd_unnorm=rewstd,  # previously rewstd
            vpredintmean=self.I.buf_vpreds_int.mean(),  # previously vpredmean
            vpredintstd=self.I.buf_vpreds_int.std(),  # previously vrpedstd
            vpredextmean=self.I.buf_vpreds_ext.mean(),  # previously not there
            vpredextstd=self.I.buf_vpreds_ext.std(),  # previously not there
            ev_int=np.clip(
                explained_variance(self.I.buf_vpreds_int.ravel(),
                                   rets_int.ravel()), -1, None),
            ev_ext=np.clip(
                explained_variance(self.I.buf_vpreds_ext.ravel(),
                                   rets_ext.ravel()), -1, None),
            rooms=SemicolonList(self.rooms),
            n_rooms=len(self.rooms),
            best_ret=self.best_ret,
            reset_counter=self.I.reset_counter)

        info[f'mem_available'] = psutil.virtual_memory().available

        to_record = {
            'acs': self.I.buf_acs,
            'rews_int': self.I.buf_rews_int,
            'rews_int_norm': rews_int,
            'rews_ext': self.I.buf_rews_ext,
            'vpred_int': self.I.buf_vpreds_int,
            'vpred_ext': self.I.buf_vpreds_ext,
            'adv_int': self.I.buf_advs_int,
            'adv_ext': self.I.buf_advs_ext,
            'ent': self.I.buf_ent,
            'ret_int': rets_int,
            'ret_ext': rets_ext,
        }
        if self.I.venvs[0].record_obs:
            to_record['obs'] = self.I.buf_obs[None]
        self.recorder.record(bufs=to_record, infos=self.I.buf_epinfos)

        #Create feeddict for optimization.
        envsperbatch = self.I.nenvs // self.nminibatches
        ph_buf = [
            (self.stochpol.ph_ac, self.I.buf_acs),
            (self.ph_ret_int, rets_int),
            (self.ph_ret_ext, rets_ext),
            (self.ph_oldnlp, self.I.buf_nlps),
            (self.ph_adv, self.I.buf_advs),
        ]
        if self.I.mem_state is not NO_STATES:
            ph_buf.extend([
                (self.stochpol.ph_istate, self.I.seg_init_mem_state),
                (self.stochpol.ph_new, self.I.buf_news),
            ])

        verbose = True
        if verbose and self.is_log_leader:
            samples = np.prod(self.I.buf_advs.shape)
            logger.info(
                "buffer shape %s, samples_per_mpi=%i, mini_per_mpi=%i, samples=%i, mini=%i "
                % (str(self.I.buf_advs.shape), samples, samples //
                   self.nminibatches, samples * self.comm_train_size,
                   samples * self.comm_train_size // self.nminibatches))
            logger.info(" " * 6 + fmt_row(13, self.loss_names))

        epoch = 0
        start = 0
        #Optimizes on current data for several epochs.
        while epoch < self.nepochs:
            end = start + envsperbatch
            mbenvinds = slice(start, end, None)

            fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
            fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})
            fd[self.stochpol.ph_ob[None]] = np.concatenate([
                self.I.buf_obs[None][mbenvinds],
                self.I.buf_ob_last[None][mbenvinds, None]
            ], 1)
            assert list(fd[self.stochpol.ph_ob[None]].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape), \
                [fd[self.stochpol.ph_ob[None]].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape)]
            fd.update({
                self.stochpol.ph_mean: self.stochpol.ob_rms.mean,
                self.stochpol.ph_std: self.stochpol.ob_rms.var**0.5
            })

            ret = tf.get_default_session().run(self._losses + [self._train],
                                               feed_dict=fd)[:-1]
            if not self.testing:
                lossdict = dict(zip([n for n in self.loss_names], ret), axis=0)
            else:
                lossdict = {}
            #Synchronize the lossdict across mpi processes, otherwise weights may be rolled back on one process but not another.
            _maxkl = lossdict.pop('maxkl')
            lossdict = dict_gather(self.comm_train, lossdict, op='mean')
            maxmaxkl = dict_gather(self.comm_train, {"maxkl": _maxkl},
                                   op='max')
            lossdict["maxkl"] = maxmaxkl["maxkl"]
            if verbose and self.is_log_leader:
                logger.info(
                    "%i:%03i %s" %
                    (epoch, start,
                     fmt_row(13, [lossdict[n] for n in self.loss_names])))
            start += envsperbatch
            if start == self.I.nenvs:
                epoch += 1
                start = 0

        if self.is_train_leader:
            self.I.stats["n_updates"] += 1
            info.update([('opt_' + n, lossdict[n]) for n in self.loss_names])
            tnow = time.time()
            info['tps'] = self.nsteps * self.I.nenvs / (tnow -
                                                        self.I.t_last_update)
            info['time_elapsed'] = time.time() - self.t0
            self.I.t_last_update = tnow
        self.stochpol.update_normalization(  # Necessary for continuous control tasks with odd obs ranges, only implemented in mlp policy,
            ob=self.I.buf_obs  # NOTE: not shared via MPI
        )
        return info
Ejemplo n.º 36
0
    def __init__(self,
                 sess,
                 prefix,
                 env,
                 nb_epochs,
                 nb_epoch_cycles,
                 render_eval,
                 reward_scale,
                 render,
                 param_noise,
                 actor,
                 critic,
                 normalize_returns,
                 normalize_observations,
                 critic_l2_reg,
                 actor_lr,
                 actor_dis_lr,
                 critic_lr,
                 exp_scale,
                 action_noise,
                 popart,
                 gamma,
                 clip_norm,
                 nb_train_steps,
                 nb_dis_train_steps,
                 nb_rollout_steps,
                 nb_eval_steps,
                 batch_size,
                 dis_batch_size,
                 memory,
                 tau=0.01,
                 eval_env=None,
                 param_noise_adaption_interval=50):

        self.sess = sess
        self.prefix = prefix
        self.env = env
        self.nb_epochs = nb_epochs
        self.nb_epoch_cycles = nb_epoch_cycles
        self.render_eval = render_eval
        self.reward_scale = reward_scale
        self.render = render
        self.param_noise = param_noise
        self.actor = actor
        self.critic = critic
        self.normalize_returns = normalize_returns
        self.normalize_observations = normalize_observations
        self.critic_l2_reg = critic_l2_reg
        self.actor_lr = actor_lr
        self.actor_dis_lr = actor_dis_lr
        self.critic_lr = critic_lr
        self.exp_scale = exp_scale
        self.action_noise = action_noise
        self.popart = popart
        self.gamma = gamma
        self.clip_norm = clip_norm
        self.nb_train_steps = nb_train_steps
        self.nb_dis_train_steps = nb_dis_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.nb_eval_steps = nb_eval_steps
        self.batch_size = batch_size
        self.dis_batch_size = dis_batch_size
        self.memory = memory
        self.tau = tau
        self.eval_env = eval_env
        self.param_noise_adaption_interval = param_noise_adaption_interval

        self.rank = MPI.COMM_WORLD.Get_rank()

        assert (np.abs(self.env.action_space.low) == self.env.action_space.high
                ).all()  # we assume symmetric actions.
        self.max_action = self.env.action_space.high
        logger.info('scaling actions by {} before executing in env'.format(
            self.max_action))

        self.agent = DDPG(self.prefix,
                          self.actor,
                          self.critic,
                          self.memory,
                          self.env.observation_space.shape,
                          self.env.action_space.shape,
                          gamma=self.gamma,
                          tau=self.tau,
                          normalize_returns=self.normalize_returns,
                          normalize_observations=self.normalize_observations,
                          batch_size=self.batch_size,
                          dis_batch_size=self.dis_batch_size,
                          action_noise=self.action_noise,
                          param_noise=self.param_noise,
                          critic_l2_reg=self.critic_l2_reg,
                          actor_lr=self.actor_lr,
                          actor_dis_lr=self.actor_dis_lr,
                          critic_lr=self.critic_lr,
                          exp_scale=self.exp_scale,
                          enable_popart=self.popart,
                          clip_norm=self.clip_norm,
                          reward_scale=self.reward_scale)
        logger.info('Using agent with the following configuration:')
        logger.info(str(self.agent.__dict__.items()))

        # Set up logging stuff only for a single worker.
        if self.rank == 0:
            self.saver = tf.train.Saver()
        else:
            self.saver = None

        self.step = 0
        self.episode = 0
        self.eval_episode_rewards_history = deque(maxlen=100)
        self.episode_rewards_history = deque(maxlen=100)
        #with U.single_threaded_session() as sess:
        #self.sess = U.single_threaded_session()
        # Prepare everything.
        self.agent.initialize(self.sess)
        #self.sess.graph.finalize()

        self.agent.reset()
        self.obs = self.env.reset()
        if self.eval_env is not None:
            self.eval_obs = eval_env.reset()
        self.done = False
        self.episode_reward = 0.
        self.episode_step = 0
        self.episodes = 0
        self.t = 0

        self.epoch = 0
        self.start_time = time.time()

        self.epoch_episode_rewards = []
        self.epoch_episode_steps = []
        self.epoch_episode_eval_rewards = []
        self.epoch_episode_eval_steps = []
        self.epoch_start_time = time.time()
        self.epoch_actions = []
        self.epoch_qs = []
        self.epoch_episodes = 0

        self.epoch_actor_losses = []
        self.epoch_critic_losses = []
        self.epoch_adaptive_distances = []
        self.eval_episode_rewards = []
        self.eval_qs = []
Ejemplo n.º 37
0
    parser.add_argument('--nb-train-steps', type=int, default=50)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-eval-steps', type=int, default=100)  # per epoch cycle and MPI worker
    parser.add_argument('--nb-rollout-steps', type=int, default=100)  # per epoch cycle and MPI worker
    parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')  # choices are adaptive-param_xx, ou_xx, normal_xx, none
    parser.add_argument('--logdir', type=str, default=None)
    boolean_flag(parser, 'gym-monitor', default=False)
    boolean_flag(parser, 'evaluation', default=True)
    boolean_flag(parser, 'bind-to-core', default=False)

    return vars(parser.parse_args())


if __name__ == '__main__':
    args = parse_args()

    # Figure out what logdir to use.
    if args['logdir'] is None:
        args['logdir'] = os.getenv('OPENAI_LOGDIR')
    
    # Print and save arguments.
    logger.info('Arguments:')
    for key in sorted(args.keys()):
        logger.info('{}: {}'.format(key, args[key]))
    logger.info('')
    if args['logdir']:
        with open(os.path.join(args['logdir'], 'args.json'), 'w') as f:
            json.dump(args, f)

    # Run actual script.
    run(**args)
Ejemplo n.º 38
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('uo_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.uo_stats = Normalizer(self.dimo + self.dimg,
                                       self.norm_eps,
                                       self.norm_clip,
                                       sess=self.sess)
        with tf.variable_scope('lo_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.lo_stats = Normalizer(2 * self.dimo,
                                       self.norm_eps,
                                       self.norm_clip,
                                       sess=self.sess)
        '''
        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both
            maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else: #If  not training with demonstrations
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
        '''
        # polyak averaging
        #self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        #self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        #self.init_target_net_op = list(
        #    map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        #self.update_target_net_op = list(
        #    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
Ejemplo n.º 39
0
def learn(*,
          network,
          env,
          total_timesteps,
          eval_env=None,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          lr=3e-4,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          save_interval=0,
          load_path=None,
          model_fn=None,
          update_fn=None,
          use_demo=False,
          demos=None,
          render_env=False,
          init_fn=None,
          mpi_rank_weight=1,
          comm=None,
          base_path=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------

    network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                      specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                      tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                      neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                      See common/models.py/lstm for more details on using recurrent nets in policies

    env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                      The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


    nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                      nenv is number of environment copies simulated in parallel)

    total_timesteps: int              number of timesteps (i.e. number of actions taken in the environment)

    ent_coef: float                   policy entropy coefficient in the optimization objective

    lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                      training and 0 is the end of the training.

    vf_coef: float                    value function loss coefficient in the optimization objective

    max_grad_norm: float or None      gradient norm clipping coefficient

    gamma: float                      discounting factor

    lam: float                        advantage estimation discounting factor (lambda in the paper)

    log_interval: int                 number of timesteps between logging events

    nminibatches: int                 number of training minibatches per update. For recurrent policies,
                                      should be smaller or equal than number of environments run in parallel.

    noptepochs: int                   number of training epochs per update

    cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                      and 0 is the end of the training

    save_interval: int                number of timesteps between saving events

    load_path: str                    path to load the model from

    **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                                      For instance, 'mlp' network architecture has arguments num_hidden and num_layers.



    '''

    set_global_seeds(seed)

    if isinstance(lr, float): lr = constfn(lr)
    else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    policy = build_policy(env, network, **network_kwargs)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)

    # Instantiate the model object (that creates act_model and train_model)
    if model_fn is None:
        from baselines.ppo2.model import Model
        model_fn = Model

    model = model_fn(policy=policy,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     nbatch_act=nenvs,
                     nbatch_train=nbatch_train,
                     nsteps=nsteps,
                     ent_coef=ent_coef,
                     vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm,
                     comm=comm,
                     mpi_rank_weight=mpi_rank_weight)

    if save_interval and logger.get_dir():
        if base_path is None:
            base_path = os.path.dirname(os.path.abspath(__file__))
        if not os.path.isdir(osp.join(base_path, "models")):
            os.mkdir(osp.join(base_path, "models"))
        # with open(osp.join(base_path, "models", 'model.pkl'), 'wb') as fh:
        #     fh.write(cloudpickle.dumps(model))

    if load_path is not None:
        model.load(load_path)
    # Instantiate the runner object
    runner = Runner(env=env,
                    model=model,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    use_demo=use_demo,
                    demos=demos,
                    render_env=render_env)
    if eval_env is not None:
        eval_runner = Runner(env=eval_env,
                             model=model,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)

    epinfobuf = deque(maxlen=100)
    if eval_env is not None:
        eval_epinfobuf = deque(maxlen=100)

    if init_fn is not None:
        init_fn()

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch

    # initialize dictionary for data saving
    performance = {"reward": []}
    for name in model.loss_names:
        performance[name] = []

    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        # Start timer
        tstart = time.perf_counter()
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)
        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  #pylint: disable=E0632
        rewards = runner.mb_rewards
        # save reward data
        performance["reward"].extend(rewards)

        if eval_env is not None:
            eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run(
            )  #pylint: disable=E0632

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)
        if eval_env is not None:
            eval_epinfobuf.extend(eval_epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):
                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices = (arr[mbinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mblossvals.append(model.train(lrnow, cliprangenow,
                                                  *slices))
        else:  # recurrent version
            assert nenvs % nminibatches == 0
            envsperbatch = nenvs // nminibatches
            envinds = np.arange(nenvs)
            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
            for _ in range(noptepochs):
                np.random.shuffle(envinds)
                for start in range(0, nenvs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mbflatinds = flatinds[mbenvinds].ravel()
                    slices = (arr[mbflatinds]
                              for arr in (obs, returns, masks, actions, values,
                                          neglogpacs))
                    mbstates = states[mbenvinds]
                    mblossvals.append(
                        model.train(lrnow, cliprangenow, *slices, mbstates))

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update_fn is not None:
            update_fn(update)

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            if eval_env is not None:
                logger.logkv(
                    'eval_eprewmean',
                    safemean([epinfo['r'] for epinfo in eval_epinfobuf]))
                logger.logkv(
                    'eval_eplenmean',
                    safemean([epinfo['l'] for epinfo in eval_epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model.loss_names):
                logger.logkv('loss/' + lossname, lossval)

            logger.dumpkvs()

        # save loss data
        for i in range(len(model.loss_names)):
            loss_name = model.loss_names[i]
            performance[loss_name].extend([lossvals[i]])

        if save_interval and (update % save_interval == 0 or update
                              == 1) and logger.get_dir() and is_mpi_root:
            model_dir = osp.join(base_path, "models")
            os.makedirs(model_dir, exist_ok=True)
            savepath = osp.join(model_dir, '%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
            print("Saved model successfully.")
            performance_fname = os.path.join(base_path, "performance.p")
            with open(performance_fname, "wb") as f:
                pickle.dump(performance, f)

    model_dir = osp.join(base_path, "models")
    os.makedirs(model_dir, exist_ok=True)
    savepath = osp.join(model_dir, '%.5i' % update)
    print('Saving to', savepath)
    model.save(savepath)
    print("Saved model successfully.")
    performance_fname = os.path.join(base_path, "performance.p")
    with open(performance_fname, "wb") as f:
        pickle.dump(performance, f)

    return performance
Ejemplo n.º 40
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                new_obs, r, done, info = env.step(
                    max_action * action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/return_history_std'] = np.std(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

    return agent
Ejemplo n.º 41
0
def view(env,
          seed=None,
          total_timesteps=None,
          reward_scale=1.0,
          render=False,
          render_eval=False,
          noise_type='adaptive-param_0.2',
          normalize_returns=False,
          normalize_observations=True,
          critic_l2_reg=1e-2,
          actor_lr=1e-4,
          critic_lr=1e-3,
          popart=False,
          gamma=0.99,
          clip_norm=None,
          nb_train_steps=50, # per epoch cycle and MPI worker,
          nb_eval_steps=100,
          nb_save_epochs=None,
          batch_size=64, # per MPI worker
          tau=0.01,
          action_range=(-250.0, 250.0),
          observation_range=(-5.0, 5.0),
          eval_env=None,
          load_path=None,
          save_dir=None,
          param_noise_adaption_interval=50,
          **network_kwargs):

    set_global_seeds(seed)

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    memory = Memory(limit=int(1e6))
    
    network_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (1),
                'activation': 'tanh',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]
    
    vnetwork_spec = [
            {
                'layer_type': 'concat',
                'nodes_in': ['action_movement', 'observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (1),
                'activation': '',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]

    network = DdpgPolicy(scope="ddpg", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                 stochastic=False, reuse=False, build_act=True,
                 trainable_vars=None, not_trainable_vars=None,
                 gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999,
                 normalize_observations=normalize_observations, normalize_returns=normalize_returns,
                 observation_range=observation_range)
    
    target_network = DdpgPolicy(scope="target", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                 stochastic=False, reuse=False, build_act=True,
                 trainable_vars=None, not_trainable_vars=None,
                 gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999,
                 normalize_observations=normalize_observations, normalize_returns=normalize_returns,
                 observation_range=observation_range)

    max_action = action_range[1]
    logger.info('scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(network, target_network, memory, env.observation_space, env.action_space,
        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
        batch_size=batch_size, action_noise=None, param_noise=None, critic_l2_reg=critic_l2_reg,
        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
        reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    sess = U.get_session()

    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    nenvs = env.num_envs
    obs = env.reset()
    n_agents = obs['observation_self'].shape[0]

    for epoch in range(total_timesteps):
        agent.reset()
        obs = env.reset()
        while True:
            action, q, _, _ = agent.step(obs, apply_noise=False, compute_Q=False)
            # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
            for k, v in action.items():
                action[k] *= max_action
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {'action_movement' : action['action_movement'][i*n_agents:(i + 1)*n_agents]}
                nenvs_actions.append(nenv_action)
            print(nenvs_actions)
            obs, r, done, info = env.step(nenvs_actions)
            env.render()
            if True in done:
                break
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Ejemplo n.º 43
0
def main():
    num_envs = 64
    learning_rate = 5e-4
    ent_coef = .01
    gamma = .999
    lam = .95
    nsteps = 256
    nminibatches = 8
    ppo_epochs = 3
    clip_range = .2
    total_timesteps = 1_000_000  ## now this counts steps in testing runs
    use_vf_clipping = True

    ## From random_ppo.py
    max_grad_norm = 0.5
    vf_coef = 0.5
    L2_WEIGHT = 10e-4
    FM_COEFF = 0.002
    REAL_THRES = 0.1

    parser = argparse.ArgumentParser(
        description='Process procgen testing arguments.')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1000)
    ## default starting_level set to 50 to test on unseen levels!
    parser.add_argument('--start_level', type=int, default=1000)
    parser.add_argument('--run_id', '-id', type=int, default=0)
    parser.add_argument('--load_id', type=int, default=0)
    parser.add_argument('--nrollouts', '-nroll', type=int, default=0)

    args = parser.parse_args()
    args.total_timesteps = total_timesteps
    if args.nrollouts:
        total_timesteps = int(args.nrollouts * num_envs * nsteps)
    run_ID = 'run_' + str(args.run_id).zfill(2)
    run_ID += '_load{}'.format(args.load_id)

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    mpi_rank_weight = 0
    num_levels = args.num_levels

    log_comm = comm.Split(0, 0)
    format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else []

    logpath = join(LOG_DIR, run_ID)
    if not os.path.exists(logpath):
        os.system("mkdir -p %s" % logpath)

    fpath = join(logpath, 'args_{}.json'.format(run_ID))
    with open(fpath, 'w') as fh:
        json.dump(vars(args), fh, indent=4, sort_keys=True)
    print("\nSaved args at:\n\t{}\n".format(fpath))

    logger.configure(dir=logpath, format_strs=format_strs)

    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=args.env_name,
                      num_levels=num_levels,
                      start_level=args.start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")

    venv = VecMonitor(
        venv=venv,
        filename=None,
        keep_buf=100,
    )
    venv = VecNormalize(venv=venv, ob=False)

    logger.info("creating tf session")
    setup_mpi_gpus()
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    sess = tf.compat.v1.Session(config=config)
    sess.__enter__()

    logger.info("Testing")
    ## Modified based on random_ppo.learn
    env = venv
    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    nrollouts = total_timesteps // nbatch

    network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)
    policy = build_policy(env, network)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nbatch_act=nenvs,
                  nbatch_train=nbatch_train,
                  nsteps=nsteps,
                  ent_coef=ent_coef,
                  vf_coef=vf_coef,
                  max_grad_norm=max_grad_norm)

    LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id)
    model.load(LOAD_PATH)
    logger.info("Model pramas loaded from save")
    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

    epinfobuf10 = deque(maxlen=10)
    epinfobuf100 = deque(maxlen=100)
    # tfirststart = time.time() ## Not doing timing yet
    # active_ep_buf = epinfobuf100

    mean_rewards = []
    datapoints = []
    for rollout in range(1, nrollouts + 1):
        logger.info('collecting rollouts {}...'.format(rollout))
        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(
        )  ## differnent from random_ppo!
        epinfobuf10.extend(epinfos)
        epinfobuf100.extend(epinfos)

        rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10])
        rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100])
        ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10])
        ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100])

        logger.info('\n----', rollout)
        mean_rewards.append(rew_mean_10)
        logger.logkv('eprew10', rew_mean_10)
        logger.logkv('eprew100', rew_mean_100)
        logger.logkv('eplenmean10', ep_len_mean_10)
        logger.logkv('eplenmean100', ep_len_mean_100)
        logger.logkv("misc/total_timesteps", rollout * nbatch)

        logger.info('----\n')
        logger.dumpkvs()
    env.close()

    print("Rewards history: ", mean_rewards)
    return mean_rewards
Ejemplo n.º 44
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critics,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.005,
          eval_env=None,
          param_noise_adaption_interval=50,
          initial_random_steps=1e4,
          policy_and_target_update_period=1,
          use_mpi_adam=False,
          stop_actor_steps=None,
          stop_critic_steps=None,
          **kwargs):

    if kwargs is not None:
        logger.info("Warning: redundant hyper-parameters: " + str(kwargs))

    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = EnsembleDDPG(actor,
                         critics,
                         memory,
                         env.observation_space.shape,
                         env.action_space.shape,
                         gamma=gamma,
                         tau=tau,
                         normalize_returns=normalize_returns,
                         normalize_observations=normalize_observations,
                         batch_size=batch_size,
                         action_noise=action_noise,
                         param_noise=param_noise,
                         critic_l2_reg=critic_l2_reg,
                         actor_lr=actor_lr,
                         critic_lr=critic_lr,
                         enable_popart=popart,
                         clip_norm=clip_norm,
                         reward_scale=reward_scale,
                         use_mpi_adam=use_mpi_adam)
    logger.info('Using agent with the following configuration:')

    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    if t < initial_random_steps:
                        action = np.random.uniform(low=-1,
                                                   high=1,
                                                   size=env.action_space.shape)
                    else:
                        action, _ = agent.pi(obs,
                                             apply_noise=True,
                                             compute_Q=False)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    # use take_update flag to control actor training
                    take_update = (t_train %
                                   policy_and_target_update_period == 0)
                    stop_actor_training = False
                    stop_critic_training = False
                    if stop_actor_steps is not None and t > stop_actor_steps:
                        stop_actor_training = True
                    if stop_critic_steps is not None and t > stop_critic_steps:
                        stop_critic_training = True

                    cl, al = agent.train(take_update, stop_critic_training,
                                         stop_actor_training)
                    epoch_critic_losses.append(cl)

                    # use take_update flag to control target updating
                    if take_update:
                        epoch_actor_losses.append(al)
                        agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/return_latest'] = np.mean(
                epoch_episode_rewards[-1])
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Ejemplo n.º 45
0
def log_params(params, logger=logger):
    for key in sorted(params.keys()):
        logger.info('{}: {}'.format(key, params[key]))
Ejemplo n.º 46
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank==0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))