コード例 #1
0
def run(args):
    logger.configure(
        f'logs/{args["dataset"]}/pam/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}'
    )
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())
    pam_arg = args.copy()

    if 'margin' not in pam_arg.keys():
        best_margin = pool.map(find_best_margin, make_arg_list(pam_arg))
        best_margin = np.mean(best_margin, 0)
        if 'verbose' in pam_arg.keys() and pam_arg['verbose']:
            for i in range(len(best_margin)):
                logger.record_tabular(f'[PAM] margin = {MARGINS[i]}',
                                      best_margin[i])
            logger.dump_tabular()
        best_margin = MARGINS[best_margin.argmax()]
        logger.record_tabular('[PAM] best margin', best_margin)
        pam_arg['margin'] = best_margin

    results_pam = pool.map(run_pam, make_arg_list(pam_arg))

    logger.record_tabular('[PAM] accuracy mean', np.mean(results_pam))
    logger.record_tabular('[PAM] accuracy max', np.max(results_pam))
    logger.record_tabular('[PAM] accuracy min', np.min(results_pam))
    logger.record_tabular('[PAM] accuracy std', np.std(results_pam))
    logger.dump_tabular()
コード例 #2
0
ファイル: run_svm.py プロジェクト: apricotxingya/peer_loss
def run(args):
    logger.configure(
        f'logs/{args["dataset"]}/svm/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}'
    )
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())
    svm_arg = args.copy()

    if 'C1' not in svm_arg.keys():
        best_c1 = pool.map(find_best_c1, make_arg_list(svm_arg))
        best_c1 = np.mean(best_c1, 0)
        if 'verbose' in svm_arg.keys() and svm_arg['verbose']:
            for i in range(len(best_c1)):
                logger.record_tabular(f'[C-SVM] C1 = {CLASS_WEIGHTS[i]}',
                                      best_c1[i])
            logger.dump_tabular()
        best_c1 = CLASS_WEIGHTS[best_c1.argmax()]
        logger.record_tabular('[C-SVM] best C1', best_c1)
        svm_arg['C1'] = best_c1

    results_svm = pool.map(run_c_svm, make_arg_list(svm_arg))

    logger.record_tabular('[C-SVM] accuracy mean', np.mean(results_svm))
    logger.record_tabular('[C-SVM] accuracy max', np.max(results_svm))
    logger.record_tabular('[C-SVM] accuracy min', np.min(results_svm))
    logger.record_tabular('[C-SVM] accuracy std', np.std(results_svm))
    logger.dump_tabular()
コード例 #3
0
    def call(self, on_policy):
        env_runner, model, buffer, steps = self.env_runner, self.model, \
            self.buffer, self.steps
        if on_policy:
            enc_obs, obs, actions, rewards, mus, dones, masks = env_runner.run()
            self.episode_stats.feed(rewards, dones)
            if buffer is not None:
                buffer.put(enc_obs, actions, rewards, mus, dones, masks)
        else:
            # get obs, actions, rewards, mus, dones from buffer.
            obs, actions, rewards, mus, dones, masks = buffer.get()

        # reshape stuff correctly
        obs = obs.reshape(env_runner.batch_ob_shape)
        actions = actions.reshape([env_runner.nbatch])
        rewards = rewards.reshape([env_runner.nbatch])
        mus = mus.reshape([env_runner.nbatch, env_runner.nact])
        dones = dones.reshape([env_runner.nbatch])
        masks = masks.reshape([env_runner.batch_ob_shape[0]])

        names_ops, values_ops = model.predict(
            obs,
            actions,
            rewards,
            dones,
            mus,
            model.initial_state,
            masks,
            steps
        )

        if on_policy and (int(steps/env_runner.nbatch) % self.log_interval == 0):
            logger.record_tabular("total_timesteps", steps)
            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
            # IMP: In EpisodicLife env, during training, we get
            # done=True at each loss of life, not just at the terminal
            # state.  Thus, this is mean until end of life, not end of
            # episode.  For true episode rewards, see the monitor
            # files in the log folder.
            logger.record_tabular(
                "mean_episode_length",
                self.episode_stats.mean_length()
            )
            logger.record_tabular(
                "mean_episode_reward",
                self.episode_stats.mean_reward()
            )
            for name, val in zip(names_ops, values_ops):
                logger.record_tabular(name, float(val))
            logger.dump_tabular()
コード例 #4
0
ファイル: run_nn.py プロジェクト: apricotxingya/peer_loss
def find_best_alpha_val(kargs):
    if len(kargs['alpha']) == 1:
        return {'alpha': kargs['alpha'][0]}
    args = kargs.copy()
    pool = mp.Pool(mp.cpu_count())
    results = []
    for alpha in kargs['alpha']:
        args['alpha'] = alpha
        res = [
            res['val_acc']
            for res in pool.map(run_nn_peer_val, make_arg_list(args))
        ]
        res = np.mean(res, axis=0)[-1]
        if 'verbose' in args.keys() and args['verbose']:
            logger.record_tabular(f'[PEER] alpha = {alpha}', res)
        results.append(res)
    pool.close()
    pool.join()
    logger.dump_tabular()
    best_alpha = kargs['alpha'][np.argmax(results)]
    return {'alpha': best_alpha}
コード例 #5
0
ファイル: eac.py プロジェクト: yimingpeng/sac-master
    def train(self):
        """
        CG: the function that conducts ensemble training.
        :return: 
        """
        # Set up parameters for the training process.
        self._n_epochs = self._base_ac_params['n_epochs']
        self._epoch_length = self._base_ac_params['epoch_length']
        self._n_train_repeat = self._base_ac_params['n_train_repeat']
        self._n_initial_exploration_steps = self._base_ac_params[
            'n_initial_exploration_steps']
        self._eval_render = self._base_ac_params['eval_render']
        self._eval_n_episodes = self._base_ac_params['eval_n_episodes']
        self._eval_deterministic = self._base_ac_params['eval_deterministic']

        # Set up the evaluation environment.
        if self._eval_n_episodes > 0:
            with tf.variable_scope("low_level_policy", reuse=True):
                self._eval_env = deep_clone(self._env)

        # Import required libraries for training.
        import random
        import math
        import operator
        import numpy as np

        # Initialize the sampler.
        alg_ins = random.choice(self._alg_instances)
        self._sampler.initialize(self._env, alg_ins[0].policy, self._pool)

        # Perform the training/evaluation process.
        num_episode = 0.
        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.log('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    isEpisodeEnd = self._sampler.sample()

                    # If an episode is ended, we need to update performance statistics for each AC instance and
                    # pick randomly another AC instance for next episode of exploration.
                    if isEpisodeEnd:
                        num_episode = num_episode + 1.
                        alg_ins[1] = 0.9 * alg_ins[
                            1] + 0.1 * self._sampler._last_path_return
                        alg_ins[2] = alg_ins[2] + 1.

                        if self._use_ucb:
                            # Select an algorithm instance based on UCB.
                            selected = False
                            for ains in self._alg_instances:
                                if ains[2] < 1.:
                                    alg_ins = ains
                                    selected = True
                                    break
                                else:
                                    ains[3] = ains[1] + math.sqrt(
                                        2.0 * math.log(num_episode) / ains[2])

                            if not selected:
                                alg_ins = max(self._alg_instances,
                                              key=operator.itemgetter(3))

                        else:
                            # Select an algorithm instance uniformly at random.
                            alg_ins = random.choice(self._alg_instances)
                            self._sampler.set_policy(alg_ins[0].policy)

                    if not self._sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    # ================
                    # Perform training.
                    # ================
                    for i in range(self._n_train_repeat):
                        batch = self._sampler.random_batch()

                        # ====================================
                        # Perform training over all AC instances.
                        # ====================================
                        for ains in self._alg_instances:
                            ains[0]._do_training(iteration=t +
                                                 epoch * self._epoch_length,
                                                 batch=batch)

                        # =================================================
                        # Perform training of the action-selection Q-function.
                        # =================================================
                        # Set up the feed dictionary.
                        feed_dict = {
                            self._observations_ens_ph:
                            batch['observations'],
                            self._obv_act_ph:
                            batch['actions'],
                            self._observations_ens_next_ph:
                            batch['next_observations'],
                            self._rewards_ph:
                            batch['rewards'],
                            self._terminals_ph:
                            batch['terminals'],
                        }
                        for i, ains in enumerate(self._alg_instances):
                            with ains[0].policy.deterministic(
                                    self._eval_deterministic):
                                feed_dict[self._acts_next_phs[i]] = ains[
                                    0].policy.get_actions(
                                        batch['next_observations'])

                        # Perform training on the action-selection Q-function.
                        self._sess.run(self._q_ens_train_operator, feed_dict)

                    gt.stamp('train')

                # ============================================================
                # Perform evaluation after one full epoch of training is completed.
                # ============================================================
                if self._eval_n_episodes < 1:
                    continue

                if self._evaluation_strategy == 'ensemble':
                    # Use a whole ensemble of AC instances for evaluation.
                    paths = rollouts(self._eval_env, self,
                                     self._sampler._max_path_length,
                                     self._eval_n_episodes)

                elif self._evaluation_strategy == 'best-policy':
                    # Choose the AC instance with the highest observed performance so far for evaluation.
                    eval_alg_ins = max(self._alg_instances,
                                       key=operator.itemgetter(1))
                    with eval_alg_ins[0].policy.deterministic(
                            self._eval_deterministic):
                        paths = rollouts(self._eval_env,
                                         eval_alg_ins[0].policy,
                                         self._sampler._max_path_length,
                                         self._eval_n_episodes)

                else:
                    paths = None

                if paths is not None:
                    total_returns = [path['rewards'].sum() for path in paths]
                    episode_lengths = [len(p['rewards']) for p in paths]
                    logger.record_tabular('return-average',
                                          np.mean(total_returns))
                    logger.record_tabular('return-min', np.min(total_returns))
                    logger.record_tabular('return-max', np.max(total_returns))
                    logger.record_tabular('return-std', np.std(total_returns))
                    logger.record_tabular('episode-length-avg',
                                          np.mean(episode_lengths))
                    logger.record_tabular('episode-length-min',
                                          np.min(episode_lengths))
                    logger.record_tabular('episode-length-max',
                                          np.max(episode_lengths))
                    logger.record_tabular('episode-length-std',
                                          np.std(episode_lengths))

                    self._eval_env.log_diagnostics(paths)
                    if self._eval_render:
                        self._eval_env.render(paths)

                # Produce log info after each episode of training and evaluation.
                times_itrs = gt.get_times().stamps.itrs
                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self._sampler.log_diagnostics()

                logger.dump_tabular()
                # logger.pop_prefix()

                gt.stamp('eval')

            # Terminate the sampler after the training process is completed.
            self._sampler.terminate()
コード例 #6
0
ファイル: learner.py プロジェクト: AltmanD/rl-framework
def main():
    # Parse input parameters
    args, unknown_args = parser.parse_known_args()
    args.num_steps = int(args.num_steps)
    unknown_args = parse_cmdline_kwargs(unknown_args)

    # Load config file
    load_yaml_config(args, 'learner')

    # Expose socket to actor(s)
    context = zmq.Context()
    weights_socket = context.socket(zmq.PUB)
    weights_socket.bind(f'tcp://*:{args.param_port}')

    _, agent = init_components(args, unknown_args)

    # Configure experiment directory
    create_experiment_dir(args, 'LEARNER-')
    save_yaml_config(args.exp_path / 'config.yaml', args, 'learner', agent)
    args.log_path = args.exp_path / 'log'
    args.ckpt_path = args.exp_path / 'ckpt'
    args.ckpt_path.mkdir()
    args.log_path.mkdir()

    logger.configure(str(args.log_path))

    # Record commit hash
    with open(args.exp_path / 'hash', 'w') as f:
        f.write(
            str(
                subprocess.run('git rev-parse HEAD'.split(),
                               stdout=subprocess.PIPE).stdout.decode('utf-8')))

    # Variables to control the frequency of training
    receiving_condition = multiprocessing.Condition()
    num_receptions = multiprocessing.Value('i', 0)

    # Start memory pool in another process
    manager = MemPoolManager()
    manager.start()
    mem_pool = manager.MemPool(capacity=args.pool_size)
    Process(target=recv_data,
            args=(args.data_port, mem_pool, receiving_condition,
                  num_receptions, args.keep_training)).start()

    # Print throughput statistics
    Process(target=MultiprocessingMemPool.record_throughput,
            args=(mem_pool, args.record_throughput_interval)).start()

    freq = 0
    learn_flag = 0
    while True:
        if learn_flag == 0:
            weights_socket.send(pickle.dumps(agent.get_weights()))

        if len(mem_pool) >= args.batch_size:

            # Sync weights to actor
            weights = agent.get_weights()
            if hvd.rank() == 0:
                weights_socket.send(pickle.dumps(weights))

            if freq % args.ckpt_save_freq == 0:
                if args.ckpt_save_type == 'checkpoint':
                    agent.save(args.ckpt_path / 'ckpt')
                elif args.ckpt_save_type == 'weight':
                    with open(args.ckpt_path / 'weight.ckpt', 'wb') as f:
                        pickle.dump(weights, f)

            if args.keep_training:
                agent.learn(mem_pool.sample(size=args.batch_size))
            else:
                with receiving_condition:
                    while num_receptions.value < args.training_freq:
                        receiving_condition.wait()
                    data = mem_pool.sample(size=args.batch_size)
                    num_receptions.value -= args.training_freq
                # Training
                stat = agent.learn(data)
                learn_flag = 1
                if stat is not None:
                    for k, v in stat.items():
                        logger.record_tabular(k, v)
                logger.dump_tabular()

            freq += 1
コード例 #7
0
def train(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval_freq,
          hid1_mult, init_policy_logvar, seed):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor (float)
        lam: lambda from Generalized Advantage Estimate
        kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
        batch_size: number of episodes per policy training batch
        eval_freq: number of training batch before test
        hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
        init_policy_logvar: natural log of initial policy variance
        seed: random seed for all modules with randomness
    """
    # set seeds
    set_global_seed(seed)
    # configure log
    configure_log_info(env_name, seed)

    # create env
    env = gym.make(env_name)
    env.seed(seed) # set env seed
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]

    # create scaler
    scaler = Scaler(obs_dim)

    # create policy
    policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_policy_logvar).to(device)

    # create value_function
    value_function = ValueFunction(obs_dim, hid1_mult).to(device)

    # run a few episodes of untrained policy to initialize scaler:
    run_policy(env, policy, scaler, episodes=5)

    # train & test models
    num_iteration = num_episodes // eval_freq
    current_episodes = 0
    current_steps = 0
    for iter in range(num_iteration):
        # train models
        for i in range(eval_freq):
            # rollout
            trajectories, steps = run_policy(env, policy, scaler, episodes=batch_size)
            # process data
            current_episodes += len(trajectories)
            current_steps += steps
            add_value(trajectories, value_function)  # add estimated values to episodes
            add_disc_sum_rew(trajectories, gamma)  # calculated discounted sum of Rs
            add_gae(trajectories, gamma, lam)  # calculate advantage
            # concatenate all episodes into single NumPy arrays
            observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
            train_returns = [np.sum(t["rewards"]) for t in trajectories]
            logger.info('[train] average return:{0}, std return: {1}'.format(np.mean(train_returns), np.std(train_returns)))
            # add various stats to training log:
            #log_batch_stats(observes, actions, advantages, disc_sum_rew)
            # update policy
            policy.update(observes, actions, advantages)  # update policy
            # update value function
            value_function.update(observes, disc_sum_rew)  # update value function

        # test models
        num_test_episodes = 10
        trajectories, _ = run_policy(env, policy, scaler, episodes=num_test_episodes)
        avg_return = np.mean([np.sum(t["rewards"]) for t in trajectories])
        std_return = np.std([np.sum(t["rewards"]) for t in trajectories])
        logger.record_tabular('iteration', iter)
        logger.record_tabular('episodes', current_episodes)
        logger.record_tabular('steps', current_steps)
        logger.record_tabular('avg_return', avg_return)
        logger.record_tabular('std_return', std_return)
        logger.dump_tabular()
コード例 #8
0
def main():
    L.configure('/home/metalabadmin/exp/freeway',
                format_strs=['stdout', 'csv', 'tensorboard'])
    env = gym.make('Freeway-v0')
    env = wrapper.wrap_deepmind(env, frame_stack=True, scale=True)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
    network = Q_network(env.observation_space,
                        env.action_space.n,
                        optimizer,
                        gamma=0.99,
                        scope='freeway')
    m_controller = MetaController(network, env.action_space.n)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(0.1 * 1e7),
                                 initial_p=1.0,
                                 final_p=0.02)
    replay = ReplayBuffer(50000)
    # get default tf_session
    sess = U.get_session()
    U.initialize()
    sess.run(m_controller.network.update_target_op)
    step = 0
    episodes = 0
    rewards = 0
    mean_100ep_reward = 0
    total_reward = []
    saved_mean_reward = None
    ob = env.reset()

    while step <= 1e7:
        ep = exploration.value(step)
        ob_reshaped = np.reshape(ob, (1, ) + env.observation_space.shape)
        act = m_controller.sample_act(sess, ob_reshaped, update_eps=ep)[0]
        ob_tp1, reward_t, done_t, info = env.step(act)
        env.render()
        rewards += reward_t
        replay.add(ob, act, reward_t, ob_tp1, float(done_t))
        ob = ob_tp1

        # train every 4 steps
        if step >= 1000 and step % 4 == 0:
            obs, acts, rewards_t, obs_tp1, dones_t = replay.sample(64)
            weights, batch_idxes = np.ones_like(rewards_t), None
            # get q estimate for tp1 as 'supervised'
            obs_tp1_reshaped = np.reshape(obs_tp1,
                                          (64, ) + env.observation_space.shape)
            q_tp1 = m_controller.get_q(sess, obs_tp1_reshaped)[0]
            td_error = m_controller.train(sess, obs, acts, rewards_t, obs_tp1,
                                          dones_t, weights, q_tp1)

        step += 1

        if step >= 1000 and step % 1000 == 0:
            sess.run(m_controller.network.update_target_op)

        if done_t:
            ob = env.reset()
            total_reward.append(rewards)
            episodes += 1
            rewards = 0
            print('step %d done %s, ep %.2f' % (step, str(done_t), ep))
            mean_100ep_reward = round(np.mean(total_reward[-101:-1]), 1)
            if episodes % 10 == 0 and episodes != 0:
                print('date time %s' % str(datetime.now()))
                L.record_tabular("steps", step)
                L.record_tabular("episodes", episodes)
                L.record_tabular("mean 100 episode reward", mean_100ep_reward)
                L.dump_tabular()

        if step % 1000 == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".
                      format(saved_mean_reward, mean_100ep_reward))
                U.save_variables('./freewaymodel.ckpt')
                model_saved = True
                saved_mean_reward = mean_100ep_reward
コード例 #9
0
        # Initialize environment and reward type
        env = gym.make(args['gym_env'],
                       reward_type=args['reward_type'],
                       set_additional_goal=args['set_additional_goal'])

        # Set random seed in hope to reproductability
        env.seed(args['seed'])
        np.random.seed(args['seed'])
        tf.set_random_seed(args['seed'])

        logger.record_tabular("algo", args['algo'])
        logger.record_tabular("env", args['gym_env'])
        logger.record_tabular("env.set_additional_goal",
                              env.set_additional_goal)
        logger.record_tabular("env.reward_type", env.reward_type)
        logger.dump_tabular()

        if args['algo'] == "ppo":
            # Make necessary directories
            maybe_mkdir(args['RUN_DIR'])
            maybe_mkdir(args['MODEL_DIR'])
            maybe_mkdir(args['FIGURE_DIR'])
            maybe_mkdir(args['RESULT_DIR'])
            ppo_params_json = os.environ[
                'PROJ_HOME_3'] + '/ppo1/ppo_params.json'

            # Start to train the policy from scratch
            # trained_policy = run(env=env, algorithm=ppo, params=ppo_params_json, args=args)
            # trained_policy.save_model(args['MODEL_DIR'])

            # Load model and continue training
コード例 #10
0
ファイル: pposgd_simple.py プロジェクト: SFU-MARS/SL_optCtrl
def learn(
    env,
    policy_fn,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
コード例 #11
0
ファイル: run_deepq.py プロジェクト: kirk86/baselines
def fit(
        env,
        q_func,
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=100,
        checkpoint_freq=10000,
        checkpoint_path=None,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None
):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration
        rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version
        is restored at the end of the training. If you do not wish to
        restore the best version at the end of the training set this
        variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before
        learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from
        initial value to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load
        it.  See header of baselines/deepq/categorical.py for details
        on the act function.
    """
    # Create all the functions necessary to train the model

    model = DeepDQN()
    sess = model.init_session().__enter__()

    # capture the shape outside the closure so that the env object is
    # not serialized by cloudpickle when serializing make_obs_ph

    def make_obs_ph(name):
        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = model.build_train(
        make_obs_ph,
        q_func,
        env.action_space.n,
        tf.train.AdamOptimizer(learning_rate=lr),
        10,
        gamma,
        param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    model.init_vars()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False
        if tf.train.latest_checkpoint(td) is not None:
            model.load_state(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence
                # between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with
                # eps = exploration.value(t).  See Appendix C.1 in
                # Parameter Space Noise for Exploration, Plappert et
                # al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = \
                    update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(
                np.array(obs)[None], update_eps=update_eps, **kwargs
            )[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t)
                    )
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = \
                        replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(
                    obses_t,
                    actions,
                    rewards,
                    obses_tp1,
                    dones,
                    weights
                )
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(
                        batch_idxes,
                        new_priorities
                    )

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}".
                            format(saved_mean_reward, mean_100ep_reward)
                        )
                    model.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward)
                )
            model.load_state(model_file)

    return act
コード例 #12
0
ファイル: run_nn.py プロジェクト: apricotxingya/peer_loss
def run(args):
    prefix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    logger.configure(f'logs/{args["dataset"]}/nn/{prefix}')
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())

    nn_arg = args.copy()
    nn_arg.update(find_best_params(nn_arg))
    nn_arg.update(find_best_alpha_val(nn_arg))
    logger.record_tabular('[PEER] batchsize', nn_arg['batchsize'])
    logger.record_tabular('[PEER] learning rate', nn_arg['lr'])
    logger.record_tabular('[PEER] hidsize', nn_arg['hidsize'])
    logger.record_tabular('[PEER] alpha', nn_arg['alpha'])
    logger.dump_tabular()

    nn_arg['seed'] = 1
    run_nn_dmi(nn_arg)
    results_dmi = pool.map(run_nn_dmi, make_arg_list(nn_arg))
    results_surr = pool.map(run_nn_surr, make_arg_list(nn_arg))
    results_nn = pool.map(run_nn, make_arg_list(nn_arg))
    results_peer = pool.map(run_nn_peer, make_arg_list(nn_arg))
    results_symm = pool.map(run_nn_symm, make_arg_list(nn_arg))
    pool.close()
    pool.join()

    test_acc_bce = [res['val_acc'] for res in results_nn]
    test_acc_peer = [res['val_acc'] for res in results_peer]
    test_acc_surr = [res['val_acc'] for res in results_surr]
    test_acc_symm = [res['val_acc'] for res in results_symm]
    test_acc_dmi = [res['val_acc'] for res in results_dmi]

    plot([
        test_acc_bce, test_acc_peer, test_acc_surr, test_acc_symm, test_acc_dmi
    ], [
        'cross entropy loss', 'peer loss', 'surrogate loss', 'symmtric loss',
        'dmi loss'
    ],
         title='Accuracy During Testing',
         path=f'logs/{args["dataset"]}/nn/{prefix}')

    train_acc_bce = [res['train_acc'] for res in results_nn]
    train_acc_peer = [res['train_acc'] for res in results_peer]
    train_acc_surr = [res['train_acc'] for res in results_surr]
    train_acc_symm = [res['train_acc'] for res in results_symm]
    train_acc_dmi = [res['train_acc'] for res in results_dmi]

    plot([
        train_acc_bce, train_acc_peer, train_acc_surr, train_acc_symm,
        train_acc_dmi
    ], [
        'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss',
        'dmi loss'
    ],
         title='Accuracy During Training',
         path=f'logs/{args["dataset"]}/nn/{prefix}')

    loss_acc_surr = [res['loss'] for res in results_surr]
    loss_acc_bce = [res['loss'] for res in results_nn]
    loss_acc_peer = [res['loss'] for res in results_peer]
    loss_acc_symm = [res['loss'] for res in results_symm]
    loss_acc_dmi = [res['loss'] for res in results_dmi]

    plot([
        loss_acc_bce, loss_acc_peer, loss_acc_surr, loss_acc_symm, loss_acc_dmi
    ], [
        'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss',
        'dmi loss'
    ],
         title='Loss',
         path=f'logs/{args["dataset"]}/nn/{prefix}')

    logger.record_tabular('[NN] with peer loss', np.mean(test_acc_peer, 0)[-1])
    logger.record_tabular('[NN] with surrogate loss',
                          np.mean(test_acc_surr, 0)[-1])
    logger.record_tabular('[NN] with symmetric loss',
                          np.mean(test_acc_symm, 0)[-1])
    logger.record_tabular('[NN] with dmi loss', np.mean(test_acc_dmi, 0)[-1])
    logger.record_tabular(f'[NN] with {args["loss"]} loss',
                          np.mean(test_acc_bce, 0)[-1])
    logger.dump_tabular()
コード例 #13
0
ファイル: actor.py プロジェクト: AltmanD/rl-framework
def run_one_actor(index, args, unknown_args, actor_status):
    import tensorflow.compat.v1 as tf
    from tensorflow.keras.backend import set_session

    # Set 'allow_growth'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    # Connect to learner
    context = zmq.Context()
    context.linger = 0  # For removing linger behavior
    socket = context.socket(zmq.REQ)
    socket.connect(f'tcp://{args.ip}:{args.data_port}')

    # Initialize environment and model instance
    env = get_env(args.env, args.num_envs, **unknown_args)
    model = get_model(env, args)

    # Configure logging only in one process
    if index == 0:
        logger.configure(str(args.log_path))
    else:
        logger.configure(str(args.log_path), format_strs=[])

    # Initialize values
    model_id = -1
    episode_infos = deque(maxlen=100)
    num_episode = 0
    state = env.reset()

    nupdates = args.num_steps // args.max_steps_per_update

    model_init_flag = 0
    for update in range(1, nupdates + 1):
        # Update weights
        new_weights, model_id = find_new_weights(model_id, args.ckpt_path)
        if new_weights is not None:
            model.set_weights(new_weights)
            model_init_flag = 1
        elif model_init_flag == 0:
            continue

        # Collect data
        mb_states, mb_actions, mb_rewards, mb_dones, mb_extras = [], [], [], [], []
        start_time = time.time()
        for _ in range(args.max_steps_per_update):

            mb_states.append(state)

            # Sample action
            action, value, neglogp = model.forward(state)
            extra_data = {'value': value, 'neglogp': neglogp}
            state, reward, done, info = env.step(action)

            mb_actions.append(action)
            mb_rewards.append(reward)
            mb_dones.append(done)
            mb_extras.append(extra_data)

            for info_i in info:
                maybeepinfo = info_i.get('episode')
                if maybeepinfo:
                    episode_infos.append(maybeepinfo)
                    num_episode += 1

        mb_states = np.asarray(mb_states, dtype=state.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)

        # Adjust data format and send to learner
        data = prepare_training_data(
            model,
            [mb_states, mb_actions, mb_rewards, mb_dones, state, mb_extras])
        socket.send(serialize(data).to_buffer())
        socket.recv()

        send_data_interval = time.time() - start_time
        # Log information
        logger.record_tabular("steps", update * args.max_steps_per_update)
        logger.record_tabular("episodes", num_episode)
        logger.record_tabular(
            "mean 100 episode reward",
            round(np.mean([epinfo['reward'] for epinfo in episode_infos]), 2))
        logger.record_tabular(
            "mean 100 episode length",
            round(np.mean([epinfo['length'] for epinfo in episode_infos]), 2))
        logger.record_tabular("send data interval", send_data_interval)
        logger.record_tabular("send data fps",
                              args.max_steps_per_update // send_data_interval)
        logger.record_tabular("total steps",
                              nupdates * args.max_steps_per_update)
        logger.dump_tabular()

    actor_status[index] = 1
コード例 #14
0
ファイル: run_trpo.py プロジェクト: kirk86/baselines
def fit(
        model,
        env,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None):
    # Setup losses and stuff
    # ----------------------------------------
    # nworkers = MPI.COMM_WORLD.Get_size()
    # rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)

    th_init = model.get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    model.set_from_flat(th_init)
    model.vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = model.traj_segment_generator(model.pi,
                                           env,
                                           timesteps_per_batch,
                                           stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback:
            callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        with model.timed("sampling"):
            seg = seg_gen.__next__()

        model.add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(model.pi, "ret_rms"):
            model.pi.ret_rms.update(tdlamret)
        if hasattr(model.pi, "ob_rms"):
            model.pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return model.allmean(model.compute_fvp(p, *
                                                   fvpargs)) + cg_damping * p

        model.assign_old_eq_new(
        )  # set old parameter values to new parameter values
        with model.timed("computegrad"):
            *lossbefore, g = model.compute_lossandgrad(*args)
        lossbefore = model.allmean(np.array(lossbefore))
        g = model.allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with model.timed("cg"):
                stepdir = conjugate_gradient(fisher_vector_product,
                                             g,
                                             cg_iters=cg_iters,
                                             verbose=model.rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = model.get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                model.set_from_flat(thnew)
                meanlosses = surr, kl, *_ = model.allmean(
                    np.array(model.compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                model.set_from_flat(thbefore)
            if model.nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(),
                     model.vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(model.loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with model.timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = model.allmean(model.compute_vflossandgrad(mbob, mbret))
                    model.vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(model.flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if model.rank == 0:
            logger.dump_tabular()
コード例 #15
0
ファイル: run_acktr.py プロジェクト: kirk86/baselines
def fit(policy,
        env,
        seed,
        total_timesteps=int(40e6),
        gamma=0.99,
        log_interval=1,
        nprocs=32,
        nsteps=20,
        ent_coef=0.01,
        vf_coef=0.5,
        vf_fisher_coef=1.0,
        lr=0.25,
        max_grad_norm=0.5,
        kfac_clip=0.001,
        save_interval=None,
        lrschedule='linear'):
    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
    ob_space = env.observation_space
    ac_space = env.action_space
    model = AcktrDiscrete(policy,
                          ob_space,
                          ac_space,
                          nenvs,
                          total_timesteps,
                          nsteps=nsteps,
                          ent_coef=ent_coef,
                          vf_coef=vf_fisher_coef,
                          lr=lr,
                          max_grad_norm=max_grad_norm,
                          kfac_clip=kfac_clip,
                          lrschedule=lrschedule)
    # if save_interval and logger.get_dir():
    #     import cloudpickle
    #     with open(os.path.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
    #         fh.write(cloudpickle.dumps(make_model))
    # model = make_model()

    runner = Environment(env, model, nsteps=nsteps, gamma=gamma)
    nbatch = nenvs * nsteps
    tstart = time.time()
    coord = tf.train.Coordinator()
    enqueue_threads = model.q_runner.create_threads(model.sess,
                                                    coord=coord,
                                                    start=True)
    for update in range(1, total_timesteps // nbatch + 1):
        obs, states, rewards, masks, actions, values = runner.run()
        policy_loss, value_loss, policy_entropy = model.train(
            obs, states, rewards, masks, actions, values)
        model.old_obs = obs
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()

        if save_interval and (update % save_interval == 0 or update == 1) \
           and logger.get_dir():
            savepath = os.path.join(logger.get_dir(),
                                    'checkpoint%.5i' % update)
            print('Saving to', savepath)
            model.save(savepath)
    coord.request_stop()
    coord.join(enqueue_threads)
    env.close()
コード例 #16
0
ファイル: base.py プロジェクト: yimingpeng/sac-master
    def _train(self, env, policy, initial_exploration_policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        if initial_exploration_policy is None:
            self.sampler.initialize(env, policy, pool)
            initial_exploration_done = True
        else:
            self.sampler.initialize(env, initial_exploration_policy, pool)
            initial_exploration_done = False

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                # logger.push_prefix()
                logger.log('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    if not initial_exploration_done:
                        if self._epoch_length * epoch >= self._n_initial_exploration_steps:
                            self.sampler.set_policy(policy)
                            initial_exploration_done = True
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                # logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                # logger.dump_tabular(with_prefix=False)
                logger.dump_tabular()
                # logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
コード例 #17
0
    def rollouts(self):
        # Prepare for rollouts
        # ----------------------------------------
        seg_gen = self.traj_segment_generator(self.pi,
                                              self.env,
                                              self.timesteps_per_actorbatch,
                                              stochastic=True)

        episodes_so_far = 0
        timesteps_so_far = 0
        iters_so_far = 0
        tstart = time.time()
        lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
        rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

        assert sum([
            self.max_iters > 0, self.max_timesteps > 0, self.max_episodes > 0,
            self.max_seconds > 0
        ]) == 1, "Only one time constraint permitted"

        while True:
            if self.callback:
                self.callback(locals(), globals())
            if self.max_timesteps and timesteps_so_far >= self.max_timesteps:
                break
            elif self.max_episodes and episodes_so_far >= self.max_episodes:
                break
            elif self.max_iters and iters_so_far >= self.max_iters:
                break
            elif self.max_seconds and time.time() - tstart >= self.max_seconds:
                break

            if self.schedule == 'constant':
                cur_lrmult = 1.0
            elif self.schedule == 'linear':
                cur_lrmult = max(
                    1.0 - float(timesteps_so_far) / self.max_timesteps, 0)
            else:
                raise NotImplementedError

            logger.log("********** Iteration %i ************" % iters_so_far)

            seg = seg_gen.__next__()
            self.add_vtarg_and_adv(seg, self.gamma, self.lam)

            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], \
                seg["tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not self.pi.recurrent)
            optim_batchsize = self.optim_batchsize or ob.shape[0]

            if hasattr(self.pi, "ob_rms"):
                self.pi.ob_rms.update(ob)  # update running mean/std for policy

            self.assign_old_eq_new(
            )  # set old parameter values to new parameter values
            logger.log("Optimizing...")
            logger.log(fmt_row(13, self.loss_names))
            # Here we do a bunch of optimization epochs over the data
            for _ in range(self.optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"],
                                                     batch["atarg"],
                                                     batch["vtarg"],
                                                     cur_lrmult)
                    self.adam.update(g, self.optim_stepsize * cur_lrmult)
                    losses.append(newlosses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            logger.log("Evaluating losses...")
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = self.loss(batch["ob"], batch["ac"], batch["atarg"],
                                      batch["vtarg"], cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0)
            logger.log(fmt_row(13, meanlosses))
            for (lossval, name) in zipsame(meanlosses, self.loss_names):
                logger.record_tabular("loss_" + name, lossval)
            logger.record_tabular("ev_tdlam_before",
                                  explained_variance(vpredbefore, tdlamret))
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(self.flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            if MPI.COMM_WORLD.Get_rank() == 0:
                logger.dump_tabular()

        return self.pi
コード例 #18
0
def main(_):
    # create visualizer
    #visualizer = TensorboardVisualizer()
    monitor = Monitor(FLAGS)
    #log_dir = monitor.log_dir
    #visualizer.initialize(log_dir, None)
    saved_mean_reward = None
    # openAI logger
    L.configure(monitor.log_dir, format_strs=['stdout', 'csv'])

    # initialize env
    atari_env = AtariEnv(monitor)
    #screen_shot_subgoal(atari_env)

    # we should probably follow deepmind style env
    # stack 4 frames and scale float
    env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True)

    # get default tf_session
    sess = U.get_session()

    # create q networks for controller
    controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller')
    controller = Controller(controller_network, env.action_space.n)

    # create q networks for meta-controller
    num_goals = env.unwrapped.goals_space.n
    metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller')
    metacontroller = MetaController(metacontroller_network, num_goals)
    # Create the schedule for exploration starting from 1.
    exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps),
                                 initial_p=1.0,
                                 final_p=EXPLORATION_FINAL_EPS)
    # initialize experience replay
    controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE)
    metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE)
    
    # initialize critic
    critic = Critic(env.unwrapped)

    total_extrinsic_reward = []
    # for success rate
    total_goal_reached = np.zeros(num_goals, dtype=np.int32) 
    total_goal_sampled = np.zeros(num_goals, dtype=np.int32)
    total_goal_epsilon = np.ones(num_goals, dtype=np.float32)
    ep = 0
    total_step = 0
    init_ob = env.reset()

    U.initialize()
    # initialize target network in both controller and meta
    sess.run(metacontroller.network.update_target_op)
    sess.run(controller.network.update_target_op)

    # load ckpt if presence 
    model_path = tf.train.latest_checkpoint(monitor.ckpt_dir)
    model_saved = False
    model_file = os.path.join(monitor.ckpt_dir, 'model')
    if model_path is not None:
        U.load_variables(model_file)
        L.log('loaded model from %s' % model_file)
        model_saved = True

    while ep < MAX_EPISODE: # count number of steps 
        # init environment game play variables
        
        init_ob = env.reset()
        observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape)
        desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0]
        env.unwrapped.desired_goal = desired_goal
        total_goal_sampled[desired_goal] += 1

        # given predicted goal, we encode this goal bounding mask to the observation np array
        ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal)

        # NOTE: Below code verify added mask correctly
        # for i in range(ob_with_g.shape[-1]):
        #     ob = ob_with_g[:,:,i]
        #     image = Image.fromarray(ob)
        #     image = image.convert('RGB')
        #     image.save('test_%i.png' % i)

        done = False
        reached_goal = False

        while not done:
            extrinsic_rewards = 0
            s0 = init_ob['observation']

            while not (done or reached_goal):
                update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0]
                # obtain extrinsic reward from environment
                ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t)
                reached_goal = env.unwrapped.reached_goal(desired_goal)
                ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal)
                
                intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t)
                controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t)
                
                # sample from replay_buffer1 to train controller
                obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None
                # get q estimate for tp1 as 'supervised'
                ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape)
                q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0]
                td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1)
                # join train meta-controller only sample from replay_buffer2 to train meta-controller
                if total_step >= WARMUP_STEPS:
                    L.log('join train has started ----- step %d', total_step)
                    # sample from replay_buffer2 to train meta-controller
                    init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                    weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None
                    # get q estimate for tp1 as 'supervised'
                    obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape)
                    q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0]
                    td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1)

                if total_step % UPDATE_TARGET_NETWORK_FREQ == 0:
                    #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step)
                    sess.run(controller.network.update_target_op)
                    # its fine, we aren't really training meta dqn until after certain steps.
                    sess.run(metacontroller.network.update_target_op)

                extrinsic_rewards += extrinsic_reward_t
                ob_with_g = ob_with_g_tp1
                done = done_t
                total_step += 1
            # we are done / reached_goal
            # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2
            # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards))
            # clean observation without goal encoded
            metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done)

            # if we are here then we have finished the desired goal
            if not done:
                #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards))
                exploration_ep = 1.0
                total_goal_reached[env.unwrapped.achieved_goal] += 1
                if total_step >= WARMUP_STEPS:
                    t = total_step - WARMUP_STEPS
                    exploration_ep = exploration2.value(t)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                
                while env.unwrapped.achieved_goal == desired_goal:
                    desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0]

                env.unwrapped.desired_goal = desired_goal
                total_goal_sampled[desired_goal] += 1
                L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal))

                # start again
                reached_goal = False
        
        # finish an episode
        total_extrinsic_reward.append(extrinsic_rewards)
        ep += 1

        mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1)
        if ep % monitor.print_freq == 0 :
            L.record_tabular("steps", total_step)
            L.record_tabular("episodes", ep)
            L.record_tabular("mean 100 episode reward", mean_100ep_reward)
            L.dump_tabular()

        if total_step % monitor.ckpt_freq == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
            U.save_variables(model_file)
            model_saved = True
            saved_mean_reward = mean_100ep_reward
    
    # verified our model was saved
    if model_saved:
        L.log('restored model with mean reward: %d' % saved_mean_reward)
        U.load_variables(model_file)
コード例 #19
0
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        args):
    # Setup losses and stuff`
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy

    # Ops to reassign params from new to old
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    newprob = tf.exp(pi.pd.logp(ac))
    oldprob = tf.exp(oldpi.pd.logp(ac))

    ratio = newprob / oldprob

    kl = pi.pd.kl(oldpi.pd)
    mean_kl = tf.reduce_mean(kl)
    get_kl = U.function([ob, ac], kl)
    get_mean_kl = U.function([ob, ac], mean_kl)

    threshold = kl < args.kl_threshold
    threshold = tf.cast(threshold, tf.float32)

    pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold

    pol_surr = tf.reduce_mean(pol_surr)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    running_scores = []

    assert sum([
        max_iters > 0, args.num_timesteps > 0, max_episodes > 0,
        max_seconds > 0
    ]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if args.num_timesteps and timesteps_so_far >= args.num_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(timesteps_so_far) / args.num_timesteps, 0)
        else:
            raise NotImplementedError

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-8)  # standardized advantage function estimate

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values

        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)

        # Here we do a bunch of optimization epochs over the data
        for num_epoch in count():
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                g = np.nan_to_num(g)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

            agg_mean_kl = get_mean_kl(ob, ac)

            if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs:
                break

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))

        rewbuffer.extend(rews)

        mean_score = None

        if rewbuffer:
            mean_score = np.mean(rewbuffer)
            running_scores.append((timesteps_so_far, mean_score))

        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.record_tabular("EpRewMean", mean_score)
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            logger.record_tabular("NumEpoch", num_epoch)

            logger.dump_tabular()

    return running_scores
コード例 #20
0
def fit(policy,
        env,
        seed,
        nsteps=5,
        total_timesteps=int(80e6),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='linear',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100):

    set_global_seeds(seed)

    model = A2C(policy=policy,
                observation_space=env.observation_space,
                action_space=env.action_space,
                nenvs=env.num_envs,
                nsteps=nsteps,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                max_grad_norm=max_grad_norm,
                lr=lr,
                alpha=alpha,
                epsilon=epsilon,
                total_timesteps=total_timesteps,
                lrschedule=lrschedule)
    session = model.init_session()
    tf.global_variables_initializer().run(session=session)
    env_runner = Environment(env, model, nsteps=nsteps, gamma=gamma)

    nbatch = env.num_envs * nsteps
    tstart = time.time()
    writer = tf.summary.FileWriter('output', session.graph)
    for update in range(1, total_timesteps // nbatch + 1):
        tf.reset_default_graph()
        obs, states, rewards, masks, actions, values = env_runner.run(session)
        policy_loss, value_loss, policy_entropy = model.predict(
            observations=obs,
            states=states,
            rewards=rewards,
            masks=masks,
            actions=actions,
            values=values,
            session=session)
        nseconds = time.time() - tstart
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    env.close()
    writer.close()
    session.close()
コード例 #21
0
def run_one_agent(index, args, unknown_args, actor_status):
    from tensorflow.keras.backend import set_session
    import tensorflow.compat.v1 as tf

    # Set 'allow_growth'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    # Connect to learner
    context = zmq.Context()
    context.linger = 0  # For removing linger behavior
    socket = context.socket(zmq.REQ)
    socket.connect(f'tcp://{args.ip}:{args.data_port}')

    # Initialize environment and agent instance
    env, agent = init_components(args, unknown_args)

    # Configure logging only in one process
    if index == 0:
        logger.configure(str(args.log_path))
        save_yaml_config(args.exp_path / 'config.yaml', args, 'actor', agent)
    else:
        logger.configure(str(args.log_path), format_strs=[])

    # Create local queues for collecting data
    transitions = []  # A list to store raw transitions within an episode
    mem_pool = MemPool()  # A pool to store prepared training data

    # Initialize values
    model_id = -1
    episode_rewards = [0.0]
    episode_lengths = [0]
    num_episodes = 0
    mean_10ep_reward = 0
    mean_10ep_length = 0
    send_time_start = time.time()

    state = env.reset()
    for step in range(args.num_steps):
        # Do some updates
        agent.update_sampling(step, args.num_steps)

        # Sample action
        action, extra_data = agent.sample(state)
        next_state, reward, done, info = env.step(action)

        # Record current transition
        transitions.append(
            (state, action, reward, next_state, done, extra_data))
        episode_rewards[-1] += reward
        episode_lengths[-1] += 1

        state = next_state

        is_terminal = done or episode_lengths[-1] >= args.max_episode_length > 0
        if is_terminal or len(mem_pool) + len(
                transitions) >= args.max_steps_per_update:
            # Current episode is terminated or a trajectory of enough training data is collected
            data = agent.prepare_training_data(transitions)
            transitions.clear()
            mem_pool.push(data)

            if is_terminal:
                # Log information at the end of episode
                num_episodes = len(episode_rewards)
                mean_10ep_reward = round(np.mean(episode_rewards[-10:]), 2)
                mean_10ep_length = round(np.mean(episode_lengths[-10:]), 2)
                episode_rewards.append(0.0)
                episode_lengths.append(0)

                # Reset environment
                state = env.reset()

        if len(mem_pool) >= args.max_steps_per_update:
            # Send training data after enough training data (>= 'arg.max_steps_per_update') is collected
            post_processed_data = agent.post_process_training_data(
                mem_pool.sample())
            socket.send(serialize(post_processed_data).to_buffer())
            socket.recv()
            mem_pool.clear()

            send_data_interval = time.time() - send_time_start
            send_time_start = time.time()

            if num_episodes > 0:
                # Log information
                logger.record_tabular("iteration",
                                      (step + 1) // args.max_steps_per_update)
                logger.record_tabular("steps", step)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean 10 episode reward",
                                      mean_10ep_reward)
                logger.record_tabular("mean 10 episode length",
                                      mean_10ep_length)
                logger.record_tabular(
                    "send data fps",
                    args.max_steps_per_update // send_data_interval)
                logger.record_tabular("send data interval", send_data_interval)
                logger.dump_tabular()

        # Update weights
        new_weights, model_id = find_new_weights(model_id, args.ckpt_path)
        if new_weights is not None:
            agent.set_weights(new_weights)

    actor_status[index] = 1
コード例 #22
0
ファイル: ddpg.py プロジェクト: onursahin93/Pytorch-RL
def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std,
          batch_size, eval_freq, seed):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        start_episodes: how many episodes purely random policy is run for
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor
        tau: target network update rate
        batch_size: number of episodes per policy training batch
        eval_freq: number of training batch before test
        seed: random seed for all modules with randomness
    """
    # set seeds
    set_global_seed(seed)
    # configure log
    configure_log_info(env_name, seed)

    # create env
    env = gym.make(env_name)
    env.seed(seed)  # set env seed
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]

    # create actor and target actor
    actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device)
    target_actor = Actor(obs_dim, act_dim,
                         float(env.action_space.high[0])).to(device)

    # create critic and target critic
    critic = Critic(obs_dim, act_dim).to(device)
    target_critic = Critic(obs_dim, act_dim).to(device)

    # create DDPG agent (hollowed object)
    agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma,
                 tau)
    agent.align_target()

    # create replay_buffer
    replay_buffer = ReplayBuffer()
    # run a few episodes of untrained policy to initialize scaler and fill in replay buffer
    run_policy(env,
               agent,
               replay_buffer,
               mode="random",
               episodes=start_episodes)

    num_iteration = num_episodes // eval_freq
    current_episodes = 0
    current_steps = 0
    for iter in range(num_iteration):
        # train models
        for i in range(eval_freq):
            # sample transitions
            train_returns, total_steps = run_policy(env,
                                                    agent,
                                                    replay_buffer,
                                                    mode="train",
                                                    episodes=batch_size)
            current_episodes += batch_size
            current_steps += total_steps
            logger.info('[train] average return:{0}, std return: {1}'.format(
                np.mean(train_returns), np.std(train_returns)))
            # train
            num_epoch = total_steps // batch_size
            for e in range(num_epoch):
                observation, action, reward, next_obs, done = replay_buffer.sample(
                )
                agent.update(observation, action, reward, next_obs, done)
        # test models
        num_test_episodes = 10
        returns, _ = run_policy(env,
                                agent,
                                replay_buffer,
                                mode="test",
                                episodes=num_test_episodes)
        avg_return = np.mean(returns)
        std_return = np.std(returns)
        logger.record_tabular('iteration', iter)
        logger.record_tabular('episodes', current_episodes)
        logger.record_tabular('steps', current_steps)
        logger.record_tabular('avg_return', avg_return)
        logger.record_tabular('std_return', std_return)
        logger.dump_tabular()
コード例 #23
0
ファイル: acktr_cont.py プロジェクト: kirk86/baselines
def fit(env,
        policy,
        vf,
        gamma,
        lam,
        timesteps_per_batch,
        num_timesteps,
        animate=False,
        callback=None,
        desired_kl=0.002):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = KfacOptimizer(learning_rate=stepsize,
                          cold_lr=stepsize * (1 - 0.9),
                          momentum=0.9,
                          kfac_update=2,
                          epsilon=1e-2,
                          stats_decay=0.99,
                          async=1,
                          cold_iter=1,
                          weight_decay_dict=policy.wd_dict,
                          max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = Model().function(inputs, update_op)
    Model().init_vars()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr is not None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)