Esempio n. 1
0
    def valid(self, name, sess, valid_feed):
        elbo_losses = []
        rc_losses = []
        rc_ppls = []
        bow_losses = []
        kl_losses = []

        while True:
            batch = valid_feed.next_batch()
            if batch is None:
                break
            feed_dict = self.batch_2_feed(batch, None, use_prior=False, repeat=1)

            elbo_loss, bow_loss, rc_loss, rc_ppl, kl_loss = sess.run(
                [self.elbo, self.avg_bow_loss, self.avg_rc_loss,
                 self.rc_ppl, self.avg_kld], feed_dict)
            elbo_losses.append(elbo_loss)
            rc_losses.append(rc_loss)
            rc_ppls.append(rc_ppl)
            bow_losses.append(bow_loss)
            kl_losses.append(kl_loss)

        avg_losses = self.print_loss(name, ["elbo_loss", "bow_loss", "rc_loss", "rc_peplexity", "kl_loss"],
                                     [elbo_losses, bow_losses, rc_losses, rc_ppls, kl_losses], "")
        logger.record_tabular("elbo_loss", avg_losses[0])
        logger.record_tabular("bow_loss", avg_losses[1])
        logger.record_tabular("rc_loss", avg_losses[2] )
        logger.record_tabular("rc_peplexity", avg_losses[3])
        logger.record_tabular("kl_loss", avg_losses[4])
        logger.dump_tabular()

        return avg_losses[0]
Esempio n. 2
0
    def train(self):
        self.sess.run(tf.global_variables_initializer())
        self.start_worker()
        start_time = time.time()
        total_samples = 0
        for itr in range(0, self.n_itr):
            itr_start_time = time.time()
            logger.info('\n itr #%d' % itr)
            logger.info("Obtaining samples...")
            paths, n_samples = self.obtain_samples(itr)
            total_samples += n_samples

            logger.info("Processing samples...")
            samples_data = self.process_samples(itr, paths)

            logger.info("Optimizing policy...")
            self.optimize_policy(itr, samples_data)

            logger.info("Update stats...")
            self.update_stats(paths)

            logger.info("Fitting baseline...")
            self.fit_baseline(paths)

            logger.record_tabular('Time', time.time() - start_time)
            logger.record_tabular('ItrTime', time.time() - itr_start_time)
            logger.dump_tabular()

        self.shutdown_worker()
Esempio n. 3
0
def log_tabular_results(returns, itr, train_collection):
    logger.clear_tabular()
    logger.record_tabular('Iteration', itr)
    logger.record_tabular('episode_mean', np.mean(returns))
    logger.record_tabular('episode_min', np.min(returns))
    logger.record_tabular('episode_max', np.max(returns))
    logger.record_tabular('TotalSamples', train_collection.get_total_samples())

    logger.dump_tabular()
Esempio n. 4
0
def log_tabular_results(returns, itr, train_collection):
    logger.clear_tabular()
    logger.record_tabular('Iteration', itr)
    logger.record_tabular('AverageReturn', np.mean(returns))
    logger.record_tabular('MinimumReturn', np.min(returns))
    logger.record_tabular('MaximumReturn', np.max(returns))
    logger.record_tabular('TotalSamples', train_collection.get_total_samples())

    logger.dump_tabular()
Esempio n. 5
0
def learn(env,
          sess,
          seed,
          nsteps=5,
          total_timesteps=int(80e4),
          discount=0.5,
          entropy_coeff=0.01,
          lr=7e-4,
          lr_decay=0.99,
          fuzz_factor=0.00001,
          max_grad_norm=0.5,
          log_interval=100):
    env.init()
    action_set = env.getActionSet()
    n_actions = len(action_set)
    state_dim = env.getGameState().size  # Reset environment

    total_returns = []

    # Init actorCritic
    actor_critic = ActorCritic(state_dim, n_actions, nsteps, discount,
                               entropy_coeff, lr, lr_decay, fuzz_factor,
                               total_timesteps, max_grad_norm)
    sim = Simulation(env, actor_critic, nsteps=nsteps, discount=discount)
    sim.start_episode()
    e_cnt = 0
    for nupdate in range(int(total_timesteps / nsteps)):
        if env.game_over():
            # done = True
            total_returns.append(sim.total_return)
            sim.start_episode()
            e_cnt = e_cnt + 1

        # Collect n-step trajectories
        obs, rewards, actions, values, dones, states = sim.run_nsteps()

        # Update train_model
        policy_loss, value_loss, policy_entropy, a_dist = \
            actor_critic.train(obs, actions, rewards, values, dones, states)
        # print('action probs:')
        # print(ap[0], a)

        if nupdate % log_interval == 0 or nupdate == 1:
            # ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", nupdate)
            logger.record_tabular("nepisode", e_cnt)
            # logger.record_tabular("total_timesteps", nupdate * nsteps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("policy_loss", float(policy_loss))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular(
                "avg. total return",
                np.mean(total_returns[-(min(len(total_returns), 100)):]))
            # logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    return actor_critic
Esempio n. 6
0
def validate(val_loader, model, criterion, epoch):
    batch_time = 0  #AverageMeter()
    data_time = 0  #AverageMeter()
    losses = 0  #AverageMeter()
    all_accs = 0  #AverageMeter()
    cls_accs = 0  #AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.cuda()  #(async=True)
        #target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        all_acc, cls_acc = pascal_accuracy(output.data, target)
        # prec1, prec5 = pascal_accuracy(output.data, target, topk=(1, 5))
        losses += loss
        all_accs += all_acc
        cls_accs += cls_acc

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if i % args.print_freq == 0:
            abs_batch_time = batch_time / (i + 1)
            abs_data_time = data_time / (i + 1)
            abs_losses = losses.item() / (i + 1)
            abs_all_accs = all_accs.item() / (i + 1)
            logger.log(
                'Epoch: [{}][{}/{}]\t Time {}\t Data {}\t Loss {}\t All acs {} '
                .format(epoch, i, len(train_loader), abs_batch_time,
                        abs_data_time, abs_losses, abs_all_accs))

            logger.log((cls_accs / (i + 1)))

            logger.record_tabular('val/loss', loss.item())
            logger.record_tabular('val/accum_loss', abs_losses)
            logger.record_tabular('val/accum_all_acces', abs_all_accs)
            for i in range(cls_accs.shape[0]):
                logger.record_tabular('val/accum_cls_accs_{}'.format(i),
                                      cls_accs[i].item() / (i + 1))
                logger.record_tabular('val/cls_accs_{}'.format(i),
                                      cls_acc[i].item())

            logger.dump_tabular()

    return all_accs.item() / (i + 1)
Esempio n. 7
0
def main():
    logger.configure('logs/simulate')
    global T, n_bills, n_taxis, occupied
    results = []
    for n_lanes in range(2, 10):
        bills, n_taxis_left, n_passengers_left = [], [], []
        for seed in range(N_RUNS):
            np.random.seed(seed)
            occupied = [False for _ in range(n_lanes + 1)]
            T, n_bills, n_taxis, sta = 0, 0, 0, 0
            lanes = [
                Lane(i, n_lanes + 1, lam=0.1 / n_lanes) for i in range(n_lanes)
            ]
            enter = np.random.poisson(0.1, size=10000)
            while T < 10000:
                if sta == 0:
                    if n_taxis < M:
                        n_taxis += enter[T]
                    else:
                        sta = 1
                elif n_taxis < N:
                    sta = 0
                for lane in lanes:
                    lane.step()
                T += 1
            bills.append(n_bills)
            n_taxis_left.append(n_taxis)
            n_passengers_left.append(
                np.sum([lane.n_passengers for lane in lanes]))

        results.append(bills)

        logger.record_tabular('lanes', n_lanes)
        logger.record_tabular('bills mean', np.mean(bills))
        logger.record_tabular('bills std', np.std(bills))
        logger.record_tabular('taxis mean', np.mean(n_taxis_left))
        logger.record_tabular('passengers mean', np.mean(n_passengers_left))
        logger.dump_tabular()

    df = pd.DataFrame(np.reshape(results, -1)).rename(columns={0: '# bills'})
    df.insert(0, '# lanes', [i for i in range(2, 10) for _ in range(N_RUNS)],
              True)
    sns.boxplot(x='# lanes',
                y='# bills',
                data=df,
                showmeans=True,
                meanline=True)
    plt.grid(linestyle='--')
    plt.savefig('logs/simulate/boxplot.jpg')
    plt.show()
Esempio n. 8
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': True,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutWorker(params['make_env'], policy, dims, logger,
                              **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
def evaluate(env,
             bc_agent_wrapper,
             num_trajs,
             render,
             exact_model_path=None,
             model_ckpt_dir=None):
    """Evaluate a trained SAM agent"""

    # Only one of the two arguments can be provided
    assert sum([exact_model_path is None, model_ckpt_dir is None]) == 1

    # Rebuild the computational graph
    pol = bc_agent_wrapper('pol')
    # Create episode generator
    traj_gen = traj_ep_generator(env, pol, render)
    # Initialize and load the previously learned weights into the freshly re-built graph
    U.initialize()
    if exact_model_path is not None:
        U.load_model(exact_model_path)
        logger.info(
            "model loaded from exact path:\n  {}".format(exact_model_path))
    else:  # `exact_model_path` is None -> `model_ckpt_dir` is not None
        U.load_latest_checkpoint(model_ckpt_dir)
        logger.info("model loaded from ckpt dir:\n  {}".format(model_ckpt_dir))
    # Initialize the history data structures
    ep_lens = []
    ep_env_rets = []
    # Collect trajectories
    for i in range(num_trajs):
        logger.info("evaluating [{}/{}]".format(i + 1, num_trajs))
        traj = traj_gen.__next__()
        ep_len, ep_env_ret = traj['ep_len'], traj['ep_env_ret']
        # Aggregate to the history data structures
        ep_lens.append(ep_len)
        ep_env_rets.append(ep_env_ret)
    # Log some statistics of the collected trajectories
    ep_len_mean = np.mean(ep_lens)
    ep_env_ret_mean = np.mean(ep_env_rets)
    logger.record_tabular("ep_len_mean", ep_len_mean)
    logger.record_tabular("ep_env_ret_mean", ep_env_ret_mean)
    logger.dump_tabular()
Esempio n. 10
0
    def cartpole_train_3_5(self, rank, args):
        torch.manual_seed(args.seed + rank)

        self.agent.local_brain.train()

        step = 0
        sum_rewards = 0
        max_sum_rewards = 0
        vs = []
        entropies = []
        cnt = 0

        while self.g_ep.value < args.epoch:
            #tmp = 0
            o = self.env.reset()
            #o = torch.from_numpy(state)
            #print('cnt:',cnt)
            # self.agent.local_brain.sync(self.global_brain) # local policy にコピー
            observations, actions, values, rewards, probs = [], [], [], [], []
            #R = 0
            #done = True
            ep_r = 0.
            while True:
                step += 1
                # Agentのactで行動を取得
                p, v = self.agent.local_brain(Variable(torch.from_numpy(o).float()).unsqueeze(0))
                a = self.agent.act(o)
                if len(a.data.squeeze().size()) == 0:
                    o, r, done, _ = self.env.step(a.data.squeeze().item())
                else:
                    o, r, done, _ = self.env.step(a.data.squeeze()[0])
                if done: r = -1
                if rank == 0:
                    sum_rewards += r
                    if args.render:
                        self.env.render()
                ep_r += r
                observations.append(o)
                actions.append(a)
                values.append(v)
                rewards.append(r)
                probs.append(p)

                if step % args.local_t_max == 0 or done:
                    if done:
                        R = 0
                    else:
                        _, v = self.agent.local_brain(torch.from_numpy(observations[-1]).unsqueeze(0).float())
                        R = v.data.squeeze().item()

                    returns = []
                    for r in rewards[::-1]: # 割引報酬和
                        R = r + 0.99 * R
                        returns.insert(0, R)
                    returns = torch.Tensor(returns)


                    loss, v_loss, entropy, _ = self.agent._loss_function(actions, values, probs, returns, args)
                    vs.append(v_loss.data.numpy())
                    entropies.append(entropy.data.numpy())

                    ## 記録
                    if rank == 0 and done:
                        logger.record_tabular_misc_stat('Entropy', entropies)
                        logger.record_tabular_misc_stat('V', vs)
                        logger.record_tabular('reward', sum_rewards)
                        logger.record_tabular('step', self.g_ep.value)
                        logger.dump_tabular()
                        del vs[:]
                        del entropies[:]
                    self.optimizer.zero_grad()
                    loss.backward(retain_graph=True)
                    for lp, gp in zip(self.agent.local_brain.parameters(), self.global_brain.parameters()):
                        gp._grad = lp.grad

                    self.optimizer.step()
                    self.agent.local_brain.sync(self.global_brain) # local policy にコピー

                    observations, actions, values, rewards, probs = [], [], [], [], []

                if done:
                    with self.g_ep.get_lock():
                        self.g_ep.value += 1
                    with self.g_ep_r.get_lock():
                        if self.g_ep_r.value == 0.:
                            self.g_ep_r.value = ep_r
                        else:
                            self.g_ep_r.value = self.g_ep_r.value * 0.99 + ep_r * 0.01
                    self.res_queue.put(self.g_ep_r.value)

                    o = self.env.reset()
                    #self.global_history_reward.append([tmp, self.total_reward])
                    self.total_reward = 0
                    if rank == 0:
                        print('----------------------------------')
                        print('total reward of the episode:', sum_rewards)
                        print('----------------------------------')
                        if args.save_mode == 'all':
                            torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+"_{}.pkl".format(self.g_ep.value)))
                        elif args.save_mode == 'last':
                            torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                        elif args.save_mode == 'max':
                            if max_sum_rewards < sum_rewards:
                                torch.save(self.agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                                max_sum_rewards = sum_rewards
                        #step = 0
                        sum_rewards = 0
                    break

            #raise
            # 学習率の更新
            # new_lr = np.true_divide(args.epoch - global_t[0] , args.epoch * args.lr)
            # self.optimizer.step(new_lr)

            cnt += 1

        #send_rev.send(self.global_history_reward)
        self.res_queue.put(None)
Esempio n. 11
0
def train(policy, planner, rollout_worker, evaluator, n_epochs,
          n_test_rollouts, n_cycles, n_batches, policy_save_interval,
          save_path, **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()

    if save_path:
        latest_mdl_path = save_path + '_latest'
        best_mdl_path = save_path
        periodic_policy_path = save_path + '_{}'

    best_success_rate = -1

    logger.info('Training......')
    # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers
    for epoch in range(n_epochs):
        logger.info('========== epoch {} ========='.format(epoch))
        logger.record_tabular('epoch', epoch)

        # train
        rollout_worker.clear_history()
        for _ in range(n_cycles):
            # logger.info('collect rollouts...')
            episode_for_act, episode_for_pln = rollout_worker.generate_rollouts(
                cur_progress=(epoch / n_epochs))
            # logger.info('store rollouts for policy')
            policy.store_episode(episode_for_act)
            # logger.info('store rollouts for planner, episodes_for_pln shape:', episode_for_pln.shape)
            planner.store_episode(episode_for_pln)
            # logger.info('training policy')
            for _ in range(n_batches):
                policy.train()
            policy.update_target_net()
            # logger.info('training planner')
            for _ in range(n_batches):
                planner.train(use_buffer=True)

        # test
        # logger.info("evaluate...")
        evaluator.clear_history()
        for ro in range(n_test_rollouts):
            evaluator.generate_rollouts()

        for key, val in evaluator.logs('test'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in rollout_worker.logs('train'):
            logger.record_tabular(key, mpi_average(val))
        for key, val in policy.logs():
            logger.record_tabular(key, mpi_average(val))
        for key, val in planner.logs():
            logger.record_tabular(key, mpi_average(val))
        if rank == 0:
            logger.dump_tabular()

        success_rate = mpi_average(evaluator.current_success_rate())
        if rank == 0 and success_rate >= best_success_rate and save_path:
            best_success_rate = success_rate
            # logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
            # evaluator.save_policy(latest_mdl_path)
            logger.info(
                'Saving best policy+planner to {} ...'.format(best_mdl_path))
            evaluator.save_policy(best_mdl_path)
            evaluator.save_planner(best_mdl_path)
        if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_path:
            # policy_path = periodic_policy_path.format(epoch)
            logger.info('Saving lastest policy+planner to {} ...'.format(
                latest_mdl_path))
            evaluator.save_policy(latest_mdl_path)
            evaluator.save_planner(latest_mdl_path)
        elif rank == 0 and policy_save_interval < 0 and epoch % (
                -policy_save_interval) == 0 and save_path:
            periodic_mdl_path = periodic_policy_path.format(epoch)
            logger.info('Saving periodic policy+planner to {} ...'.format(
                periodic_mdl_path))
            evaluator.save_policy(periodic_mdl_path)
            evaluator.save_planner(periodic_mdl_path)

        local_uniform = np.random.uniform(size=(1, ))
        root_uniform = local_uniform.copy()
        MPI.COMM_WORLD.Bcast(root_uniform, root=0)
        if rank != 0:
            assert local_uniform[0] != root_uniform[0]

    return policy, planner
Esempio n. 12
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
          normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
          popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
          tau=0.01, eval_env=None, param_noise_adaption_interval=50):

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saver = tf.train.Saver()

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Esempio n. 13
0
def learn(env, model_path, data_path, policy_fn, *,
          rolloutSize, num_options=4, horizon=80,
          clip_param=0.025, ent_coeff=0.01,  # clipping parameter epsilon, entropy coeff
          optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100,  # optimization hypers
          gamma=0.99, lam=0.95,  # advantage estimation
          max_iters=20,  # time constraint
          adam_epsilon=1e-5,
          schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
          retrain=False,
          ):
    """
        Core learning function
    """
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space, num_options=num_options)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options)  # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32,
                            shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None])
    op_adv = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    betas = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    # Setup losses and stuff
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-ent_coeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    option_hot = tf.one_hot(option, depth=num_options)
    pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims(
        tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims(
        tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1)
    op_loss -= 0.01 * tf.reduce_sum(op_entropy)

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)])
    termgrad = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)])  # Since we will use a different step size.
    opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options],
                        [U.flatgrad(op_loss, var_list)])  # Since we will use a different step size.
    intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options],
                         [U.flatgrad(int_loss, var_list)])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=5)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=5)  # rolling buffer for episode rewards

    datas = [0 for _ in range(num_options)]

    if retrain:
        print("Retraining to New Task !! ")
        time.sleep(2)
        U.load_state(model_path+'/')

    p = []
    max_timesteps = int(horizon * rolloutSize * max_iters)
    while True:
        if max_iters and iters_so_far >= max_iters:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        render = False

        rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + 'rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        add_vtarg_and_adv(rollouts, gamma, lam, num_options)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"]
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Optimizing the policy
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("Option- ", opt, " Batch Size: ", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                continue

            datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            if indices.size < optim_batchsize:
                print("Too few samples for opt - ", opt)
                continue

            optim_batchsize_corrected = optim_batchsize
            optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs)
            print("Optim Epochs:", optim_epochs_corrected)
            logger.log("Optimizing...")
            # Here we do a bunch of optimization epochs over the data

            for _ in range(optim_epochs_corrected):
                losses = []  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize_corrected):
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                                                    cur_lrmult, [opt])
                    adam.update(grads, mainlr * cur_lrmult)
                    losses.append(newlosses)

            # Optimize termination functions
            termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0]
            adam.update(termg, termlr)

            # Optimize interest functions
            intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0]
            adam.update(intgrads, intlr)

        # Optimize policy over options
        opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0]
        adam.update(opgrads, piolr)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
Esempio n. 14
0
def eval(data, model, meta_optimizer):

    model.eval()
    criterion = nn.NLLLoss().cuda()
    num_sents = 0
    num_words = 0
    total_nll_autoreg = 0.
    total_nll_vae = 0.
    total_kl_vae = 0.
    total_nll_svi = 0.
    total_kl_svi = 0.
    best_svi_loss = 0.
    for i in range(len(data)):
        sents, length, batch_size = data[i]
        num_words += batch_size * length
        num_sents += batch_size
        if args.gpu >= 0:
            sents = sents.cuda()
        if args.model == 'autoreg':
            preds = model._dec_forward(sents, None, True)
            nll_autoreg = sum([
                criterion(preds[:, l], sents[:, l + 1]) for l in range(length)
            ])
            total_nll_autoreg += nll_autoreg.data[0] * batch_size
        elif args.model == 'svi':
            mean_svi = Variable(
                0.1 * torch.randn(batch_size, args.latent_dim).cuda(),
                requires_grad=True)
            logvar_svi = Variable(
                0.1 * torch.randn(batch_size, args.latent_dim).cuda(),
                requires_grad=True)
            var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi],
                                                    sents)
            mean_svi_final, logvar_svi_final = var_params_svi
            z_samples = model._reparameterize(mean_svi_final.detach(),
                                              logvar_svi_final.detach())
            preds = model._dec_forward(sents, z_samples)
            nll_svi = sum([
                criterion(preds[:, l], sents[:, l + 1]) for l in range(length)
            ])
            total_nll_svi += nll_svi.data[0] * batch_size
            kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
            total_kl_svi += kl_svi.data[0] * batch_size
            mean, logvar = mean_svi_final, logvar_svi_final
        else:
            mean, logvar = model._enc_forward(sents)
            z_samples = model._reparameterize(mean, logvar)
            preds = model._dec_forward(sents, z_samples)
            nll_vae = sum([
                criterion(preds[:, l], sents[:, l + 1]) for l in range(length)
            ])
            total_nll_vae += nll_vae.data[0] * batch_size
            kl_vae = utils.kl_loss_diag(mean, logvar)
            total_kl_vae += kl_vae.data[0] * batch_size
            if args.model == 'savae':
                mean_svi = Variable(mean.data, requires_grad=True)
                logvar_svi = Variable(logvar.data, requires_grad=True)
                var_params_svi = meta_optimizer.forward([mean_svi, logvar_svi],
                                                        sents)
                mean_svi_final, logvar_svi_final = var_params_svi
                z_samples = model._reparameterize(mean_svi_final,
                                                  logvar_svi_final)
                preds = model._dec_forward(sents, z_samples)
                nll_svi = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                total_nll_svi += nll_svi.data[0] * batch_size
                kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
                total_kl_svi += kl_svi.data[0] * batch_size
                mean, logvar = mean_svi_final, logvar_svi_final

    nll_autoreg = total_nll_autoreg / num_sents
    ppl_autoreg = np.exp(total_nll_autoreg / num_words)
    nll_vae = (total_nll_vae + total_kl_vae) / num_sents
    rec_vae = total_nll_vae / num_sents
    kl_vae = total_kl_vae / num_sents
    ppl_bound_vae = np.exp((total_nll_vae + total_kl_vae) / num_words)
    nll_svi = (total_nll_svi + total_kl_svi) / num_sents
    rec_svi = total_nll_svi / num_sents
    kl_svi = total_kl_svi / num_sents
    ppl_bound_svi = np.exp((total_nll_svi + total_kl_svi) / num_words)

    logger.record_tabular('AR NLL', nll_autoreg)
    logger.record_tabular('AR PPL', ppl_autoreg)
    logger.record_tabular('VAE NLL', nll_vae)
    logger.record_tabular('VAE REC', rec_vae)
    logger.record_tabular('VAE KL', kl_vae)
    logger.record_tabular('VAE PPL', ppl_bound_vae)
    logger.record_tabular('SVI NLL', nll_svi)
    logger.record_tabular('SVI REC', rec_svi)
    logger.record_tabular('SVI KL', kl_svi)
    logger.record_tabular('SVI PPL', ppl_bound_svi)
    logger.dump_tabular()
    logger.info(
        'AR NLL: %.4f, AR PPL: %.4f, VAE NLL: %.4f, VAE REC: %.4f, VAE KL: %.4f, VAE PPL: %.4f, SVI NLL: %.4f, SVI REC: %.4f, SVI KL: %.4f, SVI PPL: %.4f'
        % (nll_autoreg, ppl_autoreg, nll_vae, rec_vae, kl_vae, ppl_bound_vae,
           nll_svi, rec_svi, kl_svi, ppl_bound_svi))
    model.train()
    if args.model == 'autoreg':
        return ppl_autoreg
    elif args.model == 'vae':
        return ppl_bound_vae
    elif args.model == 'savae' or args.model == 'svi':
        return ppl_bound_svi
Esempio n. 15
0
def learn(env, model_path, data_path, policy_fn, model_learning_params, svm_grid_params, svm_params_interest,
          svm_params_guard, *, modes, rolloutSize, num_options=2,
          horizon,  # timesteps per actor per update
          clip_param, ent_coeff=0.02,  # clipping parameter epsilon, entropy coeff
          optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=160,  # optimization hypers
          gamma=0.99, lam=0.95,  # advantage estimation
          max_iters=0,  # time constraint
          adam_epsilon=1.2e-4,
          schedule='linear',  # annealing for stepsize parameters (epsilon and adam)
          retrain=False
          ):
    """
        Core learning function
    """

    ob_space = env.observation_space
    ac_space = env.action_space
    if retrain:
        model = pickle.load(open(model_path + '/hybrid_model.pkl', 'rb'))
        print("Model graph:", model.transitionGraph.nodes)
        print("Model options:", model.transitionGraph.edges)
    else:
        model = partialHybridModel(env, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, horizon, modes, num_options, rolloutSize)
    pi = policy_fn("pi", ob_space, ac_space, model, num_options)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space, model, num_options)  # Network for old policy
    atarg = tf1.placeholder(dtype=tf1.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf1.placeholder(dtype=tf1.float32, shape=[None])  # Empirical return

    lrmult = tf1.placeholder(name='lrmult', dtype=tf1.float32,
                             shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # Define placeholders for computing the advantage
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    ac = pi.pdtype.sample_placeholder([None])

    # Defining losses for optimization
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf1.reduce_mean(kloldnew)
    meanent = tf1.reduce_mean(ent)
    pol_entpen = (-ent_coeff) * meanent

    ratio = tf1.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf1.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf1.reduce_mean(tf1.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP), negative to convert from a maximization to minimization problem
    vf_loss = tf1.reduce_mean(tf1.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf1.assign(oldv, newv) for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=10)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=10)  # rolling buffer for episode rewards

    p = []  # for saving the rollouts

    if retrain:
        print("Retraining to New Task !!")
        time.sleep(2)
        U.load_state(model_path+'/')
        print(pi.eps)
    max_timesteps = int(horizon * rolloutSize * max_iters)

    while True:
        if max_iters and iters_so_far >= max_iters:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("************* Iteration %i *************" % iters_so_far)
        print("Collecting samples for policy optimization !! ")
        render = False

        rollouts = sample_trajectory(pi, model, env, horizon=horizon, rolloutSize=rolloutSize, render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + '/rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        # Model update
        print("Updating model !!\n")
        model.updateModel(rollouts, pi)
        print("Model graph:", model.transitionGraph.nodes)
        print("Model options:", model.transitionGraph.edges)
        edges = list(model.transitionGraph.edges)
        for i in range(0, len(edges)):
            print(edges[i][0], " -> ", edges[i][1], " : ", model.transitionGraph[edges[i][0]][edges[i][1]]['weight'])

        datas = [0 for _ in range(num_options)]
        add_vtarg_and_adv(rollouts, pi, gamma, lam, num_options)

        ob, ac, opts, atarg, tdlamret = rollouts["seg_obs"], rollouts["seg_acs"], rollouts["des_opts"], rollouts["adv"], rollouts["tdlamret"]
        old_opts = rollouts["seg_opts"]
        similarity = 0
        for i in range(0, len(old_opts)):
            if old_opts[i] == opts[i]:
                similarity += 1

        print("Percentage similarity of options: ", similarity/len(old_opts) * 100)

        vpredbefore = rollouts["vpreds"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()

        pi.eps = pi.eps * gamma #reduce exploration

        # Optimizing the policy
        print("\nOptimizing policy !! \n")
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("Option- ", opt, " Batch Size: ", indices.size)
            if not indices.size:
                continue

            datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            if indices.size < optim_batchsize:
                print("Too few samples for opt - ", opt)
                continue

            optim_batchsize_corrected = optim_batchsize
            optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs)
            print("Optim Epochs:", optim_epochs_corrected)
            logger.log("Optimizing...")
            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs_corrected):
                losses = []  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize_corrected):
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt])
                    if np.isnan(newlosses).any():
                        continue
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)
        if len(losses) > 0:
            meanlosses, _, _ = mpi_moments(losses, axis=0)
            print("Mean loss ", meanlosses)
            for (lossval, name) in zipsame(meanlosses, loss_names):
                logger.record_tabular("loss_" + name, lossval)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        '''
        if model_path and not retrain:
            U.save_state(model_path + '/')
            model_file_name = model_path + '/hybrid_model.pkl'
            pickle.dump(model, open(model_file_name, "wb"), pickle.HIGHEST_PROTOCOL)
            print("Policy and Model saved in - ", model_path)
        '''
    return pi, model
Esempio n. 16
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    U.load_state("save/Humanoid-v1")

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        #if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
        U.save_state("save/Humanoid-v1")
Esempio n. 17
0
def train(rank, global_policy, local_policy, optimizer, env, global_t, args):
    o = env.reset()
    step = 0
    sum_rewards = 0
    max_sum_rewards = 0
    vs = []
    entropies = []
    sum_rewards = 0
    while global_t[0] < args.epoch:
        local_policy.sync(global_policy)
        observations = []
        actions = []
        values = []
        rewards = []
        probs = []
        R = 0
        for i in range(args.local_t_max):
            global_t += 1
            step += 1
            p, v = local_policy(
                Variable(torch.from_numpy(o).float()).unsqueeze(0))
            a = p.multinomial()
            o, r, done, _ = env.step(a.data.squeeze()[0])
            if rank == 0:
                sum_rewards += r
                if args.render:
                    env.render()
            observations.append(o)
            actions.append(a)
            values.append(v)
            rewards.append(r)
            probs.append(p)
            if done:
                o = env.reset()
                if rank == 0:
                    print('----------------------------------')
                    print('total reward of the episode:', sum_rewards)
                    print('----------------------------------')
                    if args.save_mode == 'all':
                        torch.save(
                            local_policy,
                            os.path.join(
                                args.log_dir, args.save_name +
                                "_{}.pkl".format(global_t[0])))
                    elif args.save_mode == 'last':
                        torch.save(
                            local_policy,
                            os.path.join(args.log_dir,
                                         args.save_name + '.pkl'))
                    elif args.save_mode == 'max':
                        if max_sum_rewards < sum_rewards:
                            torch.save(
                                local_policy,
                                os.path.join(args.log_dir,
                                             args.save_name + '.pkl'))
                            max_sum_rewards = sum_rewards
                    step = 0
                break
        else:
            _, v = local_policy(
                Variable(torch.from_numpy(o).unsqueeze(0).float()))
            R += v.data.squeeze()[0]

        returns = []
        for r in rewards[::-1]:
            R = r + 0.99 * R
            returns.insert(0, R)
        returns = torch.Tensor(returns)
        #if len(returns) > 1:
        #    returns = (returns-returns.mean()) / (returns.std()+args.eps)
        v_loss = 0
        entropy = 0
        for a, v, p, r in zip(actions, values, probs, returns):
            a.reinforce(r - v.data.squeeze())
            _v_loss = nn.MSELoss()(v, Variable(torch.Tensor([r])))
            v_loss += _v_loss
            entropy += -(p * (p + args.eps).log()).sum()
        v_loss = v_loss * 0.5 * args.v_loss_coeff
        entropy = entropy * args.entropy_beta
        loss = v_loss - entropy
        vs.append(v_loss.data.numpy())
        entropies.append(entropy.data.numpy())
        if rank == 0 and done:
            logger.record_tabular_misc_stat('Entropy', entropies)
            logger.record_tabular_misc_stat('V', vs)
            logger.record_tabular('reward', sum_rewards)
            logger.record_tabular('step', global_t[0])
            logger.dump_tabular()
            del vs[:]
            del entropies[:]
            sum_rewards = 0
        optimizer.zero_grad()
        final_node = [loss] + actions
        gradients = [torch.ones(1)] + [None] * len(actions)
        autograd.backward(final_node, gradients)
        new_lr = (args.epoch - global_t[0]) / args.epoch * args.lr
        optimizer.step(new_lr)
Esempio n. 18
0
def main():
    MAX_EPISODES = 2000
    LR_A = 0.0005  # learning rate for actor
    LR_C = 0.0005  # learning rate for critic
    GAMMA = 0.999  # reward discount
    REPLACE_ITER_A = 1700
    REPLACE_ITER_C = 1500
    MEMORY_CAPACITY = 200000
    BATCH_SIZE = 32
    DISPLAY_THRESHOLD = 400  # display until the running reward > 100
    DATA_PATH = './data'
    LOAD_MODEL = False
    SAVE_MODEL_ITER = 100000
    RENDER = False
    OUTPUT_GRAPH = False

    GLOBAL_STEP = tf.Variable(0, trainable=False)
    INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1))
    LR_A = tf.train.exponential_decay(LR_A,
                                      GLOBAL_STEP,
                                      10000,
                                      .97,
                                      staircase=True)
    LR_C = tf.train.exponential_decay(LR_C,
                                      GLOBAL_STEP,
                                      10000,
                                      .97,
                                      staircase=True)
    END_POINT = (200 - 10) * (14 / 30)  # from game

    ENV_NAME = 'BipedalWalker-v2'
    env = gym.make(ENV_NAME)
    env.seed(1)

    STATE_DIM = env.observation_space.shape[0]  # 24
    ACTION_DIM = env.action_space.shape[0]  # 4
    ACTION_BOUND = env.action_space.high  # [1, 1, 1, 1]

    # all placeholder for tf
    with tf.name_scope('S'):
        S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
    with tf.name_scope('R'):
        R = tf.placeholder(tf.float32, [None, 1], name='r')
    with tf.name_scope('S_'):
        S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')

    sess = tf.Session()

    # Create actor and critic.
    actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A, S, S_)
    critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C,
                    actor.a, actor.a_, S, S_, R, GLOBAL_STEP)
    actor.add_grad_to_graph(critic.a_grads, GLOBAL_STEP)

    M = Memory(MEMORY_CAPACITY)

    saver = tf.train.Saver(max_to_keep=100)

    if LOAD_MODEL:
        all_ckpt = tf.train.get_checkpoint_state(
            './data', 'checkpoint').all_model_checkpoint_paths
        saver.restore(sess, all_ckpt[-1])
    else:
        if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH)
        os.mkdir(DATA_PATH)
        sess.run(tf.global_variables_initializer())

    if OUTPUT_GRAPH:
        tf.summary.FileWriter('logs', graph=sess.graph)

    var = 3  # control exploration
    var_min = 0.01

    logger.configure()
    print(logger.get_dir())

    logger.info('start training')
    for i_episode in range(MAX_EPISODES):
        # s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.)
        s = env.reset()
        ep_r = 0
        while True:
            if RENDER:
                env.render()
            a = actor.choose_action(s)
            a = np.clip(
                np.random.normal(a, var), -1,
                1)  # add randomness to action selection for exploration
            s_, r, done, _ = env.step(
                a
            )  # r = total 300+ points up to the far end. If the robot falls, it gets -100.

            if r == -100: r = -2
            ep_r += r

            transition = np.hstack((s, a, [r], s_))
            max_p = np.max(M.tree.tree[-M.tree.capacity:])
            M.store(max_p, transition)

            if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY / 20:
                var = max([var * 0.9999,
                           var_min])  # decay the action randomness
                tree_idx, b_M, ISWeights = M.prio_sample(
                    BATCH_SIZE)  # for critic update
                b_s = b_M[:, :STATE_DIM]
                b_a = b_M[:, STATE_DIM:STATE_DIM + ACTION_DIM]
                b_r = b_M[:, -STATE_DIM - 1:-STATE_DIM]
                b_s_ = b_M[:, -STATE_DIM:]

                abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights)
                actor.learn(b_s)
                for i in range(len(tree_idx)):  # update priority
                    idx = tree_idx[i]
                    M.update(idx, abs_td[i])
            if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0:
                ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt')
                save_path = saver.save(sess,
                                       ckpt_path,
                                       global_step=GLOBAL_STEP,
                                       write_meta_graph=False)
                print("Save Model %s\n" % save_path)

            if done:
                if "running_r" not in globals():
                    running_r = ep_r
                else:
                    running_r = 0.95 * running_r + 0.05 * ep_r
                # if running_r > DISPLAY_THRESHOLD: RENDER = True
                # else: RENDER = False

                # done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----'
                # print('Episode:', i_episode,
                #       done,
                #       '| Running_r: %i' % int(running_r),
                #       '| Epi_r: %.2f' % ep_r,
                #       '| Exploration: %.3f' % var,
                #       '| Pos: %.i' % int(env.unwrapped.hull.position[0]),
                #       '| LR_A: %.6f' % sess.run(LR_A),
                #       '| LR_C: %.6f' % sess.run(LR_C),
                #       )
                logger.record_tabular('episode', i_episode)
                logger.record_tabular('ep_reward', ep_r)
                logger.record_tabular('pos',
                                      int(env.unwrapped.hull.position[0]))
                logger.dump_tabular()
                logger.info('')
                break

            s = s_
            sess.run(INCREASE_GS)
Esempio n. 19
0
    def cartpole_train(self, rank, env, global_brain, agent, optimizer, global_t, send_rev, args):
        #global_total_loss = []
        o = env.reset()
        step = 0
        sum_rewards = 0
        max_sum_rewards = 0
        vs = []
        entropies = []
        sum_rewards = 0
        done = True
        #cnt = 0
        while global_t[0] < args.epoch:
            tmp = global_t.clone().item() + 1
            #print('cnt:',cnt)
            agent.local_brain.sync(global_brain) # local policy にコピー
            observations = []
            actions = []
            values = []
            rewards = []
            probs = []
            R = 0
            for _ in range(args.local_t_max):
                global_t += 1
                step += 1
                # Agentのactで行動を取得
                p, v = agent.local_brain(Variable(torch.from_numpy(o).float()).unsqueeze(0))
                a = agent.act(o)
                if len(a.data.squeeze().size()) == 0:
                    o, r, done, _ = env.step(a.data.squeeze().item())
                else:
                    o, r, done, _ = env.step(a.data.squeeze()[0])
                if r != 1:
                    print('-----------------------------------------------------------------------------------------------')
                if rank == 0:
                    sum_rewards += r
                    if args.render:
                        env.render()

                observations.append(o)
                actions.append(a)
                values.append(v)
                rewards.append(r)
                probs.append(p)
                if done:
                    o = env.reset()
                    #self.total_reward = 0
                    if rank == 0:
                        print('----------------------------------')
                        print('total reward of the episode:', sum_rewards)
                        print('----------------------------------')
                        if args.save_mode == 'all':
                            torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+"_{}.pkl".format(global_t[0])))
                        elif args.save_mode == 'last':
                            torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                        elif args.save_mode == 'max':
                            if max_sum_rewards < sum_rewards:
                                torch.save(agent.local_brain, os.path.join(args.log_dir, args.save_name+'.pkl'))
                                max_sum_rewards = sum_rewards
                        step = 0
                    break
                else:
                    #self.total_reward += r
                    _, v = agent.local_brain(torch.from_numpy(o).unsqueeze(0).float())
                    R += v.data.squeeze().item()

            # -- Agent advantage_push_agent.local_brain() --- 割引報酬和の計算
            
            returns = []
            for r in rewards[::-1]: # 割引報酬和
                R = r + 0.99 * R
                returns.insert(0, R)
            returns = torch.Tensor(returns)


            #if len(returns) > 1:
            #    returns = (returns-returns.mean()) / (returns.std()+args.eps)

            # -- LocalBrain _build_graph() --- lossの計算

            loss, v_loss, entropy, p_loss_list = agent._loss_function(actions, values, probs, returns, args)

            vs.append(v_loss.data.numpy())
            entropies.append(entropy.data.numpy())

            self.global_history_reward.append([tmp, sum_rewards])

            ## 記録
            if rank == 0 and done:
                logger.record_tabular_misc_stat('Entropy', entropies)
                logger.record_tabular_misc_stat('V', vs)
                logger.record_tabular('reward', sum_rewards)
                logger.record_tabular('step', global_t[0])
                logger.dump_tabular()
                del vs[:]
                del entropies[:]
                sum_rewards = 0

            # 重みの更新(最後まで)
            optimizer.zero_grad()
            final_node = [loss] + p_loss_list
            #print('final_node',final_node)
            gradients = [torch.ones(1)] + [None] * len(p_loss_list)
            #print('gradients',gradients)
            autograd.backward(final_node, gradients)
            #print('after_final_node',final_node)
            #print('after_gradients',gradients)

            #raise
            # 学習率の更新
            new_lr = np.true_divide(args.epoch - global_t[0] , args.epoch * args.lr)
            optimizer.step(new_lr)

            # cnt += 1

        send_rev.send(self.global_history_reward)
Esempio n. 20
0
def main():

    logger.configure('{}{}_logs'.format(filePath, envName))
    for k, v in C.items():
        logger.record_tabular(k, v)
    logger.dump_tabular()

    logger.log('MsPacman')

    #Start the session
    sess = tf.InteractiveSession()

    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])

    #Intitialize variables to record outputs
    train_track = [0.0]
    eval_track = []
    best_reward = 0

    train_reward = tf.placeholder(tf.float32)
    eval_reward = tf.placeholder(tf.float32)
    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])
    agent = Agent(train_env, C)

    train_fs = reset_fs()
    train_s = train_env.reset()
    best_reward = 0
    train_mean = []
    eval_mean = []

    train_summary = tf.summary.scalar('train_reward', train_reward)
    eval_summary = tf.summary.scalar('eval_reward', eval_reward)
    writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName),
                                   sess.graph)
    sess.run(tf.global_variables_initializer())

    agent.net.update_target_network()

    for it in range(C['iterations']):

        train_fs.append(train_s)

        train_a = agent.act(np.transpose(train_fs, (1, 2, 0)))
        ns, train_r, train_d, _ = train_env.step(train_a)
        #print('Iteration ',it, ' Reward ', train_r)
        train_track[-1] += train_r
        agent.record(train_s, train_a, train_r, float(train_d), it)
        train_s = ns

        if train_d:
            if train_env.env.env.was_real_done:  # one env for MsPacman, Freeway (No Fire action)
                if len(train_track) % 100 == 0:
                    mean = np.mean(train_track[-100:])
                    train_mean.append(mean)
                    summary = sess.run(train_summary,
                                       feed_dict={train_reward: mean})
                    writer.add_summary(summary, it)
                    logger.record_tabular('steps', it)
                    logger.record_tabular('episode', len(train_track))
                    logger.record_tabular('epsilon', 100 * agent.epsilon)
                    logger.record_tabular('learning rate', agent.lr)
                    logger.record_tabular('Mean Reward 100 episdoes', mean)
                    logger.dump_tabular()
                    with open(resultPath + 'reward_atari_base.pk1', 'wb') as f:
                        pickle.dump(train_track,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

                train_track.append(0.0)

            train_fs = reset_fs()
            train_s = train_env.reset()

        if (it + 1) % C['eval_freq'] == 0:

            for i in range(C['eval_episodes']):
                temp_video = []
                eval_track.append(0.0)
                eval_fs = reset_fs()
                eval_s = eval_env.reset()
                while True:
                    temp_video.append(eval_s)
                    eval_fs.append(eval_s)
                    eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0)))
                    eval_s, eval_r, eval_d, _ = eval_env.step(eval_a)
                    eval_track[-1] += eval_r

                    if eval_env.env.env.was_real_done:
                        break
                    if eval_d:
                        eval_fs = reset_fs()
                        eval_s = eval_env.reset()

                if eval_track[-1] > best_reward:
                    best_reward = eval_track[-1]
                    best_video = temp_video
                    with open(resultPath + 'video_atari_base.pk1', 'wb') as f:
                        pickle.dump(best_video,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

            eval_mean.append(np.mean(eval_track[-C['eval_episodes']:]))
            summary = sess.run(eval_summary,
                               feed_dict={
                                   eval_reward:
                                   np.mean(eval_track[-C['eval_episodes']:])
                               })
            writer.add_summary(summary, it)

        if it == 1000000:
            outputs = agent.net.get_outputs(np.transpose(train_fs, (1, 2, 0)))
            with open(resultPath + 'outputs.pk1', 'wb') as f:
                pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL)
            with open(resultPath + 'outputs_screen.pk1', 'wb') as f:
                pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(resultPath + 'reward_atari_base.pk1', 'wb') as f:
        pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(resultPath + 'trainMean_atari_base.pk1', 'wb') as f:
        pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(resultPath + 'evalMean_atari_base.pk1', 'wb') as f:
        pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
    agent.net.save(filePath + '{}_model2'.format(C['env_id']))
    sess.close()
Esempio n. 21
0
def testing(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=50,
        nb_rollout_steps=3,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        # no noise for test
        #   noise_type='adaptive-param_0.2',
        #   noise_type='normal_0.9',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    # nb_actions = 2*env.grid_size
    nb_actions = env.grid_size
    action_shape = np.array(nb_actions * [0]).shape
    nb_features = (4 + 1) * env.grid_size
    observation_shape = np.array(nb_features * [0]).shape
    grid_x = env.grid_x
    grid_y = env.grid_y
    x = []
    y = []
    for i in range(grid_x):
        x.append(i + 1)
    for i in range(grid_y):
        y.append(i + 1)
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    '''no noise for test'''
    # if noise_type is not None:
    #     for current_noise_type in noise_type.split(','):
    #         current_noise_type = current_noise_type.strip()
    #         if current_noise_type == 'none':
    #             pass
    #         elif 'adaptive-param' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
    #         elif 'normal' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         elif 'ou' in current_noise_type:
    #             _, stddev = current_noise_type.split('_')
    #             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
    #         else:
    #             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # max_action = env.action_space.high
    # logger.info('scaling actions by {} before executing in env'.format(max_action))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    # agent.initialize(sess)
    # sess.graph.finalize()
    agent.load(sess, save_path)

    agent.reset()

    obs, env_state = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    average_reward = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_state = []
    epoch_episodes = 0
    #record the car numbers in each step
    car_num_set = {}
    t_set = [i for i in range(total_timesteps)]
    for xx in x:
        for yy in y:
            lab = str(xx) + str(yy)
            car_num_set[lab] = [[0 for i in range(total_timesteps)]
                                for j in range(4)]

    for epoch in range(nb_epochs):
        obs, env_state = env.reset()
        epoch_actions = []
        epoch_state = []
        average_car_num_set = []
        last_action = 1
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            action, q, _, _ = agent.step(obs,
                                         apply_noise=False,
                                         compute_Q=True)
            '''random action'''
            # if np.random.rand()>0.5:
            #     action=[1]
            # else:
            #     action=[0]
            '''cycle light state'''
            # action=[0]
            '''cycle action (should cycle state instead of action)'''
            # if last_action==1:
            #     action=[0]
            # else:
            #     action=[1]
            # last_action=action[0]

            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                new_obs, r, env_state, done = env.step(action, env_state)
                epoch_state.append(env_state['11'].light_state)
                for xx in x:
                    for yy in y:
                        lab = str(xx) + str(yy)
                        for i in range(4):
                            car_num_set[lab][i][t] = (
                                env_state['11'].car_nums[i])
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.
                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        print('done')
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            epoch_episode_rewards.append(episode_reward)
            average_reward.append(episode_reward / nb_rollout_steps)

            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            # for t_train in range(nb_train_steps):
            #     # Adapt param noise, if necessary.
            #     if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
            #         distance = agent.adapt_param_noise()
            #         epoch_adaptive_distances.append(distance)
            #     # print('Train!')
            #     cl, al = agent.train()
            #     epoch_critic_losses.append(cl)
            #     epoch_actor_losses.append(al)
            #     agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0
            step_set.append(t)

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        # plt.figure(figsize=(8,5))
        '''plot rewards-steps'''
        ax1 = plt.subplot(2, 1, 1)
        plt.sca(ax1)
        plt.plot(step_set, average_reward, color='b')
        # plt.xlabel('Steps')
        plt.ylabel('Mean Reward', fontsize=12)
        # plt.ylim(-15000,0)
        '''plot queueing car numbers-steps'''
        ax2 = plt.subplot(2, 1, 2)
        plt.sca(ax2)
        print(np.shape(t_set), np.shape(car_num_set['11'][i]))
        for i in range(4):
            if i == 0:
                plt.plot(t_set, car_num_set['11'][i], '--', label=i, color='b')
            elif i == 1:
                plt.plot(t_set,
                         car_num_set['11'][i],
                         '--',
                         label=i,
                         color='orange')
            elif i == 2:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='g')
            else:
                plt.plot(t_set, car_num_set['11'][i], label=i, color='r')
        plt.ylim(0, 100)
        #sum among roads
        sum_car_num = np.sum(car_num_set['11'], axis=0)
        #average among time steps
        average_car_num = np.average(sum_car_num)
        average_car_num_set.append(average_car_num)

        plt.xlabel('Steps', fontsize=12)
        plt.ylabel('Cars Numbers', fontsize=12)
        # set legend
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = OrderedDict(zip(labels, handles))
        leg = plt.legend(by_label.values(), by_label.keys(), loc=1)
        # leg = plt.legend(loc=4)
        legfm = leg.get_frame()
        legfm.set_edgecolor('black')  # set legend fame color
        legfm.set_linewidth(0.5)  # set legend fame linewidth
        plt.savefig('ddpg_mean_test.pdf')
        plt.show()
        print(epoch_state)

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('average queueing car numbers: ', np.average(average_car_num_set))

    return agent
Esempio n. 22
0
def main():

  #Adding configuraion file details into logger
  logger.configure('{}{}_logs'.format(filePath, envName))
  for k, v in C.items():
    logger.record_tabular(k, v)
  logger.dump_tabular()        

  logger.log('Practice DQN with Dense 512') 

  sess = tf.InteractiveSession()
  train_env = make_env(C['env_id'], C['noop_max'])
  eval_env = make_env(C['env_id'], C['noop_max'])
  train_s = train_env.reset()
  agent = Agent(train_env, C)

  train_reward = tf.placeholder(tf.float32)
  eval_reward = tf.placeholder(tf.float32)
  train_summary = tf.summary.scalar('train_reward', train_reward)
  eval_summary = tf.summary.scalar('eval_reward', eval_reward)
  writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName), sess.graph)

  sess.run(tf.global_variables_initializer())

  #Practice
  for it in range(C['pre_iterations']):
    train_a = agent.act_pre()
    ns, train_r, train_d, _ = train_env.step(train_a)
    agent.record(train_s, train_a, train_r, float(train_d), it, True)
    train_s = ns
    if train_d:
      train_s = train_env.reset()
 
  logger.log('Pre-training completed')

  #Initializing Online RL training network 
  agent.net.initialize_online_network()
  train_track = [0.0]
  eval_track = []
  best_reward = 0
  
  train_fs = reset_fs()
  train_s = train_env.reset()
  best_reward = 0
  train_mean = []
  eval_mean = []
  
  
  agent.net.update_target_network()
  
  #RL training
  for it in range(C['iterations']):
    
    train_fs.append(train_s)
    
    train_a = agent.act(np.transpose(train_fs, (1,2,0)))
    ns, train_r, train_d, _ = train_env.step(train_a)
    train_track[-1]+= train_r
    agent.record(train_s, train_a, train_r, float(train_d), it, False)
    train_s = ns

    if train_d:
      if train_env.env.env.was_real_done:
        if len(train_track) % 100 == 0:

          #records statistics to logger and tensorboard
          train_mean.append(np.mean(train_track[-100:]))
          summary = sess.run(train_summary, feed_dict={train_reward:np.mean(train_track[-100:])})
          writer.add_summary(summary, it)
          logger.record_tabular('steps', it)
          logger.record_tabular('episode', len(train_track))
          logger.record_tabular('epsilon', 100*agent.epsilon)
          logger.record_tabular('learning rate', agent.lr)
          logger.record_tabular('Mean Reward 100 episdoes', np.mean(train_track[-100:]))
          logger.dump_tabular()
          with open(resultPath + 'reward_atari_practice.pk1', 'wb') as f:
              pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
        train_track.append(0.0)
      
      train_fs = reset_fs()
      train_s = train_env.reset()
        
    #Evaluation
    if (it+1)%C['eval_freq'] == 0:
        
      for i in range(C['eval_episodes']):
        temp_video = []
        eval_track.append(0.0)
        eval_fs = reset_fs()
        eval_s = eval_env.reset()
        while True:
          temp_video.append(eval_s)
          eval_fs.append(eval_s)
          eval_a = agent.greedy_act(np.transpose(eval_fs, (1,2,0)))
          eval_s, eval_r, eval_d, _ = eval_env.step(eval_a)
          eval_track[-1] += eval_r
          if eval_env.env.env.was_real_done:
            break
          if eval_d:
            eval_fs = reset_fs()
            eval_s = eval_env.reset()
                
        if eval_track[-1] > best_reward:
          best_reward = eval_track[-1]
          best_video = temp_video
          with open(resultPath + 'video_atari_practice.pk1', 'wb') as f:
              pickle.dump(best_video, f, protocol=pickle.HIGHEST_PROTOCOL)
              
      eval_mean.append(np.mean(eval_track[-C['eval_episodes']:]))
      logger.log('Evaluate mean reward: {:.2f}, max reward: {:.2f}, std: {:.2f}'.format(np.mean(eval_track[-C['eval_episodes']:]), np.max(eval_track[-C['eval_episodes']:]), np.std(eval_track[-C['eval_episodes']:])))
      summary = sess.run(eval_summary, feed_dict={eval_reward:np.mean(eval_track[-C['eval_episodes']:])})
      writer.add_summary(summary, it)
      with open(resultPath + 'eval_reward_atari_practice.pk1', 'wb') as f:
          pickle.dump(eval_track, f, protocol=pickle.HIGHEST_PROTOCOL)      

    #Storing current state and outputs from Convolution layers
    if it%1000000 == 0:
      outputs = agent.net.get_outputs(np.transpose(train_fs, (1,2,0)))
      with open(resultPath+str(it)+'outputs.pk1', 'wb') as f:
        pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL)
      with open(resultPath+str(it)+'outputs_screen.pk1', 'wb') as f:
        pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL)

  #Storing required outputs as pickle files        
  with open(resultPath + 'reward_atari_practice.pk1', 'wb') as f:
    pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
  with open(resultPath + 'trainMean_atari_practice.pk1', 'wb') as f:
    pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
  with open(resultPath+ 'evalMean_atari_practice.pk1', 'wb') as f:
    pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL)        
  agent.net.save(filePath + '{}_model2'.format(C['env_id']))
  sess.close()
Esempio n. 23
0
def train(params):
    sess = get_session(interactive=True)
    env = get_env(params['env_name'], params.get('video_dir'))
    inner_env = get_inner_env(env)

    # get offline traj
    train_collection, val_collection, normalization, path_collection, rollout_sampler = \
        get_data_from_offline_batch(params, env, split_ratio=0.85)
    behavior_policy_train_collection, behavior_policy_val_collection, behavior_policy_normalization, \
    behavior_policy_path_collection, behavior_policy_rollout_sampler =\
        get_data_from_offline_batch(params, env, model='behavior_policy', split_ratio=1.0, normalization_scope='behavior_policy')

    # ############################################################
    # ############### create computational graph #################
    # ############################################################
    policy = create_policy_from_params(params, env, sess)
    controller = create_controller_from_policy(policy)
    rollout_sampler.update_controller(controller)

    # (approximated) behavior policy
    behavior_policy = create_behavior_policy_from_params(params, env, sess)
    behavior_policy.running_stats.update_stats(
        behavior_policy_train_collection.data["observations"])
    behavior_policy_controller = create_controller_from_policy(behavior_policy)
    behavior_policy_rollout_sampler.update_controller(
        behavior_policy_controller)

    dyn_model = create_dynamics_model(params, env, normalization, sess)

    if params['algo'] not in ('trpo', 'vime'):
        raise NotImplementedError

    algo = create_trpo_algo(
        params,
        env,
        inner_env,
        policy,
        dyn_model,
        sess,
        behavior_policy=behavior_policy,
        offline_dataset=behavior_policy_train_collection.data["observations"])

    # ############################################################
    # ######################### learning #########################
    # ############################################################

    # init global variables
    all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope=None)
    policy_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="policy")
    behavior_policy_variables = tf.get_collection(
        tf.GraphKeys.GLOBAL_VARIABLES, scope="behavior_policy")
    if params['param_value']:
        value_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope="baseline")
        all_var_except_policy = [
            v for v in all_variables
            if v not in (policy_variables + behavior_policy_variables +
                         value_variables)
        ]
    else:
        all_var_except_policy = [
            v for v in all_variables
            if v not in (policy_variables + behavior_policy_variables)
        ]

    train_dyn_with_intrinsic_reward_only = params["dynamics"].get(
        "intrinsic_reward_only", False)
    logger.log("Train dynamics model with intrinsic reward only? {}".format(
        train_dyn_with_intrinsic_reward_only))

    dynamics_saver = tf.train.Saver(var_list=all_var_except_policy)
    behavior_policy_saver = tf.train.Saver(var_list=behavior_policy_variables)
    policy_saver = tf.train.Saver(var_list=policy_variables)
    tf.global_variables_initializer().run()

    start_itr = params.get("start_onpol_iter", 0)
    end_itr = params['onpol_iters']

    if train_dyn_with_intrinsic_reward_only:
        # Note: not supported
        dyn_model.use_intrinsic_rewards_only()
    else:
        dyn_model.use_external_rewards_only()

    # training
    if confirm_restoring_dynamics_model(params):
        restore_model(params, dynamics_saver, sess)
    else:
        # Fit the dynamics.
        logger.info("Fitting dynamics.")
        dyn_model.fit(train_collection, val_collection)
        logger.info("Done fitting dynamics.")
        save_cur_iter_dynamics_model(params, dynamics_saver, sess, itr=0)
    rollout_sampler.update_dynamics(dyn_model)

    # train behavior policy
    # Note: restoring policy is not supported yet.
    if confirm_restoring_behavior_policy(params):
        restore_behavior_policy(params, behavior_policy_saver, sess)
    else:
        behavior_policy.fit_as_bc(behavior_policy_train_collection,
                                  behavior_policy_val_collection,
                                  behavior_policy_rollout_sampler)
        save_cur_iter_behavior_policy(params,
                                      behavior_policy_saver,
                                      sess,
                                      itr=0)

    # re-initialize TRPO policy with BC policy
    if params['bc_init']:
        logger.info("Initialize TRPO policy with BC.")
        update_weights = [
            tf.assign(new, old)
            for (new, old) in zip(tf.trainable_variables('policy'),
                                  tf.trainable_variables('behavior_policy'))
        ]
        sess.run(update_weights)
        algo.reinit_with_source_policy(behavior_policy)
        if rollout_sampler:
            rl_paths = rollout_sampler.sample(
                num_paths=params['num_path_onpol'],
                horizon=params['env_horizon'],
                evaluation=True)
            returns = np.mean(
                np.array([sum(path["rewards"]) for path in rl_paths]))
            logger.info("TRPO policy initialized with BC average return: " +
                        str(returns))

    if params['pretrain_value']:
        logger.info("Fitting value function.")
        behavior_policy_train_collection.set_batch_size(
            params['max_path_length'])
        for obses, _, _, rewards in behavior_policy_train_collection:
            algo.pre_train_baseline(obses, rewards, params['trpo']['gamma'],
                                    params['trpo']['gae'])
        logger.info("Done fitting value function.")

    # restore TRPO policy
    if confirm_restoring_policy(params):
        restore_policy(params, policy_saver, sess, start_itr)
        logger.info("TRPO policy resumed from iter {}".format(str(start_itr)))

    # Main training loop
    for itr in range(start_itr, end_itr):
        logger.info('itr #%d | ' % itr)

        # Update randomness
        logger.info("Updating randomness.")
        dyn_model.update_randomness()
        logger.info("Done updating randomness.")

        # Policy training
        logger.info("Training policy using TRPO.")

        if params['policy'].get('reinitialize_every_itr', False):
            logger.info("Re-initialize policy variables.")
            policy.initialize_variables()

        train_policy_trpo(params, algo, dyn_model,
                          params['trpo']['iterations'], sess)

        logger.info("Done training policy.")

        # Generate on-policy rollouts.
        # only for evaluation, not for updating data
        logger.info("Generating on-policy rollouts.")
        if params['eval_model']:
            rl_paths, rollouts, residuals = rollout_sampler.sample(
                num_paths=params['num_path_onpol'],
                horizon=params['env_horizon'],
                evaluation=True,
                eval_model=params['eval_model'])
        else:
            rl_paths = rollout_sampler.sample(
                num_paths=params['num_path_onpol'],
                horizon=params['env_horizon'],
                evaluation=True)
        logger.info("Done generating on-policy rollouts.")
        returns = np.array([sum(path["rewards"]) for path in rl_paths])
        log_tabular_results(returns, itr, train_collection)
        if params['eval_model']:
            n_transitions = sum([len(path["rewards"]) for path in rl_paths])
            # step_wise_analysis
            step_wise_mse = np.mean(
                [sum(np.array(path["observations"])**2) for path in residuals])
            step_wise_mse /= n_transitions
            logger.record_tabular('step_wise_mse', step_wise_mse)
            step_wise_episode_mean = np.mean(
                [sum(path["rewards"]) for path in residuals])
            logger.record_tabular('step_wise_episode_mean',
                                  step_wise_episode_mean)
            # trajectory_wise_analysis
            min_path = min([len(path["observations"]) for path in rl_paths])
            min_rollout = min(
                [len(rollout["observations"]) for rollout in rollouts])
            traj_len = min(min_path, min_rollout)
            traj_wise_mse = np.mean([
                sum((np.array(path["observations"])[:traj_len] -
                     np.array(rollout["observations"])[:traj_len])**2)
                for (path, rollout) in zip(rl_paths, rollouts)
            ])
            traj_wise_mse /= traj_len * params['num_path_onpol']
            logger.record_tabular('traj_wise_mse', traj_wise_mse)
            traj_wise_episode_mean = np.mean(
                [sum(path["rewards"][:traj_len]) for path in rollouts])
            logger.record_tabular('traj_wise_episode_mean',
                                  traj_wise_episode_mean)
            # Energy distance between \tau_{sim} and \tau_{real}
            combination_sim_real = list(itertools.product(rl_paths, rollouts))
            A = np.mean([
                sum(
                    np.sqrt((np.array(v[0]["observations"][:traj_len]) -
                             np.array(v[1]["observations"][:traj_len]))**2))
                for v in combination_sim_real
            ])
            combination_sim = list(itertools.product(rollouts, rollouts))
            B = np.mean([
                sum(
                    np.sqrt((np.array(v[0]["observations"][:traj_len]) -
                             np.array(v[1]["observations"][:traj_len]))**2))
                for v in combination_sim
            ])
            combination_real = list(itertools.product(rl_paths, rl_paths))
            C = np.mean([
                sum(
                    np.sqrt((np.array(v[0]["observations"][:traj_len]) -
                             np.array(v[1]["observations"][:traj_len]))**2))
                for v in combination_real
            ])
            energy_dist = np.sqrt(2 * A - B - C)
            logger.record_tabular('energy_distance', energy_dist)
            logger.dump_tabular()
        if itr % 100 == 0 or itr == 250 or itr == end_itr - 1:
            save_cur_iter_policy(params, policy_saver, sess, itr)
            if params['save_variables']:
                algo.baseline.save_value_function(params['exp_dir'], itr)
Esempio n. 24
0
    def process_samples(self, itr, paths):

        start = time.time()
        all_path_baselines = [self.algo.baseline.predict(path) for path in paths]
        print("baseline predicting time: {}".format(time.time()-start) + "[sec]")
        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                self.algo.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["behavior_policy_kl"] = misc_utils.gauss_KL(
                path["agent_infos"]["mean"],
                path["agent_infos"]["logstd"],
                path["behavior_policy_agent_infos"]["mean"],
                path["behavior_policy_agent_infos"]["logstd"], axis=1)
            deltas = deltas - self.algo.alpha * path["behavior_policy_kl"]
            # GAE calculation
            path["advantages"] = misc_utils.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)

            # a trick to reduce variance but gives biased gradient
            path["value_targets"] = path["advantages"] + np.array(path_baselines[:-1])

        max_path_length = max([len(path["advantages"]) for path in paths])

        # make all paths the same length (pad extra advantages with 0)
        obs = [path["observations"] for path in paths]
        obs = tensor_utils.pad_tensor_n(obs, max_path_length)

        if self.algo.center_adv:
            raw_adv = np.concatenate([path["advantages"] for path in paths])
            adv_mean = np.mean(raw_adv)
            adv_std = np.std(raw_adv) + 1e-8
            adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
        else:
            adv = [path["advantages"] for path in paths]

        adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

        actions = [path["actions"] for path in paths]
        actions = tensor_utils.pad_tensor_n(actions, max_path_length)

        samples_data = dict(
            observations=obs,
            actions=actions,
            advantages=adv,
            paths=paths,
        )

        advantages = [path["advantages"] for path in paths]
        advantages = tensor_utils.pad_tensor_n(advantages, max_path_length)
        behavior_policy_kls = [path["behavior_policy_kl"] for path in paths]
        behavior_policy_kls = tensor_utils.pad_tensor_n(behavior_policy_kls, max_path_length)

        logger.record_tabular('advantages_mean', np.mean(advantages))
        logger.record_tabular('advantages_max', np.max(advantages))
        logger.record_tabular('advantages_min', np.min(advantages))
        logger.record_tabular('behavior_policy_kl_mean', np.mean(behavior_policy_kls))
        logger.record_tabular('behavior_policy_kl_max', np.max(behavior_policy_kls))
        logger.record_tabular('behavior_policy_kl_min', np.min(behavior_policy_kls))
        logger.dump_tabular()

        return samples_data
Esempio n. 25
0
def main():
    logger.configure('logs/long_short')

    global T, PRIORITY, DICHOTOMY, ENTROPY

    for PRIORITY, DICHOTOMY, ENTROPY in [
        (True, False, True),
        (True, False, False),
        (False, True, False),
        (False, False, False),
    ]:
        income_means, income_stds = [], []
        short_ratios, long_ratios = [], []
        short_passengers, long_passengers = [], []
        for seed in range(N_RUNS):
            np.random.seed(seed)
            T = 0
            g_lanes.clear()
            g_lanes.update({'short': Lane(), 'long': Lane()})
            # short_passengers, long_passengers = [], []
            enter_passengers = np.random.poisson(0.1, size=LENGTH)

            g_taxis.clear()
            for i in range(N_TAXIS // 2):
                g_taxis.append(Taxi(i))
                enter(g_taxis[-1], g_lanes['short'])
            for i in range(N_TAXIS // 2):
                g_taxis.append(Taxi(i + N_TAXIS // 2))
                enter(g_taxis[-1], g_lanes['long'])

            while T < LENGTH:
                if enter_passengers[T]:
                    dist = max(
                        2, np.random.choice(range(len(DISTANCES)),
                                            p=DISTANCES))
                    p = Passenger(dist)
                    if not DICHOTOMY:
                        lane = RANDOM_LANE()
                    elif p.distance <= THRESHOLD:
                        lane = g_lanes['short']
                    else:
                        lane = g_lanes['long']
                    lane.passengers.append(p)

                g_lanes['short'].step()
                g_lanes['long'].step()
                for taxi in g_taxis:
                    taxi.step()

                short_passengers.append(len(g_lanes['short'].passengers))
                long_passengers.append(len(g_lanes['long'].passengers))

                T += 1

            incomes = [np.sum(t.incomes) for t in g_taxis]

            income_means.append(np.mean(incomes))
            income_stds.append(np.std(incomes))
            short_ratios.append(
                np.mean([r for t in g_taxis for r in t.income_ratio['short']]))
            long_ratios.append(
                np.mean([r for t in g_taxis for r in t.income_ratio['long']]))

        # logger.info(income_means)
        # logger.info(income_stds)
        logger.record_tabular('*priority*', PRIORITY)
        logger.record_tabular('*dichotomy*', DICHOTOMY)
        logger.record_tabular('*entropy*', ENTROPY)
        logger.record_tabular('income mean', np.mean(income_means))
        logger.record_tabular('income std', np.mean(income_stds))
        logger.record_tabular('queuing time mean',
                              np.mean([t.queue_time for t in g_taxis]))
        logger.record_tabular('short income ratio mean',
                              np.mean(short_ratios) * 3600)
        logger.record_tabular('short income ratio std',
                              np.std(short_ratios) * 3600)
        logger.record_tabular('long income ratio mean',
                              np.mean(long_ratios) * 3600)
        logger.record_tabular('long income ratio std',
                              np.std(long_ratios) * 3600)
        logger.record_tabular('# short lane passengers',
                              np.mean(short_passengers))
        logger.record_tabular('# long lane passengers',
                              np.mean(long_passengers))
        logger.dump_tabular()
Esempio n. 26
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Esempio n. 27
0
def learn(env,
          policy_func,
          reward_giver,
          reward_guidance,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          task_name,
          gamma,
          lam,
          algo,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=1e-4,
          vf_iters=3,
          max_timesteps=0,
          max_episodes=0,
          max_iters=0,
          loss_percent=0.0,
          callback=None):

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = build_policy(env, 'mlp', value_network='copy')

    ob = observation_placeholder(ob_space)
    with tf.variable_scope('pi'):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope('oldpi'):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables('pi')
    # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
    # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")
    # assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    guidance_adam = MpiAdam(reward_guidance.get_trainable_variables())

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables('oldpi'), get_variables('pi'))
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    guidance_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     reward_guidance,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     algo=algo,
                                     loss_percent=loss_percent)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Iteration %i ************" % iters_so_far)

        # global flag_render
        # if iters_so_far > 0 and iters_so_far % 10 ==0:
        #     flag_render = True
        # else:
        #     flag_render = False

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            print('rewards', seg['rew'])
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(
            batch_size=len(ob))
        batch_size = 128
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Discriminator"):
            for (ob_batch, ac_batch) in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)
                # update running mean/std for reward_giver
                if hasattr(reward_giver, "obs_rms"):
                    reward_giver.obs_rms.update(
                        np.concatenate((ob_batch, ob_expert), 0))
                *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert)
                d_adam.update(allmean(g), d_stepsize)
                d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        # ------------------ Update Guidance ------------
        logger.log("Optimizing Guidance...")

        logger.log(fmt_row(13, reward_guidance.loss_name))
        batch_size = 128
        guidance_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        with timed("Guidance"):
            for ob_batch, ac_batch in dataset.iterbatches(
                (ob, ac),
                    include_final_partial_batch=False,
                    batch_size=batch_size):
                ob_expert, ac_expert = expert_dataset.get_next_batch(
                    batch_size=batch_size)

                idx_condition = process_expert(ob_expert, ac_expert)
                pick_idx = (idx_condition >= loss_percent)
                # pick_idx = idx_condition

                ob_expert_p = ob_expert[pick_idx]
                ac_expert_p = ac_expert[pick_idx]

                ac_batch_p = []
                for each_ob in ob_expert_p:
                    tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True)
                    ac_batch_p.append(tmp_ac)

                # update running mean/std for reward_giver
                if hasattr(reward_guidance, "obs_rms"):
                    reward_guidance.obs_rms.update(ob_expert_p)
                # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p)
                *newlosses, g = reward_guidance.lossandgrad(
                    ob_expert_p, ac_batch_p, ac_expert_p)
                guidance_adam.update(allmean(g), d_stepsize)
                guidance_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(guidance_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens) * g_step
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
def retraining(
        save_path,
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=4,  #50
        nb_rollout_steps=3,  #100
        reward_scale=1.0,
        render=False,
        render_eval=False,
        #   noise_type='adaptive-param_0.2',
        noise_type='normal_0.2',
        #   noise_type='ou_0.9',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-4,
        #   actor_lr=1e-6,
        #   critic_lr=1e-5,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=3,  # per epoch cycle and MPI worker,  50
        nb_eval_steps=1,  #100
        batch_size=640,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=3,  #50
        **network_kwargs):

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    # nb_actions = env.action_space.shape[-1]
    nb_actions = env.num_actions

    # nb_actions=3
    # print(nb_actions)
    action_shape = np.array(nb_actions * [0]).shape

    #4 pairs pos + 3 link length
    # nb_features = 2*(env.num_actions+1)+env.num_actions

    #4 pairs pos + 1 pair target pos
    nb_features = 2 * (env.num_actions + 2)
    observation_shape = np.array(nb_features * [0]).shape
    # assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.

    # memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    memory = Memory(limit=int(1e6),
                    action_shape=action_shape,
                    observation_shape=observation_shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    # nb_actions = env.action_space.shape[-1]
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
    agent = DDPG(actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    # sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar
    step_set = []
    reward_set = []

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    mean_epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    #load the initialization policy
    agent.load_ini(sess, save_path)
    # agent.memory.clear(limit=int(1e6), action_shape=action_shape, observation_shape=observation_shape)
    for epoch in range(nb_epochs):
        print(nb_epochs)
        # obs, env_state = env.reset()
        obs = env.reset()
        agent.save(save_path)
        epoch_episode_rewards = []
        '''check if the actor initialization policy has been loaded correctly, 
        i.e. equal to directly ouput values in checkpoint files '''
        # loaded_weights=tf.get_default_graph().get_tensor_by_name('target_actor/mlp_fc0/w:0')
        # print('loaded_weights:', sess.run(loaded_weights))
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.

            for t_rollout in range(nb_rollout_steps):
                # Predict next action
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)
                print('action:', action)

                new_obs, r, done = env.step(action)
                # time.sleep(0.2)
                t += 1

                episode_reward += r
                episode_step += 1
                # print('episode_re: ', episode_reward) #[1.]

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                b = 1.
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

            epoch_episode_rewards.append(episode_reward)
            episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)
                # print('Train!')
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        eval_action)
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        mean_epoch_episode_rewards.append(np.mean(epoch_episode_rewards))
        # print(step_set,mean_epoch_episode_rewards)
        step_set.append(t)
        plt.plot(step_set,
                 mean_epoch_episode_rewards,
                 color='r',
                 label='Initialization')
        plt.xlabel('Steps')
        plt.ylabel('Mean Episode Reward')
        plt.savefig('ddpg_mean_retrain.png')
        # plt.show()

        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)
    print('stepset: ', step_set)
    print('rewards: ', mean_epoch_episode_rewards)

    return agent
Esempio n. 29
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          normalize_observations,
          actor_lr,
          critic_lr,
          action_noise,
          gamma,
          nb_train_steps,
          nb_rollout_steps,
          batch_size,
          memory,
          tau=0.01):

    max_action = env.action_space.high
    agent = DDPG(
        memory,
        env.observation_space.shape[0],
        env.action_space.shape[0],
        gamma=gamma,
        tau=tau,
        normalize_observations=normalize_observations,
        batch_size=batch_size,
        action_noise=action_noise,
        actor_lr=actor_lr,
        critic_lr=critic_lr,
    )
    if USE_CUDA:
        agent.cuda()
    # Set up logging stuff only for a single worker.
    step = 0
    episode = 0
    episode_rewards_history = deque(maxlen=100)
    # Prepare everything.

    agent.reset()
    obs = env.reset()
    done = False
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch = 0
    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_start_time = time.time()
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(
                    obs, apply_noise=True,
                    compute_Q=True)  # policy 로 부터 action 을 선택하는
                assert action.shape == env.action_space.shape

                # Execute next action.
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(max_action * action)  # 환경 스탭
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            for t_train in range(nb_train_steps):
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = dict()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])
        logger.dump_tabular()
        logger.info('')
Esempio n. 30
0
def train(env, nb_epochs, nb_epoch_cycles, normalize_observations, actor_lr, critic_lr, action_noise,
          gamma, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.01):

    max_action = env.action_space.high
    agent = DDPG(memory, env.observation_space.shape[0], env.action_space.shape[0],
                 gamma=gamma, tau=tau,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise,
                 actor_lr=actor_lr, critic_lr=critic_lr,
                 )
    if USE_CUDA:
        agent.cuda()
    # Set up logging stuff only for a single worker.
    step = 0
    episode = 0
    episode_rewards_history = deque(maxlen=100)
    # Prepare everything.

    agent.reset()
    obs = env.reset()
    done = False
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch = 0
    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_start_time = time.time()
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                assert action.shape == env.action_space.shape

                # Execute next action.
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(max_action * action)
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            for t_train in range(nb_train_steps):
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = dict()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])
        logger.dump_tabular()
        logger.info('')
Esempio n. 31
0
def learn(
        env,
        model_path,
        data_path,
        policy_fn,
        *,
        horizon=150,  # timesteps per actor per update
        rolloutSize=50,
        clip_param=0.2,
        entcoeff=0.02,  # clipping parameter epsilon, entropy coeff
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=32,  # optimization hypers
        gamma=0.99,
        lam=0.95,  # advantage estimation
        max_iters=0,  # time constraint
        adam_epsilon=1e-4,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        retrain=False):

    # Setup losses and policy
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=5)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=5)  # rolling buffer for episode rewards

    p = []  # for saving the rollouts

    if retrain == True:
        print("Retraining the policy from saved path")
        time.sleep(2)
        U.load_state(model_path)
    max_timesteps = int(horizon * rolloutSize * max_iters)

    while True:
        if max_iters and iters_so_far >= max_iters:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        print("Collecting samples for policy optimization !! ")
        if iters_so_far > 70:
            render = True
        else:
            render = False
        rollouts = sample_trajectory(pi,
                                     env,
                                     horizon=horizon,
                                     rolloutSize=rolloutSize,
                                     stochastic=True,
                                     render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + 'rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        add_vtarg_and_adv(rollouts, gamma, lam)

        ob, ac, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts[
            "adv"], rollouts["tdlamret"]
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    deterministic=pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
Esempio n. 32
0
            weights, batch_indxes = np.ones_like(rewards), None
            obses_t, obses_tp1 = tf.constant(obses_t), tf.constant(obses_tp1)
            actions, rewards, dones = tf.constant(
                actions,
                dtype=tf.int64), tf.constant(rewards), tf.constant(dones)
            weights = tf.constant(weights)

            td_errors = agent.train(obses_t, actions, rewards, obses_tp1,
                                    dones, weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            agent.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        # todo 每一个episode记录一次
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()

    plt.figure()
    plt.plot(len(duration), duration)
    plt.figure()
    plt.plot(len(episode_rewards), episode_rewards)
    plt.show()