Exemple #1
0
def evaluate(net, save_domains=False, baseline=None):
    test_env = SubprocVecEnv([
        lambda: gym.make('SysAdmin-v0', save_domain=save_domains)
        for i in range(config.eval_batch)
    ],
                             in_series=(config.eval_batch // config.cpus),
                             context='fork')
    tqdm_val = tqdm(desc='Validating',
                    total=config.eval_problems,
                    unit=' problems')

    with torch.no_grad():
        net.eval()

        r_tot = 0.
        problems_finished = 0.
        rewards = []
        steps = 0

        s = test_env.reset()

        while problems_finished < config.eval_problems:
            steps += 1

            if not baseline:
                a, v, pi, pi_full = net(s)
            else:
                a = random_action(s, baseline, config.multi)

            s, r, d, i = test_env.step(a)

            r_tot += np.sum(r)
            problems_finished += np.sum(d)
            rewards += [x['reward_total'] for x in itertools.compress(i, d)]

            tqdm_val.update(np.sum(d))

        r_avg_ps = r_tot / (steps * config.eval_batch
                            )  # average reward per step
        r_avg_pp = r_tot / problems_finished  # average reward per problem

        net.train()

    if args.print_raw:
        rew_mean = np.mean(rewards)
        rew_ci95 = 1.96 * scipy.stats.sem(rewards)
        print(f"{rew_mean:.2f} ± {rew_ci95:.2f}")

    tqdm_val.close()
    test_env.close()

    eval_log = {
        'reward_per_step': r_avg_ps,
        'reward_per_problem': r_avg_pp,
        'rewards': rewards,
        'problems_finished': problems_finished,
    }

    return eval_log
Exemple #2
0
def evaluate(net, split='valid', subset=None):
	test_env = SubprocVecEnv([lambda: gym.make('Sokograph-v0', split=split, subset=subset) for i in range(config.eval_batch)], in_series=(config.eval_batch // config.cpus), context='fork')
	tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' steps')

	with torch.no_grad():
		net.eval()

		r_tot = 0.
		problems_solved = 0
		problems_finished = 0
		steps = 0

		s = test_env.reset()

		while problems_finished < config.eval_problems:
			steps += 1

			a, n, v, pi = net(s)
			actions = to_action(a, n, s, size=config.soko_size)

			s, r, d, i = test_env.step(actions)

			# print(r)
			r_tot += np.sum(r)
			problems_solved   += sum('all_boxes_on_target' in x and x['all_boxes_on_target'] == True for x in i)
			problems_finished += np.sum(d)

			tqdm_val.update()

		r_avg = r_tot / (steps * config.eval_batch) # average reward per step
		problems_solved_ps  = problems_solved / (steps * config.eval_batch)
		problems_solved_avg = problems_solved / problems_finished

		net.train()

	tqdm_val.close()
	test_env.close()

	return r_avg, problems_solved_ps, problems_solved_avg, problems_finished
Exemple #3
0
def evaluate(net, planner):
    test_env = SubprocVecEnv([
        lambda: gym.make('Boxworld-v0', plan=planner)
        for i in range(config.eval_batch)
    ],
                             in_series=(config.eval_batch // config.cpus),
                             context='fork')
    tqdm_val = tqdm(desc='Validating',
                    total=config.eval_problems,
                    unit=' problems')

    with torch.no_grad():
        net.eval()

        r_tot = 0.
        problems_solved = 0.
        problems_finished = 0.
        problems_timeout = 0.
        steps = 0

        opt_all = []
        opt_solved = []

        s = test_env.reset()

        while problems_finished < config.eval_problems:
            steps += 1
            # for step in range(1e9):
            a, v, pi = net(s)
            s, r, d, i = test_env.step(a)

            # print(r)
            r_tot += np.sum(r)
            problems_solved += np.array(
                sum(x['d_true'] for x in i)
            )  # conversion to numpy for easier ZeroDivision handling (-> nan)
            problems_finished += np.sum(d)

            if planner is not None:
                # print([x['path_len'] / x['steps'] if x['d_true'] else 0. for x in i if x['done']])
                opt_all += [
                    x['path_len'] / x['steps'] if x['d_true'] else 0.
                    for x in i if x['done']
                ]
                opt_solved += [
                    x['path_len'] / x['steps'] for x in i if x['d_true']
                ]

            tqdm_val.update(np.sum(d))

        problems_solved_ps = problems_solved / (steps * config.eval_batch)
        problems_solved_avg = problems_solved / problems_finished

        r_avg_ps = r_tot / (steps * config.eval_batch
                            )  # average reward per step
        r_avg_pp = r_tot / problems_finished  # average reward per problem

        opt_all_avg = np.mean(opt_all)
        opt_all_sem = scipy.stats.sem(opt_all)

        opt_solved_avg = np.mean(opt_solved)
        opt_solved_sem = scipy.stats.sem(opt_solved)

        avg_steps_to_solve = (steps * config.eval_batch) / problems_finished

        net.train()

    tqdm_val.close()
    test_env.close()

    eval_log = {
        'reward_per_step': r_avg_ps,
        'reward_per_problem': r_avg_pp,
        'problems_solved': problems_solved_avg,
        'problems_finished': problems_finished,
        'solved_per_step': problems_solved_ps,
        'steps_per_problem': avg_steps_to_solve,
        'optimality_all': opt_all_avg,
        'optimality_all_sem': opt_all_sem,
        'optimality_solved': opt_solved_avg,
        'optimality_solved_sem': opt_solved_sem,
    }

    return eval_log
Exemple #4
0
        in_series=(config.batch // config.cpus),
        context='fork')

    # job_name = f"{config.soko_size[0]}x{config.soko_size[1]}-{config.soko_boxes} mp-{config.mp_iterations} nn-{config.emb_size} b-{config.batch}"
    job_name = None
    wandb.init(project="rrl-boxworld", name=job_name, config=config)
    wandb.save("*.pt")

    wandb.watch(net, log='all')
    # print(net)

    tot_env_steps = 0
    tot_el_env_steps = 0

    tqdm_main = tqdm(desc='Training', unit=' steps')
    s = env.reset()

    for step in itertools.count(start=1):
        a, v, pi = net(s)
        s, r, d, i = env.step(a)
        # print(r, d)
        # print(s)

        s_true = [x['s_true'] for x in i]
        d_true = [x['d_true'] for x in i]

        n_stacks = list(len(x['raw_state'])
                        for x in i)  # for the entropy regularization

        # update network
        loss, loss_pi, loss_v, loss_h, entropy, norm = net.update(
Exemple #5
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    obs_numel = reduce(operator.mul, obs_shape, 1)

    actor_critic = Policy(obs_numel, envs.action_space)

    # Maxime: log some info about the model and its size
    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            elif current_obs.dim() == 3:
                current_obs *= masks.unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[:-1].view(-1,
                                                   actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

        if args.vis and j % args.vis_interval == 0:
            win = visdom_plot(total_num_steps, final_rewards.mean())
Exemple #6
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    # Maxime: commented this out because it very much changes the behavior
    # of the code for seemingly arbitrary reasons
    #if len(envs.observation_space.shape) == 1:
    #    envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    elif args.recurrent_policy:
        actor_critic = RecMLPPolicy(obs_numel, envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    # Maxime: log some info about the model and its size
    # call function PPO.modelsize() for this to happen
    '''
	modelSize = 0
	for p in actor_critic.parameters():
		pSize = reduce(operator.mul, p.size(), 1)
		modelSize += pSize
	'''

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)

    if args.algo == 'a2c':
        Agent = A2C(actor_critic, rollouts, args.lr, args.eps,
                    args.num_processes, obs_shape, args.use_gae, args.gamma,
                    args.tau, args.recurrent_policy, args.num_mini_batch,
                    args.cuda, args.log_interval, args.vis, args.env_name,
                    args.log_dir, args.entropy_coef, args.num_stack,
                    args.num_steps, args.ppo_epoch, args.clip_param,
                    args.max_grad_norm, args.alpha, args.save_dir,
                    args.vis_interval, args.save_interval, num_updates,
                    action_shape, args.value_loss_coef)

    elif args.algo == 'ppo':
        Agent = PPO(actor_critic, rollouts, args.lr, args.eps,
                    args.num_processes, obs_shape, args.use_gae, args.gamma,
                    args.tau, args.recurrent_policy, args.num_mini_batch,
                    args.cuda, args.log_interval, args.vis, args.env_name,
                    args.log_dir, args.entropy_coef, args.num_stack,
                    args.num_steps, args.ppo_epoch, args.clip_param,
                    args.max_grad_norm, args.save_dir, args.vis_interval,
                    args.save_interval, num_updates, action_shape,
                    args.value_loss_coef)

    elif args.algo == 'acktr':
        Agent = ACKTR(actor_critic, rollouts, args.lr, args.eps,
                      args.num_processes, obs_shape, args.use_gae, args.gamma,
                      args.tau, args.recurrent_policy, args.num_mini_batch,
                      args.cuda, args.log_interval, args.vis, args.env_name,
                      args.log_dir, args.entropy_coef, args.num_stack,
                      args.num_steps, args.ppo_epoch, args.clip_param,
                      args.max_grad_norm, args.alpha, args.save_dir,
                      args.vis_interval, args.save_interval, num_updates,
                      action_shape, args.value_loss_coef)
    print(str(actor_critic))
    print('Total model size: %d' % Agent.modelsize())

    obs = envs.reset()
    Agent.update_current_obs(obs, envs)
    Agent.rollouts.observations[0].copy_(Agent.current_obs)

    # These variables are used to compute average rewards for all processes.
    Agent.train(envs)
Exemple #7
0
def train(params, model_name, save_interval=1000, eval_interval=200,
          record_episodes=True, restart=False):
    try:
        # Create test env
        print("[INFO] Creating test environment")
        test_env = gym.make(env_name)

        # Traning parameters
        initial_lr = params["initial_lr"]
        discount_factor = params["discount_factor"]
        gae_lambda = params["gae_lambda"]
        ppo_epsilon = params["ppo_epsilon"]
        value_scale = params["value_scale"]
        entropy_scale = params["entropy_scale"]
        horizon = params["horizon"]
        num_epochs = params["num_epochs"]
        batch_size = params["batch_size"]
        num_envs = params["num_envs"]

        # Training parameters
        def lr_scheduler(step_idx): return initial_lr * \
            0.85 ** (step_idx // 10000)

        # Environment constants
        frame_stack_size = 4
        input_shape = (84, 84, frame_stack_size)
        num_actions = test_env.action_space.shape[0]
        action_min = test_env.action_space.low
        action_max = test_env.action_space.high

        # Create model
        print("[INFO] Creating model")
        model = PPO(input_shape, num_actions, action_min, action_max,
                    epsilon=ppo_epsilon,
                    value_scale=value_scale, entropy_scale=entropy_scale,
                    model_name=model_name)

        print("[INFO] Creating environments")
        envs = SubprocVecEnv([make_env for _ in range(num_envs)])

        initial_frames = envs.reset()
        envs.get_images()
        frame_stacks = [FrameStack(initial_frames[i], stack_size=frame_stack_size,
                                   preprocess_fn=preprocess_frame) for i in range(num_envs)]

        print("[INFO] Training loop")
        while True:
            # While there are running environments
            states, taken_actions, values, rewards, dones = [], [], [], [], []

            # Simulate game for some number of steps
            for _ in range(horizon):
                # Predict and value action given state
                # π(a_t | s_t; θ_old)
                states_t = [frame_stacks[i].get_state()
                            for i in range(num_envs)]
                actions_t, values_t = model.predict(states_t)

                # Sample action from a Gaussian distribution
                envs.step_async(actions_t)
                frames, rewards_t, dones_t, _ = envs.step_wait()
                envs.get_images()  # render

                # Store state, action and reward
                # [T, N, 84, 84, 4]
                states.append(states_t)
                taken_actions.append(actions_t)              # [T, N, 3]
                values.append(np.squeeze(values_t, axis=-1))  # [T, N]
                rewards.append(rewards_t)                    # [T, N]
                dones.append(dones_t)                        # [T, N]

                # Get new state
                for i in range(num_envs):
                    # Reset environment's frame stack if done
                    if dones_t[i]:
                        for _ in range(frame_stack_size):
                            frame_stacks[i].add_frame(frames[i])
                    else:
                        frame_stacks[i].add_frame(frames[i])

            # Calculate last values (bootstrap values)
            states_last = [frame_stacks[i].get_state()
                           for i in range(num_envs)]
            last_values = np.squeeze(model.predict(
                states_last)[1], axis=-1)  # [N]

            advantages = compute_gae(
                rewards, values, last_values, dones, discount_factor, gae_lambda)
            advantages = (advantages - advantages.mean()) / \
                (advantages.std() + 1e-8)  # Move down one line?
            returns = advantages + values
            # Flatten arrays
            states = np.array(states).reshape(
                (-1, *input_shape))       # [T x N, 84, 84, 4]
            taken_actions = np.array(taken_actions).reshape(
                (-1, num_actions))  # [T x N, 3]
            # [T x N]
            returns = returns.flatten()
            # [T X N]
            advantages = advantages.flatten()

            T = len(rewards)
            N = num_envs
            assert states.shape == (
                T * N, input_shape[0], input_shape[1], frame_stack_size)
            assert taken_actions.shape == (T * N, num_actions)
            assert returns.shape == (T * N,)
            assert advantages.shape == (T * N,)

            # Train for some number of epochs
            model.update_old_policy()  # θ_old <- θ
            for _ in range(num_epochs):
                num_samples = len(states)
                indices = np.arange(num_samples)
                np.random.shuffle(indices)
                for i in range(int(np.ceil(num_samples / batch_size))):
                    # Evaluate model
                    if model.step_idx % eval_interval == 0:
                        print("[INFO] Running evaluation...")
                        avg_reward, value_error = evaluate(
                            model, test_env, discount_factor, frame_stack_size, make_video=True)
                        model.write_to_summary("eval_avg_reward", avg_reward)
                        model.write_to_summary("eval_value_error", value_error)

                    # Save model
                    if model.step_idx % save_interval == 0:
                        model.save()

                    # Sample mini-batch randomly
                    begin = i * batch_size
                    end = begin + batch_size
                    if end > num_samples:
                        end = None
                    mb_idx = indices[begin:end]

                    # Optimize network
                    model.train(states[mb_idx], taken_actions[mb_idx],
                                returns[mb_idx], advantages[mb_idx])
    except KeyboardInterrupt:
        model.save()
Exemple #8
0
def main():
    # Create test env
    print("Creating test environment")
    test_env = gym.make(env_name)

    # Traning parameters
    lr_scheduler = Scheduler(initial_value=3e-4, interval=1000,
                             decay_factor=1)  #0.75)
    std_scheduler = Scheduler(initial_value=2.0,
                              interval=1000,
                              decay_factor=0.75)
    discount_factor = 0.99
    gae_lambda = 0.95
    ppo_epsilon = 0.2
    t_max = 10  #180
    num_epochs = 10
    batch_size = 40  #64
    save_interval = 500
    eval_interval = 100
    training = True

    # Environment constants
    frame_stack_size = 4
    input_shape = (84, 84, frame_stack_size)
    num_actions = 1  #envs.action_space.shape[0]
    action_min = np.array([-1.0])  #np.array([-1.0, 0.0, 0.0])
    action_max = np.array([1.0])  #np.array([ 1.0, 1.0, 1.0])

    # Create model
    print("Creating model")
    model_checkpoint = None  #"./models/CarRacing-v0/run2/episode0_step455000.ckpt"
    model = PPO(num_actions,
                input_shape,
                action_min,
                action_max,
                ppo_epsilon,
                value_scale=0.5,
                entropy_scale=0.0001,
                model_checkpoint=model_checkpoint,
                model_name="CarRacing-v0")

    if training:
        print("Creating environments")
        num_envs = 4
        envs = SubprocVecEnv([make_env for _ in range(num_envs)])

        initial_frames = envs.reset()
        initial_frames = envs.get_images()
        frame_stacks = [
            FrameStack(initial_frames[i], preprocess_fn=preprocess_frame)
            for i in range(num_envs)
        ]

        print("Main loop")
        step = 0
        while training:
            # While there are running environments
            print("Training...")
            states, taken_actions, values, rewards, dones = [], [], [], [], []
            learning_rate = np.maximum(lr_scheduler.get_value(), 1e-6)
            std = np.maximum(std_scheduler.get_value(), 0.2)

            # Simulate game for some number of steps
            for _ in range(t_max):
                # Predict and value action given state
                # π(a_t | s_t; θ_old)
                states_t = [
                    frame_stacks[i].get_state() for i in range(num_envs)
                ]
                actions_t, values_t = model.predict(states_t,
                                                    use_old_policy=True,
                                                    std=std)
                for i in range(num_envs):
                    actions_t[i] = 0 if actions_t[i] < 0 else 1
                actions_t = np.squeeze(actions_t.astype(np.int32), axis=-1)

                # Sample action from a Gaussian distribution
                envs.step_async(actions_t)
                frames, rewards_t, dones_t, infos = envs.step_wait()
                frames = envs.get_images()  # render

                # Store state, action and reward
                states.append(states_t)  # [T, N, 84, 84, 1]
                taken_actions.append(actions_t)  # [T, N, 3]
                values.append(np.squeeze(values_t, axis=-1))  # [T, N]
                rewards.append(rewards_t)  # [T, N]
                dones.append(dones_t)  # [T, N]

                # Get new state
                for i in range(num_envs):
                    frame_stacks[i].add_frame(frames[i])

            # Calculate last values (bootstrap values)
            states_last = [
                frame_stacks[i].get_state() for i in range(num_envs)
            ]
            last_values = np.squeeze(model.predict(states_last)[-1],
                                     axis=-1)  # [N]

            # Compute returns
            returns = compute_returns(rewards, last_values, dones,
                                      discount_factor)

            # Compute advantages
            advantages = compute_gae(rewards, values, last_values, dones,
                                     discount_factor, gae_lambda)

            # Normalize advantages
            advantages = (advantages -
                          np.mean(advantages)) / np.std(advantages)

            # Flatten arrays
            states = np.array(states).reshape(
                (-1, *input_shape))  # [T x N, 84, 84, 1]
            taken_actions = np.array(taken_actions).reshape(
                (-1, num_actions))  # [T x N, 3]
            returns = returns.flatten()  # [T x N]
            advantages = advantages.flatten()  # [T X N]

            # Train for some number of epochs
            model.update_old_policy()  # θ_old <- θ
            for _ in range(num_epochs):
                # Sample mini-batch randomly and train
                mb_idx = np.random.choice(len(states),
                                          batch_size,
                                          replace=False)

                # Optimize network
                model.train(states[mb_idx],
                            taken_actions[mb_idx],
                            returns[mb_idx],
                            advantages[mb_idx],
                            learning_rate=learning_rate,
                            std=std)

            # Reset environment's frame stack if done
            for i, done in enumerate(dones_t):
                if done:
                    frame_stacks[i].add_frame(frames[i])

            # Save model
            step += 1
            if step % save_interval == 0:
                model.save()
            if step % eval_interval == 0:
                avg_reward = evaluate(model, test_env, 10)
                model.write_to_summary("eval_avg_reward", avg_reward)

    # Training complete, evaluate model
    avg_reward = evaluate(model, test_env, 10)
    print("Model achieved a final reward of:", avg_reward)
Exemple #9
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    test_envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        test_envs = SubprocVecEnv(test_envs)
    else:
        envs = DummyVecEnv(envs)
        test_envs = DummyVecEnv(test_envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.saved_encoder_model:
        obs_shape = (args.num_stack, args.latent_space_size)

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.resume_experiment:
        print("\n############## Loading saved model ##############\n")
        actor_critic, ob_rms = torch.load(
            os.path.join(save_path, args.env_name + args.save_tag + ".pt"))
        tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p"))

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    print(obs_shape)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes,
                                   obs_shape, envs.action_space,
                                   actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)
    current_obs_test = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, test=False):
        shape_dim0 = envs.observation_space.shape[0]
        if args.saved_encoder_model:
            shape_dim0 = 1
            obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs)))
            obs = obs.data.cpu().numpy()
        obs = torch.from_numpy(obs).float()
        if not test:
            if args.num_stack > 1:
                current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
        else:
            if args.num_stack > 1:
                current_obs_test[:, :
                                 -shape_dim0] = current_obs_test[:,
                                                                 shape_dim0:]
            current_obs_test[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    reward_avg = 0

    if args.cuda:
        current_obs = current_obs.cuda()
        current_obs_test = current_obs_test.cuda()
        rollouts.cuda()
        rollouts_test.cuda()

    start = time.time()

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observation, reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # Maxime: clip the reward within [0,1] for more reliable training
            # This code deals poorly with large reward values
            reward = np.clip(reward, a_min=0, a_max=None) / 400

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            tr.episodes_done += args.num_processes - masks.sum()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        tr.iterations_done += 1

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + args.save_tag + ".pt"))

            total_test_reward_list = []
            step_test_list = []

            for _ in range(args.num_tests):
                test_obs = test_envs.reset()
                update_current_obs(test_obs, test=True)
                rollouts_test.observations[0].copy_(current_obs_test)
                step_test = 0
                total_test_reward = 0

                while step_test < args.num_steps_test:
                    value_test, action_test, action_log_prob_test, states_test = actor_critic.act(
                        Variable(rollouts_test.observations[step_test],
                                 volatile=True),
                        Variable(rollouts_test.states[step_test],
                                 volatile=True),
                        Variable(rollouts_test.masks[step_test],
                                 volatile=True))
                    cpu_actions_test = action_test.data.squeeze(
                        1).cpu().numpy()

                    # Observation, reward and next obs
                    obs_test, reward_test, done_test, info_test = test_envs.step(
                        cpu_actions_test)

                    # masks here doesn't really matter, but still
                    masks_test = torch.FloatTensor(
                        [[0.0] if done_test_ else [1.0]
                         for done_test_ in done_test])

                    # Maxime: clip the reward within [0,1] for more reliable training
                    # This code deals poorly with large reward values
                    reward_test = np.clip(reward_test, a_min=0,
                                          a_max=None) / 400

                    total_test_reward += reward_test[0]
                    reward_test = torch.from_numpy(
                        np.expand_dims(np.stack(reward_test), 1)).float()

                    update_current_obs(obs_test)
                    rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\
                     value_test.data, reward_test, masks_test)

                    step_test += 1

                    if done_test:
                        break

                #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget
                total_test_reward_list.append(total_test_reward)
                step_test_list.append(step_test)

            append_to(tr.test_reward, tr,
                      sum(total_test_reward_list) / args.num_tests)
            append_to(tr.test_episode_len, tr,
                      sum(step_test_list) / args.num_tests)

            logger.log_scalar_rl(
                "test_reward", tr.test_reward[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "test_episode_len", tr.test_episode_len[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])

            # Saving all the MyContainer variables
            tr.save(
                os.path.join(log_path, args.env_name + args.save_tag + ".p"))

        if j % args.log_interval == 0:
            reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean()
            end = time.time()
            tr.global_steps_done = (j +
                                    1) * args.num_processes * args.num_steps

            print(
                "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, tr.global_steps_done,
                        int(tr.global_steps_done / (end - start)), reward_avg,
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            append_to(tr.pg_loss, tr, action_loss.data[0])
            append_to(tr.val_loss, tr, value_loss.data[0])
            append_to(tr.entropy_loss, tr, dist_entropy.data[0])
            append_to(tr.train_reward_avg, tr, reward_avg)

            logger.log_scalar_rl(
                "train_pg_loss", tr.pg_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_val_loss", tr.val_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            """
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0])
                )
            """

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass