Esempio n. 1
0
def train_policy():
    ppo_config = {
        "gamma": 0.9988,
        "n_steps": 200,
        "ent_coef": 0,
        "learning_rate": 0.001,
        "vf_coef": 0.99,
        "max_grad_norm": 0.1,
        "lam": 0.95,
        "nminibatches": 5,
        "noptepochs": 100,
        "cliprange": 0.2,
        "tensorboard_log": log_relative_path
    }
    os.makedirs(log_relative_path)
    policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[256, 128])
    env = SubprocVecEnv([_make_env(rank=i) for i in range(5)])
    model = PPO2(MlpPolicy,
                 env,
                 _init_setup_model=True,
                 policy_kwargs=policy_kwargs,
                 verbose=1,
                 **ppo_config)
    model.learn(total_timesteps=1000,
                tb_log_name="ppo2",
                reset_num_timesteps=False)
    model.save(os.path.join(log_relative_path, 'model'))
    env.env_method("save_world", log_relative_path)
    env.close()
    return
Esempio n. 2
0
def _eval_model(model, env_id, ob_shape, num_eps, plot=False):
  test_env = SubprocVecEnv([make_env(env_id)])
  sharpe_ratios = []
  for episode in range(num_eps):
    # Padding zeros to the test env to match the shape of the training env.
    zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape)
    zero_completed_obs[0, :] = test_env.reset()
    state = None
    for _ in range(L):
      action, state = model.predict(zero_completed_obs, state=state, deterministic=True)
      zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0]
    sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0])
    if plot: test_env.env_method('render', indices=0)
  test_env.close()
  
  # Return the average sharpe ratio
  return sum(sharpe_ratios) / len(sharpe_ratios)
Esempio n. 3
0
if __name__ == "__main__":
    batch_size = 8
    num_envs = 8
    num_gpus = torch.cuda.device_count()

    def make_env(index):
        return lambda: gym.make(
            'MetaEnv-v0', device=torch.device('cuda', index=index % num_gpus))

    env = SubprocVecEnv([make_env(x) for x in range(num_envs)],
                        start_method='forkserver')

    # env.get_valid_actions = lambda: np.array([e.get_valid_actions() for e in env.envs])
    env.get_valid_actions = lambda: np.array(
        env.env_method('get_valid_actions'))

    model = algo.MaskedPPO(CustomLSTMPolicy,
                           env,
                           verbose=1,
                           n_steps=20,
                           nminibatches=batch_size,
                           tensorboard_log="../out/meta_opt/")

    model.learn(total_timesteps=100000, log_interval=10)
    model.save('meta_optimizer')

    obs = env.reset()
    state = None
    total_rewards = 0
    done = [False for _ in range(env.num_envs)]
Esempio n. 4
0
def evaluate_model_on_set(
    set_path,
    model,
    config_path=None,
    config_kw=None,
    metrics=("success", "control_variation", "rise_time", "overshoot",
             "settling_time"),
    norm_data_path=None,
    num_envs=1,
    turbulence_intensity="none",
    use_pid=False,
    writer=None,
    timestep=None,
):
    """
    :param set_path: (str) path to test set file
    :param model: (PPO2 object or [PIDController]) the controller to be evaluated
    :param config_path: (str) path to gym environment configuration file
    :param config_kw: (dict) dictionary of key value pairs to override settings in the configuration file of the gym environment
    :param metrics: ([str]) list of metrics to be computed and recorded
    :param norm_data_path: (str) path to folder containing normalization statistics
    :param num_envs: (int) number of gym environments to run in parallell using multiprocessing
    :param turbulence_intensity: (str) the intensity setting of the wind turbulence
    :param use_pid: (bool) Whether the evaluated controller is a PID controller or not
    :param writer: (tensorboard writer) If supplied, evaluation results will be written to tensorboard log, if not, results are printed to standard output
    :param timestep: (int) What timestep results are written to when using tensorboard logging
    :return: (dict) the metrics computed for the evaluated controller on the test set
    """
    scenarios = list(np.load(set_path, allow_pickle=True))
    scenario_count = len(scenarios)

    if config_kw is None:
        config_kw = {}

    config_kw.update({
        "steps_max": 1500,
        "target": {
            "on_success": "done",
            "success_streak_fraction": 1,
            "success_streak_req": 100,
            "states": {
                0: {
                    "bound": 5
                },
                1: {
                    "bound": 5
                },
                2: {
                    "bound": 2
                }
            },
        },
    })

    if use_pid:
        config_kw["action"] = {"scale_space": False}

    sim_config_kw = {
        "turbulence": turbulence_intensity != "None",
        "turbulence_intensity": turbulence_intensity,
    }

    test_env = SubprocVecEnv([
        make_env(config_path,
                 i,
                 config_kw=config_kw,
                 sim_config_kw=sim_config_kw) for i in range(num_envs)
    ])
    if use_pid:
        dt = test_env.get_attr("simulator")[0].dt
        for pid in model:
            pid.dt = dt
        env_cfg = test_env.get_attr("cfg")[0]
        obs_states = [var["name"] for var in env_cfg["observation"]["states"]]
        try:
            phi_i, theta_i, Va_i = (
                obs_states.index("roll"),
                obs_states.index("pitch"),
                obs_states.index("Va"),
            )
            omega_i = [
                obs_states.index("omega_p"),
                obs_states.index("omega_q"),
                obs_states.index("omega_r"),
            ]
        except ValueError:
            print(
                "When using PID roll, pitch, Va, omega_p, omega_q, omega_r must be part of the observation vector."
            )
    else:
        test_env = VecNormalize(test_env)
        if model.env is not None:
            test_env.obs_rms = model.env.obs_rms
            test_env.ret_rms = model.env.ret_rms
        else:
            assert norm_data_path is not None
            test_env.load_running_average(norm_data_path)
        test_env.training = False

    res = {metric: {} for metric in metrics}
    res["rewards"] = [[] for i in range(scenario_count)]
    active_envs = [i < scenario_count for i in range(num_envs)]
    env_scen_i = [i for i in range(num_envs)]
    test_done = False
    obs = np.array(
        [np.zeros(test_env.observation_space.shape) for i in range(num_envs)])
    done = [True for i in range(num_envs)]
    info = None

    while not test_done:
        for i, env_done in enumerate(done):
            if env_done:
                if len(scenarios) > 0 or active_envs[i]:
                    if len(scenarios) > 0:
                        print("{}/{} scenarios left".format(
                            len(scenarios), scenario_count))
                        scenario = scenarios.pop(0)
                        env_scen_i[i] = (scenario_count - 1) - len(scenarios)
                        obs[i] = test_env.env_method("reset",
                                                     indices=i,
                                                     **scenario)[0]
                        if use_pid:
                            model[i].reset()
                            model[i].set_reference(
                                scenario["target"]["roll"],
                                scenario["target"]["pitch"],
                                scenario["target"]["Va"],
                            )
                    else:
                        active_envs[i] = False
                    if info is not None:
                        for metric in metrics:
                            if isinstance(info[i][metric], dict):
                                for state, value in info[i][metric].items():
                                    if state not in res[metric]:
                                        res[metric][state] = []
                                    res[metric][state].append(value)
                            else:
                                if "all" not in res[metric]:
                                    res[metric]["all"] = []
                                res[metric]["all"].append(info[i][metric])

        if len(scenarios) == 0:
            test_done = not any(active_envs)
        if use_pid:
            actions = []
            for i, pid in enumerate(model):
                roll, pitch, Va = obs[i, phi_i], obs[i, theta_i], obs[i, Va_i]
                omega = obs[i, omega_i]
                if info is not None and "target" in info[i]:
                    pid.set_reference(
                        phi=info[i]["target"]["roll"],
                        theta=info[i]["target"]["pitch"],
                        va=info[i]["target"]["Va"],
                    )
                actions.append(pid.get_action(roll, pitch, Va, omega))
            actions = np.array(actions)
        else:
            actions, _ = model.predict(obs, deterministic=True)
        obs, rew, done, info = test_env.step(actions)
        for i, env_rew in enumerate(rew):
            res["rewards"][env_scen_i[i]].append(env_rew)

    if writer is not None:
        summaries = []
        for metric, metric_v in res.items():
            if isinstance(res[metric], dict):
                for state, v in res[metric].items():
                    summaries.append(
                        tf.Summary.Value(
                            tag="test_set/{}_{}".format(metric, state),
                            simple_value=np.nanmean(v),
                        ))
        writer.add_summary(tf.Summary(value=summaries), timestep)
    else:
        print_results(res)

        return res
Esempio n. 5
0
def main():

    args = get_configuration()
    args.state_dim = util.get_state_dim(args)

    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir, exist_ok=True)

    if args.graph_embedding:

        class MyPolicy(EmbeddingPolicy):
            def __init__(self,
                         sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         reuse=True,
                         **_kwargs):
                super().__init__(sess,
                                 ob_space,
                                 ac_space,
                                 n_env,
                                 n_steps,
                                 n_batch,
                                 args,
                                 reuse=reuse,
                                 **_kwargs)
    else:

        class MyPolicy(EnigmaPolicy):
            def __init__(self,
                         sess,
                         ob_space,
                         ac_space,
                         n_env,
                         n_steps,
                         n_batch,
                         reuse=True,
                         **_kwargs):
                super().__init__(sess,
                                 ob_space,
                                 ac_space,
                                 n_env,
                                 n_steps,
                                 n_batch,
                                 args,
                                 reuse=reuse,
                                 **_kwargs)

    t0 = time.time()

    from mpi4py import MPI as mpi
    comm = mpi.COMM_WORLD
    rank = comm.Get_rank()
    all = comm.Get_size()

    gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
    gpu_count = len(gpus)
    gpu = gpus[rank % gpu_count]
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    print("My rank is {} out of {}, using GPU {}".format(rank, all, gpu))

    if args.model_type == "ppo2":
        from stable_baselines import PPO2 as PPO
        env = SubprocVecEnv([(lambda: ProofEnv.ProofEnv(args))
                             for _ in range(args.parallel_envs)
                             ])  #, start_method="spawn")
    elif args.model_type == "ppo1":
        args.parallel_envs = 1
        env = DummyVecEnv([lambda: ProofEnv.ProofEnv(args)])
        # from stable_baselines import PPO1 as PPO
        from ppo import PPO1 as PPO

    if args.saved_model == None:
        myPolicy = MyPolicy
        if args.model_type == "ppo2":
            model = PPO(
                policy=myPolicy,
                env=env,
                n_steps=args.actorbatch,
                # nminibatches=args.optim_stepsize,
                lam=0.95,
                gamma=args.gamma,
                noptepochs=4,
                ent_coef=args.entcoeff,
                learning_rate=lambda f: f * 2.5e-4,
                cliprange=lambda f: f * 0.1,
                verbose=1)
        elif args.model_type == "ppo1":
            model = PPO(myPolicy,
                        env,
                        verbose=2,
                        timesteps_per_actorbatch=args.actorbatch,
                        schedule=args.lr_schedule,
                        optim_stepsize=args.optim_stepsize,
                        entcoeff=args.entcoeff,
                        optim_batchsize=args.optim_batchsize,
                        gamma=args.gamma)
    else:
        print("Loading model from {}".format(args.saved_model))
        model = PPO.load(args.saved_model)
        model.set_env(env)

    counter = 0

    for ind in range(args.parallel_envs):
        env.env_method("set_model",
                       model,
                       indices=list(range(args.parallel_envs)))

    modelfiles = []
    for train_timestep, train_dir in zip(args.train_timesteps,
                                         args.train_dirs):
        problem_files = sorted(util.list_problems(train_dir))
        problem_files = util.split_list(problem_files, all)[rank]
        problem_files_splitted = util.split_list(problem_files,
                                                 args.parallel_envs,
                                                 extensible=False)

        if args.add_repeating_pretraining:
            for ind in range(args.parallel_envs):
                env.env_method("set_source",
                               problem_files_splitted[ind],
                               indices=[ind],
                               generator_type="repeating")
            # all_thread_timestep = train_timestep * all
            print("PRETRAINING")
            model.learn(total_timesteps=train_timestep)
            print("Pretraining on {} finished in {}".format(
                train_dir, util.format_time(time.time() - t0)))

        for ind in range(args.parallel_envs):
            env.env_method("set_source",
                           problem_files_splitted[ind],
                           indices=[ind])
        # all_thread_timestep = train_timestep * all
        model.learn(total_timesteps=train_timestep)

        modelfile = "{}/ppo1_fcop_train_{}".format(args.outdir, counter)
        modelfiles.append(modelfile)
        if rank == 0:
            model.save(modelfile)
            # logger.logkv("finished_train_problems", counter)
        counter += 1

        print("Training on {} finished in {}".format(
            train_dir, util.format_time(time.time() - t0)))
        statistics_list = env.get_attr("statistics",
                                       indices=list(range(args.parallel_envs)))
        blacklist_list = env.get_attr("blacklist",
                                      indices=list(range(args.parallel_envs)))
        for i, statistics in enumerate(statistics_list):
            print("ENV {} - {} - blacklist: {}\n".format(
                rank, i, blacklist_list[i])),
            util.print_problemdict(statistics, rank)

            # for f in statistics:
            #     statistics[f]["mcts"].display_tree([0])

        # util.print_problemdict(env.envs[0].statistics)

    if len(args.train_dirs) > 0 and len(
            args.train_timesteps) > 0:  # we did training
        print("We have finished training, rank {}".format(rank))

        # for p in problem_files:
        #     vis_policy.vis_policy(env.envs[0], model, p)

        env.close()
        del env
        del model

    # here we wait for everyone
    comm.Barrier()
    print("We have started evaluation, rank {}".format(rank))

    # evaluation without training
    if (args.saved_model is not None) and (len(
            args.train_dirs) == 0):  # no training, just evaluation
        modelfiles = [args.saved_model]

    for evaldir in args.evaldirs:
        for model_index, modelfile in enumerate(modelfiles):
            eval.eval_mpi(args, evaldir, modelfile, model_index)

            # here we wait for everyone
            comm.Barrier()
Esempio n. 6
0
                    logging_level=logging_level)

    return create_yumi


logging_level = logging.DEBUG if args.debug else logging.INFO
yumis = [make_env(args.render, i, seed=i) for i in range(n_cpu)]
env = SubprocVecEnv(yumis)

model = PPO2.load(args.model_path, env=env, policy=MlpPolicy)

n_episodes = 100 if real else 5000

states, actions, next_states, parameters, steps = [], [], [], [], []

horizon = env.env_method('get_horizon')[0]
n_steps = (horizon * n_episodes) // n_cpu

deterministic = False

obs = env.reset()
for ep in range(n_steps):
    states.extend(obs)
    action, _states = model.predict(obs, deterministic=deterministic)
    obs, rewards, done, info = env.step(action)
    actions.extend(action)
    next_states.extend(obs)
    dynamics = env.env_method('get_dynamics')
    parameters.extend(dynamics)
    steps.append(env.env_method('get_step'))
def main(args):
    log_dir = args.log_path if (args.log_path is not None) else \
        "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    configure_logger(log_dir)

    set_global_seeds(args.seed)

    n_cpu = get_num_workers(args.env) if not args.play else 1
    env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential,
                                args.reward_type, args.n_object,
                                args.curriculum)

    def make_thunk(rank):
        return lambda: make_env(env_id=args.env,
                                rank=rank,
                                log_dir=log_dir,
                                flatten_dict=True,
                                kwargs=env_kwargs)

    env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])

    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    if "use_cu" in eval_env_kwargs:
        eval_env_kwargs['use_cu'] = False
    eval_env = make_env(env_id=args.env,
                        rank=0,
                        flatten_dict=True,
                        kwargs=eval_env_kwargs)
    print(eval_env)
    if not args.play:
        os.makedirs(log_dir, exist_ok=True)
        train_kwargs = get_train_kwargs("ppo",
                                        args,
                                        parsed_action_noise=None,
                                        eval_env=eval_env)

        # policy = 'MlpPolicy'
        from utils.attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("ppo", args)
        print(policy_kwargs)

        model = PPO2(args.policy,
                     env,
                     verbose=1,
                     nminibatches=32,
                     lam=0.95,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=3e-4,
                     cliprange=0.2,
                     policy_kwargs=policy_kwargs,
                     **train_kwargs)
        print(model.get_parameter_list())

        def callback(_locals, _globals):
            num_update = _locals["update"]
            if 'FetchStack' in args.env:
                mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
            else:
                mean_eval_reward = eval_model(eval_env, _locals["self"])
            log_eval(num_update, mean_eval_reward)
            if num_update % 10 == 0:
                model_path = os.path.join(log_dir,
                                          'model_' + str(num_update // 10))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        model.learn(total_timesteps=int(args.num_timesteps),
                    callback=callback,
                    seed=args.seed,
                    log_interval=1)
        model.save(os.path.join(log_dir, 'final'))

    else:
        assert args.load_path is not None
        model = PPO2.load(args.load_path)
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        goal_dim = env.get_attr('goal')[0].shape[0]
        if 'FetchStack' in args.env:
            while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                    env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPush' in args.env:
            while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61
                       and 0.7 < obs[0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0]))
            obs = env.env_method('get_obs')
            obs[0] = np.concatenate([
                obs[0][key]
                for key in ['observation', 'achieved_goal', 'desired_goal']
            ])
        else:
            while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                obs = env.reset()
        print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal',
              obs[0][-goal_dim:])
        episode_reward = 0.0
        num_episode = 0
        frame_idx = 0
        images = []
        if 'max_episode_steps' not in env_kwargs.keys():
            env_kwargs['max_episode_steps'] = 100
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            if env.get_attr('goal')[0].shape[0] <= 3:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx))
            else:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx) + ', goal idx ' +
                             str(np.argmax(env.get_attr('goal')[0][3:])))
                if 'FetchStack' in args.env:
                    tasks = ['pick and place', 'stack']
                    ax.set_title('episode ' + str(num_episode) + ', frame ' +
                                 str(frame_idx) + ', task: ' +
                                 tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 *
                                                        goal_dim])])
            images.append(img)
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if not args.export_video:
                plt.pause(0.1)
            else:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            if done:
                print('episode_reward', episode_reward)
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                            env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                else:
                    while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                        obs = env.reset()
                print('goal', obs[0][-goal_dim:])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 10:
                    break
        if args.export_video:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Esempio n. 8
0
        env.seed(seed + rank)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == '__main__':
    log_path = os.path.join(*__file__.split('/')[:-2], 'log',
                            'run_' + datetime.now().strftime('%m%d%H%M'))

    env_vec = SubprocVecEnv([make_env(i) for i in range(4)],
                            start_method='spawn')

    net_arch = [dict(pi=[512, 256], vf=[512, 256])]
    obs_norm_init = env_vec.env_method('obs_norm_params', indices=0)[0]
    act_norm_init = env_vec.env_method('act_norm_params', indices=0)[0]
    policy_kwargs = dict(act_fun=tf.nn.relu,
                         net_arch=net_arch,
                         obs_norm_init=obs_norm_init,
                         act_norm_init=act_norm_init)

    n_time_step = 10**5

    model = PPO2(NormalMlpPolicy,
                 env_vec,
                 gamma=0.95,
                 n_steps=8192,
                 nminibatches=4,
                 noptepochs=4,
                 learning_rate=5.0 * 10**(-4),
Esempio n. 9
0
    while True:

        # env.set_attr("keyboard_u", keyboard_u)

        env.render()
        action, _states = model.predict(obs, deterministic=True)
        action[0] = 0
        obs, rewards, dones, info = env.step(action)
        episode_reward += rewards[0]
        if dones[0]:
            performance[cnt, 0] = episode_reward
            episode_reward = 0

            performance[cnt, 1] = env.get_attr("record_count")[0]
            # print(env.get_attr("record_count"))
            performance[cnt, 2] = env.env_method("why_done")[0]
            # print(env.env_method("why_done"))
            if int(performance[cnt, 2]) != 0:
                performance[cnt, 1] = np.inf
            cnt += 1

            break
    print(performance)
    print(np.mean(performance[:, 0]),
          np.min(performance[:, 1]) * 0.1,
          np.max(performance[:, 1]) * 0.1,
          np.mean(performance[:, 1]) * 0.1,
          len(performance[performance[:, 2] == 0]),
          len(performance[performance[:, 2] == 1]),
          len(performance[performance[:, 2] == 2]))
Esempio n. 10
0
def test(model_name, env_name, num_cpu, log_dir):
    env_id = env_name + 'NoFrameskip-v4'
    env = SubprocVecEnv([
        make_env(env_id, i, log_dir, useMonitor=False) for i in range(num_cpu)
    ])
    # env = Monitor(env, log_dir, allow_early_resets=True)
    model = get_model(model_name, env, log_dir)

    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    from matplotlib import pyplot as plt
    show_num = 1
    while True:
        action, _states = model.predict(obs)
        # obs, rewards, done, info = env.step([int(input('action:'))]*num_cpu)
        obs, rewards, done, info = env.step(action)
        img = obs[show_num, :, :, :]
        fig = plt.figure(0)
        plt.clf()
        plt.imshow(img / 255)
        fig.canvas.draw()

        if 'SelfAttention' in model_name and 'Box' in env_name and 'World' in env_name:
            agent_position = env.env_method(
                'get_current_agent_position')[show_num]
            print('agent_position', agent_position)
            attention = model.get_attention(obs, _states, done)[0]
            # head_0
            attention0 = attention[show_num][0][agent_position]

            left = [
                attention0[agent_position -
                           14] if agent_position - 14 >= 0 else 0
            ]
            right = [
                attention0[agent_position +
                           14] if agent_position + 14 > 155 else 0
            ]
            attention0_udlr = [
                left, right, attention0[agent_position - 1],
                attention0[agent_position + 1]
            ]
            # print('top :{} left:{}'.format(attention0[agent_position-14],attention0[agent_position-1]))
            attention0 = np.reshape(attention0, [14, 14])
            fig = plt.figure(2)
            plt.clf()
            plt.imshow(attention0, cmap='gray')
            fig.canvas.draw()

            # head_1
            attention1 = attention[show_num][1][agent_position]

            left = [
                attention1[agent_position -
                           14] if agent_position - 14 >= 0 else 0
            ]
            right = [
                attention1[agent_position +
                           14] if agent_position + 14 > 155 else 0
            ]
            attention1_udlr = [
                left, right, attention1[agent_position - 1],
                attention1[agent_position + 1]
            ]

            attention1 = np.reshape(attention1, [14, 14])
            fig = plt.figure(3)
            plt.clf()
            plt.imshow(attention1, cmap='gray')
            fig.canvas.draw()

            print(action[show_num], np.argmax(attention0_udlr),
                  np.argmax(attention1_udlr), "max attention:",
                  np.max(attention0_udlr), np.max(attention1_udlr))
        # env.render()
        plt.pause(0.000001)
Esempio n. 11
0
def run_experiment(
    not_save=False,
    folder='experiments',
    weights_location=None,
    tag=None,
    env='Base',
    env_num=4,
    n=0,
    save_interval=10000,
    train_steps=int(1e6),
    description=None,
    weights=None,
    n_steps=200,
    gamma=0.99,
    max_steps=None,
):

    if weights is not None and not os.path.isfile(weights):
        raise ValueError("Weights do not exist")

    # Saving args
    args = deepcopy(locals())

    # Get env
    env = getattr(environments, env)

    # Generate environments
    if max_steps is not None:
        env = SubprocVecEnv(
            [lambda: env(max_steps=max_steps) for i in range(env_num)])
    else:
        env = SubprocVecEnv([lambda: env() for i in range(env_num)])

    args['env_config'] = str(env.env_method("get_org_config")[0])

    # Check if folder exists and if is a valid name
    if not not_save:
        id,logger,logs_folder,experiment_csv,experiment_folder = \
                create_experiment_folder(folder=folder,tag=tag,args=args)
    else:
        id = -1
        logs_folder = None
        logger = None
        experiment_folder = None

    if weights is not None:
        model = PPO2.load(
            weights,
            verbose=0,
            tensorboard_log=logs_folder,
            max_grad_norm=100,
            n_steps=n_steps,
            gamma=gamma,
            #policy_kwargs={'data_format':'NCHW'},
        )
        model.set_env(env)
    else:
        model = PPO2(
            CnnPolicy,
            env,
            verbose=0,
            tensorboard_log=logs_folder,
            max_grad_norm=100,
            n_steps=n_steps,
            #policy_kwargs={'data_format':'NCHW'},
        )

    # set bar
    callback = Callback(
        not_save=not_save,
        logger=logger,
        train_steps=train_steps,
        n=n,
        experiment_folder=experiment_folder,
        save_interval=save_interval,
        id=id,
    )

    # Start running experiment
    # Creating nice table
    _width = 40
    del args['env_config']
    max_k_width = max([len(k) for k in args])
    print("\n{}".format("#" * _width))
    print("# {1:^{0}} #".format(_width - 4, "RUNNING EXPERIMENT"))
    print("# {1:^{0}} #".format(_width - 4, ""))
    print("# {1:<{0}} #".format(
        _width - 4, "{0:{2}s}: {1:03d}".format("ID", id, max_k_width)))
    for k, v in args.items():
        if type(v) is int:
            print("# {1:<{0}} #".format(
                _width - 4, "{0:{2}s}: {1:0d}".format(k, v, max_k_width)))
        elif type(v) is float:
            print("# {1:<{0}} #".format(
                _width - 4, "{0:{2}s}: {1:0.3f}".format(k, v, max_k_width)))
        else:
            print("# {1:<{0}} #".format(
                _width - 4, "{0:{2}s}: {1:s}".format(k, str(v), max_k_width)))
    print("{}".format("#" * _width))
    del args

    print("\n############ STARTING TRAINING ###########\n")
    try:
        with tqdm.tqdm(total=train_steps, leave=True) as bar:
            callback.set_bars(bar)
            model.learn(
                total_timesteps=train_steps,
                callback=callback,
            )

        if not not_save:
            model.save(experiment_folder + "/weights_final")

    except KeyboardInterrupt:
        if not not_save and input(
                "Do you want to DELETE this experiment? (Yes/n) ") == "Yes":
            remove_experiment(experiment_folder, folder, experiment_csv, id)
        else:
            if not not_save:
                model.save(experiment_folder + "/weights_final")