Example #1
0
def learn(env_name, save_file, total_timesteps):
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = PPO(CnnPolicy, env, verbose=1)
    model.learn(total_timesteps=total_timesteps)
    model.save(save_file)
    del model
    env.close()
Example #2
0
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False):
    print("Testing:")
    print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info)
    eval_env = DummyVecEnv([eval_env])
    eval_env = VecNormalize.load(vec_filename, eval_env)
    eval_env.norm_reward = False

    eval_env.seed(seed)
    model = PPO.load(model_filename)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}")
    return total_reward, distance_x
Example #3
0
def main(config, agent):
    with open(config) as fp:
        json_data = json.load(fp)

    video_path = os.path.join("./videos", agent, "pong")
    config = GameConfig.deserialize(json_data)
    config.agents_config[args.agent]["save_path"] += "best_model.zip"
    # config.agents_config[args.agent]["save_path"] = "my_models/pong/pong_ppo/best_model.zip"
    print(config.agents_config[args.agent]["save_path"])
    # env = retro.make(config.game_name)
    env = gym.make("PongNoFrameskip-v4")

    agent = AgentLoader.get_agent(args.agent,
                                  config.agents_config,
                                  env,
                                  load=True)
    env.close()
    env = gym.make("PongNoFrameskip-v4")
    env = DummyVecEnv([lambda: env])
    # env = retro.make(config.game_name, record=video_path)
    env = VecVideoRecorder(
        env,
        video_path,
        record_video_trigger=lambda x: x == 0,
    )

    obs = env.reset()
    done = False
    while not done:
        actions, _ = agent.agent.predict(obs)
        obs, rew, done, info = env.step(actions)

    env.close()
Example #4
0
def main_vs_5(config: str):

    with open(config) as fp:
        json_data = json.load(fp)

    config = GameConfig.deserialize(json_data)
    config.agents_config["A2C"]["save_path"] += "_vs_5"
    config.agents_config["A2C"]["tensorboard"] += "_vs_5"

    env = DummyVecEnv(
        [lambda: retro.make(config.game_name, state=config.train_states[0])])
    agent = AgentLoader.get_agent("A2C", config.agents_config, env)
    env.close()

    start_time = time.time()
    for st in tqdm(config.train_states, desc='Main Loop'):
        print(st)
        env = DummyVecEnv([
            lambda: retro.make(config.game_name, state=st, scenario='scenario')
        ])
        agent.agent.set_env(env)
        agent.agent.learn(total_timesteps=10000)
        agent.save()
        env.close()
    end_time = time.time() - start_time
    print(f'\n The Training Took {end_time} seconds')
def record_video(env_id,
                 model,
                 video_length=500,
                 prefix='',
                 video_folder='videos'):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
        """
    eval_env = DummyVecEnv(
        [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
    # eval_env = gym.make(env_id)
    val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                eval_env)

    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder='tmp',
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    for i in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Example #7
0
def record_video_example():
    # Record a Video.

    env_id = "CartPole-v1"
    video_folder = "logs/videos/"
    video_length = 100

    env = DummyVecEnv([lambda: gym.make(env_id)])

    obs = env.reset()

    # Record the video starting at the first step.
    env = VecVideoRecorder(env,
                           video_folder,
                           record_video_trigger=lambda x: x == 0,
                           video_length=video_length,
                           name_prefix=f"random-agent-{env_id}")

    env.reset()
    for _ in range(video_length + 1):
        action = [env.action_space.sample()]
        obs, _, _, _ = env.step(action)

    # Save the video.
    env.close()
Example #8
0
def record_video(env_id,
                 model,
                 video_length=500,
                 prefix='',
                 video_folder='videos/'):
    """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
    eval_env = DummyVecEnv([lambda: gym.make(env_id)])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
Example #9
0
def eval_100_trials(args):
    with open(args.config) as fp:
        json_data = json.load(fp)

    config = GameConfig.deserialize(json_data)
    config.agents_config[args.agent]["save_path"] += "_vs_time_pt.zip"
    env = DummyVecEnv(
        [lambda: retro.make(config.game_name, state=config.eval_state[1])])
    agent = AgentLoader.get_agent(args.agent,
                                  config.agents_config,
                                  env,
                                  load=True)

    rew_list = []
    trials = 100
    for i in tqdm(range(trials)):
        obs = env.reset()
        done = False
        reward = 0
        while not done:
            actions, _ = agent.agent.predict(obs)
            obs, rew, done, info = env.step(actions)
            reward += rew

        rew_list.append(reward)

    env.close()
    count = sum(i > 0 for i in rew_list)

    print("win percentage = {}%".format(count / trials * 100))
Example #10
0
def eval_time(args):
    with open(args.config) as fp:
        json_data = json.load(fp)

    video_path = os.path.join("./videos", args.agent)
    config = GameConfig.deserialize(json_data)
    config.agents_config[args.agent]["save_path"] += "_vs_time_pt_check.zip"
    env = DummyVecEnv(
        [lambda: retro.make(config.game_name, state=config.eval_state[1])])
    agent = AgentLoader.get_agent(args.agent,
                                  config.agents_config,
                                  env,
                                  load=True)
    env.close()
    env = DummyVecEnv([
        lambda: retro.make(
            config.game_name, state=config.eval_state[1], record=video_path)
    ])
    obs = env.reset()
    done = False

    while not done:
        actions, _ = agent.agent.predict(obs)
        obs, rew, done, info = env.step(actions)
    #   env.render()

    env.close()
Example #11
0
def run_experiment(args):
    # Again could have used the SB3 tools here, buuuut...
    vecEnv = []
    for i in range(args.n_envs):
        # Bit of trickery here to avoid referencing
        # to the same "i"
        vecEnv.append((lambda idx: lambda: create_env(args, idx))(i))

    vecEnv = DummyVecEnv(vecEnv)

    constraint = AVAILABLE_CONSTRAINTS[args.constraint]
    agent = None
    if constraint == "ClipPPO":
        # Create a vanilla PPO
        agent = PPO("MlpPolicy",
                    vecEnv,
                    verbose=2,
                    device="cpu",
                    n_steps=args.n_steps,
                    clip_range=args.clip_range,
                    learning_rate=args.learning_rate,
                    gamma=args.gamma,
                    ent_coef=args.ent_coef,
                    gae_lambda=1.0,
                    n_epochs=args.n_epochs)
    else:
        constraint = constraint(args)

        agent = SmallStepPPO("MlpPolicy",
                             vecEnv,
                             verbose=2,
                             device="cpu",
                             n_steps=args.n_steps,
                             step_constraint=constraint,
                             learning_rate=args.learning_rate,
                             step_constraint_max_updates=args.max_updates,
                             gamma=args.gamma,
                             ent_coef=args.ent_coef,
                             gae_lambda=1.0)

    output_log_file = None
    if args.output_log:
        output_log_file = open(args.output_log, "w")
        logger.Logger.CURRENT = logger.Logger(
            folder=None,
            output_formats=[logger.HumanOutputFormat(output_log_file)])

    agent.learn(total_timesteps=args.total_timesteps)

    if args.output is not None:
        agent.save(os.path.join(args.output, AGENT_FILE))

    vecEnv.close()
    if output_log_file:
        output_log_file.close()
Example #12
0
def play(env_name, load_file, total_timesteps):
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = PPO.load(load_file, verbose=1)
    obs = env.reset()
    for i in range(total_timesteps):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render() # dummy
        if done:
            print(info[0]['episode'])
    del model
    env.close()
Example #13
0
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"):

    print("Testing:")
    total_rewards = []
    distance_xs = []
    for i in range(test_n):
        print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}")
        print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}")
        eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class)
        eval_env = DummyVecEnv([eval_env])
        eval_env = VecNormalize.load(vec_filename, eval_env)
        eval_env.norm_reward = False

        eval_env.seed(seed+i)
        model = PPO.load(model_filename)

        obs = eval_env.reset()
        if render:
            eval_env.env_method("set_view")
        distance_x = 0
        # print(obs)
        total_reward = 0
        for step in range(1000):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            if done:
                break
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_env.envs[0].robot.body_xyz[0]
            total_reward += reward[0]
            if render:
                time.sleep(0.01)

        eval_env.close()
        print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}")

        total_rewards.append(total_reward)
        distance_xs.append(distance_x)

    # avoid yaml turn float64 to numpy array
    total_rewards = [float(x) for x in total_rewards]
    distance_xs = [float(x) for x in distance_xs]

    data = {
        "title": "test",
        "train": train,
        "test": test,
        "total_reward": total_rewards,
        "distance_x": distance_xs,
    }
    with open(f"{save_file}", "w") as f:
        yaml.dump(data, f)
Example #14
0
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'):
    print('record_video function')
    # Wrap the env in a Vec Video Recorder 
    local_eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
    local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    sync_envs_normalization(train_env, local_eval_env)
    local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath,
                              record_video_trigger=lambda step: step == 0, video_length=videoLength,
                              name_prefix=prefix)
    obs = local_eval_env.reset()
    for _ in range(videoLength):
        action, _ = model.predict(obs)
        obs, _, _, _ = local_eval_env.step(action)

    # Close the video recorder
    local_eval_env.close()
Example #15
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])
    #
    if args.stats_path is None:
        envs = VecNormalize(envs)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs)
        learner.learn(total_timesteps=10000000, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)
        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Example #16
0
def main_vs_time(config: str):
    with open(config) as fp:
        json_data = json.load(fp)

    config = GameConfig.deserialize(json_data)
    config.agents_config["A2C"]["save_path"] += "_vs_time_pt"
    config.agents_config["A2C"]["tensorboard"] += "_vs_time"

    env = DummyVecEnv(
        [lambda: (retro.make(config.game_name, state=config.eval_state[0]))])
    agent = AgentLoader.get_agent("A2C", config.agents_config, env)
    start_time = time.time()
    with ProgressBarManager_new(40000) as callback:
        agent.agent.learn(total_timesteps=40000, callback=callback)
        agent.save()
        env.close()

    end_time = time.time() - start_time
    print(f'\n The Training Took {end_time} seconds')
Example #17
0
def main_vs_time(config: str):
    with open(config) as fp:
        json_data = json.load(fp)

    config = GameConfig.deserialize(json_data)
    config.agents_config["PPO"]["save_path"] += "_vs_time_pt_check"
    config.agents_config["PPO"]["tensorboard"] += "_vs_time_check"

    env = DummyVecEnv(
        [lambda: (retro.make(config.game_name, state=config.eval_state[0]))])
    agent = AgentLoader.get_agent("PPO", config.agents_config, env)
    env.close()
    env = DummyVecEnv(
        [lambda: (retro.make(config.game_name, state=config.eval_state[0]))])
    agent.agent.set_env(env)

    with ProgressBarManager_new(1000) as callback:
        agent.agent.learn(1000, callback=callback)
        agent.save()
        env.close()
Example #18
0
def test(seed,
         model,
         train,
         test,
         normalize_kwargs,
         body_info=0,
         render=False):
    print("Testing:")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = DummyVecEnv([
        utils.make_env(rank=0,
                       seed=utils.seed + 1,
                       render=False,
                       robot_body=test,
                       body_info=0)
    ])
    eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs)
    eval_env.seed(seed)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(
        f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}"
    )
    return total_reward, distance_x
Example #19
0
def test_current_exp(args):
    if args.save_img:
        all_folders = glob.glob(os.path.join(img_path,"*"))
        all_folders = [os.path.basename(x) for x in all_folders]
        all_folders = [int(x) if x.isnumeric() else -1 for x in all_folders] + [0]
        current_folder = max(all_folders) + 1
        current_folder = os.path.join(img_path, str(current_folder))
        os.makedirs(current_folder, exist_ok=True)
        print(f"Writing into {current_folder}")
        input("Press Enter...")

    env = DummyVecEnv([make_env(env_id=args.env_id, rank=0, seed=0, render=True)])
    env = VecNormalize.load(args.vnorm_filename, env)
    model = CustomizedPPO.load(args.model_filename, env=env)
    callback = AdjustCameraCallback()
    
    obs = env.reset()
    callback.reset_lights(env.envs[0].env._p) # once window is opened, change the lighting

    if args.save_img:
        time.sleep(1) # please use this time to maximize the window, so that the image recorded will be full size

    with model.policy.features_extractor.start_testing():
        while True:
            for i in range(1000):
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                callback.camera_simpy_follow_robot(target_env=env.envs[0])
                if args.save_img:
                    callback.write_a_image(current_folder=current_folder, step=i, target_env=env.envs[0])
                    if obs.shape[1]>100: # With Vision I guess
                        image = np.rollaxis(obs[:, -3*8*8:].reshape([3,8,8]), 0, start=3) * 255.0
                        print(image.shape)
                        # image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                        cv2.imwrite(f"{current_folder}/vision_{i:05}.png", image)
                if done:
                    break
                time.sleep(0.01)
            break
        time.sleep(0.1)
    env.close()
Example #20
0
def collect_rollouts(env, experiment_path):
    """
    Collect rollouts for given experiment.

    Based on the code here
    https://github.com/HumanCompatibleAI/imitation/blob/master/src/imitation/scripts/expert_demos.py
    """
    rollout_file = os.path.join(experiment_path, ROLLOUTS_FILE)
    if os.path.isfile(rollout_file):
        return

    sample_until = rollout.make_sample_until(MAX_ROLLOUT_TIMESTEPS,
                                             MAX_ROLLOUT_EPISODES)

    venv = DummyVecEnv([lambda: create_env(env) for i in range(NUM_ENVS)])

    agent_path = os.path.join(experiment_path, AGENT_FILE)
    agent = PPO.load(agent_path)

    rollout.rollout_and_save(rollout_file, agent, venv, sample_until)

    venv.close()
Example #21
0
                              dtype=np.float32)

        distance_x = 0
        total_reward = 0
        step = 0
        for step in tqdm(range(args.test_steps)):
            g_obs_data[step, :] = obs[0]
            # for i in obs[0]:
            #     print(f"{i:.02f}", end=", ")
            # print("")
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_venv.step(action)
            if args.render:
                # eval_venv.envs[0].camera_adjust()
                time.sleep(0.015)
            if done:
                # it should not matter if the env reset. I guess...
                break
                # pass
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_venv.envs[0].robot.body_xyz[0]
            total_reward += reward[0]

        eval_venv.close()
        print(f"model filename: {args.model_filename}")
        print(f"test on {test_body}")
        print(
            f"Results: last step {step}, total_reward {total_reward}, distance_x {distance_x}"
        )
        print("\n" * 4)
Example #22
0
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
            optimizer.step()

        if args.kle_stop:
            if approx_kl > args.target_kl:
                break
        if args.kle_rollback:
            if (b_logprobs[minibatch_ind] -
                    agent.get_action(b_obs[minibatch_ind],
                                     b_actions.long()[minibatch_ind])[1]
                ).mean() > args.target_kl:
                agent.load_state_dict(target_agent.state_dict())
                break

    # TRY NOT TO MODIFY: record rewards for plotting purposes
    writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]['lr'],
                      global_step)
    writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
    writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
    writer.add_scalar("losses/entropy", entropy.mean().item(), global_step)
    writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
    if args.kle_stop or args.kle_rollback:
        writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step)

    print("SPS:", int(global_step / (time.time() - start_time)))
envs.close()
writer.close()
Example #23
0
                lambda game_path=args.game_path, b=args.base_port+(i*2), c="gamelog-"+str(i)+".txt", d=args.level_path, e=args.image_based, f=args.env_p: 
                        TankEnv(game_path,
                                game_port=b,
                                game_log_path=c,
                                level_path=d,
                                image_based=e,
                                p=f
                        )
            )
        env_stack = DummyVecEnv(envs)
    else:
        env_stack = TankEnv(args.game_path, game_port=args.base_port, game_log_path="gamelog.txt", level_path=args.level_path, image_based=args.image_based, p=args.env_p)

    try:
        population = []
        for i in range(args.start):
            agent_name, agent = gen_agent(env_stack, args.num_envs, args.model_dir, args.noun_file_path, args.adj_file_path, 
                batch_size=args.batch_size, image_based=args.image_based, image_pretrain=args.image_pretrain, env_p=args.env_p)
            population.append(agent_name)
            if args.nem:
                population.append(gen_nemesis(agent_name, agent, env_stack, args.num_envs, args.model_dir, image_based=args.image_based, image_pretrain=args.image_pretrain, env_p=args.env_p))
            if args.surv:
                population.append(gen_survivor(agent_name, agent, env_stack, args.num_envs, args.model_dir, image_based=args.image_based, image_pretrain=args.image_pretrain, env_p=args.env_p))
        if args.start:
            with open(args.model_dir + "/population.txt", 'w') as pop_file:
                for p in population:
                    pop_file.write(p + '\n')
    finally:
        env_stack.close()
    
    print("PBT Preamble complete", flush=True)
Example #24
0
def test_framestack_vecenv():
    """Test that framestack environment stacks on desired axis"""

    image_space_shape = [12, 8, 3]
    zero_acts = np.zeros([N_ENVS] + image_space_shape)

    transposed_image_space_shape = image_space_shape[::-1]
    transposed_zero_acts = np.zeros([N_ENVS] + transposed_image_space_shape)

    def make_image_env():
        return CustomGymEnv(
            gym.spaces.Box(
                low=np.zeros(image_space_shape),
                high=np.ones(image_space_shape) * 255,
                dtype=np.uint8,
            ))

    def make_transposed_image_env():
        return CustomGymEnv(
            gym.spaces.Box(
                low=np.zeros(transposed_image_space_shape),
                high=np.ones(transposed_image_space_shape) * 255,
                dtype=np.uint8,
            ))

    def make_non_image_env():
        return CustomGymEnv(
            gym.spaces.Box(low=np.zeros((2, )), high=np.ones((2, ))))

    vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)])
    vec_env = VecFrameStack(vec_env, n_stack=2)
    obs, _, _, _ = vec_env.step(zero_acts)
    vec_env.close()

    # Should be stacked on the last dimension
    assert obs.shape[-1] == (image_space_shape[-1] * 2)

    # Try automatic stacking on first dimension now
    vec_env = DummyVecEnv([make_transposed_image_env for _ in range(N_ENVS)])
    vec_env = VecFrameStack(vec_env, n_stack=2)
    obs, _, _, _ = vec_env.step(transposed_zero_acts)
    vec_env.close()

    # Should be stacked on the first dimension (note the transposing in make_transposed_image_env)
    assert obs.shape[1] == (image_space_shape[-1] * 2)

    # Try forcing dimensions
    vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)])
    vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="last")
    obs, _, _, _ = vec_env.step(zero_acts)
    vec_env.close()

    # Should be stacked on the last dimension
    assert obs.shape[-1] == (image_space_shape[-1] * 2)

    vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)])
    vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="first")
    obs, _, _, _ = vec_env.step(zero_acts)
    vec_env.close()

    # Should be stacked on the first dimension
    assert obs.shape[1] == (image_space_shape[0] * 2)

    # Test invalid channels_order
    vec_env = DummyVecEnv([make_image_env for _ in range(N_ENVS)])
    with pytest.raises(AssertionError):
        vec_env = VecFrameStack(vec_env, n_stack=2, channels_order="not_valid")

    # Test that it works with non-image envs when no channels_order is given
    vec_env = DummyVecEnv([make_non_image_env for _ in range(N_ENVS)])
    vec_env = VecFrameStack(vec_env, n_stack=2)
def main():
    # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(),
    #                          torch.nn.Linear(64, 2))

    os.makedirs(_log_dir, exist_ok=True)

    DoTraining = True
    StartFresh = True
    num_cpu = 8
    if (DoTraining):

        # This doesn't work but it might have something to do with how the environment is written
        # num_cpu = 1
        # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor

        # Create the callback: check every 1000 steps
        # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir)

        if (StartFresh):
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.)
            env.reset()
            policy_kwargs = {
                'net_arch': [128, 128, 128],
            }
            model = PPO('MlpPolicy',
                        env,
                        policy_kwargs=policy_kwargs,
                        verbose=2,
                        tensorboard_log=tb_log)
        else:
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize.load(_stats_path, env)
            env.reset()

            model = PPO.load(
                'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl',
                tensorboard_log=tb_log)
            model.set_env(env)

        eval_env = gym.make(env_id)
        # print('!!!!Checking Environment!!!!')
        # print(check_env(eval_env))

        mean_reward, std_reward = evaluate_policy(model,
                                                  eval_env,
                                                  n_eval_episodes=10)
        print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
        for _ in range(50):
            model.learn(total_timesteps=100000,
                        tb_log_name=env_id,
                        reset_num_timesteps=False)  #, callback=callback
            mean_reward, std_reward = evaluate_policy(model,
                                                      eval_env,
                                                      n_eval_episodes=10)
            print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
            model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) +
                       '.mdl')
            env.save(_log_dir +
                     'vec_normalize_{}'.format(model.num_timesteps) + '.pkl')

    if (not DoTraining):
        # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env)
        # eval_env = VecVideoRecorder(eval_env, video_folder='videos/',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='test')
        # eval_env.training = False
        # eval_env.norm_reward = False
        # eval_env.reset()

        eval_env = DummyVecEnv(
            [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
        # eval_env = gym.make(env_id)
        eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                     eval_env)

        model = PPO.load(
            'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl',
            tensorboard_log=tb_log)
        model.set_env(eval_env)
        # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id)
        # Start the video at step=0 and record 500 steps
        # eval_env = VecVideoRecorder(eval_env, video_folder='tmp',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='')

        obs = eval_env.reset()
        # for i in range(500):
        #     action, _ = model.predict(obs)
        #     obs, _, _, _ = eval_env.step(action)
        # eval_env.close()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, _, done, _ = eval_env.step(action)
            # eval_env.render()
            if done.any():
                # obs = eval_env.reset()
                # time.sleep(1/30)
                eval_env.close()
                break
def run_experiment(args):
    n_envs = None
    n_timesteps = None
    policy = "MlpPolicy"
    hyperparams = {}
    # Super-pretty manual hardcoding of the parameters
    # right here, but we only are going to run one type.
    if args.env == "BipedalWalkerHardcore-v3":
        # Adapted from
        # https://github.com/araffin/rl-baselines-zoo/blob/master/hyperparams/ppo2.yml
        n_envs = 16
        n_timesteps = int(10e7)
        policy = "MlpPolicy"
        hyperparams = {
            "n_steps": 2048,
            "gae_lambda": 0.95,
            "gamma": 0.99,
            "n_epochs": 10,
            "ent_coef": 0.001,
            "clip_range": 0.2,
            "clip_range_vf": 0.2,
            "learning_rate": 2.5e-4,
            "batch_size": (2048 * 16) // 32
        }
    else:
        # LunarLander-v2
        n_envs = 16
        n_timesteps = int(1e6)
        policy = "MlpPolicy"
        hyperparams = {
            "n_steps": 1024,
            "gae_lambda": 0.98,
            "gamma": 0.999,
            "n_epochs": 4,
            "ent_coef": 0.01,
            "clip_range": 0.2,
            "clip_range_vf": 0.2,
            "batch_size": (1024 * 16) // 32
        }

    vecEnv = []
    for i in range(n_envs):
        # Bit of trickery here to avoid referencing
        # to the same "i"
        vecEnv.append((lambda idx: lambda: create_env(args, idx))(i))

    vecEnv = DummyVecEnv(vecEnv)

    agent_class = AVAILABLE_ALGORITHMS[args.agent]
    agent = agent_class(policy, vecEnv, verbose=1, device="cpu", **hyperparams)

    # Prepare callback
    checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR)
    os.makedirs(checkpoint_dir)
    # Note that save_freq is counted in number of agent step-calls,
    # not env step-calls.
    save_freq = n_timesteps // (args.num_snapshots * n_envs)

    checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir)

    agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback)
    agent.save(os.path.join(args.output, AGENT_FILE))

    vecEnv.close()
Example #27
0
    else:
        body = args.body_id
        print(body)
        env = DummyVecEnv([utils.make_env(rank=i, seed=utils.seed, render=args.render, robot_body=body, body_info=0) for i in range(train_num_envs)])
        save_filename = f"model-ant-single-{body}"

    env = VecNormalize(env, **normalize_kwargs)

    keys_remove =["normalize", "n_envs", "n_timesteps", "policy"]
    for key in keys_remove:
        del hyperparams[key]

    eval_env = DummyVecEnv([utils.make_env(rank=0, seed=utils.seed+1, render=False, robot_body=2, body_info=0)])
    eval_env = VecNormalize(eval_env, norm_reward=False, **normalize_kwargs)
    eval_callback = EvalCallback(
        eval_env=eval_env,
        n_eval_episodes=3,
        eval_freq=1e4, # will implicitly multiplied by 16 (train_num_envs)
        deterministic=True,
    )
    # eval_callback = None

    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=f"{folder}/tb/{save_filename}", seed=utils.seed, **hyperparams)

    model.learn(total_timesteps=total_timesteps, callback=eval_callback)
    model.save(f"{folder}/{save_filename}")
    # Important: save the running average, for testing the agent we need that normalization
    model.get_vec_normalize_env().save(f"{folder}/{save_filename}-vecnormalize.pkl")

    env.close()
Example #28
0
                verbose=1,
                tensorboard_log=str(common.output_data_folder /
                                    f"tensorboard" / saved_model_filename),
                seed=common.seed,
                **hyperparams)

    if len(args.initialize_weights_from) > 0:
        try:
            load_model = PPO.load(args.initialize_weights_from)
            load_weights = load_model.policy.state_dict()
            model.policy.load_state_dict(load_weights)
            print(f"Weights loaded from {args.initialize_weights_from}")
        except Exception:
            print("Initialize weights error.")
            raise Exception

    try:
        model.learn(total_timesteps=args.train_steps, callback=all_callbacks)
    except KeyboardInterrupt:
        pass
    model.save(str(common.output_data_folder / "models" /
                   saved_model_filename))

    if args.vec_normalize:
        # Important: save the running average, for testing the agent we need that normalization
        model.get_vec_normalize_env().save(
            str(common.output_data_folder / "models" /
                f"{saved_model_filename}.vnorm.pkl"))

    venv.close()
Example #29
0
def test(seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"):
    global g_step, g_obs_data
    print("Testing:")
    total_rewards = []
    distance_xs = []
    if True:
        os.makedirs(f"{folder}/obs_data_videos", exist_ok=True)
        g_step = 0

        print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
        print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}")
        if test_as_class>=0:
            bodyinfo = test_as_class
        else:
            if args.with_bodyinfo:
                bodyinfo = test//100
            else:
                bodyinfo = 0
        # default_wrapper = wrapper.BodyinfoWrapper
        # if args.disable_wrapper:
        #     default_wrapper = None
        default_wrapper = wrapper.WalkerWrapper

        eval_env = utils.make_env(template=utils.template(test), render=render, robot_body=test, wrapper=default_wrapper, body_info=bodyinfo)
        eval_env = DummyVecEnv([eval_env])
        if args.vec_normalize:
            eval_env = VecNormalize.load(vec_filename, eval_env)
        eval_env.norm_reward = False

        eval_env.seed(seed)
        model = PPO.load(model_filename)

        obs = eval_env.reset()
        g_obs_data = np.zeros(shape=[args.test_steps, obs.shape[1]], dtype=np.float32)

        if render:
            # eval_env.env_method("set_view")
            import common.linux
            common.linux.fullscreen()
            print("\n\nWait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode.\n\n")
            time.sleep(2) # Wait for a while, so I have the time to press Ctrl+F11 to enter FullScreen Mode.
        distance_x = 0
        # print(obs)
        total_reward = 0
        for step in tqdm(range(args.test_steps)):
            g_obs_data[step,:] = obs[0]
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            if render:
                eval_env.envs[0].camera_adjust()
                if args.disable_saving_image:
                    time.sleep(0.01)
                else:
                    (width, height, rgbPixels, _, _) = eval_env.envs[0].pybullet.getCameraImage(1920,1080, renderer=pybullet.ER_BULLET_HARDWARE_OPENGL)
                    image = rgbPixels[:,:,:3]
                    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                    cv2.imwrite(f"{folder}/obs_data_videos/getCameraImage_b{test}_s{seed}_{step:05}.png", image)
            if done:
                # it should not matter if the env reset. I guess...
                # break
                pass
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_env.envs[0].robot.body_xyz[0]
            total_reward += reward[0]
            # if render:
            #    time.sleep(0.01)

        eval_env.close()
        print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}")

        if args.save_obs_data:
            base_obs_data = None
            sorted_data = g_obs_data.copy()
            if test!=0 or seed!=0:
                # if sorted_arg_obs_data exists, use the existing one
                # because we want to compare the patterns of two experiments
                sorted_arg_obs_data = np.load(f"{folder}/sorted_arg_obs_data.npy")
                base_obs_data = np.load(f"{folder}/base_obs_data.npy")
            else:
                sorted_arg_obs_data = np.argsort(np.mean(sorted_data,axis=0))
                np.save(f"{folder}/sorted_arg_obs_data.npy", sorted_arg_obs_data)
                base_obs_data = g_obs_data.copy()
                np.save(f"{folder}/base_obs_data.npy", base_obs_data)

            # sorted_data = sorted_data[:,sorted_arg_obs_data]
            # base_obs_data = base_obs_data[:, sorted_arg_obs_data]

            for step in tqdm(range(args.test_steps)):
                plt.close()
                plt.figure(figsize=[10,4])
                if test!=0 or seed!=0:
                    x = sorted_data[step]
                    plt.bar(np.arange(len(x)), x, color=[0.1, 0.3, 0.7, 0.5])
                x = base_obs_data[step]
                plt.bar(np.arange(len(x)), x, color=[0.6, 0.6, 0.6, 0.5])
                plt.ylim(-2,2)
                plt.savefig(f"{folder}/obs_data_videos/barchart_b{test}_s{seed}_{step:05}.png")
                plt.close()

        total_rewards.append(total_reward)
        distance_xs.append(distance_x)

    # avoid yaml turn float64 to numpy array
    total_rewards = [float(x) for x in total_rewards]
    distance_xs = [float(x) for x in distance_xs]

    data = {
        "title": "test",
        "train": train,
        "test": test,
        "total_reward": total_rewards,
        "distance_x": distance_xs,
    }
    with open(f"{save_file}", "w") as f:
        yaml.dump(data, f)