Esempio n. 1
0
def train(env_id,
          num_timesteps,
          seed,
          policy,
          attack=False,
          n_envs=8,
          nminibatches=4,
          n_steps=128):

    model = PPO2.load("model.pkl")
    env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4)
    if attack:
        env = VecFrameStack(
            make_adversarial_atari_env(env_id, n_envs, seed, model), 4)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    #    model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches,
    #                lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01,
    #                 learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)
    model.learn(total_timesteps=num_timesteps)
    model.save("model")
    env.close()
    # Free memory
    del model
Esempio n. 2
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
    """
    Train A2C model for atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_env: (int) The number of environments
    """
    policy_fn = None
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    elif policy == 'lnlstm':
        policy_fn = CnnLnLstmPolicy
    if policy_fn is None:
        raise ValueError("Error: policy {} not implemented".format(policy))

    env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

    model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
Esempio n. 3
0
def train(env_id, num_timesteps, seed, policy,
          n_envs=8, nminibatches=4, n_steps=128):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param n_envs: (int) Number of parallel environments
    :param nminibatches: (int) Number of training minibatches per update. For recurrent policies,
        the number of environments run in parallel should be a multiple of nminibatches.
    :param n_steps: (int) The number of steps to run for each environment per update
        (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
    """
    env = make_atari_env(env_id, n_envs, seed)
    env = VecFrameStack(env, 4)
    policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy]
    model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches,
                 lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)
    model.learn(total_timesteps=num_timesteps)
    model.save('/serverdata/rohit/stablebaselines/atari/ppo/{}'.format(env_id), 'csv')

    env.close()
    # Free memory
    del model
Esempio n. 4
0
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
    """
    train an ACER model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    if policy == 'cnn':
        policy_fn = CnnPolicy
    elif policy == 'lstm':
        policy_fn = CnnLstmPolicy
    else:
        warnings.warn("Policy {} not implemented".format(policy))
        return

    model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
    # Free memory
    del model
def test_pretrain_images(tmp_path):
    env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4)
    model = PPO2('CnnPolicy', env)
    generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1,
                         image_folder=str(tmp_path / 'pretrain_recorded_images'))

    expert_path = str(tmp_path / 'expert_pong.npz')
    dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32,
                            sequential_preprocessing=True)
    model.pretrain(dataset, n_epochs=2)

    shutil.rmtree(str(tmp_path / 'pretrain_recorded_images'))
    env.close()
    del dataset, model, env
Esempio n. 6
0
def test_ppo(env_id, seed, path_to_policy_params, n_envs = 1):
    
    """
     env_id: typr str, identifies each environment uniquely
     num_timesteps: number of timesteps to run the algorithm
     seed: initial random seed
     policy: policy to be followed (mlp, cnn, lstm, etc)
     n_env: number of envs to run in parallel
     nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params
     n_steps: number of steps in each update
    """
    
    # Train PPO algorithm for num_timesteps
    # stack 4 frames for the vectorized environment
    # Note: PPO2 works only with vectorized environment
    env = VecFrameStack(make_atari_env(env_id = env_id, num_env = n_envs, seed=seed), 4)
    # define the policy
    # create model object for class PPO2
    # The policy is CnnPolicy from stable baselines and has been trained for 2e7 time steps on Pong
    
    model = PPO2.load(path_to_policy_params)
    vr = video_recorder.VideoRecorder(env, base_path="./videos/Pong_test_without_attack", enabled="./videos/Pong_test_without_attack" is not None)
    
    obs = env.reset()
    ep_rew = [0.0]
    ep = 0
    for i in range(50000):
      action, _states = model.predict(obs)
      obs, rewards, dones, info = env.step(action)
      ep_rew[-1] += rewards
      env.render()
      vr.capture_frame()
      if dones:
        obs = env.reset()
        print('Net reward for episode ',ep,': ',ep_rew[-1])
        if((ep+1)%10 == 0):
          print('Mean reward for last 10 episodes: ',np.mean(ep_rew[-10:]))
        ep_rew.append(0.0)
        ep += 1
        print('Number of timesteps completed: ', i+1)
    env.close()
    vr.close()
Esempio n. 7
0
def train(
    train_id,
    game,
    level,
    num_processes,
    num_timesteps,
    algo_name,
    policy_name,
    is_joint,
    model_save_path,
    logs_path,
    hyper_opt,
    load_model_path=None,
    train_counter=0,  # To be set (incrementally) when running multiple trainings
    short_life=False,
    backtracking=False,
):
    global global_logs_path, best_mean_reward, n_steps

    print("\n\nStarting training with args:\n")
    print(log_fun_args(locals()))
    print("\n")

    global_logs_path = logs_path
    best_mean_reward, n_steps = -np.inf, 0
    envs = []
    if is_joint:
        envs = [
            make_env(
                game=game,
                level=level,
                rank=i,
                log_dir=logs_path,
                seed=train_counter * 100,
                short_life=short_life,
                backtracking=backtracking,
            ) for i, (game, level) in enumerate(small_train_set)
        ]
    else:
        envs = [
            make_env(
                game=game,
                level=level,
                rank=i,
                log_dir=logs_path,
                seed=train_counter * 100,
                short_life=short_life,
                backtracking=backtracking,
            ) for i in range(num_processes)
        ]

    if num_processes == 1:
        env = VecFrameStack(DummyVecEnv(envs), 4)
    else:
        env = VecFrameStack(SubprocVecEnv(envs), 4)

    print("\n\n")

    algo = None
    if algo_name == "ppo2":
        algo = PPO2
    elif algo_name == "a2c":
        algo = A2C

    policy = None
    nminibatches = 4
    if policy_name == "cnn":
        policy = CnnPolicy
    elif policy_name == "cnnlstm":
        if is_joint:
            nminibatches = 5
        policy = CnnLstmPolicy

    model = None
    if load_model_path:
        print("Loading a model...")
        model = algo.load(load_model_path, env=env, tensorboard_log=logs_path)
    else:
        print("Creating a new model...")
        if algo_name == "ppo2":
            if hyper_opt:
                model = algo(
                    policy,
                    env,
                    verbose=1,
                    tensorboard_log=logs_path,
                    n_steps=4096,
                    nminibatches=8,
                    learning_rate=2e-4,
                    ent_coef=0.01,
                )
            else:
                model = PPO2(
                    policy,
                    env,
                    nminibatches=nminibatches,
                    verbose=1,
                    tensorboard_log=logs_path,
                )
        elif algo_name == "a2c":
            model = A2C(policy, env, verbose=1, tensorboard_log=logs_path)

    print(f"Starting training for {num_timesteps} timesteps")
    model.learn(total_timesteps=num_timesteps,
                callback=callback,
                log_interval=1)
    print("Training finished!")

    if model_save_path:
        model.save(model_save_path)
        print("Model saved in:\t", model_save_path)

    timestep_values, score_values = ts2xy(load_results(logs_path), "timesteps")
    score_values = score_values * 100

    plot_path = os.path.join(logs_path, f"{level}.png")
    print("Saving the plot in: " + plot_path)
    save_plot(timestep_values, score_values, title=level, save_path=plot_path)

    env.close()
                     num_env=config['parallel_envs'],
                     seed=config['seed'])
env = VecFrameStack(env, n_stack=config['state_frames'])
env = VecVideoRecorder(env,
                       video_folder,
                       record_video_trigger=lambda x: x == 0,
                       video_length=video_length,
                       name_prefix="random-agent-{}".format(config['task']))
obs = env.reset()
n = env.action_space.n

device = torch.device('cuda') if config['use_gpu'] else torch.device('cpu')
model = ActorCritic(n, config).to(device)
model.load_state_dict(
    torch.load('checkpoints\pong_noEnt\model_recent_ckpt',
               map_location=device))
model.eval()

for i in tqdm(range(video_length)):
    #env.render(mode='rgb_array')
    tensor = torch.from_numpy(obs.astype(np.float32).transpose(
        (0, 3, 1, 2))) / 255
    tensor = torch.nn.functional.interpolate(tensor,
                                             scale_factor=48 /
                                             tensor.shape[-1])
    action, _, _, _ = model.forward(tensor.to(device))
    obs, _, dones, _ = env.step(action)
    if dones.sum() > 0:
        obs = env.reset()
env.close()
Esempio n. 9
0
def record_():
    model_path = args.load_model
    os.path.isfile(model_path)

    # search skills

    m = re.search("\[[0-9\, \[\]]*\]", model_path)
    if m is None:
        raise ValueError(
            "load_model: {} does not contain skills".format(model_path))
    skills = str_to_skills(m.group(0))

    # search env-id
    env_id_list = ENV_LIST
    env_id = None
    searched = False
    m = re.search("[A-Z][a-z]*NoFrameskip-v4", model_path)
    if m is not None:
        searched = True
        env_id = m.group(0)

    if searched is not True:
        for id_ in env_id_list:
            if id_.lower() in model_path.lower():
                searched = True
                env_id = id_ + "NoFrameskip-v4"

    if searched is not True:
        raise ValueError(
            "load_model: {} does not contain env id".format(model_path))

    save_path = args.logdir
    if save_path is None:
        save_path = os.path.dirname(model_path)

    print("ENV:{} \nskills:{} \nmodel_path:{} \nsave_path:{}\n".format(
        env_id, skills, model_path, save_path))
    time.sleep(3)

    env_creator_ = lambda env: ActionRemapWrapper(env)
    env_creator = lambda env: SkillWrapper(env_creator_(env), skills=skills)
    env = VecFrameStack(
        make_atari_env(env_id,
                       1,
                       args.seed,
                       extra_wrapper_func=env_creator,
                       logdir=save_path,
                       wrapper_kwargs={
                           "episode_life": False,
                           "clip_rewards": False
                       }), 4)

    if args.load_model is None:
        raise NotImplementedError
    assert os.path.isfile(args.load_model)

    if args.rl_model == "ppo":
        model = PPO2.load(args.load_model)
    elif args.rl_model == "a2c":
        model = A2C.load(args.load_model)
    elif args.rl_model is None:
        if "ppo" in model_path:
            model = PPO2.load(model_path)
        elif "a2c" in model_path:
            model = A2C.load(model_path)
        else:
            raise ValueError("please specify rl_model")
    else:
        raise ValueError("{} rl_model not recognize".format(args.rl_model))

    # DEBUG
    set_global_seeds(args.seed)

    obs = env.reset()
    if args.record:
        env = VecVideoRecorder(env,
                               save_path,
                               record_video_trigger=lambda x: x == 0,
                               video_length=MAX_VIDEO_LENGTH)
        env.reset()
    total_rewards = 0

    action_save_path = os.path.join(save_path, "history_action.txt")
    if args.log_action:
        try:
            os.remove(action_save_path)
        except OSError as e:
            if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
                raise  # re-raise exception if a different error occurred
    log_picture = None
    if args.log_picture:
        log_picture = os.path.join(save_path, "history_action_pic")
        log_picture = mkdirs(log_picture, mode="keep")
        action_save_path = os.path.join(log_picture,
                                        os.path.basename(action_save_path))
        # try:
        #     # shutil.rmtree()
        # except:

    print("start evaluate")
    with open(action_save_path, 'a') as f:
        for steps in range(args.eval_max_steps):
            action, _states = model.predict(obs)
            if args.log_action:
                # print("{}".format(action[0]), sep=" ", file=f)
                f.write("{} ".format(action[0]))
            if args.log_picture:
                assert log_picture is not None
                pict = env.render(mode='rgb_array')

                im = Image.fromarray(pict)
                _path = os.path.join(log_picture,
                                     "{}_{}.jpg".format(steps, action[0]))
                im.save(_path)
            obs, rewards, dones, info = env.step(action)
            total_rewards += rewards
            if bool(dones[0]) is True:
                break
    print("steps: {}/{}".format(steps + 1, args.eval_max_steps))
    print("total_rewards: {}".format(total_rewards))
    env.close()
Esempio n. 10
0
    def get_rewards(self,
                    skills=[],
                    train_total_timesteps=5000000,
                    eval_times=100,
                    eval_max_steps=int(1e6),
                    model_save_name=None,
                    add_info={}):
        """
        
        :param skills: (list) the availiable action sequence for agent 
        e.g [[0,2,2],[0,1,1]]
        :param train_total_timesteps: (int)total_timesteps to train 
        :param eval_times: (int)the evaluation times
        e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode
        :param eval_max_steps: (int)maximum timesteps per episode when evaluate
        (deprecate):param model_save_name: (str)specify the name of saved model (should not repeat)
        :param add_info: (dict) other information to log in log.txt
        """

        if self.save_tensorboard and self.save_path is not None:
            tensorboard_log = os.path.join(self.save_path,
                                           "model_" + str(self._serial_num))
        else:
            tensorboard_log = None

        env_creator = lambda env: SkillWrapper(
            self.env_creator(env), skills=skills, gamma=self.gamma)

        if self.save_monitor is True:
            monitor_path = os.path.join(self.save_path, "monitor")
            try:
                os.makedirs(monitor_path)
            except OSError as ex:
                if ex.errno == errno.EEXIST and os.path.exists(monitor_path):
                    print("{} exists. ignore".format(monitor_path))
                    pass
                else:
                    raise
        else:
            monitor_path = None

        if "cfg" in self.env_id:

            env = make_doom_env(self.env_id,
                                self.num_cpu,
                                self.seed,
                                extra_wrapper_func=env_creator,
                                logdir=monitor_path)

        else:
            env = VecFrameStack(
                make_atari_env(self.env_id,
                               self.num_cpu,
                               self.seed,
                               extra_wrapper_func=env_creator,
                               logdir=monitor_path), 4)

        model = None
        if self.use_converge_parameter is True:
            model = self.model(self.policy,
                               env,
                               verbose=self.verbose,
                               tensorboard_log=tensorboard_log,
                               n_steps=128,
                               nminibatches=4,
                               lam=0.95,
                               gamma=0.99,
                               noptepochs=4,
                               ent_coef=.01,
                               learning_rate=lambda f: f * 2.5e-4,
                               cliprange=lambda f: f * 0.1)
        else:
            model = self.model(self.policy,
                               env,
                               verbose=self.verbose,
                               tensorboard_log=tensorboard_log)

        self.strat_time = time.time()
        print("start to train agent...")

        callback = None
        if self.evaluate_freq is not None and self.evaluate_freq > 0:
            preiod_eval_path = os.path.join(self.save_path, "period_eval")
            mkdirs(preiod_eval_path)
            if "cfg" in self.env_id:

                eval_env = make_doom_env(self.env_id,
                                         self.num_cpu,
                                         self.seed,
                                         extra_wrapper_func=env_creator,
                                         logdir=monitor_path,
                                         wrapper_kwargs={
                                             "episode_life": False,
                                             "clip_rewards": False
                                         })
            else:
                eval_env = VecFrameStack(
                    make_atari_env(self.env_id,
                                   self.num_cpu,
                                   self.seed,
                                   extra_wrapper_func=env_creator,
                                   logdir=preiod_eval_path,
                                   wrapper_kwargs={
                                       "episode_life": False,
                                       "clip_rewards": False
                                   }), 4)
            callback = self.eval_callback(eval_env,
                                          freq=self.evaluate_freq,
                                          eval_times=eval_times,
                                          eval_max_steps=eval_max_steps,
                                          save_path=preiod_eval_path)

        model.learn(total_timesteps=train_total_timesteps,
                    reset_num_timesteps=self.reset_num_timesteps,
                    callback=callback)
        print("Finish train agent")

        #evaluate once more because sometimes it is not divisible
        if callback is not None:
            callback({"self": model, "eval_now": True}, None)

        if self.save_path is not None:
            if self.preserve_model > 0:

                self.save_model(model, skills=skills)

        env.close()
        # evaluate
        env = VecFrameStack(
            make_atari_env(self.env_id,
                           self.num_cpu,
                           self.seed,
                           extra_wrapper_func=env_creator,
                           logdir=None), 4)
        info = self.evaluate(env, model, eval_times, eval_max_steps)
        try:
            env.close()
        except AttributeError as e:
            print("Ignore : {}".format(e))
        try:
            del model
        except AttributeError as e:
            print("Ignore del model : {}".format(e))

        #log result
        info.update(add_info)
        self.log(info)

        self._serial_num = self._serial_num + 1
        return info["ave_score"], info["ave_action_reward"]