Esempio n. 1
0
def record(model, env, num_episodes=1):
  """
  Evaluate a RL agent
  :param model: (BaseRLModel object) the RL Agent
  :param num_steps: (int) number of timesteps to evaluate it
  :return: (float) Mean reward for the last 100 episodes
  """
  env = VecVideoRecorder(env,"./vid", record_video_trigger=lambda x: x == 0, video_length=12000, name_prefix="tetris_ai_video")
  obs = env.reset()
  i=0
  steps = 0
  while (i <= num_episodes):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    steps+=1
    if dones[0]:
        obs = env.reset()
        print("=== EPISODE {} ===".format(i+1))
        print("Num_lines: " + str(info[0]['number_of_lines']))
        print("score: " + str(info[0]['score']))
        print("Number of episodes: ", i)
        print("=== END ===")
        i+=1
  
  return "Done"
Esempio n. 2
0
def record_video(env_id,
                 model,
                 video_length=500,
                 prefix='',
                 video_folder='videos/'):
    """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
    eval_env = DummyVecEnv([lambda: gym.make(env_id)])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
Esempio n. 3
0
 def _play_n_game(model,
                  task: str,
                  n_games: int,
                  display=False,
                  record=False):
     env = model.env_dict[task]
     if record:
         env = VecVideoRecorder(env,
                                './data/videos/',
                                record_video_trigger=lambda x: x == 0,
                                video_length=10_000,
                                name_prefix="trained-agent-{}".format(task))
     timesteps = 0
     sum_reward = 0
     for i in range(n_games):
         obs = env.reset()
         done = None
         state = None
         while not done:
             action, state = model.predict(task, obs, state, done)
             obs, reward, done, info = env.step(action)
             timesteps += 1
             sum_reward += reward
             if display is True:
                 env.render()
                 time.sleep(0.005)
     sum_reward = int(sum_reward / n_games)
     if sum_reward == 0:  # harmonic mean needs greater than zero elements
         sum_reward = 0.1
     timesteps = int(timesteps / n_games)
     env.close()
     return sum_reward, timesteps
Esempio n. 4
0
def func_run(env, logger, lr, action_noise, file):
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 5e7

    save_video_length = 200
    save_video_interval = 1000000
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=lr,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter),
                                       np.format_float_scientific(lr))
    model.save(exp_name)
    file.write(exp_name + '\n')
    env.close()
    return True
Esempio n. 5
0
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
Esempio n. 6
0
def record_video(env_id,
                 model,
                 video_length=300,
                 prefix='',
                 video_folder='videos/',
                 lstm=False):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id)])
    # Start the video at step=0 and record 300 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    state = None
    for _ in range(video_length):
        action, state = model.predict(np.tile(obs, (model.n_envs, 1)),
                                      state=state,
                                      deterministic=False)
        action = action[[0]] if lstm else action[0]
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
Esempio n. 7
0
def wrap_video_env(env, name, video_length, path='videos/'):
    """Wrap the environment to record a video"""
    env = VecVideoRecorder(env,
                           path,
                           record_video_trigger=lambda x: x == 0,
                           video_length=video_length,
                           name_prefix=name)
    return env
Esempio n. 8
0
def _load(model_name):
	model = PPO2.load(model_name)
	env = make_vec_env('PointMassDense-%d-v1' %num_objs, 1, wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal'])
	env = VecVideoRecorder(env, osp.join(logger, "videos_3"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length)
	model.set_env(env)
	model.learn(total_timesteps=int(nIter), log_interval=100)
	# model.save(exp_name)
	model.save(model_name + "_new")
	env.close()
Esempio n. 9
0
def gen_env(env_id, seed, log_dir, record, video_dir):
    env = DummyVecEnv([lambda: create_env(env_id, seed, 0, log_dir)])
    if record:
        now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        env = VecVideoRecorder(env,
                               video_folder=video_dir,
                               record_video_trigger=lambda x: x % 1000 == 0,
                               name_prefix='{}'.format(now))
    return env
def record_video(model,
                 env_id=None,
                 eval_env=None,
                 max_video_length=500,
                 video_prefix='',
                 video_folder='videos/',
                 break_early=False,
                 is_recurrent=False):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param max_video_length: (int)
    :param video_prefix: (str)
    :param video_folder: (str)
    """

    # directly passing an environment overrides passing in an env
    if eval_env is None:
        eval_env = DummyVecEnv([lambda: gym.make(env_id)])

    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder=video_folder,
                                record_video_trigger=lambda step: step == 0,
                                video_length=max_video_length,
                                name_prefix=video_prefix)

    # according to docs, recurrent policies must have "state" set like this
    if is_recurrent:
        state = None

    # When using VecEnv, done is a vector
    is_single_env = (eval_env.num_envs == 1)
    doneVec = [False for _ in range(model.n_envs)]

    obs = eval_env.reset()

    for _ in range(max_video_length):

        # We need to pass the previous state and a mask for recurrent policies
        # to reset lstm state when a new episode begin
        action, state = model.predict(obs, state=state, mask=doneVec)

        # only allow recurrent models to continually update their state
        if not is_recurrent:
            state = None

        obs, _, done, _ = eval_env.step(action)

        if is_single_env:
            doneVec[0] = copy.deepcopy(done[0])
        else:
            doneVec = copy.deepcopy(done)

    # Close the video recorder
    eval_env.close()
Esempio n. 11
0
def gen_env(env_id, seed, log_dir, n_cpu, video_dir, now, record):
    env = DummyVecEnv(
        [lambda: create_env(env_id, seed, i, log_dir) for i in range(n_cpu)])
    if record:
        env = VecVideoRecorder(env,
                               video_folder=video_dir,
                               record_video_trigger=lambda x: x % 1000 == 0,
                               name_prefix='{}'.format(now),
                               video_length=2000)
    return env
Esempio n. 12
0
def train():
    set_gpu()
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 1e8

    save_video_length = 200
    save_video_interval = 1000000
    file = open('sac_done.txt', 'w+')
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    n_actions = env.action_space.shape[-1]
    stddev = 0.2

    pool = multiprocessing.Pool(processes=4)
    for lr in [1e-5]:  #, 5e-4, 1e-5
        logger = osp.join(
            expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter),
                                         np.format_float_scientific(lr)))
        env = VecVideoRecorder(
            env,
            osp.join(logger, "videos"),
            record_video_trigger=lambda x: x % save_video_interval == 0,
            video_length=save_video_length)
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file))
        model = SAC(
            MlpPolicy,
            env,
            verbose=verbose,
            tensorboard_log=logger,
            learning_rate=lr,
            action_noise=action_noise,
        )
        model.learn(total_timesteps=int(nIter), log_interval=100)
        exp_name = expDir + "/%s/%s_%s" % (name,
                                           np.format_float_scientific(nIter),
                                           np.format_float_scientific(lr))
        model.save(exp_name)
        file.write(exp_name + '\n')
        env.close()
    file.close()
    pool.close()
    pool.join()
Esempio n. 13
0
 def _setup_video_recorder(self, video_path):
     if distutils.spawn.find_executable(
             "avconv") or distutils.spawn.find_executable("ffmpeg"):
         logging.info("Using installed standard video encoder.")
         self.env = VecVideoRecorder(self.env,
                                     video_path,
                                     record_video_trigger=lambda x: x == 0,
                                     video_length=10000,
                                     name_prefix=util.get_timestamp())
     else:
         logging.warning(
             "Did not find avconf or ffmpeg - using gif as a video container replacement."
         )
         self.env = VecImageRecorder(self.env, video_path)
Esempio n. 14
0
def record(exp):
    model = SAC.load(exp)
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos_2"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model.set_env(env)
    model.learn(total_timesteps=2000, log_interval=100)
    # model.save(expDir + "/%s/%d" %(name, nIter))
    env.close()
Esempio n. 15
0
def learn():
	# expDir = '/home/shivanik/lab/pointExp/state/'
	# verbose = 1
	# num_objs = 1
	# name = 'ppo2_%d' %num_objs
	# logger = osp.join(expDir, name, 'logs')
	# video_folder = osp.join(logger, 'videos')
	# nIter = 1e7
	# save_video_interval = 5000

	env = make_vec_env('PointMassDense-%d-v1' %num_objs, 1,  wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal'])
	env = VecVideoRecorder(env, video_folder,
                       record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length,
                       name_prefix="Video-{}")

	model = PPO2(MlpPolicy, env, verbose=verbose,
	            tensorboard_log=logger,)
	model.learn(total_timesteps=int(nIter))
	model.save(expDir + "/%s/%s" %(name, np.format_float_scientific(nIter)))
Esempio n. 16
0
        def create_test_env(video_name_prefix, envconfig=envconfig):
            print('Creating test environment: ' + env_id)
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             render_mode=args.render if args.video else None,
                             pilot=args.pilot)
            vec_env = DummyVecEnv([lambda: env])
            if args.video:
                video_length = min(500, args.recording_length)
                recorded_env = VecVideoRecorder(vec_env,
                                                video_folder,
                                                record_video_trigger=lambda x:
                                                (x % video_length) == 0,
                                                video_length=video_length,
                                                name_prefix=video_name_prefix)
            active_env = recorded_env if args.video else vec_env

            return env, active_env
Esempio n. 17
0
def main(args):
    envconfig_string = args.envconfig
    custom_envconfig = _preprocess_custom_envconfig(
        args.envconfig) if args.envconfig is not None else {}
    env_id = 'gym_auv:' + args.env
    env_name = env_id.split(':')[-1] if ':' in env_id else env_id
    envconfig = gym_auv.SCENARIOS[env_name][
        'config'] if env_name in gym_auv.SCENARIOS else {}
    envconfig.update(custom_envconfig)

    NUM_CPU = multiprocessing.cpu_count()

    EXPERIMENT_ID = str(int(time())) + args.algo.lower()
    model = {
        'ppo': PPO2,
        'ddpg': DDPG,
        'td3': TD3,
        'a2c': A2C,
        'acer': ACER,
        'acktr': ACKTR,
        'sac': SAC,
        'trpo': TRPO
    }[args.algo.lower()]

    if args.mode == 'play':
        agent = model.load(args.agent) if args.agent is not None else None
        envconfig_play = envconfig.copy()
        envconfig_play['show_indicators'] = True
        #envconfig_play['autocamera3d'] = False
        env = create_env(env_id,
                         envconfig_play,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot,
                         verbose=True)
        print('Created environment instance')

        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            args.video_dir,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        print(args.video_dir, args.video_name)
        play_scenario(env, recorded_env, args, agent=agent)
        recorded_env.env.close()

    elif (args.mode == 'enjoy'):
        agent = model.load(args.agent)

        figure_folder = os.path.join(DIR_PATH, 'logs', 'enjoys', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        os.makedirs(scenario_folder, exist_ok=True)

        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(video_folder, exist_ok=True)

        env = create_env(env_id,
                         envconfig,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot)
        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            video_folder,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        obs = recorded_env.reset()
        state = None
        t_steps = 0
        ep_number = 1
        done = [False for _ in range(vec_env.num_envs)]
        for _ in range(args.recording_length):
            if args.recurrent:
                action, _states = agent.predict(
                    observation=obs,
                    state=state,
                    mask=done,
                    deterministic=not args.stochastic)
                state = _states
            else:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
            obs, reward, done, info = recorded_env.step(action)
            recorded_env.render()
            t_steps += 1

            if t_steps % 800 == 0 or done:
                if not done:
                    env.save_latest_episode(save_history=False)
                gym_auv.reporting.plot_trajectory(
                    env,
                    fig_dir=scenario_folder,
                    fig_prefix=(args.env +
                                '_ep{}_step{}'.format(ep_number, t_steps)))
                gym_auv.reporting.plot_trajectory(
                    env,
                    fig_dir=scenario_folder,
                    fig_prefix=(
                        args.env +
                        '_ep{}_step{}_local'.format(ep_number, t_steps)),
                    local=True)
            if done:
                ep_number += 1
        recorded_env.close()

    elif (args.mode == 'train'):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        os.makedirs(scenario_folder, exist_ok=True)
        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        recording_length = 8000
        os.makedirs(video_folder, exist_ok=True)
        agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(agent_folder, exist_ok=True)
        tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard',
                                       args.env, EXPERIMENT_ID)
        tensorboard_port = 6006

        if (args.nomp or model == DDPG or model == TD3 or model == SAC
                or model == TRPO):
            num_cpu = 1
            vec_env = DummyVecEnv(
                [lambda: create_env(env_id, envconfig, pilot=args.pilot)])
        else:
            num_cpu = NUM_CPU
            vec_env = SubprocVecEnv([
                make_mp_env(env_id, i, envconfig, pilot=args.pilot)
                for i in range(num_cpu)
            ])

        if (args.agent is not None):
            agent = model.load(args.agent)
            agent.set_env(vec_env)
        else:
            if (model == PPO2):
                if args.recurrent:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 1,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-3,
                    }

                    class CustomLSTMPolicy(MlpLstmPolicy):
                        def __init__(self,
                                     sess,
                                     ob_space,
                                     ac_space,
                                     n_env,
                                     n_steps,
                                     n_batch,
                                     n_lstm=256,
                                     reuse=False,
                                     **_kwargs):
                            super().__init__(sess,
                                             ob_space,
                                             ac_space,
                                             n_env,
                                             n_steps,
                                             n_batch,
                                             n_lstm,
                                             reuse,
                                             net_arch=[
                                                 256, 256, 'lstm',
                                                 dict(vf=[64], pi=[64])
                                             ],
                                             **_kwargs)

                    agent = PPO2(CustomLSTMPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams)
                else:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 32,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-4,
                    }
                    #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64])
                    #policy_kwargs = dict(net_arch=[64, 64, 64])
                    layers = [256, 128, 64]
                    #layers = [64, 64]
                    policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                    agent = PPO2(MlpPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams,
                                 policy_kwargs=policy_kwargs)
                    #dataset = ExpertDataset(expert_path='gail_expert.npz', traj_limitation=1, batch_size=128)
                    #print('Pretraining {} agent on "{}"'.format(args.algo.upper(), env_id))
                    #agent.pretrain(dataset, n_epochs=1000)
                    #print('Done pretraining {} agent on "{}"'.format(args.algo.upper(), env_id))
            elif (model == DDPG):
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'memory_limit': 50000,
                #     'normalize_observations': True,
                #     'normalize_returns': False,
                #     'gamma': 0.98,
                #     'actor_lr': 0.00156,
                #     'critic_lr': 0.00156,
                #     'batch_size': 256,
                #     'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
                # }
                hyperparams = {
                    'memory_limit':
                    1000000,
                    'normalize_observations':
                    True,
                    'normalize_returns':
                    False,
                    'gamma':
                    0.98,
                    'actor_lr':
                    0.00156,
                    'critic_lr':
                    0.00156,
                    'batch_size':
                    256,
                    'param_noise':
                    AdaptiveParamNoiseSpec(initial_stddev=0.287,
                                           desired_action_stddev=0.287)
                }
                agent = DDPG(LnMlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log,
                             **hyperparams)
            elif (model == TD3):
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'batch_size': 256,
                #     'buffer_size': 50000,
                #     'learning_starts': 1000
                # }
                hyperparams = {
                    'buffer_size': 1000000,
                    'train_freq': 1000,
                    'gradient_steps': 1000,
                    'learning_starts': 10000
                }
                action_noise = NormalActionNoise(mean=np.zeros(2),
                                                 sigma=0.1 * np.ones(2))
                agent = TD3(stable_baselines.td3.MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            action_noise=action_noise,
                            **hyperparams)
            elif model == A2C:
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'n_steps': 5,
                #     'gamma': 0.995,
                #     'ent_coef': 0.00001,
                #     'learning_rate': 0.00083,
                #     'lr_schedule': 'linear'
                # }
                # layers = [256, 128, 64]
                hyperparams = {
                    'n_steps': 16,
                    'gamma': 0.99,
                    'ent_coef': 0.001,
                    'learning_rate': 2e-4,
                    'lr_schedule': 'linear'
                }
                layers = [64, 64]
                policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                agent = A2C(MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            **hyperparams,
                            policy_kwargs=policy_kwargs)
            elif model == ACER:
                agent = ACER(MlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log)
            elif model == ACKTR:
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'gamma': 0.99,
                #     'n_steps': 16,
                #     'ent_coef': 0.0,
                #     'learning_rate': 0.06,
                #     'lr_schedule': 'constant'
                # }
                # agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams)
                agent = ACKTR(MlpPolicy,
                              vec_env,
                              verbose=True,
                              tensorboard_log=tensorboard_log)
            elif model == SAC:
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'batch_size': 256,
                #     'learning_starts': 1000
                # }
                # agent = SAC(stable_baselines.sac.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams)
                agent = SAC(stable_baselines.sac.MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log)
            elif model == TRPO:
                agent = TRPO(MlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log)

        print('Training {} agent on "{}"'.format(args.algo.upper(), env_id))

        n_updates = 0
        n_episodes = 0

        def callback(_locals, _globals):
            nonlocal n_updates
            nonlocal n_episodes

            sys.stdout.write('Training update: {}\r'.format(n_updates))
            sys.stdout.flush()

            _self = _locals['self']
            vec_env = _self.get_env()

            class Struct(object):
                pass

            report_env = Struct()
            report_env.history = []
            report_env.config = envconfig
            report_env.nsensors = report_env.config[
                "n_sensors_per_sector"] * report_env.config["n_sectors"]
            report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1)
            report_env.last_episode = vec_env.get_attr('last_episode')[0]
            report_env.config = vec_env.get_attr('config')[0]
            report_env.obstacles = vec_env.get_attr('obstacles')[0]

            env_histories = vec_env.get_attr('history')
            for episode in range(max(map(len, env_histories))):
                for env_idx in range(len(env_histories)):
                    if (episode < len(env_histories[env_idx])):
                        report_env.history.append(
                            env_histories[env_idx][episode])
            report_env.episode = len(report_env.history) + 1

            total_t_steps = _self.get_env().get_attr(
                'total_t_steps')[0] * num_cpu
            agent_filepath = os.path.join(agent_folder,
                                          str(total_t_steps) + '.pkl')

            if model == PPO2:
                recording_criteria = n_updates % 10 == 0
                report_criteria = True
                _self.save(agent_filepath)
            elif model == A2C or model == ACER or model == ACKTR or model == SAC or model == TRPO:
                save_criteria = n_updates % 100 == 0
                recording_criteria = n_updates % 1000 == 0
                report_criteria = True
                if save_criteria:
                    _self.save(agent_filepath)
            elif model == DDPG or model == TD3:
                save_criteria = n_updates % 10000 == 0
                recording_criteria = n_updates % 50000 == 0
                report_criteria = report_env.episode > n_episodes
                if save_criteria:
                    _self.save(agent_filepath)

            if report_env.last_episode is not None and len(
                    report_env.history) > 0 and report_criteria:
                try:
                    #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode))
                    gym_auv.reporting.report(report_env,
                                             report_dir=figure_folder)
                    #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode)))
                except OSError as e:
                    print("Ignoring reporting OSError:")
                    print(repr(e))

            if recording_criteria:
                if args.pilot:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, args.pilot, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                else:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                subprocess.Popen(cmd)

            n_episodes = report_env.episode
            n_updates += 1

        agent.learn(total_timesteps=1500000,
                    tb_log_name='log',
                    callback=callback)

    elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        agent = PPO2.load(args.agent)

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))
            for valuedict in valuegrid:
                customconfig = envconfig.copy()
                customconfig.update(valuedict)
                env = create_env(env_id,
                                 envconfig,
                                 test_mode=True,
                                 pilot=args.pilot)
                valuedict_str = '_'.join(
                    (key + '-' + str(val) for key, val in valuedict.items()))

                print('Running {} test for {}...'.format(
                    args.mode, valuedict_str))

                if args.mode == 'policyplot':
                    gym_auv.reporting.plot_actions(env,
                                                   agent,
                                                   fig_dir=figure_folder,
                                                   fig_prefix=valuedict_str)
                elif args.mode == 'vectorfieldplot':
                    gym_auv.reporting.plot_vector_field(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)
                elif args.mode == 'streamlinesplot':
                    gym_auv.reporting.plot_streamlines(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)

        else:
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             pilot=args.pilot)
            with open(os.path.join(figure_folder, 'config.json'), 'w') as f:
                json.dump(env.config, f)

            if args.mode == 'policyplot':
                gym_auv.reporting.plot_actions(env,
                                               agent,
                                               fig_dir=figure_folder)
            elif args.mode == 'vectorfieldplot':
                gym_auv.reporting.plot_vector_field(env,
                                                    agent,
                                                    fig_dir=figure_folder)
            elif args.mode == 'streamlinesplot':
                gym_auv.reporting.plot_streamlines(env,
                                                   agent,
                                                   fig_dir=figure_folder)

        print('Output folder: ', figure_folder)

    elif args.mode == 'test':
        figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env,
                                     EXPERIMENT_ID)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        video_folder = os.path.join(figure_folder, 'videos')
        os.makedirs(figure_folder, exist_ok=True)
        os.makedirs(scenario_folder, exist_ok=True)
        os.makedirs(video_folder, exist_ok=True)

        if not args.onlyplot:
            agent = model.load(args.agent)

        def create_test_env(video_name_prefix, envconfig=envconfig):
            print('Creating test environment: ' + env_id)
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             render_mode=args.render if args.video else None,
                             pilot=args.pilot)
            vec_env = DummyVecEnv([lambda: env])
            if args.video:
                video_length = min(500, args.recording_length)
                recorded_env = VecVideoRecorder(vec_env,
                                                video_folder,
                                                record_video_trigger=lambda x:
                                                (x % video_length) == 0,
                                                video_length=video_length,
                                                name_prefix=video_name_prefix)
            active_env = recorded_env if args.video else vec_env

            return env, active_env

        failed_tests = []

        def run_test(id,
                     reset=True,
                     report_dir=figure_folder,
                     scenario=None,
                     max_t_steps=None,
                     env=None,
                     active_env=None):
            nonlocal failed_tests

            if env is None or active_env is None:
                env, active_env = create_test_env(video_name_prefix=args.env +
                                                  '_' + id)

            if scenario is not None:
                obs = active_env.reset()
                env.load(args.scenario)
                print('Loaded', args.scenario)
            else:
                if reset:
                    obs = active_env.reset()
                else:
                    obs = env.observe()

            gym_auv.reporting.plot_scenario(env,
                                            fig_dir=scenario_folder,
                                            fig_postfix=id,
                                            show=args.onlyplot)
            if args.onlyplot:
                return
            cumulative_reward = 0
            t_steps = 0
            if max_t_steps is None:
                done = False
            else:
                done = t_steps > max_t_steps

            while not done:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
                obs, reward, done, info = active_env.step(action)
                if args.video:
                    active_env.render()
                t_steps += 1
                cumulative_reward += reward[0]
                report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format(
                    id, t_steps, cumulative_reward, info[0]['progress'])
                sys.stdout.write(report_msg)
                sys.stdout.flush()

                if args.save_snapshots and t_steps % 1000 == 0 and not done:
                    env.save_latest_episode(save_history=False)
                    for size in (20, 50, 100, 200, 300, 400, 500):
                        gym_auv.reporting.plot_trajectory(
                            env,
                            fig_dir=scenario_folder,
                            fig_prefix=(args.env + '_t_step_' + str(t_steps) +
                                        '_' + str(size) + '_' + id),
                            local=True,
                            size=size)
                elif done:
                    gym_auv.reporting.plot_trajectory(env,
                                                      fig_dir=scenario_folder,
                                                      fig_prefix=(args.env +
                                                                  '_' + id))

            env.close()

            gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1)
            #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id))
            #env.save(os.path.join(scenario_folder, id))
            if env.collision:
                failed_tests.append(id)
                with open(os.path.join(figure_folder, 'failures.txt'),
                          'w') as f:
                    f.write(', '.join(map(str, failed_tests)))

            return copy.deepcopy(env.last_episode)

        print('Testing scenario "{}" for {} episodes.\n '.format(
            args.env, args.episodes))
        report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format(
            'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions',
            'CT-Error [m]', 'H-Error [deg]')
        print(report_msg_header)
        print('-' * len(report_msg_header))

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))

        if args.scenario:
            if args.testvals:
                episode_dict = {}
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = -np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                        episode_dict[valuedict_str] = [last_episode, colorval]
                print('Plotting all')
                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=scenario_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)

            else:
                run_test("ep0", reset=True, scenario=args.scenario)

        else:
            if args.testvals:
                episode_dict = {}
                agent_index = 1
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                    episode_dict['Agent ' +
                                 str(agent_index)] = [last_episode, colorval]
                    agent_index += 1

                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=figure_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)
            else:
                env, active_env = create_test_env(video_name_prefix=args.env)
                for episode in range(args.episodes):
                    run_test('ep' + str(episode),
                             env=env,
                             active_env=active_env)

        if args.video and active_env:
            active_env.close()
Esempio n. 18
0
def learn(model_factory: ModelFactory, multiprocess: bool = True, time_steps: int = int(1e6),
          record_video: bool = True):
    def callback(locals_, globals_):
        self_ = locals_["self"]

        mean_actions = np.mean(self_.env.get_attr("actions_per_timestep"))
        mean_actions_tf = tf.Summary(value=[tf.Summary.Value(tag='simulation/mean_actions', simple_value=mean_actions)])
        winning_ratio = np.mean(self_.env.get_attr("winning_ratio"))
        winning_ratio_tf = tf.Summary(
            value=[tf.Summary.Value(tag='simulation/winning_ratio', simple_value=winning_ratio)])
        locals_['writer'].add_summary(mean_actions_tf, self_.num_timesteps)
        locals_['writer'].add_summary(winning_ratio_tf, self_.num_timesteps)

        if isinstance(model_factory, PPO2ModelFactory):
            fps = tf.Summary(value=[tf.Summary.Value(tag='simulation/fps', simple_value=locals_['fps'])])
            mean_length = np.mean([info["l"] for info in locals_["ep_infos"]])
            mean_length_tf = tf.Summary(
                value=[tf.Summary.Value(tag='simulation/mean_episode_length', simple_value=mean_length)])
            locals_['writer'].add_summary(fps, self_.num_timesteps)
            locals_['writer'].add_summary(mean_length_tf, self_.num_timesteps)
        return True

    def video_trigger(step):
        # allow warm-up for video recording
        if not record_video or step < time_steps / 3:
            return False

        return step % (int(time_steps / 8)) == 0

    log_dir = "../logs/%s/" % datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

    log_dir_tensorboard = "../logs/tensorboard/"
    print("Tensorboard log directory: %s" % os.path.abspath(log_dir_tensorboard))

    model_dir = os.path.join(log_dir, "model")
    os.makedirs(model_dir, exist_ok=True)

    video_dir = os.path.join(log_dir, "videos")
    os.makedirs(video_dir, exist_ok=True)

    def make_env():
        log_dir_single = "%s/%s/" % (log_dir, uuid.uuid4())
        env = gym.make('AtcEnv-v0')
        os.makedirs(log_dir_single, exist_ok=True)
        env = Monitor(env, log_dir_single, allow_early_resets=True)
        return env

    n_envs = 8
    if multiprocess:
        env = SubprocVecEnv([lambda: make_env() for i in range(n_envs)])
    else:
        env = DummyVecEnv([lambda: make_env()])

    if record_video:
        env = VecVideoRecorder(env, video_dir, video_trigger, video_length=2000)

    model = model_factory.build(env, log_dir_tensorboard)

    yaml.dump(model_factory.hyperparams, open(os.path.join(model_dir, "hyperparams.yml"), "w+"))

    # model = ACKTR(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=time_steps, callback=callback)

    model.save("%s/PPO2_atc_gym" % model_dir)

    # render trained model actions on screen and to file
    eval_observations_file = open(os.path.join(model_dir, "evaluation.csv"), 'a+')
    new_env = gym.make('AtcEnv-v0')
    obs = new_env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = new_env.step(action)
        original_state = info["original_state"]
        eval_observations_file.write("%.2f, %.2f, %.0f, %.1f\n" %
                                     (original_state[0], original_state[1], original_state[2], original_state[3]))
        new_env.render()
        if done:
            obs = new_env.reset()
Esempio n. 19
0
TRAIN = False

num_env = 1
if TRAIN:
    num_env = 4
env = make_atari_env(ENV_NAME, num_env=num_env, seed=0)
env = VecFrameStack(env, n_stack=4)

if TRAIN:
    model = PPO2(CnnPolicy, env, verbose=1, tensorboard_log=SAVE_SUMMARY_PATH)
    model.learn(total_timesteps=20000000)
    model.save(SAVE_NETWORK_PATH)

    del model
else:
    obs = env.reset()
    env = VecVideoRecorder(env, VIDEO_FOLDER,
                           record_video_trigger=lambda x: x == 0, video_length=VIDEO_LENGTH,
                           name_prefix="PPO_"+ENV_NAME)
    env.reset()
    model = PPO2.load(SAVE_NETWORK_PATH)
    cnt = 0
    for i in range(VIDEO_LENGTH):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        cnt += int(done)
        if cnt==5:
            break
env.close()
Esempio n. 20
0
from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
from stable_baselines import DDPG
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv
from gym.wrappers import FlattenDictWrapper
import os.path as osp

expDir = '/home/shivanik/lab/pointExp/state/'
verbose = 1
nIter = 5e6
name = 'ddpg_1_%s' %np.format_float_scientific(nIter)
logger = osp.join(expDir, name, 'logs')
video_folder = osp.join(logger, 'videos')

env = make_vec_env('PointMassDense-1-v1', 4,  wrapper_class = FlattenDictWrapper, wrapper_env_kwargs =['observation', 'achieved_goal', 'desired_goal'])
env = VecVideoRecorder(env, video_folder,
                   record_video_trigger=lambda x: x %100000, video_length=400,
                   name_prefix="Video-{}")

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions))

model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise)
model.learn(total_timesteps=int(nIter))
model.save(expDir + "/%s/model" %(name, np.format_float_scientific(nIter)))

# del model # remove to demonstrate saving and loading

# model = DDPG.load("ddpg_mountain")
Esempio n. 21
0
                      n_envs=n_envs,
                      is_atari=is_atari,
                      stats_path=stats_path,
                      seed=seed,
                      log_dir=None,
                      should_render=not args.no_render,
                      hyperparams=hyperparams)

model = ALGOS[algo].load(model_path)

obs = env.reset()

# Note: apparently it renders by default
env = VecVideoRecorder(env,
                       video_folder,
                       record_video_trigger=lambda x: x == 0,
                       video_length=video_length,
                       name_prefix="{}-{}".format(algo, env_id))

env.reset()
for _ in range(video_length + 1):
    # action = [env.action_space.sample()]
    action, _ = model.predict(obs, deterministic=deterministic)
    if isinstance(env.action_space, gym.spaces.Box):
        action = np.clip(action, env.action_space.low, env.action_space.high)
    obs, _, _, _ = env.step(action)

# Workaround for https://github.com/openai/gym/issues/893
if n_envs == 1 and 'Bullet' not in env_id and not is_atari:
    env = env.venv
    # DummyVecEnv
Esempio n. 22
0
def record_():
    model_path = args.load_model
    os.path.isfile(model_path)

    # search skills

    m = re.search("\[[0-9\, \[\]]*\]", model_path)
    if m is None:
        raise ValueError(
            "load_model: {} does not contain skills".format(model_path))
    skills = str_to_skills(m.group(0))

    # search env-id
    env_id_list = ENV_LIST
    env_id = None
    searched = False
    m = re.search("[A-Z][a-z]*NoFrameskip-v4", model_path)
    if m is not None:
        searched = True
        env_id = m.group(0)

    if searched is not True:
        for id_ in env_id_list:
            if id_.lower() in model_path.lower():
                searched = True
                env_id = id_ + "NoFrameskip-v4"

    if searched is not True:
        raise ValueError(
            "load_model: {} does not contain env id".format(model_path))

    save_path = args.logdir
    if save_path is None:
        save_path = os.path.dirname(model_path)

    print("ENV:{} \nskills:{} \nmodel_path:{} \nsave_path:{}\n".format(
        env_id, skills, model_path, save_path))
    time.sleep(3)

    env_creator_ = lambda env: ActionRemapWrapper(env)
    env_creator = lambda env: SkillWrapper(env_creator_(env), skills=skills)
    env = VecFrameStack(
        make_atari_env(env_id,
                       1,
                       args.seed,
                       extra_wrapper_func=env_creator,
                       logdir=save_path,
                       wrapper_kwargs={
                           "episode_life": False,
                           "clip_rewards": False
                       }), 4)

    if args.load_model is None:
        raise NotImplementedError
    assert os.path.isfile(args.load_model)

    if args.rl_model == "ppo":
        model = PPO2.load(args.load_model)
    elif args.rl_model == "a2c":
        model = A2C.load(args.load_model)
    elif args.rl_model is None:
        if "ppo" in model_path:
            model = PPO2.load(model_path)
        elif "a2c" in model_path:
            model = A2C.load(model_path)
        else:
            raise ValueError("please specify rl_model")
    else:
        raise ValueError("{} rl_model not recognize".format(args.rl_model))

    # DEBUG
    set_global_seeds(args.seed)

    obs = env.reset()
    if args.record:
        env = VecVideoRecorder(env,
                               save_path,
                               record_video_trigger=lambda x: x == 0,
                               video_length=MAX_VIDEO_LENGTH)
        env.reset()
    total_rewards = 0

    action_save_path = os.path.join(save_path, "history_action.txt")
    if args.log_action:
        try:
            os.remove(action_save_path)
        except OSError as e:
            if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
                raise  # re-raise exception if a different error occurred
    log_picture = None
    if args.log_picture:
        log_picture = os.path.join(save_path, "history_action_pic")
        log_picture = mkdirs(log_picture, mode="keep")
        action_save_path = os.path.join(log_picture,
                                        os.path.basename(action_save_path))
        # try:
        #     # shutil.rmtree()
        # except:

    print("start evaluate")
    with open(action_save_path, 'a') as f:
        for steps in range(args.eval_max_steps):
            action, _states = model.predict(obs)
            if args.log_action:
                # print("{}".format(action[0]), sep=" ", file=f)
                f.write("{} ".format(action[0]))
            if args.log_picture:
                assert log_picture is not None
                pict = env.render(mode='rgb_array')

                im = Image.fromarray(pict)
                _path = os.path.join(log_picture,
                                     "{}_{}.jpg".format(steps, action[0]))
                im.save(_path)
            obs, rewards, dones, info = env.step(action)
            total_rewards += rewards
            if bool(dones[0]) is True:
                break
    print("steps: {}/{}".format(steps + 1, args.eval_max_steps))
    print("total_rewards: {}".format(total_rewards))
    env.close()
Esempio n. 23
0
from stable_baselines.common import set_global_seeds
set_global_seeds(75)
env = gym.make('Hopper-v2')
env.seed(75)
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
n_actions = env.action_space.shape[-1]
action_noise =OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions))
model = DDPG(LnMlpPolicy, env, param_noise=None,batch_size=64,buffer_size=1000000,enable_popart=False, action_noise=action_noise, verbose=4,seed=75,n_cpu_tf_sess=1)
model=model.load(r"/home/mohit/Downloads/stable-baselines/results_mohit/ddpg/Hopper-v2/None/75/best_model.pkl")
env_id = 'Hopper-v2'
video_folder = 'home/'
video_length = 1000

env = DummyVecEnv([lambda: gym.make(env_id)])

obs = env.reset()

# Record the video starting at the first step
env = VecVideoRecorder(env, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix="random-agent-{}".format(env_id))

env.reset()
for _ in range(video_length + 1):
    action, _states = model.predict(obs)
      # here, action, rewards and dones are arrays
      # because we are using vectorized env
    obs, rewards, dones, info = env.step(action)
# Save the video
env.close()
Esempio n. 24
0
def main():
    ##setup args
    parser = argparse.ArgumentParser(
        description='Reward learning from preferences')

    parser.add_argument('--env_type', type=str, default='procgen')
    parser.add_argument('--env_name', type=str, default='fruitbot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--log_dir', type=str, default='LOGS')
    parser.add_argument('--log_name', type=str, default='')

    parser.add_argument('--resume_training', action='store_true')

    parser.add_argument('--init_buffer_size', type=int, default=500)
    parser.add_argument('--clip_size', type=int, default=25)
    parser.add_argument('--num_iters', type=int, default=500)
    parser.add_argument('--steps_per_iter', type=int, default=2 * 10**5)
    parser.add_argument('--pairs_per_iter', type=int, default=10**5)
    parser.add_argument('--pairs_in_batch', type=int, default=16)
    parser.add_argument('--l2', type=float, default=0.0001)

    args = parser.parse_args()

    args.ppo_kwargs = dict(verbose=1,
                           n_steps=256,
                           noptepochs=3,
                           nminibatches=8)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f'\n Using {device} for training')

    run_dir, monitor_dir, video_dir = setup_logging(args)

    if args.resume_training:
        reward_model, policy, data_buffer, i_num = load_state(run_dir)
        args = load_args(args)

    #initializing objects
    if args.env_type == 'procgen':
        env_fn = lambda: Gym_procgen_continuous(env_name=args.env_name,
                                                distribution_mode=args.
                                                distribution_mode,
                                                num_levels=args.num_levels,
                                                start_level=args.start_level)
    elif args.env_type == 'atari':
        env_fn = lambda: Atari_continuous(args.env_name)

    venv_fn = lambda: make_vec_env(env_fn,
                                   monitor_dir=monitor_dir,
                                   n_envs=multiprocessing.cpu_count(),
                                   vec_env_cls=SubprocVecEnv)

    #in case this is a fresh run
    if not args.resume_training:
        i_num = 0
        policy = PPO2(ImpalaPolicy, venv_fn(), **args.ppo_kwargs)
        reward_model = RewardNet(l2=args.l2, env_type=args.env_type)
        data_buffer = AnnotationBuffer()
        store_args(args, run_dir)

    for i in range(i_num, args.num_iters + i_num):
        print(f'================== iter : {i} ====================')

        num_pairs = int(args.init_buffer_size / (i + 1))

        prev_size = data_buffer.size
        while data_buffer.size - prev_size < num_pairs:
            annotations = collect_annotations(env_fn, policy, num_pairs,
                                              args.clip_size)
            data_buffer.add(annotations)

        print(f'Buffer size = {data_buffer.size}')

        reward_model, rm_train_stats = train_reward(reward_model, data_buffer,
                                                    args.pairs_per_iter,
                                                    args.pairs_in_batch)
        policy = train_policy(venv_fn, reward_model, policy,
                              args.steps_per_iter, device)

        eval_env = VecVideoRecorder(DummyVecEnv([env_fn]),
                                    video_dir,
                                    record_video_trigger=lambda x: x == 0,
                                    video_length=10000,
                                    name_prefix="on_iter_{}".format(i))

        proxy_reward_function = lambda x: reward_model.rew_fn(
            torch.from_numpy(x)[None, :].float().to(device))
        proxy_eval_env = Reward_wrapper(env_fn(), proxy_reward_function)

        true_performance, _ = evaluate_policy(policy,
                                              eval_env,
                                              n_eval_episodes=1)
        proxy_performance, _ = evaluate_policy(policy,
                                               proxy_eval_env,
                                               n_eval_episodes=1)

        print(f'True policy preformance = {true_performance}')
        print(f'Proxy policy preformance = {proxy_performance}')

        save_state(run_dir, i, reward_model, policy, data_buffer)
        log_iter(run_dir, i, data_buffer, true_performance, proxy_performance,
                 rm_train_stats)

        os.rename(monitor_dir, monitor_dir + '_' + str(i))
Esempio n. 25
0
from stable_baselines import PPO2
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv

mode = 'show'

if mode == 'video':
    video_folder = '../data/videos'
    video_length = 5000

    env = DummyVecEnv([lambda: controlTableLine()])
    obs = env.reset()

    env = VecVideoRecorder(
        env,
        video_folder,
        record_video_trigger=lambda x: x == 0,
        video_length=video_length,
        name_prefix="PPO_controlTableLine",
    )
    env.reset()

    model = PPO2.load('../data/pretrained_models/controlTableLine/PPO')

    for _ in range(video_length + 1):
        action, _ = model.predict(obs, deterministic=True)
        obs, _, _, _ = env.step(action)
    env.close()

elif mode == 'gif':
    import imageio
    from stable_baselines.common.cmd_util import make_vec_env
Esempio n. 26
0
    try:
        model = PPO2.load(path)
    except ValueError:
        # Problem when loading model. Probably wrong path provided.
        print(
            '\nError: Make sure to have the pre-trained models available or to provide a valid path to a custom model as an argument.\n'
        )
        print('Provided path: ' + path + '\n')
        sys.exit()

    video_id, video_folder, env_id = recording_destination_and_name(path)
    create_dir(video_folder)
    video_length = VIDEO_LENGTH
    num_videos = NUMBER_OF_RECORDINGS

    # Record the video starting at the first step
    env = VecVideoRecorder(venv=env,
                           video_folder=video_folder,
                           record_video_trigger=lambda x: x == 0,
                           video_length=video_length,
                           name_prefix=video_id)

    # Enjoy trained agent
    obs = env.reset()
    for _ in range(num_videos * (video_length + 1)):
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        #env.render()
    env.close()
Esempio n. 27
0
    if verbose:
        logging.basicConfig(level=logging.INFO)
    else:
        logging.basicConfig(level=logging.INFO)

    record = args.record
    video_dir = os.path.join(args.video_dir, env_id, name)
    os.makedirs(video_dir, exist_ok=True)

    env = DummyVecEnv([lambda: create_env(env_id, seed, i, log_dir) for i in range(n_cpu)])
    if record:
        now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        env = VecVideoRecorder(env,
                               video_folder=video_dir,
                               record_video_trigger=lambda x: x % 1000 == 0,
                               name_prefix='{}'.format(now),
                               video_length=2000)
    if args.use_stl:
        model = TLPPO2(MlpPolicy, env, verbose=verbose, n_steps=n_steps, tensorboard_log=log_dir)
        spec, signals, monitor = get_spec(args.stl_spec)
        model.monitor = STLMonitor(spec, signals, monitor, n_steps, 0.005)
        log.info('STL: {}'.format(latex(spec)))
        name = 'TLPPO'
    else:
        model = PPO2(MlpPolicy, env, verbose=verbose, n_steps=n_steps, tensorboard_log=log_dir)
        name = 'PPO'
    set_global_seeds(seed)

    model.learn(total_timesteps, reset_num_timesteps=True)
    backup_dir = os.path.join('backup', name)