def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir):

    """
    Runs the test
    """

    logger.log(f"#######CMA and then PPO TRAIN: {args}")

    this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir)
    log_dir = get_log_dir(this_conti_ppo_run_dir)
    conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir)
    logger.configure(log_dir)

    full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(conti_ppo_save_dir):
        import shutil
        shutil.rmtree(conti_ppo_save_dir)
    os.makedirs(conti_ppo_save_dir)



    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out
    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{full_space_save_dir}/ppo2")
    model.set_from_flat(start_theta)

    if args.normalize:
        env.load_running_average(full_space_save_dir)
    model.set_env(env)


    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99,
    #              noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)

    model.tell_run_info(run_info)
    episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps)

    model.save(f"{conti_ppo_save_dir}/ppo2")

    env.save_running_average(conti_ppo_save_dir)
    return episode_returns, full_param_traj_dir_path
コード例 #2
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        env_out = gym.make(env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=2048,
                 nminibatches=32,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2)
    model.learn(total_timesteps=num_timesteps)

    return model, env
コード例 #3
0
def test_lstm_train():
    """Test that LSTM models are able to achieve >=150 (out of 500) reward on CartPoleNoVelEnv.

    This environment requires memory to perform well in."""
    def make_env(i):
        env = CartPoleNoVelEnv()
        env = TimeLimit(env, max_episode_steps=500)
        env = bench.Monitor(env, None, allow_early_resets=True)
        env.seed(i)
        return env

    env = SubprocVecEnv([lambda: make_env(i) for i in range(NUM_ENVS)])
    env = VecNormalize(env)
    model = PPO2(MlpLstmPolicy, env, n_steps=128, nminibatches=NUM_ENVS, lam=0.95, gamma=0.99,
                 noptepochs=10, ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1)

    eprewmeans = []
    def reward_callback(local, _):
        nonlocal eprewmeans
        eprewmeans.append(safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']]))

    model.learn(total_timesteps=100000, callback=reward_callback)

    # Maximum episode reward is 500.
    # In CartPole-v1, a non-recurrent policy can easily get >= 450.
    # In CartPoleNoVelEnv, a non-recurrent policy doesn't get more than ~50.
    # LSTM policies can reach above 400, but it varies a lot between runs; consistently get >=150.
    # See PR #244 for more detailed benchmarks.

    average_reward = sum(eprewmeans[-NUM_EPISODES_FOR_SCORE:]) / NUM_EPISODES_FOR_SCORE
    assert average_reward >= 150, "Mean reward below 150; per-episode rewards {}".format(average_reward)
コード例 #4
0
def _train(env_id, agent, model_params, total_steps, is_evaluation=False):
    if is_evaluation:  # evaluate_policy() must only take one environment
        envs = SubprocVecEnv([make_env(env_id)])
    else:
        envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
    envs = VecNormalize(
        envs)  # normalize the envs during training and evaluation

    # Load pretrained model during training.
    if not is_evaluation and os.path.exists(agent + '_' + env_id):
        if agent == 'ppo2':
            model = PPO2.load(agent + '_' + env_id)
        elif agent == 'a2c':
            model = A2C.load(agent + '_' + env_id)
    else:
        if agent == 'ppo2':
            model = PPO2(MlpLstmPolicy,
                         envs,
                         nminibatches=1,
                         verbose=1,
                         **model_params)
        elif agent == 'a2c':
            model = A2C(MlpLstmPolicy, envs, verbose=1, **model_params)

    model.learn(total_timesteps=total_steps)
    return envs, model
コード例 #5
0
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        if "num_population" in args.__dict__:
            args.num_cpu = args.num_population * 2

        assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \
            "Error: cannot have more than 1 CPU for the environment {}".format(args.env)
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(args.num_cpu, args.env,
                                             env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        envs = [
            makeEnv(args.env,
                    args.seed,
                    i,
                    args.log_dir,
                    allow_early_resets=True,
                    env_kwargs=env_kwargs) for i in range(args.num_cpu)
        ]
        envs = SubprocVecEnv(envs)
        envs = VecFrameStack(envs, args.num_stack)
        if args.srl_model != "raw_pixels" and args.algo_type == "v2":
            envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
            envs = loadRunningAverage(envs,
                                      load_path_normalise=load_path_normalise)
        return envs
コード例 #6
0
ファイル: run_mujoco_trpo.py プロジェクト: shanlior/OAL
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/' + str(
            env_id) + './OURS-LOADED/noent_klcoeffanneal_samesgdsteps' + str(
                sgd_steps) + '_longer_wgae0.95_exp1_2_' + str(seed)
        #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #env = make_mujoco_env(env_id, workerseed)
        def make_env():
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            return env_out

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)  #, norm_reward=False, norm_obs=False)

        #env = VecNormalize(env)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=2048,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.95,
                     vf_iters=5,
                     vf_stepsize=1e-3,
                     verbose=1,
                     seed=seed,
                     sgd_steps=sgd_steps,
                     klcoeff=klcoeff,
                     method="multistep-SGD")
        model.learn(total_timesteps=10e6)  #num_timesteps, seed=seed)
        env.close()
コード例 #7
0
def main(_):
    def make_env():
        env_out = gym.make('CartPole-v0')
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[FLAGS.policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=FLAGS.n_steps,
                 nminibatches=FLAGS.nminibatches,
                 lam=FLAGS.lam,
                 gamma=FLAGS.gamma,
                 noptepochs=FLAGS.noptepochs,
                 ent_coef=FLAGS.ent_coef,
                 learning_rate=FLAGS.learning_rate,
                 cliprange=FLAGS.cliprange,
                 verbose=FLAGS.verbose)
    model.learn(total_timesteps=FLAGS.num_timesteps)
コード例 #8
0
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    with tf_util.single_threaded_session():
        rank = MPI.COMM_WORLD.Get_rank()
        log_path = './experiments/' + str(
            env_id) + './SAC-M/nips_test19/m' + str(sgd_steps) + '_c' + str(
                0.5) + '_e' + str(klcoeff) + '_' + str(seed)
        #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed)
        if not log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        #env = make_mujoco_env(env_id, workerseed)
        def make_env():
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            return env_out

        env = DummyVecEnv([make_env])
        env = VecNormalize(env, norm_reward=False, norm_obs=False)

        #env = VecNormalize(env)
        model = MDPO(MlpPolicy,
                     env,
                     gamma=0.99,
                     verbose=1,
                     seed=seed,
                     buffer_size=1000000,
                     ent_coef=1.0,
                     gradient_steps=sgd_steps,
                     lam=klcoeff,
                     train_freq=1,
                     tsallis_q=1,
                     reparameterize=True,
                     klconst=0.5)
        model.learn(
            total_timesteps=int(num_timesteps))  #num_timesteps, seed=seed)
        env.close()
コード例 #9
0
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(args)
    plot_dir_alg = get_plot_dir(args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    intermediate_data_dir = get_intermediate_data_dir(this_run_dir,
                                                      params_scope="pi")
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(intermediate_data_dir):
        os.makedirs(intermediate_data_dir)
    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    model.set_pi_from_flat(final_params)

    if args.normalize:
        env.load_running_average(save_dir)

    obz_tensor = model.act_model.fake_input_tensor

    some_neuron = model.act_model.policy_neurons[2][-1]

    grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor)

    grads = list(zip(grads, obz_tensor))

    trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5)

    train_op = trainer.apply_gradients(grads)
    for i in range(10000):
        obz, _ = model.sess.run([obz_tensor, train_op])
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    neuron_values_list = []

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    # policy = MlpPolicy
    # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function
    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)
    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    env.render()
    ep_infos = []
    while 1:
        neuron_values, actions, _, _, _ = model.step_with_neurons(obs)
        # neuron_values = model.give_neuron_values(obs)

        # neuron_values_list.append( neuron_values )
        yield neuron_values
        obs, rew, done, infos = env.step(actions)
        env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:

            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            obs = env.reset()
コード例 #11
0
def _make_warmstart_cartpole():
    """Warm-start VecNormalize by stepping through CartPole"""
    venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    venv = VecNormalize(venv)
    venv.reset()
    venv.get_original_obs()

    for _ in range(100):
        actions = [venv.action_space.sample()]
        venv.step(actions)
    return venv
コード例 #12
0
def load_old_ppo2(root_dir, env, env_name, index, transparent_params):
    try:
        from baselines.ppo2 import ppo2 as ppo2_old
    except ImportError as e:
        msg = "{}. HINT: you need to install (OpenAI) Baselines to use old_ppo2".format(
            e)
        raise ImportError(msg)

    denv = FakeSingleSpacesVec(env, agent_id=index)
    possible_fnames = ["model.pkl", "final_model.pkl"]
    model_path = None
    for fname in possible_fnames:
        candidate_path = os.path.join(root_dir, fname)
        if os.path.exists(candidate_path):
            model_path = candidate_path
    if model_path is None:
        raise FileNotFoundError(f"Could not find model at '{root_dir}' "
                                f"under any filename '{possible_fnames}'")

    graph = tf.Graph()
    sess = tf.Session(graph=graph)
    with sess.as_default():
        with graph.as_default():
            pylog.info(f"Loading Baselines PPO2 policy from '{model_path}'")
            policy = ppo2_old.learn(
                network="mlp",
                env=denv,
                total_timesteps=1,
                seed=0,
                nminibatches=4,
                log_interval=1,
                save_interval=1,
                load_path=model_path,
            )
    stable_policy = OpenAIToStablePolicy(policy,
                                         ob_space=denv.observation_space,
                                         ac_space=denv.action_space)
    model = PolicyToModel(stable_policy)

    try:
        normalize_path = os.path.join(root_dir, "normalize.pkl")
        with open(normalize_path, "rb") as f:
            old_vec_normalize = pickle.load(f)
        vec_normalize = VecNormalize(denv, training=False)
        vec_normalize.obs_rms = old_vec_normalize.ob_rms
        vec_normalize.ret_rms = old_vec_normalize.ret_rms
        model = NormalizeModel(model, vec_normalize)
        pylog.info(f"Loaded normalization statistics from '{normalize_path}'")
    except FileNotFoundError:
        # We did not use VecNormalize during training, skip
        pass

    return model
コード例 #13
0
    def f(root_dir, env, env_name, index, transparent_params):
        denv = FakeSingleSpacesVec(env, agent_id=index)
        pylog.info(
            f"Loading Stable Baselines policy for '{cls}' from '{root_dir}'")
        model = load_backward_compatible_model(cls, root_dir, denv)
        try:
            vec_normalize = VecNormalize(denv, training=False)
            vec_normalize.load_running_average(root_dir)
            model = NormalizeModel(model, vec_normalize)
            pylog.info(f"Loaded normalization statistics from '{root_dir}'")
        except FileNotFoundError:
            # We did not use VecNormalize during training, skip
            pass

        return model
コード例 #14
0
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    ep_infos = []
    for _ in range(eval_timesteps):
        actions = model.step(obs)[0]
        neuron_values = model.give_neuron_values(obs)

        obs, rew, done, infos = env.step(actions)

        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:
            if pi_theta is None:
                episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
                print(f'episode_rew={episode_rew}')
            obs = env.reset()

    return safe_mean([ep_info['r'] for ep_info in ep_infos])
コード例 #15
0
def test_vec_env():
    """Test VecNormalize Object"""
    def make_env():
        return gym.make(ENV_ID)

    env = DummyVecEnv([make_env])
    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=10.,
                       clip_reward=10.)
    _, done = env.reset(), [False]
    obs = None
    while not done[0]:
        actions = [env.action_space.sample()]
        obs, _, done, _ = env.step(actions)
    assert np.max(obs) <= 10
コード例 #16
0
def single_wrappers(single_venv, scheduler, our_idx, normalize, rew_shape,
                    rew_shape_params, victim_index, victim_path, victim_type,
                    debug, env_name, load_policy, lookback_params,
                    transparent_params, log_callbacks, save_callbacks):
    if rew_shape:
        rew_shape_venv = apply_reward_wrapper(single_env=single_venv,
                                              scheduler=scheduler,
                                              shaping_params=rew_shape_params,
                                              agent_idx=our_idx)
        log_callbacks.append(lambda logger, locals, globals: rew_shape_venv.
                             log_callback(logger))
        single_venv = rew_shape_venv

        for anneal_type in ['noise', 'rew_shape']:
            if scheduler.is_conditional(anneal_type):
                scheduler.set_annealer_get_logs(anneal_type,
                                                rew_shape_venv.get_logs)

    if lookback_params['lb_num'] > 0:
        lookback_venv = LookbackRewardVecWrapper(single_venv, env_name, debug,
                                                 victim_index, victim_path,
                                                 victim_type,
                                                 transparent_params,
                                                 **lookback_params)
        single_venv = lookback_venv

    if normalize:
        normalized_venv = VecNormalize(single_venv)

        if load_policy['path'] is not None:
            if load_policy['type'] == 'zoo':
                raise ValueError(
                    "Trying to normalize twice. Bansal et al's Zoo agents normalize "
                    "implicitly. Please set normalize=False to disable VecNormalize."
                )

                normalized_venv.load_running_average(load_policy['path'])

        save_callbacks.append(
            lambda root_dir: normalized_venv.save_running_average(root_dir))
        single_venv = normalized_venv

    return single_venv
コード例 #17
0
def main(_):

    p_dic = getattr(conf.dic.path_dic, FLAGS.env_name)

    register(id=FLAGS.env_id,
             entry_point='env.env_ep:Env',
             kwargs={
                 'env_name': FLAGS.env_name,
                 'done_step': 8760
             })

    def make_env():
        env_out = gym.make(FLAGS.env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[FLAGS.policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=FLAGS.n_steps,
                 nminibatches=FLAGS.nminibatches,
                 lam=FLAGS.lam,
                 gamma=FLAGS.gamma,
                 noptepochs=FLAGS.noptepochs,
                 ent_coef=FLAGS.ent_coef,
                 learning_rate=FLAGS.learning_rate,
                 cliprange=FLAGS.cliprange,
                 verbose=FLAGS.verbose,
                 log_dir=p_dic.get('agent_log_dir'))
    model.learn(total_timesteps=FLAGS.num_timesteps)
コード例 #18
0
def single_wrappers(single_venv, scheduler, our_idx, normalize, load_policy,
                    rew_shape, rew_shape_params, log_callbacks, save_callbacks):
    if rew_shape:
        rew_shape_venv = apply_reward_wrapper(single_env=single_venv, scheduler=scheduler,
                                              shaping_params=rew_shape_params, agent_idx=our_idx)
        log_callbacks.append(lambda logger, locals, globals: rew_shape_venv.log_callback(logger))
        single_venv = rew_shape_venv

        for anneal_type in ['noise', 'rew_shape']:
            if scheduler.is_conditional(anneal_type):
                scheduler.set_annealer_get_logs(anneal_type, rew_shape_venv.get_logs)

    if normalize:
        if load_policy['type'] == 'zoo':
            raise ValueError("Trying to normalize twice. Bansal et al's Zoo agents normalize "
                             "implicitly. Please set normalize=False to disable VecNormalize.")

        normalized_venv = VecNormalize(single_venv)
        save_callbacks.append(lambda root_dir: normalized_venv.save_running_average(root_dir))
        single_venv = normalized_venv

    return single_venv
コード例 #19
0
def test_vec_env(tmpdir):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv,
                             norm_obs=True,
                             norm_reward=True,
                             clip_obs=clip_obs,
                             clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = str(tmpdir.join("vec_normalize"))
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
コード例 #20
0
def _train(env_id, model_params, total_epochs, use_sigmoid_layer=False, is_evaluation=False):
  if is_evaluation: # evaluate_policy() must only take one environment
    envs = SubprocVecEnv([make_env(env_id)])
  else:
    envs = SubprocVecEnv([make_env(env_id) for _ in range(NUM_CPU)])
  envs = VecNormalize(envs) # normalize the envs during training and evaluation

  # activation fn: use tanh for delta hedging and relu for mean reversion
  # learning rate: use 1e-7 for delta hedging and 1e-5 for mean reversion
  if use_sigmoid_layer:
    model = PPO2(SigmoidMlpPolicy, envs, n_steps=1, nminibatches=1,
                 learning_rate=lambda f: f * 1e-5, verbose=1,
                 policy_kwargs=dict(act_fun=tf.nn.relu),
                 **model_params)
  else:
    model = PPO2(MlpLstmPolicy, envs, n_steps=1, nminibatches=1,
                 learning_rate=lambda f: f * 1e-5, verbose=1,
                 policy_kwargs=dict(act_fun=tf.nn.relu),
                 **model_params)

  model.learn(total_timesteps=total_epochs * L)
  return envs, model
コード例 #21
0
def train(num_timesteps, model_to_load):

    try:
        env = DummyVecEnv([dsgym])
        env = VecNormalize(env)
        policy = MlpPolicy
        lr = 3e-4 * 0.75

        model = PPO2(policy=policy,
                     env=env,
                     n_steps=2048,
                     nminibatches=32,
                     lam=0.95,
                     gamma=0.99,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=linear_schedule(lr),
                     cliprange=0.2)
        if model_to_load:
            env = DummyVecEnv([dsgym])
            env = VecNormalize.load(
                model_to_load.replace(".zip", "vec_normalize.pkl"), env)
            model = model.load(model_to_load)
            model.set_env(env)
            print("Loaded model from: ", model_to_load)
            model.set_learning_rate_func(linear_schedule_start_zero(lr))
        model.learn(total_timesteps=num_timesteps)
    except KeyboardInterrupt:
        print("Saving on keyinterrupt")
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        # quit
        sys.exit()
    except BaseException as error:
        model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
        print('An exception occurred: {}'.format(error))
        traceback.print_exception(*sys.exc_info())
        sys.exit()
    model.save("D:/openAi/ppo2save/" + time.strftime("%Y_%m_%d-%H_%M_%S"))
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size,
                                policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
                                additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None,
                                linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None,
                                neurons_inds_to_include=None, use_lagrangian=True):
    trained_model = None
    if not use_lagrangian:
        with tf.variable_scope("trained_model"):
            common_arg_parser = get_common_parser()
            trained_args, cma_unknown_args = common_arg_parser.parse_known_args()
            trained_args.env = policy_env
            trained_args.seed = policy_seed
            trained_args.num_timesteps = policy_num_timesteps
            trained_args.run_num = policy_run_num
            trained_this_run_dir = get_dir_path_for_this_run(trained_args)
            trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir)
            trained_save_dir = get_save_dir(trained_this_run_dir)

            trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final")
            trained_final_params = pd.read_csv(trained_final_file, header=None).values[0]

            trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed)
            trained_model.set_pi_from_flat(trained_final_params)

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()


    this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num,
                                                    args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold,
                                                    result_dir=result_dir, network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)


    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    linear_top_vars_list_wanted_to_print = []
    if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None):
        # note this is only linear
        if linear_top_vars_list is None or linear_correlation_neuron_list is None:

            linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed,
                                               eval_run_num, additional_note, metric_param=metric_param)

        lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \
            get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold)



    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)
    with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp:
        json.dump(linear_top_vars_list_wanted_to_print, fp)
    with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp:
        json.dump(neurons_inds_to_include, fp)


    args.env = f'{experiment_label}_{entry_point}-v1'

    if not use_lagrangian:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model,
                    "neurons_inds_to_include": neurons_inds_to_include}
        )
    else:
        register(
            id=args.env,
            entry_point=entry_point,
            max_episode_steps=1000,
            kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None,
                    "neurons_inds_to_include": None}
        )

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = visualize
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = not visualize


    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy



    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    num_dof = walker_env.robot_skeleton.ndofs
    show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir)




    # extra run info I added for my purposes
    run_info = {"run_num": args.run_num,
                "env_id": args.env,
                "full_param_traj_dir_path": full_param_traj_dir_path}

    layers = [network_size, network_size]
    policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)
    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
コード例 #23
0
def single_wrappers(
    single_venv,
    scheduler,
    our_idx,
    normalize,
    normalize_observations,
    rew_shape,
    rew_shape_params,
    embed_index,
    embed_paths,
    embed_types,
    debug,
    env_name,
    load_policy,
    lookback_params,
    transparent_params,
    log_callbacks,
    save_callbacks,
):
    if rew_shape:
        rew_shape_venv = apply_reward_wrapper(
            single_env=single_venv,
            scheduler=scheduler,
            shaping_params=rew_shape_params,
            agent_idx=our_idx,
        )
        log_callbacks.append(LoggerOnlyLogCallback(rew_shape_venv))
        single_venv = rew_shape_venv

        for anneal_type in ["noise", "rew_shape"]:
            if scheduler.is_conditional(anneal_type):
                scheduler.set_annealer_get_logs(anneal_type,
                                                rew_shape_venv.get_logs)

    if lookback_params["lb_num"] > 0:
        if len(embed_types) > 1:
            raise ValueError(
                "Lookback is not supported with multiple embedded agents")
        embed_path = embed_paths[0]
        embed_type = embed_types[0]
        lookback_venv = LookbackRewardVecWrapper(
            single_venv,
            env_name,
            debug,
            embed_index,
            embed_path,
            embed_type,
            transparent_params,
            **lookback_params,
        )
        single_venv = lookback_venv

    if normalize:
        if normalize_observations:
            if load_policy["path"] is not None:
                if load_policy["type"] == "zoo":
                    raise ValueError(
                        "Trying to normalize twice. Bansal et al's Zoo agents normalize "
                        "implicitly. Please set normalize=False to disable VecNormalize."
                    )
            normalized_venv = VecNormalize(single_venv)
        else:
            normalized_venv = VecNormalize(single_venv, norm_obs=False)

        if load_policy["path"] is not None and load_policy["type"] != "zoo":
            normalized_venv.load_running_average(load_policy["path"])

        save_callbacks.append(lambda root_dir: normalized_venv.save(
            os.path.join(root_dir, "vec_normalize.pkl")))
        single_venv = normalized_venv

    return single_venv
コード例 #24
0
def train(args):
    """
    Runs the test
    """
    args, argv = mujoco_arg_parser().parse_known_args(args)
    logger.log(f"#######TRAIN: {args}")
    args.alg = "ppo2"

    this_run_dir = get_dir_path_for_this_run(args)
    if os.path.exists(this_run_dir):
        import shutil
        shutil.rmtree(this_run_dir)
    os.makedirs(this_run_dir)

    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)
    logger.configure(log_dir)

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env.envs[0].env.env.disableViewer = True
    set_global_seeds(args.seed)
    env.envs[0].env.env.seed(args.seed)

    if args.normalize:
        env = VecNormalize(env)

    policy = MlpPolicy

    # extra run info I added for my purposes

    full_param_traj_dir_path = get_full_params_dir(this_run_dir)

    if os.path.exists(full_param_traj_dir_path):
        import shutil
        shutil.rmtree(full_param_traj_dir_path)
    os.makedirs(full_param_traj_dir_path)

    if os.path.exists(save_dir):
        import shutil
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path,
        "state_samples_to_collect": args.state_samples_to_collect
    }

    model = PPO2(policy=policy,
                 env=env,
                 n_steps=args.n_steps,
                 nminibatches=args.nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=3e-4,
                 cliprange=0.2,
                 optimizer=args.optimizer,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)
コード例 #25
0
def visualize_policy_and_collect_COM(seed, run_num, policy_env,
                                     policy_num_timesteps, policy_seed,
                                     policy_run_num):

    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()
    args.env = policy_env
    args.seed = policy_seed
    args.num_timesteps = policy_num_timesteps
    args.run_num = policy_run_num
    this_run_dir = get_dir_path_for_this_run(args)
    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = False

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        env_out.seed(seed)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2", seed=seed)
    model.set_pi_from_flat(final_params)
    if args.normalize:
        env.load_running_average(save_dir)

    sk = env.venv.envs[0].env.env.robot_skeleton
    lagrangian_values = {}

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)

    obs[:] = env.reset()
    plot_dir = get_plot_dir(policy_env=args.env,
                            policy_num_timesteps=policy_num_timesteps,
                            policy_run_num=policy_run_num,
                            policy_seed=policy_seed,
                            eval_seed=seed,
                            eval_run_num=run_num,
                            additional_note="")
    if os.path.exists(plot_dir):
        shutil.rmtree(plot_dir)
    os.makedirs(plot_dir)
    env = VecVideoRecorder(env,
                           plot_dir,
                           record_video_trigger=lambda x: x == 0,
                           video_length=3000,
                           name_prefix="3000000agent-{}".format(args.env))

    lagrangian_values["M"] = [sk.M.reshape((-1, 1))]
    lagrangian_values["COM"] = [sk.C.reshape((-1, 1))]
    lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))]
    lagrangian_values["q"] = [sk.q.reshape((-1, 1))]
    lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))]

    contact_values = {}

    neuron_values = model.give_neuron_values(obs)
    raw_layer_values_list = [[neuron_value.reshape((-1, 1))]
                             for neuron_value in neuron_values]

    env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False

    # epi_rew = 0
    for _ in range(3000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        # epi_rew+= rew[0]
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1

        neuron_values = model.give_neuron_values(obs)

        for i, layer in enumerate(neuron_values):
            raw_layer_values_list[i].append(layer.reshape((-1, 1)))

        # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values)

        lagrangian_values["M"].append(sk.M.reshape((-1, 1)))
        lagrangian_values["q"].append(sk.q.reshape((-1, 1)))
        lagrangian_values["dq"].append(sk.dq.reshape((-1, 1)))
        lagrangian_values["COM"].append(sk.C.reshape((-1, 1)))
        lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1)))

        # env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            # print(f'episode_rew={epi_rew}')
            # epi_rew = 0
            obs = env.reset()

    #Hstack into a big matrix
    lagrangian_values["M"] = np.hstack(lagrangian_values["M"])
    lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"])
    lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"])
    lagrangian_values["q"] = np.hstack(lagrangian_values["q"])
    lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"])

    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #     for name, l in body_contact_dict.items():
    #         body_contact_dict[name] = np.hstack(body_contact_dict[name])
    input_values = np.hstack(raw_layer_values_list[0])

    layers_values = [
        np.hstack(layer_list) for layer_list in raw_layer_values_list
    ][1:-2]  # drop variance and inputs

    for i, com in enumerate(lagrangian_values["COM"]):
        plt.figure()
        plt.plot(np.arange(len(com)), com)
        plt.xlabel("time")
        plt.ylabel(f"COM{i}")

        plt.savefig(f"{plot_dir}/COM{i}.jpg")
        plt.close()
def visualize_augment_experiment(augment_num_timesteps,
                                 top_num_to_include_slice,
                                 augment_seed,
                                 augment_run_num,
                                 network_size,
                                 policy_env,
                                 policy_num_timesteps,
                                 policy_run_num,
                                 policy_seed,
                                 eval_seed,
                                 eval_run_num,
                                 learning_rate,
                                 additional_note,
                                 result_dir,
                                 lagrangian_inds_to_include=None):

    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######TRAIN: {args}")
    # non_linear_global_dict
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    if policy_env == "DartWalker2d-v1":
        entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'
    elif policy_env == "DartHopper-v1":
        entry_point = 'gym.envs.dart:DartHopperEnv_aug_input'
    elif policy_env == "DartHalfCheetah-v1":
        entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input'
    elif policy_env == "DartSnake7Link-v1":
        entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input'
    else:
        raise NotImplemented()

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size)
    full_param_traj_dir_path = get_full_params_dir(this_run_dir)
    log_dir = get_log_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    create_dir_remove(this_run_dir)
    create_dir_remove(full_param_traj_dir_path)
    create_dir_remove(save_dir)
    create_dir_remove(log_dir)
    logger.configure(log_dir)

    # note this is only linear
    if lagrangian_inds_to_include is None:
        linear_top_vars_list = read_linear_top_var(policy_env,
                                                   policy_num_timesteps,
                                                   policy_run_num, policy_seed,
                                                   eval_seed, eval_run_num,
                                                   additional_note)

        # keys_to_include = ["COM", "M", "Coriolis", "total_contact_forces_contact_bodynode",
        #                    "com_jacobian", "contact_bodynode_jacobian"]
        keys_to_include = ["COM", "M", "Coriolis", "com_jacobian"]
        # lagrangian_inds_to_include = linear_top_vars_list[top_num_to_include_slice]
        lagrangian_inds_to_include = get_wanted_lagrangians(
            keys_to_include, linear_top_vars_list, top_num_to_include_slice)

    with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp:
        json.dump(lagrangian_inds_to_include, fp)

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include})

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env
    walker_env.disableViewer = True

    if args.normalize:
        env = VecNormalize(env)
    policy = MlpPolicy

    # extra run info I added for my purposes
    run_info = {
        "run_num": args.run_num,
        "env_id": args.env,
        "full_param_traj_dir_path": full_param_traj_dir_path
    }

    layers = [network_size, network_size]
    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    policy_kwargs = {"net_arch": [dict(vf=layers, pi=layers)]}
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=4096,
                 nminibatches=64,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=10,
                 ent_coef=0.0,
                 learning_rate=learning_rate,
                 cliprange=0.2,
                 optimizer='adam',
                 policy_kwargs=policy_kwargs,
                 seed=args.seed)
    model.tell_run_info(run_info)

    model.learn(total_timesteps=args.num_timesteps, seed=args.seed)

    model.save(f"{save_dir}/ppo2")

    if args.normalize:
        env.save_running_average(save_dir)

    return log_dir
def visualize_policy_and_collect_COM(
        augment_num_timesteps, top_num_to_include_slice, augment_seed,
        augment_run_num, network_size, policy_env, policy_num_timesteps,
        policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
        additional_note, metric_param):
    result_dir = get_result_dir(policy_env, policy_num_timesteps,
                                policy_run_num, policy_seed, eval_seed,
                                eval_run_num, additional_note, metric_param)
    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######VISUALIZE: {args}")
    # non_linear_global_dict
    linear_global_dict, non_linear_global_dict, lagrangian_values, input_values, layers_values, all_weights = read_all_data(
        policy_env,
        policy_num_timesteps,
        policy_run_num,
        policy_seed,
        eval_seed,
        eval_run_num,
        additional_note=additional_note)
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size,
        metric_param=metric_param)
    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    aug_plot_dir = get_aug_plot_dir(this_run_dir) + "_vis"

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={
                 'linear_global_dict': linear_global_dict,
                 'non_linear_global_dict': non_linear_global_dict,
                 'top_to_include_slice': top_num_to_include_slice,
                 'aug_plot_dir': aug_plot_dir,
                 "lagrangian_values": lagrangian_values,
                 "layers_values": layers_values
             })

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env

    walker_env.disableViewer = False

    if args.normalize:
        env = VecNormalize(env)

    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    model = PPO2.load(f"{save_dir}/ppo2", seed=augment_seed)
    model.set_pi_from_flat(final_params)
    if args.normalize:
        env.load_running_average(save_dir)

    sk = env.venv.envs[0].env.env.robot_skeleton
    lagrangian_values = {}

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)

    obs[:] = env.reset()

    env = VecVideoRecorder(env,
                           aug_plot_dir,
                           record_video_trigger=lambda x: x == 0,
                           video_length=3000,
                           name_prefix="vis_this_policy")

    lagrangian_values["M"] = [sk.M.reshape((-1, 1))]
    lagrangian_values["COM"] = [sk.C.reshape((-1, 1))]
    lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))]
    lagrangian_values["q"] = [sk.q.reshape((-1, 1))]
    lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))]

    contact_values = {}

    neuron_values = model.give_neuron_values(obs)
    raw_layer_values_list = [[neuron_value.reshape((-1, 1))]
                             for neuron_value in neuron_values]

    env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False

    # epi_rew = 0
    for _ in range(3000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        # epi_rew+= rew[0]
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1

        neuron_values = model.give_neuron_values(obs)

        for i, layer in enumerate(neuron_values):
            raw_layer_values_list[i].append(layer.reshape((-1, 1)))

        # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values)

        lagrangian_values["M"].append(sk.M.reshape((-1, 1)))
        lagrangian_values["q"].append(sk.q.reshape((-1, 1)))
        lagrangian_values["dq"].append(sk.dq.reshape((-1, 1)))
        lagrangian_values["COM"].append(sk.C.reshape((-1, 1)))
        lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1)))

        # env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            # print(f'episode_rew={epi_rew}')
            # epi_rew = 0
            obs = env.reset()

    #Hstack into a big matrix
    lagrangian_values["M"] = np.hstack(lagrangian_values["M"])
    lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"])
    lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"])
    lagrangian_values["q"] = np.hstack(lagrangian_values["q"])
    lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"])

    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #     for name, l in body_contact_dict.items():
    #         body_contact_dict[name] = np.hstack(body_contact_dict[name])
    input_values = np.hstack(raw_layer_values_list[0])

    layers_values = [
        np.hstack(layer_list) for layer_list in raw_layer_values_list
    ][1:-2]  # drop variance and inputs

    for i, com in enumerate(lagrangian_values["COM"]):
        plt.figure()
        plt.plot(np.arange(len(com)), com)
        plt.xlabel("time")
        plt.ylabel(f"COM{i}")

        plt.savefig(f"{aug_plot_dir}/COM{i}.jpg")
        plt.close()
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == '__main__':

    log_str = sys.argv[1]

    env_id = 'ROAMHandGraspCube-v1'

    model = PPO2.load("logs/{}/trained_model".format(log_str))

    # render trained agent
    env = VecNormalize(DummyVecEnv([lambda: gym.make(env_id)]),
                       norm_reward=False)
    env.load_running_average("logs/{}".format(log_str))

    obs = env.reset()
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
コード例 #29
0
def main(_):

    p_dic = getattr(conf.dic.path_dic, FLAGS.env_name)

    register(id=FLAGS.env_id,
             entry_point='env.env_ep:Env',
             kwargs={
                 'env_name': FLAGS.env_name,
                 'done_step': 8760
             })

    def make_env():
        env_out = gym.make(FLAGS.env_id)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[FLAGS.policy]
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=FLAGS.n_steps,
                 nminibatches=FLAGS.nminibatches,
                 lam=FLAGS.lam,
                 gamma=FLAGS.gamma,
                 noptepochs=FLAGS.noptepochs,
                 ent_coef=FLAGS.ent_coef,
                 learning_rate=FLAGS.learning_rate,
                 cliprange=FLAGS.cliprange,
                 verbose=FLAGS.verbose)

    with model.graph.as_default():
        tf_util.load_state(fname=tf.train.latest_checkpoint(
            p_dic.get('agent_log_dir')),
                           sess=model.sess)

    epenv = EnergyPlusEnv(
        energyplus_file="/usr/local/EnergyPlus-8-8-0/energyplus",
        model_file=p_dic.get('idf_path'),
        weather_file=
        "/usr/local/EnergyPlus-8-8-0/WeatherData/USA_IL_Chicago-OHare.Intl.AP.725300_TMY3.epw",
        log_dir=p_dic.get('eplog_dir'))

    os.environ['ENERGYPLUS'] = "/usr/local/EnergyPlus-8-8-0/energyplus"
    os.environ['ENERGYPLUS_MODEL'] = p_dic.get('idf_path')
    os.environ[
        'ENERGYPLUS_WEATHER'] = "/usr/local/EnergyPlus-8-8-0/WeatherData/USA_IL_Chicago-OHare.Intl.AP.725300_TMY3.epw"
    os.environ['ENERGYPLUS_LOG'] = p_dic.get('eplog_dir')

    epenv.start_instance()

    def signal_handler(signal, frame):
        epenv.stop_instance
        print('=====Energy plus terminated=====')
        print('==========Pipe closed==========')
        sys.exit()

    signal.signal(signal.SIGINT, signal_handler)

    state = epenv.reset()
    env = Env('ep', 8760)
    for i in range(10000000000000):
        # state = np.array([[state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state[8]]])
        state = np.array(
            [[state[0], state[1], state[2], state[3], state[4], state[5]]])
        action, _, _, _ = model.step(state)
        action = env.set_action(action)
        action = action.reshape([-1])
        state, done = epenv.step(action)
    epenv.stop_instance()
コード例 #30
0
ファイル: victim_train.py プロジェクト: psuwuxian/rl_attack
    env_path = args.surrogate_model
    mimic_model_path = args.mimic_model_path

    env = gym.make(env_name)
    venv = SubprocVecEnv([
        lambda: make_adv_multi2single_env(env_name, adv_agent_path,
                                          adv_agent_norm_path, False)
        for i in range(n_cpu)
    ])
    venv = Monitor(venv, 0)

    rew_shape_venv = apply_reward_wrapper(single_env=venv,
                                          scheduler=scheduler,
                                          agent_idx=0,
                                          shaping_params=rew_shape_params)
    venv = VecNormalize(rew_shape_venv, norm_obs=False)
    # makedir output
    out_dir, logger = setup_logger(args.root_dir, args.exp_name)
    model = MyPPO2(MlpPolicy,
                   venv,
                   ent_coef=ent_coef,
                   nminibatches=nminibatches,
                   noptepochs=noptepochs,
                   learning_rate=learning_rate,
                   verbose=1,
                   n_steps=n_steps,
                   gamma=gamma,
                   tensorboard_log=out_dir,
                   model_saved_loc=out_dir,
                   env_name=env_name,
                   env_path=env_path,