Ejemplo n.º 1
0
def main():

    # Parse command line args
    parser = arg_parser()
    parser.add_argument("-hw", "--use-hardware", action="store_true")
    parser.add_argument("-l", "--load", type=str, default=None)
    args = parser.parse_args()

    env = "QubeSwingupEnv"

    def make_env():
        env_out = QubeSwingupEnv(use_simulator=not args.use_hardware,
                                 frequency=250)
        return env_out

    try:
        env = DummyVecEnv([make_env])

        policy = MlpPolicy
        model = PPO2(policy=policy, env=env)
        model.load_parameters(args.load)

        print("Running trained model")
        obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            actions = model.step(obs)[0]
            obs[:], reward, done, _ = env.step(actions)
            if not args.use_hardware:
                env.render()
            if done:
                print("done")
                obs[:] = env.reset()
    finally:
        env.close()
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    neuron_values_list = []

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    # policy = MlpPolicy
    # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function
    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)
    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    env.render()
    ep_infos = []
    while 1:
        neuron_values, actions, _, _, _ = model.step_with_neurons(obs)
        # neuron_values = model.give_neuron_values(obs)

        # neuron_values_list.append( neuron_values )
        yield neuron_values
        obs, rew, done, infos = env.step(actions)
        env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:

            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            obs = env.reset()
Ejemplo n.º 3
0
def _make_warmstart_cartpole():
    """Warm-start VecNormalize by stepping through CartPole"""
    venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    venv = VecNormalize(venv)
    venv.reset()
    venv.get_original_obs()

    for _ in range(100):
        actions = [venv.action_space.sample()]
        venv.step(actions)
    return venv
Ejemplo n.º 4
0
def test(env_id, seed, policy):
    """
    Train PPO2 model for atari environment, for testing purposes

    :param env_id: (str) the environment id string
    :param seed: (int) Used to seed the random generator.
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    """

    # if 'lstm' in policy:
    #     print('LSTM policies not supported for drawing')
    #     return 1
    env = DummyVecEnv([PadEnvRender for _ in range(1)])  # Need for lstm
    # else:
    #     env = PadEnvRender()

    env = VecFrameStack(env, 8)
    model = PPO2.load('./pad_5combo_ppo2.pkl', env)

    while True:
        obs, done = env.reset(), False
        episode_rew = 0

        while not done:
            env.render()
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            done = done.any()
            episode_rew += rew
            time.sleep(1 / 24.)
            if done:
                print('Episode reward:', rew)
Ejemplo n.º 5
0
def test_identity_multibinary(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multibinary action space

    :param model_class: (BaseRLModel) A RL Model
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)])

    model = model_class("MlpPolicy", env)
    model.learn(total_timesteps=1000, seed=0)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward

    assert model.action_probability(obs).shape == (1, 10), \
        "Error: action_probability not returning correct shape"
    assert np.prod(model.action_probability(obs, actions=env.action_space.sample()).shape) == 1, \
        "Error: not scalar probability"
Ejemplo n.º 6
0
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps):
    # logger.log(f"#######EVAL: {args}")

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = True
        env_out.env.visualize = False
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    if pi_theta is not None:
        model.set_pi_from_flat(pi_theta)

    if args.normalize:
        env.load_running_average(save_dir)

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
    obs[:] = env.reset()
    ep_infos = []
    for _ in range(eval_timesteps):
        actions = model.step(obs)[0]
        neuron_values = model.give_neuron_values(obs)

        obs, rew, done, infos = env.step(actions)

        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:
            if pi_theta is None:
                episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
                print(f'episode_rew={episode_rew}')
            obs = env.reset()

    return safe_mean([ep_info['r'] for ep_info in ep_infos])
Ejemplo n.º 7
0
def run_model(save_name,
              nw_type,
              log_dir='./Logs/',
              log_name=None,
              env_name='CartPole-v2',
              runs=100,
              save_results=False):
    # Sets up an environment and a model:
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = load_model(nw_type=nw_type,
                       log_dir=log_dir,
                       env_name=env_name,
                       log_name=log_name,
                       save_name=save_name)

    # Runs environment with the loaded model "runs" times
    max_reward = 0
    max_steps = 0
    rew_vec = []

    header = 'theta1,alpha1,dtheta1,dalpha1,theta2,alpha2,dtheta2,dalpha2'

    for i in range(runs):
        # Resets the environment
        obs, done = env.reset(), False
        episode_rew = 0
        ep_steps = 0
        obs_vec = obs.reshape(-1, 1)
        # This loop runs the environment until a terminal state is reached
        while not done:
            action, _states = model.predict(obs)
            obs, rewards, done, info = env.step(action)
            env.render()
            episode_rew += rewards[-1]
            ep_steps += 1
            obs_vec = np.append(obs_vec,
                                obs.reshape(-1, 1) * 180 / np.pi,
                                axis=1)

        # Saves the reached reward and checks if its a record etc.
        rew_vec.append(episode_rew)
        print("Ep reward: ", '{0:.2f}'.format(episode_rew), '\tRecord: ',
              '{0:.2f}'.format(max_reward), '\tEp steps: ', ep_steps,
              '\tSteps record: ', max_steps)
        np.savetxt('rew_vec.csv', rew_vec, delimiter=',')
        if episode_rew > max_reward:
            max_reward = episode_rew
            if save_results:
                np.savetxt('obs_vec.csv',
                           obs_vec.T,
                           delimiter=',',
                           header=header,
                           fmt='%1.3f',
                           comments='')
        if ep_steps > max_steps:
            max_steps = ep_steps
Ejemplo n.º 8
0
def test_vec_env():
    """Test VecNormalize Object"""
    def make_env():
        return gym.make(ENV_ID)

    env = DummyVecEnv([make_env])
    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=10.,
                       clip_reward=10.)
    _, done = env.reset(), [False]
    obs = None
    while not done[0]:
        actions = [env.action_space.sample()]
        obs, _, done, _ = env.step(actions)
    assert np.max(obs) <= 10
Ejemplo n.º 9
0
def test_identity_multidiscrete(model_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multidiscrete action space

    :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)])

    model = model_func(env)
    model.learn(total_timesteps=1000, seed=0)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
Ejemplo n.º 10
0
def test_identity_multibinary(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    with a multibinary action space

    :param model_class: (BaseRLModel) A RL Model
    """
    env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)])

    model = model_class("MlpPolicy", env)
    model.learn(total_timesteps=1000, seed=0)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
Ejemplo n.º 11
0
def test_identity(learn_func):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)

    :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator
    """
    env = DummyVecEnv([lambda: IdentityEnv(10)])

    model = learn_func(env)

    n_trials = 1000
    reward_sum = 0
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
Ejemplo n.º 12
0
    def test(model):
        env = DummyVecEnv([make_env] * n_env)
        #env = VecNormalize.load("models/machine_snap_env.bin", venv=env)
        #env.training = False
        for trial in range(1):
            obs = env.reset()
            running_reward = 0.0
            alpha = 0.01

            for _ in range(5000):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                reward = reward[0]
                done = done[0]
                info = info[0]
                #running_reward = running_reward * (1-alpha) + alpha * reward
                running_reward += reward
                #print(obs, reward, done, info, running_reward)
                if done:
                    print("Finished after {} timesteps".format(_ + 1))
                    break
                else:
                    env.envs[0].render()
Ejemplo n.º 13
0
def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = gym.make(ENV_ID)
        env = DummyVecEnv([lambda: env])

        # create and train
        model = model_class(policy=MlpPolicy, env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS)

        # predict and measure the acc reward
        acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = gym.make(ENV_ID)
        env = DummyVecEnv([lambda: env])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <5% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.05, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=int(NUM_TIMESTEPS / 2))

        # validate no reset post learning
        loaded_acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <5% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.05, \
            "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Ejemplo n.º 14
0
def test_model_manipulation(model_policy):
    """
    Test if the algorithm (with a given policy) can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_policy: (BaseRLModel, Object) A model, policy pair
    """
    model_class, policy = model_policy

    try:
        env = DummyVecEnv([lambda: IdentityEnv(10)])

        # check the env is deterministic
        action = [env.action_space.sample()]
        set_global_seeds(0)
        obs = env.step(action)[0]
        for _ in range(N_TRIALS):
            set_global_seeds(0)
            assert obs == env.step(action)[0], "Error: environment tested not deterministic with the same seed"

        # create and train
        model = model_class(policy=policy, env=env)
        model.learn(total_timesteps=50000)

        # predict and measure the acc reward
        acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnv(10)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "loading and saving"

        # learn post loading
        model.learn(total_timesteps=1000)

        # validate no reset post learning
        loaded_acc_reward = 0
        obs = env.reset()
        set_global_seeds(0)
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        assert abs(acc_reward - loaded_acc_reward) < 0.1, "Error: the prediction seems to have changed between " \
                                                          "pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")
Ejemplo n.º 15
0
def main():
    """
    Runs the test
    """
    parser = mujoco_arg_parser()
    parser.add_argument(
        '--model-path',
        default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model")
    parser.add_argument('--images', default=False)
    args = parser.parse_args()

    logger.configure()
    if not args.play:
        model, env = train(args.env,
                           num_timesteps=args.num_timesteps,
                           seed=args.seed,
                           model_path=args.model_path,
                           images=args.images)

    if args.play:

        def make_env():
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=True,  # make sure we can render to the screen
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                ))
            env_out.reward_range = None
            env_out.metadata = None
            env_out.spec = None
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            return env_out

        #env = make_env()
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        policy = MlpPolicy
        #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
        #         optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3)
        model.load(args.model_path)
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            env.render()
            actions = model.step(obs)[0]
            obs[:] = env.step(actions)[0]
Ejemplo n.º 16
0
def main():

    import sys
    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()

    this_run_dir = get_dir_path_for_this_run(args)
    plot_dir_alg = get_plot_dir(args)

    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    if not os.path.exists(plot_dir_alg):
        os.makedirs(plot_dir_alg)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)

        return env_out

    env = DummyVecEnv([make_env])

    # env_out = gym.make(args.env)
    # env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
    if args.normalize:
        env = VecNormalize(env)
    # policy = MlpPolicy
    model = PPO2.load(f"{save_dir}/ppo2")  # this also loads V function
    # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10,
    #              ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer)
    model.set_pi_from_flat(final_params)

    if args.normalize:
        env.load_running_average(save_dir)

    sk = env.venv.envs[0].env.env.robot_skeleton
    lagrangian_values = {}

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)

    obs[:] = env.reset()

    # env = VecVideoRecorder(env, "./",
    #                            record_video_trigger=lambda x: x == 0, video_length=3000,
    #                            name_prefix="3000000agent-{}".format(args.env))

    lagrangian_values["M"] = [sk.M.reshape((-1, 1))]
    lagrangian_values["COM"] = [sk.C.reshape((-1, 1))]
    lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))]
    lagrangian_values["q"] = [sk.q.reshape((-1, 1))]
    lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))]

    contact_values = {}

    neuron_values = model.give_neuron_values(obs)
    layer_values_list = [[neuron_value.reshape((-1, 1))]
                         for neuron_value in neuron_values]

    env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False
    for _ in range(3000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1

        neuron_values = model.give_neuron_values(obs)

        for i, layer in enumerate(neuron_values):
            layer_values_list[i].append(layer.reshape((-1, 1)))

        fill_contacts_jac_dict(infos[0]["contacts"],
                               contact_dict=contact_values,
                               neuron_values=neuron_values)

        lagrangian_values["M"].append(sk.M.reshape((-1, 1)))
        lagrangian_values["q"].append(sk.q.reshape((-1, 1)))
        lagrangian_values["dq"].append(sk.dq.reshape((-1, 1)))
        lagrangian_values["COM"].append(sk.C.reshape((-1, 1)))
        lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1)))

        env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            obs = env.reset()

    #Hstack into a big matrix
    lagrangian_values["M"] = np.hstack(lagrangian_values["M"])
    lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"])
    lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"])
    lagrangian_values["q"] = np.hstack(lagrangian_values["q"])
    lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"])

    for contact_body_name, l in contact_values.items():
        body_contact_dict = contact_values[contact_body_name]
        for name, l in body_contact_dict.items():
            body_contact_dict[name] = np.hstack(body_contact_dict[name])

    layer_values_list = [
        np.hstack(layer_list) for layer_list in layer_values_list
    ][1:-2]  # drop variance

    # plt.scatter(lagrangian_values["M"][15], layer_values_list[1][2])
    # plt.scatter(lagrangian_values["M"][11], layer_values_list[0][63])
    out_dir = f"/home/panda-linux/PycharmProjects/low_dim_update_dart/low_dim_update_stable/neuron_vis/plots_{args.env}_{args.num_timesteps}"
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
    os.makedirs(out_dir)

    all_weights = model.get_all_weight_values()

    for ind, weights in enumerate(all_weights):
        fname = f"{out_dir}/weights_layer_{ind}.txt"
        np.savetxt(fname, weights)

    PLOT_CUTOFF = steps_to_first_done
    plot_everything(lagrangian_values, layer_values_list, out_dir, PLOT_CUTOFF)
    scatter_the_linear_significant_ones(lagrangian_values,
                                        layer_values_list,
                                        threshold=0.6,
                                        out_dir=out_dir)
    scatter_the_nonlinear_significant_but_not_linear_ones(
        lagrangian_values,
        layer_values_list,
        linear_threshold=0.3,
        nonlinear_threshold=0.6,
        out_dir=out_dir)
    #
    # contact_dicts = {}
    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #
    #
    #     contact_dicts[contact_body_name] = {}
    #
    #     build_dict = contact_dicts[contact_body_name]
    #
    #     build_dict["body"] = {}
    #     build_dict["layer"] = {}
    #     for name, l in body_contact_dict.items():
    #         for i in range(len(l)):
    #
    #             if name == contact_body_name:
    #                 build_dict["body"][f"{contact_body_name}_{i}"] = l[i]
    #             else:
    #                 build_dict["layer"][f"layer_{name}_neuron_{i}"] = l[i]
    #
    #     body_contact_df = pd.DataFrame.from_dict(build_dict["body"], "index")
    #     layer_contact_df = pd.DataFrame.from_dict(build_dict["layer"], "index")

    # body_contact_df.to_csv(f"{data_dir}/{contact_body_name}_contact.txt", sep='\t')
    # layer_contact_df.to_csv(f"{data_dir}/{contact_body_name}_layers.txt", sep='\t')

    # #TO CSV format
    # data_dir = f"/home/panda-linux/PycharmProjects/low_dim_update_dart/mictools/examples/neuron_vis_data{args.env}_time_steps_{args.num_timesteps}"
    # if os.path.exists(data_dir):
    #     shutil.rmtree(data_dir)
    #
    # os.makedirs(data_dir)
    #
    # for contact_body_name, d in contact_dicts.items():
    #
    #     build_dict = d
    #
    #     body_contact_df = pd.DataFrame.from_dict(build_dict["body"], "index")
    #     layer_contact_df = pd.DataFrame.from_dict(build_dict["layer"], "index")
    #
    #     body_contact_df.to_csv(f"{data_dir}/{contact_body_name}_contact.txt", sep='\t')
    #     layer_contact_df.to_csv(f"{data_dir}/{contact_body_name}_layers.txt", sep='\t')
    #
    #
    #
    # neurons_dict = {}
    # for layer_index in range(len(layer_values_list)):
    #     for neuron_index in range(len(layer_values_list[layer_index])):
    #         neurons_dict[f"layer_{layer_index}_neuron_{neuron_index}"] = layer_values_list[layer_index][neuron_index]
    #
    # for i in range(len(lagrangian_values["COM"])):
    #     neurons_dict[f"COM_index_{i}"] = lagrangian_values["COM"][i]
    #
    # neuron_df = pd.DataFrame.from_dict(neurons_dict, "index")
    #
    #
    #
    # lagrangian_dict = {}
    # for k,v in lagrangian_values.items():
    #     for i in range(len(v)):
    #         lagrangian_dict[f"{k}_index_{i}"] = v[i]
    #
    # lagrangian_df = pd.DataFrame.from_dict(lagrangian_dict, "index")
    #
    #
    # neuron_df.to_csv(f"{data_dir}/neurons.txt", sep='\t')
    # lagrangian_df.to_csv(f"{data_dir}/lagrangian.txt", sep='\t')

    # cor = {}
    # best_cor = {}
    # cor["M"] = get_correlations(lagrangian_values["M"], layer_values_list)
    # best_cor["M"] = [np.max(np.abs(cor_m)) for cor_m in cor["M"]]
    #
    #
    # cor["COM"] = get_correlations(lagrangian_values["COM"], layer_values_list)
    # best_cor["COM"] = [np.max(np.abs(cor_m)) for cor_m in cor["COM"]]
    #
    # cor["Coriolis"] = get_correlations(lagrangian_values["Coriolis"], layer_values_list)
    # best_cor["Coriolis"] = [np.max(np.abs(cor_m)) for cor_m in cor["Coriolis"]]
    # best_cor["Coriolis_argmax"] = [np.argmax(np.abs(cor_m)) for cor_m in cor["Coriolis"]]
    #
    #
    #
    #
    # ncor = {}
    # nbest_cor = {}
    # ncor["M"] = get_normalized_correlations(lagrangian_values["M"], layer_values_list)
    # nbest_cor["M"] = [np.max(np.abs(cor_m)) for cor_m in ncor["M"]]
    #
    #
    # ncor["COM"] = get_normalized_correlations(lagrangian_values["COM"], layer_values_list)
    # nbest_cor["COM"] = [np.max(np.abs(cor_m)) for cor_m in ncor["COM"]]
    #
    # ncor["Coriolis"] = get_normalized_correlations(lagrangian_values["Coriolis"], layer_values_list)
    # nbest_cor["Coriolis"] = [np.max(np.abs(cor_m)) for cor_m in ncor["Coriolis"]]
    # nbest_cor["Coriolis_argmax"] = [np.argmax(np.abs(cor_m)) for cor_m in ncor["Coriolis"]]
    #
    #
    #
    #
    #
    # lin_reg = {"perm_1":{}, "perm_2":{}}
    # best_lin_reg = {"perm_1":{}, "perm_2":{}}
    # lin_reg["perm_1"]["M"], best_lin_reg["perm_1"]["M"] = get_results("M", lagrangian_values, layer_values_list, perm_num=1)
    # lin_reg["perm_2"]["M"], best_lin_reg["perm_2"]["M"] = get_results("M", lagrangian_values, layer_values_list, perm_num=2)
    # lin_reg["perm_1"]["COM"], best_lin_reg["perm_1"]["COM"] = get_results("COM", lagrangian_values, layer_values_list, perm_num=1)
    # lin_reg["perm_2"]["COM"], best_lin_reg["perm_2"]["COM"] = get_results("COM", lagrangian_values, layer_values_list, perm_num=2)

    #
    #
    # lin_reg_1["M"] = get_linear_regressions_1_perm(lagrangian_values["M"], layer_values_list)
    # lin_reg_2["M"] = get_linear_regressions_2_perm(lagrangian_values["M"], layer_values_list)
    # best_lin_reg_2["M"] = []
    # for lin_l in lin_reg_2["M"]:
    #     if lin_l == []:
    #         best_lin_reg_2["M"].append([])
    #     else:
    #         best_lin_reg_2["M"].append(lin_l[np.argmin(lin_l[:,0])])
    #
    # best_lin_reg_1["M"] = []
    # for lin_l in lin_reg_1["M"]:
    #     if lin_l == []:
    #         best_lin_reg_1["M"].append([])
    #     else:
    #         best_lin_reg_1["M"].append(lin_l[np.argmin(lin_l[:,0])])
    # best_lin_reg_1["M"] = np.array(best_lin_reg_1["M"])
    # best_lin_reg_2["M"] = np.array(best_lin_reg_2["M"])
    #
    #
    # lin_reg_1["M"].dump("lin_reg_1_M.txt")
    # lin_reg_2["M"].dump("lin_reg_2_M.txt")
    # best_lin_reg_1["M"].dump("best_lin_reg_1_M.txt")
    # best_lin_reg_2["M"].dump("best_lin_reg_2_M.txt")
    #
    # lin_reg_1["COM"] = get_linear_regressions_1_perm(lagrangian_values["COM"], layer_values_list)
    # lin_reg_2["COM"] = get_linear_regressions_2_perm(lagrangian_values["COM"], layer_values_list)
    # best_lin_reg_2["COM"] = []
    # for lin_l in lin_reg_2["COM"]:
    #     if lin_l == []:
    #         best_lin_reg_2["COM"].append([])
    #     else:
    #         best_lin_reg_2["COM"].append(lin_l[np.argmin(lin_l[:, 0])])
    #
    # best_lin_reg_1["COM"] = []
    # for lin_l in lin_reg_1["COM"]:
    #     if lin_l == []:
    #         best_lin_reg_1["COM"].append([])
    #     else:
    #         best_lin_reg_1["COM"].append(lin_l[np.argmin(lin_l[:, 0])])
    #
    #
    # best_lin_reg_1["COM"] = np.array(best_lin_reg_1["M"])
    # best_lin_reg_2["COM"] = np.array(best_lin_reg_2["M"])
    # lin_reg_1["COM"].dump("lin_reg_1_COM.txt")
    # lin_reg_2["COM"].dump("lin_reg_2_COM.txt")
    # best_lin_reg_1["COM"].dump("best_lin_reg_1_COM.txt")
    # best_lin_reg_2["COM"].dump("best_lin_reg_2_COM.txt")

    pass
def visualize_policy_and_collect_COM(
        augment_num_timesteps, top_num_to_include_slice, augment_seed,
        augment_run_num, network_size, policy_env, policy_num_timesteps,
        policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate,
        additional_note, metric_param):
    result_dir = get_result_dir(policy_env, policy_num_timesteps,
                                policy_run_num, policy_seed, eval_seed,
                                eval_run_num, additional_note, metric_param)
    args = AttributeDict()

    args.normalize = True
    args.num_timesteps = augment_num_timesteps
    args.run_num = augment_run_num
    args.alg = "ppo2"
    args.seed = augment_seed

    logger.log(f"#######VISUALIZE: {args}")
    # non_linear_global_dict
    linear_global_dict, non_linear_global_dict, lagrangian_values, input_values, layers_values, all_weights = read_all_data(
        policy_env,
        policy_num_timesteps,
        policy_run_num,
        policy_seed,
        eval_seed,
        eval_run_num,
        additional_note=additional_note)
    timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S')
    experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \
                       f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \
                       f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \
                       f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \
                       f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}"

    entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input'

    this_run_dir = get_experiment_path_for_this_run(
        entry_point,
        args.num_timesteps,
        args.run_num,
        args.seed,
        learning_rate=learning_rate,
        top_num_to_include=top_num_to_include_slice,
        result_dir=result_dir,
        network_size=network_size,
        metric_param=metric_param)
    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    aug_plot_dir = get_aug_plot_dir(this_run_dir) + "_vis"

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    args.env = f'{experiment_label}_{entry_point}-v1'
    register(id=args.env,
             entry_point=entry_point,
             max_episode_steps=1000,
             kwargs={
                 'linear_global_dict': linear_global_dict,
                 'non_linear_global_dict': non_linear_global_dict,
                 'top_to_include_slice': top_num_to_include_slice,
                 'aug_plot_dir': aug_plot_dir,
                 "lagrangian_values": lagrangian_values,
                 "layers_values": layers_values
             })

    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    env = DummyVecEnv([make_env])
    walker_env = env.envs[0].env.env

    walker_env.disableViewer = False

    if args.normalize:
        env = VecNormalize(env)

    set_global_seeds(args.seed)
    walker_env.seed(args.seed)

    model = PPO2.load(f"{save_dir}/ppo2", seed=augment_seed)
    model.set_pi_from_flat(final_params)
    if args.normalize:
        env.load_running_average(save_dir)

    sk = env.venv.envs[0].env.env.robot_skeleton
    lagrangian_values = {}

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)

    obs[:] = env.reset()

    env = VecVideoRecorder(env,
                           aug_plot_dir,
                           record_video_trigger=lambda x: x == 0,
                           video_length=3000,
                           name_prefix="vis_this_policy")

    lagrangian_values["M"] = [sk.M.reshape((-1, 1))]
    lagrangian_values["COM"] = [sk.C.reshape((-1, 1))]
    lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))]
    lagrangian_values["q"] = [sk.q.reshape((-1, 1))]
    lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))]

    contact_values = {}

    neuron_values = model.give_neuron_values(obs)
    raw_layer_values_list = [[neuron_value.reshape((-1, 1))]
                             for neuron_value in neuron_values]

    env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False

    # epi_rew = 0
    for _ in range(3000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        # epi_rew+= rew[0]
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1

        neuron_values = model.give_neuron_values(obs)

        for i, layer in enumerate(neuron_values):
            raw_layer_values_list[i].append(layer.reshape((-1, 1)))

        # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values)

        lagrangian_values["M"].append(sk.M.reshape((-1, 1)))
        lagrangian_values["q"].append(sk.q.reshape((-1, 1)))
        lagrangian_values["dq"].append(sk.dq.reshape((-1, 1)))
        lagrangian_values["COM"].append(sk.C.reshape((-1, 1)))
        lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1)))

        # env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            # print(f'episode_rew={epi_rew}')
            # epi_rew = 0
            obs = env.reset()

    #Hstack into a big matrix
    lagrangian_values["M"] = np.hstack(lagrangian_values["M"])
    lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"])
    lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"])
    lagrangian_values["q"] = np.hstack(lagrangian_values["q"])
    lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"])

    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #     for name, l in body_contact_dict.items():
    #         body_contact_dict[name] = np.hstack(body_contact_dict[name])
    input_values = np.hstack(raw_layer_values_list[0])

    layers_values = [
        np.hstack(layer_list) for layer_list in raw_layer_values_list
    ][1:-2]  # drop variance and inputs

    for i, com in enumerate(lagrangian_values["COM"]):
        plt.figure()
        plt.plot(np.arange(len(com)), com)
        plt.xlabel("time")
        plt.ylabel(f"COM{i}")

        plt.savefig(f"{aug_plot_dir}/COM{i}.jpg")
        plt.close()
Ejemplo n.º 18
0
    plt.setp(lines, linewidth=1.0, alpha=0.8)
    plt.xlabel('update')
    plt.ylabel('rewards')
    plt.show()
    pass


if __name__ == '__main__':
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

    log_path = os.path.join('/', 'home', 'user', 'Dropbox', 'MATLAB_dropbox',
                            'DeepMimic', 'log')
    run_id = 'run_' + '10071436'
    run_file = run_id + '_simpleHumanoid.zip'

    plot_reward_portions(os.path.join(log_path, run_id, 'reward_portions.txt'))

    envs = DummyVecEnv([make_env(1)])
    model = PPO2.load(os.path.join(log_path, run_file), envs)

    obs = envs.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = envs.step(action)
        envs.render()
        time.sleep(1 / 30)

    pass
Ejemplo n.º 19
0
def eval_trained_policy_and_collect_data(eval_seed, eval_run_num, policy_env, policy_num_timesteps, policy_seed, policy_run_num, additional_note):


    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()
    args.env = policy_env
    args.seed = policy_seed
    args.num_timesteps = policy_num_timesteps
    args.run_num = policy_run_num
    this_run_dir = get_dir_path_for_this_run(args)
    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir( this_run_dir)



    final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]


    def make_env():
        env_out = gym.make(args.env)
        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
        env_out.seed(eval_seed)
        return env_out
    env = DummyVecEnv([make_env])
    running_env = env.envs[0].env.env


    set_global_seeds(eval_seed)
    running_env.seed(eval_seed)

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2", seed=eval_seed)
    model.set_pi_from_flat(final_params)
    if args.normalize:
        env.load_running_average(save_dir)

    # is it necessary?
    running_env = env.venv.envs[0].env.env


    lagrangian_values = {}

    obs = np.zeros((env.num_envs,) + env.observation_space.shape)

    obs[:] = env.reset()

    # env = VecVideoRecorder(env, "./",
    #                            record_video_trigger=lambda x: x == 0, video_length=3000,
    #                            name_prefix="3000000agent-{}".format(args.env))

    #init lagrangian values
    for lagrangian_key in lagrangian_keys:
        flat_array = running_env.get_lagrangian_flat_array(lagrangian_key)
        lagrangian_values[lagrangian_key] = [flat_array]


    neuron_values = model.give_neuron_values(obs)
    raw_layer_values_list = [[neuron_value.reshape((-1,1))] for neuron_value in neuron_values]

    # env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False
    for _ in range(30000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1


        neuron_values = model.give_neuron_values(obs)


        for i, layer in enumerate(neuron_values):
            raw_layer_values_list[i].append(layer.reshape((-1,1)))

        # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values)

        # filling lagrangian values
        for lagrangian_key in lagrangian_keys:
            flat_array = running_env.get_lagrangian_flat_array(lagrangian_key)
            lagrangian_values[lagrangian_key].append(flat_array)

        # env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        # env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            obs = env.reset()


    #Hstack into a big matrix
    for lagrangian_key in lagrangian_keys:
        lagrangian_values[lagrangian_key] = np.hstack(lagrangian_values[lagrangian_key])

    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #     for name, l in body_contact_dict.items():
    #         body_contact_dict[name] = np.hstack(body_contact_dict[name])
    input_values = np.hstack(raw_layer_values_list[0])

    layers_values = [np.hstack(layer_list) for layer_list in raw_layer_values_list][1:-2]# drop variance and inputs


    data_dir = get_data_dir(policy_env=args.env, policy_num_timesteps=policy_num_timesteps, policy_run_num=policy_run_num
                            , policy_seed=policy_seed, eval_seed=eval_seed, eval_run_num=eval_run_num, additional_note=additional_note)
    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    os.makedirs(data_dir)


    lagrangian_values_fn = f"{data_dir}/lagrangian.pickle"

    with open(lagrangian_values_fn, 'wb') as handle:
        pickle.dump(lagrangian_values, handle, protocol=pickle.HIGHEST_PROTOCOL)

    input_values_fn = f"{data_dir}/input_values.npy"
    layers_values_fn = f"{data_dir}/layer_values.npy"

    np.save(input_values_fn, input_values)
    np.save(layers_values_fn, layers_values)


    all_weights = model.get_all_weight_values()

    for ind, weights in enumerate(all_weights):
        fname = f"{data_dir}/weights_layer_{ind}.txt"
        np.savetxt(fname, weights)
Ejemplo n.º 20
0
def visualize_policy_and_collect_COM(seed, run_num, policy_env,
                                     policy_num_timesteps, policy_seed,
                                     policy_run_num):

    logger.log(sys.argv)
    common_arg_parser = get_common_parser()
    args, cma_unknown_args = common_arg_parser.parse_known_args()
    args.env = policy_env
    args.seed = policy_seed
    args.num_timesteps = policy_num_timesteps
    args.run_num = policy_run_num
    this_run_dir = get_dir_path_for_this_run(args)
    traj_params_dir_name = get_full_params_dir(this_run_dir)
    save_dir = get_save_dir(this_run_dir)

    final_file = get_full_param_traj_file_path(traj_params_dir_name,
                                               "pi_final")
    final_params = pd.read_csv(final_file, header=None).values[0]

    def make_env():
        env_out = gym.make(args.env)
        env_out.env.disableViewer = False

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        env_out.seed(seed)
        return env_out

    env = DummyVecEnv([make_env])

    if args.normalize:
        env = VecNormalize(env)

    model = PPO2.load(f"{save_dir}/ppo2", seed=seed)
    model.set_pi_from_flat(final_params)
    if args.normalize:
        env.load_running_average(save_dir)

    sk = env.venv.envs[0].env.env.robot_skeleton
    lagrangian_values = {}

    obs = np.zeros((env.num_envs, ) + env.observation_space.shape)

    obs[:] = env.reset()
    plot_dir = get_plot_dir(policy_env=args.env,
                            policy_num_timesteps=policy_num_timesteps,
                            policy_run_num=policy_run_num,
                            policy_seed=policy_seed,
                            eval_seed=seed,
                            eval_run_num=run_num,
                            additional_note="")
    if os.path.exists(plot_dir):
        shutil.rmtree(plot_dir)
    os.makedirs(plot_dir)
    env = VecVideoRecorder(env,
                           plot_dir,
                           record_video_trigger=lambda x: x == 0,
                           video_length=3000,
                           name_prefix="3000000agent-{}".format(args.env))

    lagrangian_values["M"] = [sk.M.reshape((-1, 1))]
    lagrangian_values["COM"] = [sk.C.reshape((-1, 1))]
    lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))]
    lagrangian_values["q"] = [sk.q.reshape((-1, 1))]
    lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))]

    contact_values = {}

    neuron_values = model.give_neuron_values(obs)
    raw_layer_values_list = [[neuron_value.reshape((-1, 1))]
                             for neuron_value in neuron_values]

    env.render()
    ep_infos = []
    steps_to_first_done = 0
    first_done = False

    # epi_rew = 0
    for _ in range(3000):
        actions = model.step(obs)[0]

        # yield neuron_values
        obs, rew, done, infos = env.step(actions)
        # epi_rew+= rew[0]
        if done and not first_done:
            first_done = True

        if not first_done:
            steps_to_first_done += 1

        neuron_values = model.give_neuron_values(obs)

        for i, layer in enumerate(neuron_values):
            raw_layer_values_list[i].append(layer.reshape((-1, 1)))

        # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values)

        lagrangian_values["M"].append(sk.M.reshape((-1, 1)))
        lagrangian_values["q"].append(sk.q.reshape((-1, 1)))
        lagrangian_values["dq"].append(sk.dq.reshape((-1, 1)))
        lagrangian_values["COM"].append(sk.C.reshape((-1, 1)))
        lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1)))

        # env.render()

        # time.sleep(1)
        for info in infos:
            maybe_ep_info = info.get('episode')
            if maybe_ep_info is not None:
                ep_infos.append(maybe_ep_info)

        env.render()
        done = done.any()
        if done:
            episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos])
            print(f'episode_rew={episode_rew}')
            # print(f'episode_rew={epi_rew}')
            # epi_rew = 0
            obs = env.reset()

    #Hstack into a big matrix
    lagrangian_values["M"] = np.hstack(lagrangian_values["M"])
    lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"])
    lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"])
    lagrangian_values["q"] = np.hstack(lagrangian_values["q"])
    lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"])

    # for contact_body_name, l in contact_values.items():
    #     body_contact_dict = contact_values[contact_body_name]
    #     for name, l in body_contact_dict.items():
    #         body_contact_dict[name] = np.hstack(body_contact_dict[name])
    input_values = np.hstack(raw_layer_values_list[0])

    layers_values = [
        np.hstack(layer_list) for layer_list in raw_layer_values_list
    ][1:-2]  # drop variance and inputs

    for i, com in enumerate(lagrangian_values["COM"]):
        plt.figure()
        plt.plot(np.arange(len(com)), com)
        plt.xlabel("time")
        plt.ylabel(f"COM{i}")

        plt.savefig(f"{plot_dir}/COM{i}.jpg")
        plt.close()