def main(control="gym-like"):
    env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000)
    motorsIds = []
    if control == "gym-like":
        dv = 1
        motorsIds.append(env._p.addUserDebugParameter("posX", -dv, dv, 0))
        motorsIds.append(env._p.addUserDebugParameter("posY", -dv, dv, 0))
        motorsIds.append(env._p.addUserDebugParameter("yaw", -dv, dv, 0))
    else:
        dv = 0.01
        motorsIds.append(env._p.addUserDebugParameter("posX", -dv, dv, 0))
        motorsIds.append(env._p.addUserDebugParameter("posY", -dv, dv, 0))
        motorsIds.append(env._p.addUserDebugParameter("posZ", -dv, dv, 0))
        motorsIds.append(env._p.addUserDebugParameter("yaw", -dv, dv, 0))
        motorsIds.append(env._p.addUserDebugParameter("fingerAngle", 0, 0.3, 0.3))

    done = False
    i = 0
    while not done:
        time.sleep(0.01)
        action = []
        for motorId in motorsIds:
            action.append(env._p.readUserDebugParameter(motorId))
        if control == "gym-like":
            state, reward, done, info = env.step(action)
        else:
            state, reward, done, info = env.step2(action)
        obs = env.getExtendedObservation()
        print(i)
        print(f"Action: {action}")
        print(f"Observation: {obs}")
        i += 1
コード例 #2
0
def main():

  environment = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000)

  motorsIds = []
  #motorsIds.append(environment._p.addUserDebugParameter("posX",0.4,0.75,0.537))
  #motorsIds.append(environment._p.addUserDebugParameter("posY",-.22,.3,0.0))
  #motorsIds.append(environment._p.addUserDebugParameter("posZ",0.1,1,0.2))
  #motorsIds.append(environment._p.addUserDebugParameter("yaw",-3.14,3.14,0))
  #motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3))

  dv = 0.01
  motorsIds.append(environment._p.addUserDebugParameter("posX", -dv, dv, 0))
  motorsIds.append(environment._p.addUserDebugParameter("posY", -dv, dv, 0))
  motorsIds.append(environment._p.addUserDebugParameter("posZ", -dv, dv, 0))
  motorsIds.append(environment._p.addUserDebugParameter("yaw", -dv, dv, 0))
  motorsIds.append(environment._p.addUserDebugParameter("fingerAngle", 0, 0.3, .3))

  done = False
  while (not done):

    action = []
    for motorId in motorsIds:
      action.append(environment._p.readUserDebugParameter(motorId))

    state, reward, done, info = environment.step2(action)
    obs = environment.getExtendedObservation()
コード例 #3
0
 def __init__(self, hyperparameters):
     """
     Args:
         hyperparameters (dict): a dictionary of hyperparameters
     discounted_returns:
         None
     """
     # Extract hyperparameters
     self.lr = hyperparameters['learning_rate']
     self.discount = hyperparameters['discount_rate']
     self.num_batch_transitions = hyperparameters['num_batch_transitions']
     self.state_dim = hyperparameters['state_dim']
     self.action_dim = hyperparameters['action_dim']
     self.total_train_steps = hyperparameters['total_train_steps']
     self.max_episode_length = hyperparameters['max_episode_length']
     self.num_train_epochs = hyperparameters['num_train_epochs']
     self.device = hyperparameters['device']
     # Initialize actor/critic networks
     self.actor = Actor(self.state_dim, self.action_dim)
     self.critic = Critic(self.state_dim, self.action_dim)
     self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.lr)
     self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.lr)
     # Initialize replay buffer and environment
     self.replay_buffer = ReplayBuffer(self.batch_size)
     self.enviroment = KukaGymEnv(renders=True)
コード例 #4
0
def main():

    environment = KukaGymEnv(renders=True)

    motorsIds = []
    #motorsIds.append(environment._p.addUserDebugParameter("posX",0.4,0.75,0.537))
    #motorsIds.append(environment._p.addUserDebugParameter("posY",-.22,.3,0.0))
    #motorsIds.append(environment._p.addUserDebugParameter("posZ",0.1,1,0.2))
    #motorsIds.append(environment._p.addUserDebugParameter("yaw",-3.14,3.14,0))
    #motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3))

    dv = 0.001
    motorsIds.append(environment._p.addUserDebugParameter("posX", -dv, dv, 0))
    motorsIds.append(environment._p.addUserDebugParameter("posY", -dv, dv, 0))
    motorsIds.append(environment._p.addUserDebugParameter(
        "posZ", -dv, dv, -dv))
    motorsIds.append(environment._p.addUserDebugParameter("yaw", -dv, dv, 0))
    motorsIds.append(
        environment._p.addUserDebugParameter("fingerAngle", 0, 0.3, .3))

    done = False
    while (not done):

        action = []
        for motorId in motorsIds:
            action.append(environment._p.readUserDebugParameter(motorId))

        state, reward, done, info = environment.step2(action)
        obs = environment.getExtendedObservation()
コード例 #5
0
ファイル: test_world.py プロジェクト: thias15/CausalWorld
    def test_timing_profile(self):
        from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv
        import time

        kuka_env = KukaGymEnv(renders=False,
                              isDiscrete=False)  # operates at 240 HZ
        task = generate_task(task_generator_id="pushing")
        causal_rl_env = CausalWorld(
            task=task,
            enable_visualization=False,
            seed=0,
            skip_frame=10,
            normalize_actions=False,
            normalize_observations=False)  # operates at 250 HZ
        start = time.time()
        kuka_env.reset()
        end = time.time()
        kuka_reset_time = end - start

        start = time.time()
        causal_rl_env.reset()
        end = time.time()
        causal_rl_reset_time = end - start

        self.assertLess(causal_rl_reset_time, kuka_reset_time * 1.25)

        start = time.time()
        kuka_env.step(kuka_env.action_space.sample())
        end = time.time()
        kuka_step_time = end - start

        start = time.time()
        causal_rl_env.step(causal_rl_env.action_space.sample())
        end = time.time()
        causal_rl_step_time = end - start
        print("time 1", causal_rl_step_time)
        print("time 2", kuka_step_time)
        self.assertLess(causal_rl_step_time, kuka_step_time * 10)

        start = time.time()
        kuka_env.render()
        end = time.time()
        kuka_render_time = end - start

        start = time.time()
        causal_rl_env.render()
        end = time.time()
        causal_rl_render_time = end - start
        self.assertLess(causal_rl_render_time, kuka_render_time * 1.25)

        causal_rl_env.close()
        kuka_env.close()
        return
コード例 #6
0
def main():
    
    env = KukaGymEnv(renders=True)
    act = deepq.load("kuka_model.pkl")
    print(act)
    while True:
        obs, done = env.reset(), False
        print("===================================")        
        print("obs")
        print(obs)
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
コード例 #7
0
ファイル: kuka.py プロジェクト: vvanirudh/TigerControl
 def initialize(self, render=False):
     self.initialized = True
     self._env = KukaGymEnv(renders=render)
     self.observation_space = self._env.observation_space.shape
     self.action_space = self._env.action_space.shape
     self.state = {}
     initial_obs = self.reset()
     return initial_obs
コード例 #8
0
def main_usingEnvOnly():
    environment = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000)
    randomObjs = add_random_objs_to_scene(10)
    motorsIds = []
    motorsIds.append(
        environment._p.addUserDebugParameter("posX", 0.4, 0.75, 0.537))
    motorsIds.append(
        environment._p.addUserDebugParameter("posY", -.22, .3, 0.0))
    motorsIds.append(environment._p.addUserDebugParameter("posZ", 0.1, 1, 0.2))
    motorsIds.append(
        environment._p.addUserDebugParameter("yaw", -3.14, 3.14, 0))
    motorsIds.append(
        environment._p.addUserDebugParameter("fingerAngle", 0, 0.3, .3))

    # dv = 0.01
    # motorsIds.append(environment._p.addUserDebugParameter("posX",-dv,dv,0))
    # motorsIds.append(environment._p.addUserDebugParameter("posY",-dv,dv,0))
    # motorsIds.append(environment._p.addUserDebugParameter("posZ",-dv,dv,0))
    # motorsIds.append(environment._p.addUserDebugParameter("yaw",-dv,dv,0))
    # motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3))

    done = False
    #According to the hand-eye coordination paper, the camera is mounted over the shoulder of the arm
    camEyePos = [0.03, 0.236, 0.54]
    targetPos = [0.640000, 0.075000, -0.190000]
    cameraUp = [0, 0, 1]
    viewMat = p.computeViewMatrix(camEyePos, targetPos, cameraUp)
    camInfo = p.getDebugVisualizerCamera()
    projMatrix = camInfo[3]
    # viewMat = [-0.5120397806167603, 0.7171027660369873, -0.47284144163131714, 0.0, -0.8589617609977722, -0.42747554183006287, 0.28186774253845215, 0.0, 0.0, 0.5504802465438843, 0.8348482847213745, 0.0, 0.1925382763147354, -0.24935829639434814, -0.4401884973049164, 1.0]
    # projMatrix = [0.75, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, -1.0000200271606445, -1.0, 0.0, 0.0, -0.02000020071864128, 0.0]
    img_arr = p.getCameraImage(640,
                               512,
                               viewMatrix=viewMat,
                               projectionMatrix=projMatrix)  #640*512*3
    # write_from_imgarr(img_arr, 1)

    while (not done):
        action = []
        for motorId in motorsIds:
            action.append(environment._p.readUserDebugParameter(motorId))
        state, reward, done, info = environment.step2(action)
        obs = environment.getExtendedObservation()
コード例 #9
0
def main():

    env = KukaGymEnv(renders=False, isDiscrete=True)
    model = deepq.models.mlp([64])
    act = deepq.learn(env,
                      q_func=model,
                      lr=1e-3,
                      max_timesteps=10000000,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.02,
                      print_freq=10,
                      callback=callback)
    print("Saving model to kuka_model.pkl")
    act.save("kuka_model.pkl")
コード例 #10
0
def main():
  	
    env = KukaGymEnv(renders=False, isDiscrete=True)
    act = deepq.learn(
        env,
        network=models.mlp(num_hidden=64, num_layers=1),
        lr=1e-3,
        max_timesteps=100000000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to kuka_model.pkl")
    act.save("kuka_model.pkl")
コード例 #11
0
def main():

    env = KukaGymEnv(renders=True, isDiscrete=True)
    act = deepq.load("kuka_model.pkl")
    print(act)
    while True:
        obs, done = env.reset(), False
        print("===================================")
        print("obs")
        print(obs)
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
コード例 #12
0
def main():

    env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000)
    save_path = os.path.join("saves", "ddpg-")
    os.makedirs(save_path, exist_ok=True)

    device = torch.device("cuda")
    act_net = model.DDPGActor(env.observation_space.shape[0],
                              env.action_space.shape[0]).to(device)
    crt_net = model.D4PGCritic(env.observation_space.shape[0],
                               env.action_space.shape[0], N_ATOMS, Vmin,
                               Vmax).to(device)
    print(act_net)
    print(crt_net)
    tgt_act_net = common.TargetNet(act_net)
    tgt_crt_net = common.TargetNet(crt_net)
    writer = SummaryWriter(comment="-d4pg_")
    agent = model.AgentDDPG(act_net, device=device)
    exp_source = experience.ExperienceSourceFirstLast(env,
                                                      agent,
                                                      gamma=GAMMA,
                                                      steps_count=REWARD_STEPS)
    buffer = experience.ExperienceReplayBuffer(exp_source,
                                               buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)
    frame_idx = 0
    best_reward = None
    with common.RewardTracker(writer) as tracker:
        with common.TBMeanTracker(writer, batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                #print("populate")
                buffer.populate(1)
                rewards_steps = exp_source.pop_rewards_steps()
                #print(rewards_steps)
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", steps[0], frame_idx)
                    tracker.reward(rewards[0], frame_idx)

                if len(buffer) < 100:
                    continue
                batch = buffer.sample(BATCH_SIZE)
                #print("infer")
                states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn(
                    batch, device)
                #print("train critic")# train critic
                crt_opt.zero_grad()
                crt_distr_v = crt_net(states_v, actions_v)
                last_act_v = tgt_act_net.target_model(last_states_v)
                last_distr_v = F.softmax(tgt_crt_net.target_model(
                    last_states_v, last_act_v),
                                         dim=1)
                proj_distr_v = distr_projection(last_distr_v,
                                                rewards_v,
                                                dones_mask,
                                                gamma=GAMMA**REWARD_STEPS,
                                                device=device)
                prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v
                critic_loss_v = prob_dist_v.sum(dim=1).mean()
                critic_loss_v.backward()
                crt_opt.step()
                tb_tracker.track("loss_critic", critic_loss_v, frame_idx)
                #print("train actor")
                # train actor
                act_opt.zero_grad()
                act_opt.zero_grad()
                cur_actions_v = act_net(states_v)
                crt_distr_v = crt_net(states_v, cur_actions_v)
                actor_loss_v = -crt_net.distr_to_q(crt_distr_v)
                actor_loss_v = actor_loss_v.mean()
                actor_loss_v.backward()
                act_opt.step()
                tb_tracker.track("loss_actor", actor_loss_v, frame_idx)
                tgt_act_net.alpha_sync(alpha=1 - 1e-3)
                tgt_crt_net.alpha_sync(alpha=1 - 1e-3)
                if frame_idx % TEST_ITERS == 0:
                    print("testing")
                    env.reset()
                    ts = time.time()
                    rewards, steps = test_net(act_net, env, device=device)
                    print("Test done in %.2f sec, reward %.3f, steps %d" %
                          (time.time() - ts, rewards, steps))
                    writer.add_scalar("test_reward", rewards, frame_idx)
                    writer.add_scalar("test_steps", steps, frame_idx)
                    if best_reward is None or best_reward < rewards:
                        if best_reward is not None:
                            print("Best reward updated: %.3f -> %.3f" %
                                  (best_reward, rewards))
                            name = "best_%+.3f_%d.dat" % (rewards, frame_idx)
                            fname = os.path.join(save_path, name)
                            torch.save(act_net.state_dict(), fname)
                        best_reward = rewards
コード例 #13
0
from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv

max_step = 1000
episode_count = 0
max_episodes = 50

env = KukaGymEnv(renders=True)

observation = env.reset()
while episode_count < max_episodes:
    env.render()
    action = env.action_space.sample(
    )  # your agent here (this takes random actions)
    print(action)
    observation, reward, done, info = env.step(action)
    # Reset environment if done
    if done:
        observation = env.reset()
        episode_count += 1
env.close()
コード例 #14
0
#add parent dir to find package. Only needed for source code build, pip install doesn't need it.
import os, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(os.path.dirname(currentdir))
os.sys.path.insert(0,parentdir)

from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv
import time


environment = KukaGymEnv(renders=True)

  
motorsIds=[]
#motorsIds.append(environment._p.addUserDebugParameter("posX",0.4,0.75,0.537))
#motorsIds.append(environment._p.addUserDebugParameter("posY",-.22,.3,0.0))
#motorsIds.append(environment._p.addUserDebugParameter("posZ",0.1,1,0.2))
#motorsIds.append(environment._p.addUserDebugParameter("yaw",-3.14,3.14,0))
#motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3))

dv = 0.001
motorsIds.append(environment._p.addUserDebugParameter("posX",-dv,dv,0))
motorsIds.append(environment._p.addUserDebugParameter("posY",-dv,dv,0))
motorsIds.append(environment._p.addUserDebugParameter("posZ",-dv,dv,-dv))
motorsIds.append(environment._p.addUserDebugParameter("yaw",-dv,dv,0))
motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3))

done = False
while (not done):
    
  action=[]
コード例 #15
0
config['trainBatchSize'] = 32
config['gamma'] = 0.9
config['learningRate'] = 0.001
config['netGradClip'] = 1
config['logFlag'] = True
config['logFileName'] = 'StabilizerOneDLog/traj'
config['logFrequency'] = 1000
config['priorityMemoryOption'] = False
config['netUpdateOption'] = 'doubleQ'
config['netUpdateFrequency'] = 1
config['priorityMemory_absErrUpper'] = 5

import gym
from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv

env = KukaGymEnv(renders=True, isDiscrete=True)
N_S = env.observation_space.shape[0]
N_A = env.action_space.n

netParameter = dict()
netParameter['n_feature'] = N_S
netParameter['n_hidden'] = [100]
netParameter['n_output'] = N_A

policyNet = MultiLayerNetRegression(netParameter['n_feature'],
                                    netParameter['n_hidden'],
                                    netParameter['n_output'])

targetNet = deepcopy(policyNet)

optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])
コード例 #16
0
ファイル: aux_bullet_env.py プロジェクト: contactrika/bulb
 def __init__(self,
              base_env_name,
              env_v=0,
              obs_resolution=64,
              obs_ptcloud=False,
              random_colors=False,
              debug=False,
              visualize=False):
     self._base_env_name = base_env_name
     self._random_colors = random_colors
     self._mobile = False
     if base_env_name.startswith(
         ('Hopper', 'Walker2D', 'Ant', 'Humanoid', 'HalfCheetah',
          'InvertedPendulumSwingup')):
         self._mobile = True  # update base pos when rendering
     # If we created a bullet sim env from scratch, would init sim as:
     # conn_mode = pybullet.GUI if visualize else pybullet.DIRECT
     # self._p = bullet_client.BulletClient(connection_mode=conn_mode)
     # But all bullet gym envs init bullet client in constructor or reset.
     # Kuka creates bullet client in the constructor, so need to call
     # its constructor explicitly. All other envs derive from classes in
     # pybullet_envs.env_bases, which init bullet client in reset().
     # We set the corresponding render flag for them after their creation.
     if 'Kuka' in base_env_name:
         max_episode_steps = 1000
         self._env = KukaGymEnv(renders=visualize,
                                maxSteps=max_episode_steps)
         self._env._max_episode_steps = max_episode_steps
         # Appropriate camera vals from KukaCamGymEnv
         self._base_pos = [0.52, -0.2, -0.33]
         self._env.unwrapped._cam_dist = 1.3
         self._env.unwrapped._cam_yaw = 180
         self._env.unwrapped._cam_pitch = -41
     else:
         self._env = gym.make(base_env_name + 'BulletEnv-v' + str(env_v))
         max_episode_steps = self._env._max_episode_steps
         if 'CartPole' in base_env_name:
             self._env.render(mode='rgb_array')  # init cam dist vars
         if visualize: self._env.render(mode='human')  # turn on debug viz
     super(AuxBulletEnv, self).__init__(max_episode_steps, obs_resolution,
                                        obs_ptcloud, debug, visualize)
     # Zoom in camera.
     assert (hasattr(self._env.unwrapped, '_cam_dist'))
     if base_env_name.startswith('Reacher'):
         self._env.unwrapped._cam_dist = 0.55
         self._env.unwrapped._cam_pitch = -89  # -90 doesn't look ok on debug
     elif base_env_name.startswith('InvertedPendulum'):
         self._env.unwrapped._cam_dist = 2.2  # was: 1.8
     elif base_env_name.startswith('InvertedDoublePendulum'):
         self._env.unwrapped._cam_dist = 2.8
         self._env.unwrapped._cam_pitch = -1.0
         self._part_nms = ['cart', 'pole', 'pole2']
     elif base_env_name.startswith(('Hopper', 'Walker2D')):
         self._env.unwrapped._cam_dist = 1.5
         self._env.unwrapped._cam_pitch = -40
     elif base_env_name.startswith('Humanoid'):
         self._env.unwrapped._cam_dist = 1.8
         self._env.unwrapped._cam_pitch = -40
     self._env.reset()  # load URDF files, init _p sim pointer
     self._sim = self._env.unwrapped._p  # easy access to bullet client
     # Set reasonable colors and env defaults.
     if 'CartPole' in base_env_name:
         cartpole = self._env.unwrapped.cartpole
         self._sim.changeVisualShape(cartpole, 0, rgbaColor=(1, 1, 0, 1))
         self._sim.changeVisualShape(cartpole, 1, rgbaColor=(0, 0, 1, 1))
     # Compute camera info. This has to be done after self._env.reset()
     self._view_mat, self._proj_mat, base_pos = self.compute_cam_vals()
     # Specify camera objects for point cloud processing.
     self._cam_object_ids = None
     if self.obs_ptcloud:
         assert (self._base_env_name == 'CartPole')
         self._cam_object_ids = [self._env.unwrapped.cartpole]
         # Note: PyBullet envs with XML(MJCF)-based robots seem to have
         # issues with reporting body ids and names correctly.
         #for tmpi in range(self._sim.getNumBodies()):
         #    robot_name, scene_name = self._sim.getBodyInfo(tmpi)
         #    robot_name = robot_name.decode("utf8")
         #    body_id = self._sim.getBodyUniqueId(tmpi)
         #    print('body id:', body_id, robot_name, scene_name)
     # Specify observation and action spaces.
     self._ld_names = self.compute_low_dim_state_names()
     self._ld_names_ids_dict = \
         {k: v for v, k in enumerate(self.low_dim_state_names)}
     if obs_resolution is None:
         self.observation_space = self._env.observation_space  # low dim
     elif self.obs_ptcloud:
         state_sz = obs_resolution * 3  # 3D points in point cloud
         self.observation_space = gym.spaces.Box(
             -1.0 * AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz),
             AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz))
     else:  # RGB images
         self.observation_space = gym.spaces.Box(
             low=0.0,
             high=1.0,
             shape=[3, obs_resolution, obs_resolution],  # HxWxRGB
             dtype=np.float32)
     self.action_space = self._env.action_space
     # Turn on visualization if needed.
     if visualize:
         self._sim.resetDebugVisualizerCamera(
             self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw,
             self._env.unwrapped._cam_pitch, base_pos)
         if self.obs_resolution is not None:
             pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_GUI, 1)
             pybullet.configureDebugVisualizer(
                 pybullet.COV_ENABLE_RGB_BUFFER_PREVIEW, 1)
             pybullet.configureDebugVisualizer(
                 pybullet.COV_ENABLE_DEPTH_BUFFER_PREVIEW, 1)
             pybullet.configureDebugVisualizer(
                 pybullet.COV_ENABLE_SEGMENTATION_MARK_PREVIEW, 1)
     if self.debug:
         print('Created Aux', base_env_name, 'with observation_space',
               self.observation_space, 'action_space', self.action_space,
               'max_episode_steps', self.max_episode_steps)
コード例 #17
0
ファイル: aux_bullet_env.py プロジェクト: contactrika/bulb
class AuxBulletEnv(gym.Env, AuxEnv):
    PTCLOUD_BOX_SIZE = 2.0  # 2m cube cut off for point cloud observations

    def __init__(self,
                 base_env_name,
                 env_v=0,
                 obs_resolution=64,
                 obs_ptcloud=False,
                 random_colors=False,
                 debug=False,
                 visualize=False):
        self._base_env_name = base_env_name
        self._random_colors = random_colors
        self._mobile = False
        if base_env_name.startswith(
            ('Hopper', 'Walker2D', 'Ant', 'Humanoid', 'HalfCheetah',
             'InvertedPendulumSwingup')):
            self._mobile = True  # update base pos when rendering
        # If we created a bullet sim env from scratch, would init sim as:
        # conn_mode = pybullet.GUI if visualize else pybullet.DIRECT
        # self._p = bullet_client.BulletClient(connection_mode=conn_mode)
        # But all bullet gym envs init bullet client in constructor or reset.
        # Kuka creates bullet client in the constructor, so need to call
        # its constructor explicitly. All other envs derive from classes in
        # pybullet_envs.env_bases, which init bullet client in reset().
        # We set the corresponding render flag for them after their creation.
        if 'Kuka' in base_env_name:
            max_episode_steps = 1000
            self._env = KukaGymEnv(renders=visualize,
                                   maxSteps=max_episode_steps)
            self._env._max_episode_steps = max_episode_steps
            # Appropriate camera vals from KukaCamGymEnv
            self._base_pos = [0.52, -0.2, -0.33]
            self._env.unwrapped._cam_dist = 1.3
            self._env.unwrapped._cam_yaw = 180
            self._env.unwrapped._cam_pitch = -41
        else:
            self._env = gym.make(base_env_name + 'BulletEnv-v' + str(env_v))
            max_episode_steps = self._env._max_episode_steps
            if 'CartPole' in base_env_name:
                self._env.render(mode='rgb_array')  # init cam dist vars
            if visualize: self._env.render(mode='human')  # turn on debug viz
        super(AuxBulletEnv, self).__init__(max_episode_steps, obs_resolution,
                                           obs_ptcloud, debug, visualize)
        # Zoom in camera.
        assert (hasattr(self._env.unwrapped, '_cam_dist'))
        if base_env_name.startswith('Reacher'):
            self._env.unwrapped._cam_dist = 0.55
            self._env.unwrapped._cam_pitch = -89  # -90 doesn't look ok on debug
        elif base_env_name.startswith('InvertedPendulum'):
            self._env.unwrapped._cam_dist = 2.2  # was: 1.8
        elif base_env_name.startswith('InvertedDoublePendulum'):
            self._env.unwrapped._cam_dist = 2.8
            self._env.unwrapped._cam_pitch = -1.0
            self._part_nms = ['cart', 'pole', 'pole2']
        elif base_env_name.startswith(('Hopper', 'Walker2D')):
            self._env.unwrapped._cam_dist = 1.5
            self._env.unwrapped._cam_pitch = -40
        elif base_env_name.startswith('Humanoid'):
            self._env.unwrapped._cam_dist = 1.8
            self._env.unwrapped._cam_pitch = -40
        self._env.reset()  # load URDF files, init _p sim pointer
        self._sim = self._env.unwrapped._p  # easy access to bullet client
        # Set reasonable colors and env defaults.
        if 'CartPole' in base_env_name:
            cartpole = self._env.unwrapped.cartpole
            self._sim.changeVisualShape(cartpole, 0, rgbaColor=(1, 1, 0, 1))
            self._sim.changeVisualShape(cartpole, 1, rgbaColor=(0, 0, 1, 1))
        # Compute camera info. This has to be done after self._env.reset()
        self._view_mat, self._proj_mat, base_pos = self.compute_cam_vals()
        # Specify camera objects for point cloud processing.
        self._cam_object_ids = None
        if self.obs_ptcloud:
            assert (self._base_env_name == 'CartPole')
            self._cam_object_ids = [self._env.unwrapped.cartpole]
            # Note: PyBullet envs with XML(MJCF)-based robots seem to have
            # issues with reporting body ids and names correctly.
            #for tmpi in range(self._sim.getNumBodies()):
            #    robot_name, scene_name = self._sim.getBodyInfo(tmpi)
            #    robot_name = robot_name.decode("utf8")
            #    body_id = self._sim.getBodyUniqueId(tmpi)
            #    print('body id:', body_id, robot_name, scene_name)
        # Specify observation and action spaces.
        self._ld_names = self.compute_low_dim_state_names()
        self._ld_names_ids_dict = \
            {k: v for v, k in enumerate(self.low_dim_state_names)}
        if obs_resolution is None:
            self.observation_space = self._env.observation_space  # low dim
        elif self.obs_ptcloud:
            state_sz = obs_resolution * 3  # 3D points in point cloud
            self.observation_space = gym.spaces.Box(
                -1.0 * AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz),
                AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz))
        else:  # RGB images
            self.observation_space = gym.spaces.Box(
                low=0.0,
                high=1.0,
                shape=[3, obs_resolution, obs_resolution],  # HxWxRGB
                dtype=np.float32)
        self.action_space = self._env.action_space
        # Turn on visualization if needed.
        if visualize:
            self._sim.resetDebugVisualizerCamera(
                self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw,
                self._env.unwrapped._cam_pitch, base_pos)
            if self.obs_resolution is not None:
                pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_GUI, 1)
                pybullet.configureDebugVisualizer(
                    pybullet.COV_ENABLE_RGB_BUFFER_PREVIEW, 1)
                pybullet.configureDebugVisualizer(
                    pybullet.COV_ENABLE_DEPTH_BUFFER_PREVIEW, 1)
                pybullet.configureDebugVisualizer(
                    pybullet.COV_ENABLE_SEGMENTATION_MARK_PREVIEW, 1)
        if self.debug:
            print('Created Aux', base_env_name, 'with observation_space',
                  self.observation_space, 'action_space', self.action_space,
                  'max_episode_steps', self.max_episode_steps)

    @property
    def low_dim_state_space(self):
        return self._env.observation_space

    @property
    def low_dim_state_names(self):
        return self._ld_names

    def seed(self, seed):
        np.random.seed(seed)

    def close(self):
        self._env.unwrapped.close()

    def reset(self):
        self.reset_aggregators()
        if self._random_colors: self.reset_colors()
        state = self._env.reset()
        if self.obs_resolution is None:  # low dim state
            obs = np.clip(state, self.observation_space.low,
                          self.observation_space.high)
        else:  # pixel or ptcloud state
            obs = self.render_obs()
        # For mobile envs: could look at the robot from different angles.
        #if self._mobile:
        #    self._env.unwrapped._cam_yaw = (np.random.rand()-0.5)*360
        if self.visualize and self._mobile:
            self._sim.resetDebugVisualizerCamera(
                self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw,
                self._env.unwrapped._cam_pitch, self.get_base_pos())
        return obs

    def step(self, action):
        state, rwd, done, info = self._env.step(action)  # apply action
        if self.obs_resolution is None:
            obs = np.clip(state, self.observation_space.low,
                          self.observation_space.high)
        else:
            obs = self.render_obs()
        done, more_info = self.update_aggregators(rwd, done)
        info.update(more_info)
        if self.debug:  # print low-dim state
            msg = f'pid {os.getpid():d} step {self.stepnum:d} act '
            if isinstance(action, np.ndarray):
                msg += np.array2string(
                    action,
                    precision=2,
                    formatter={'float_kind': '{:0.2f}'.format},
                )
            else:
                msg += str(action)
            for i in range(len(self.low_dim_state_names)):
                msg += f' {self.low_dim_state_names[i]:s} {state[i]:0.4f}'
            print(msg)
        #if self.debug:
        # sanity check: aux should be same as low-dim state reported by env
        # except for envs that modify state after applying action (Hopper).
        #low_dim_state = self.compute_low_dim_state()
        #if np.abs(state - low_dim_state).sum()>1e-6:
        #    print('        state', state)
        #    print('low_dim_state', low_dim_state)
        #    assert(False)
        if self.visualize and self._mobile:
            self._sim.resetDebugVisualizerCamera(
                self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw,
                self._env.unwrapped._cam_pitch, self.get_base_pos())
        return obs, rwd, done, info

    def render(self, mode='rgb_array'):
        pass  #  done automatically with PyBullet visualization

    def render_obs(self, override_resolution=None, debug_out_dir=None):
        if self._mobile and not self.obs_ptcloud:
            self._view_mat, self._proj_mat, _ = self.compute_cam_vals()
        obs = render_utils.render_obs(
            self._sim, self.stepnum, self.obs_resolution,
            override_resolution, self._env.unwrapped._cam_dist,
            self.get_base_pos(), self._env.unwrapped._cam_pitch,
            self._env.unwrapped._cam_yaw, self.obs_ptcloud,
            self._cam_object_ids, self._base_env_name,
            AuxBulletEnv.PTCLOUD_BOX_SIZE, debug_out_dir, self._view_mat,
            self._proj_mat)
        return obs

    def normalize_reward(self, rwd):
        # This could be a useful function for algorithms that expect episode
        # rewards to be closer to a small range, like [0,1] or [-1,1].
        #
        # Rewards in [0,1] for CartPole, InvertedPendulum
        # Rewards close to [-?,10] for InvertedDoublePendulum
        # (bonus for 'alive', penalty for dist from vertical)
        # Rewards close to [-1,1] for InvertedPendulumSwingup
        #
        # Locomotion envs have several non-trivial components contributing to
        # the reward. It still might make sense to divide by total number of
        # steps, but the resulting range is not that simple to compute.
        #
        return rwd / self.max_episode_steps

    def compute_low_dim_state_names(self):
        if 'CartPole' in self._base_env_name:
            nms = ['theta', 'theta_vel', 'x', 'x_vel']
        elif 'InvertedPendulum' in self._base_env_name:
            nms = ['x', 'x_vel', 'theta_cos', 'theta_sin', 'theta_vel']
        elif 'InvertedDoublePendulum' in self._base_env_name:
            nms = [
                'x', 'x_vel', 'elbow_x', 'theta_cos', 'theta_sin', 'theta_vel',
                'theta1_cos', 'theta1_sin', 'theta1_vel'
            ]
        elif self._base_env_name.startswith(
            ('Hopper', 'Walker', 'HalfCheetah', 'Ant', 'Humanoid')):
            # Note: _vel are scaled by 0.3 in these envs to be in ~[-1,1];
            # the reported state is clipped to [-5,5]
            nms = [
                'z_delta', 'tgt_theta_sin', 'tgt_theta_cos', 'x_vel', 'y_vel',
                'z_vel', 'body_roll', 'body_pitch'
            ]
            njoints = len(self._env.unwrapped.robot.ordered_joints)
            for j in range(njoints):
                pfx = 'j' + str(j)
                nms.append(pfx + '_pos')
                nms.append(pfx + '_vel')
            nfeet = len(self._env.unwrapped.robot.foot_list)
            for ft in range(nfeet):
                nms.append('ft' + str(ft) + '_contact')
        elif 'Reacher' in self._base_env_name:
            nms = [
                'tgt_x', 'tgt_y', 'to_tgt_vec_x', 'to_tgt_vec_y', 'theta_cos',
                'theta_sin', 'theta_vel', 'theta1', 'theta1_vel'
            ]
        elif 'Kuka' in self._base_env_name:
            nms = [
                'grp_x', 'grp_y', 'grp_z', 'grp_r', 'grp_p', 'grp_y',
                'blk_rel_x', 'blk_rel_y', 'blk_rel_eulz'
            ]
        else:
            assert (False), f'Unknown BulletEnv {self._base_env_name:s}'
        return nms

    def compute_low_dim_state(self):
        # Get low-dim state. Since there is no unified interface for bullet
        # gym envs to get the low-dim state, here handle special cases.
        # Code comes from step() method of the underlying bullet gym envs.
        # Note: some envs modify state within step() call, so this function
        # is only useful for getting initial low dim state (e.g. after reset).
        if 'CartPole' in self._base_env_name:
            # state = theta, theta_dot, x, x_dot
            cartpole = self._env.unwrapped.cartpole
            state = list(self._sim.getJointState(cartpole, 1)[0:2])
            state.extend(self._sim.getJointState(cartpole, 0)[0:2])
        elif self._base_env_name.startswith('Kuka'):
            # Kuka state = grp_pos, grp_euler, block_rel_pos
            # pos,euler are 3D; block_rel_pos is XYEulZ:
            # relative x,y position and euler angle of block in gripper space
            # Racecar state = ball_rel_x, ball_rel_y
            # Racecar env is a emulation of the MIT RC Racecar: reward is based
            # on distance to the randomly placed ball; observations are ball
            # position (x,y) in camera frame; camera follows car's body frame.
            state = self._env.unwrapped.getExtendedObservation()
        else:  # envs in pybullet_envs.gym_*_envs.py use calc_state()
            state = self._env.unwrapped.robot.calc_state()
        return np.array(state)

    def compute_cam_vals(self):
        base_pos = self.get_base_pos()
        view_matrix = self._sim.computeViewMatrixFromYawPitchRoll(
            cameraTargetPosition=base_pos,
            distance=self._env.unwrapped._cam_dist,
            yaw=self._env.unwrapped._cam_yaw,
            pitch=self._env.unwrapped._cam_pitch,
            roll=0,
            upAxisIndex=2)
        proj_matrix = self._sim.computeProjectionMatrixFOV(fov=60,
                                                           aspect=1.0,
                                                           nearVal=0.1,
                                                           farVal=100.0)
        return view_matrix, proj_matrix, base_pos

    def get_base_pos(self):
        pos = [0, 0, 0]
        if hasattr(self, '_base_pos'): return self._base_pos
        env = self._env.unwrapped
        if hasattr(env, 'robot'):
            if hasattr(env.robot,
                       'body_xyz'):  # for pybullet_envs/env_bases.py
                pos = env.robot.body_xyz
            elif hasattr(env.robot, 'slider'):  # for pendulum envs
                x, _ = env.robot.slider.current_position()
                pos = [x, 0, 0]
        return pos

    def reset_colors(self):
        # Randomly select a color scheme.
        base_clrs = [(1, 1, 0, 1), (0, 1, 1, 1), (1, 0, 1, 1), (0, 0, 1, 1)]
        pole_clrs = [(0, 0, 1, 1), (0, 0, 0, 1), (0, 1, 0, 1), (1, 0, 0, 1)]
        pole2_clrs = [(1, 0, 0, 1), (0, 0, 1, 1), (1, 1, 0, 1), (0, 0, 0, 1)]
        pole3_clrs = [(0.8, 0.8, 0.8, 1), (0.7, 0, 0.7, 1), (0.7, 0, 0, 1),
                      (0, 0.8, 0.8, 1)]
        clr_scheme_id = np.random.randint(len(base_clrs))
        base_clr = base_clrs[clr_scheme_id]
        pole_clr = pole_clrs[clr_scheme_id]
        pole2_clr = pole2_clrs[clr_scheme_id]
        pole3_clr = pole3_clrs[clr_scheme_id]
        #print('base_clrs', base_clrs, 'pole_clr', pole_clr)
        if 'CartPole' in self._base_env_name:
            cartpole = self._env.unwrapped.cartpole
            self._sim.changeVisualShape(cartpole, 0, rgbaColor=base_clr)
            self._sim.changeVisualShape(cartpole, 1, rgbaColor=pole_clr)
        elif 'Pendulum' in self._base_env_name:
            base = self._env.unwrapped.robot.parts["cart"]
            pole = self._env.unwrapped.robot.parts["pole"]
            self._sim.changeVisualShape(base.bodies[base.bodyIndex],
                                        base.bodyPartIndex,
                                        rgbaColor=base_clr)
            self._sim.changeVisualShape(pole.bodies[pole.bodyIndex],
                                        pole.bodyPartIndex,
                                        rgbaColor=pole_clr)
            if 'Double' in self._base_env_name:
                pole2 = self._env.unwrapped.robot.parts["pole2"]
                #print('pole2_clr', pole2_clr)
                self._sim.changeVisualShape(pole2.bodies[pole2.bodyIndex],
                                            pole2.bodyPartIndex,
                                            rgbaColor=pole2_clr)
        elif self._base_env_name.startswith(
            ('Walker2D', 'HalfCheetah', 'Ant')):
            if self._base_env_name.startswith('Walker2D'):
                part_nms = [
                    'torso', 'thigh', 'leg', 'foot', 'thigh_left', 'leg_left',
                    'foot_left'
                ]
            elif self._base_env_name.startswith('HalfCheetah'):
                part_nms = [
                    'torso', 'bthigh', 'bshin', 'bfoot', 'fthigh', 'fshin',
                    'ffoot'
                ]
            elif self._base_env_name.startswith('Ant'):
                part_nms = [
                    'torso', 'front_left_foot', 'front_right_foot',
                    'left_back_foot', 'right_back_foot'
                ]
            else:
                assert (False)  # clrs for env not specified
            clrs = [
                pole_clr, pole2_clr, base_clr, pole_clr, base_clr, pole2_clr,
                pole3_clr
            ]
            for i, part_nm in enumerate(part_nms):
                part = self._env.unwrapped.robot.parts[part_nm]
                self._sim.changeVisualShape(part.bodies[part.bodyIndex],
                                            part.bodyPartIndex,
                                            rgbaColor=clrs[i])
        #    pitches = [-60, -35, -40, -20]; yaws = [-180, -20, 120, 270]
        #    self._env.unwrapped._cam_pitch = pitches[clr_scheme_id]
        #    self._env.unwrapped._cam_yaw = yaws[clr_scheme_id]

    def override_state(self, low_dim_state):
        # TODO: add support for more envs in the future.
        assert ('CartPole' in self._base_env_name
                or 'Pendulum' in self._base_env_name)
        unwrp_env = self._env.unwrapped
        ids_dict = self._ld_names_ids_dict
        assert (len(ids_dict.keys()) == len(low_dim_state))
        if self._base_env_name.startswith('CartPole'):
            self._sim.resetJointState(  # set pole pos,vel
                unwrp_env.cartpole, 1, low_dim_state[ids_dict['theta']],
                low_dim_state[ids_dict['theta_vel']])
            self._sim.resetJointState(  # set cart pos,vel
                unwrp_env.cartpole, 0, low_dim_state[ids_dict['x']],
                low_dim_state[ids_dict['x_vel']])
        elif 'Pendulum' in self._base_env_name:
            unwrp_env.robot.slider.reset_current_position(
                low_dim_state[ids_dict['x']], low_dim_state[ids_dict['x_vel']])
            theta = np.arctan2(low_dim_state[ids_dict['theta_sin']],
                               low_dim_state[ids_dict['theta_cos']])
            unwrp_env.robot.j1.reset_current_position(
                theta, low_dim_state[ids_dict['theta_vel']])
            if 'Double' in self._base_env_name:
                theta1 = np.arctan2(low_dim_state[ids_dict['theta1_sin']],
                                    low_dim_state[ids_dict['theta1_cos']])
                unwrp_env.robot.j2.reset_current_position(
                    theta1, low_dim_state[ids_dict['theta1_vel']])
        unwrp_env.state = self.compute_low_dim_state()
        return None
コード例 #18
0
class PPO:
    def __init__(self, hyperparameters):
        """
        Args:
            hyperparameters (dict): a dictionary of hyperparameters
        discounted_returns:
            None
        """
        # Extract hyperparameters
        self.lr = hyperparameters['learning_rate']
        self.discount = hyperparameters['discount_rate']
        self.num_batch_transitions = hyperparameters['num_batch_transitions']
        self.state_dim = hyperparameters['state_dim']
        self.action_dim = hyperparameters['action_dim']
        self.total_train_steps = hyperparameters['total_train_steps']
        self.max_episode_length = hyperparameters['max_episode_length']
        self.num_train_epochs = hyperparameters['num_train_epochs']
        self.device = hyperparameters['device']
        # Initialize actor/critic networks
        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim, self.action_dim)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.lr)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.lr)
        # Initialize replay buffer and environment
        self.replay_buffer = ReplayBuffer(self.batch_size)
        self.enviroment = KukaGymEnv(renders=True)

    def sample_policy(self, total_time_steps=None):
        """
        Uses the policy to interact with the environment and fills the replay buffer
        
        Args:
            total_time_steps (int): the minimum total number of time steps to sample
        """
        if total_time_steps is None:
            total_time_steps = self.num_batch_transitions
        # Sample until at least have total_time_steps samples
        num_samples = 0
        while num_samples < total_time_steps:
            done = False
            cur_state = self.enviroment.reset()
            # Run full episodes until we get enough samples
            for i in range(self.max_episode_length):
                # Sample and perform action from the policy
                action, log_prob = self.actor.sample_action()
                next_state, reward, done, info = self.enviroment.step(action)
                # Store transition in replay buffer
                num_samples += 1
                self.replay_buffer.append(cur_state, action, reward,
                                          next_state, done, log_prob)
                cur_state = next_state
                if done:
                    break

    def rewards_to_go(self, sampled_rewards, sampled_dones, normalize=False):
        """
        Computes discounted returns for each episode from the rewards with the option of normalizing them 

        Args:
            sampled_rewards (Iterable[Float]): an iterable containing the rewards
            sampled_dones (Iterable[Bool]): an iterable containing whether there was terminal time step
            normalize (Bool): whether to normalize the returns
        """

        discounted_returns = []
        disc_return = 0
        # Compute discounted return for each episode
        for reward, done in zip(reversed(sampled_rewards),
                                reversed(sampled_dones)):
            if done:
                disc_return = 0
            disc_return = reward + self.discount * disc_return
            discounted_returns.insert(
                0, disc_return)  # TODO: Try to append and then reverse

        discounted_returns = torch.tensor(discounted_returns,
                                          dtype=torch.float32).to(self.device)
        # Normalize discounted_returns
        if normalize:
            discounted_returns = (discounted_returns -
                                  discounted_returns.mean()) / (
                                      discounted_returns.std() + 1e-8)
        return discounted_returns

    def train(self):
        """
        Trains the actor and critic networks using PPO
        """
        print("Training PPO for {} steps".format(self.total_train_steps))
        num_time_steps = 0
        num_train_iterations = 0

        while num_time_steps < self.total_train_steps:
            # Sample policy for transitions
            self.sample_policy()
            state, action, reward, next_state, done, action_log_prob = self.sample_transitions(
                return_type="torch_tensor")
            # Compute advantage and value
            returns = self.rewards_to_go(reward, done)
            value = self.critic(state, action)
            advantage = returns - value.detach()
            # Normalize advantage
            advantage = (advantage - advantage.mean()) / (advantage.std() +
                                                          1e-10)
            # Update actor and critic networks
            for train_epoch in range(self.num_train_epochs):
                # Compute value and action prob ratio
                value = self.critic(
                    state, action
                )  # TODO: See if you can just resuse value outside for loop
                action, cur_action_log_prob = self.actor.sample_action()
                action_prob_ratio = torch.exp(cur_action_log_prob -
                                              action_log_prob)
                # Compute actor loss
                loss1 = action_prob_ratio * advantage
                loss2 = torch.clamp(action_prob_ratio, 1 - self.clip,
                                    1 + self.clip) * advantage
                actor_loss = -torch.min(loss1, loss2).mean()
                # Compute critic loss
                critic_loss = F.mse_loss(returns, value)
                # Update actor network
                self.actor_optim.zero_grad()
                actor_loss.backward(
                    retain_graph=True
                )  # retain_graph prevents gradients from being erased after backprop
                self.actor_optim.step()
                # Update critic network
                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()

            self.buffer.clear()
            num_time_steps += len(state)

    def test(self):
        return
コード例 #19
0
    parser.add_argument('--output', default='output', type=str, help='Resuming model path for testing')
    parser.add_argument('--test', action='store_true', help='test or not')
    parser.add_argument('--vis', action='store_true', help='visualize each action or not')
    parser.add_argument('--discrete', dest='discrete', action='store_true', help='the actions are discrete or not')
    parser.add_argument('--cuda', dest='cuda', action='store_true')
    parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO

    args = parser.parse_args()
    # StrCat args.output with args.env
    if args.resume is None:
        args.output = get_output_folder(args.output, args.env)
    else:
        args.output = args.resume

    if args.env == "KukaGym":
        env = KukaGymEnv(renders=False, isDiscrete=True)
    elif args.discrete:
        env = gym.make(args.env)
        env = env.unwrapped
    else:
        env = NormalizedEnv(gym.make(args.env))

    # input random seed
    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    # input states count & actions count
    print(env.observation_space.shape, env.action_space.shape)
    nb_states = env.observation_space.shape[0]
    if args.discrete: