def run_grid_test(x_points, y_points, z_points, num_test, policy_path):
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # set up coordination between eps per iteration and num_test
    episode_length_time = 4.0
    dt = 0.04
    timesteps_per_ep = episode_length_time / dt
    timesteps_per_batch = int(timesteps_per_ep * num_test)
    total_timesteps = timesteps_per_batch * x_points * y_points * z_points


    # Create GridTest environment
    env = GridTestEnv(
            setup="UR10_6dof",
            host=None,
            dof=6,
            control_type="velocity",
            reset_type="zero",
            reward_type="precision",
            derivative_type="none",
            deriv_action_max=5,
            first_deriv_max=2,
            accel_max=1.4, # was 1.4
            speed_max=0.3, # was 0.3
            speedj_a=1.4,
            episode_length_time=episode_length_time,
            episode_length_step=None,
            actuation_sync_period=1,
            dt=dt,
            run_mode="multiprocess",
            rllab_box=False,
            movej_t=2.0,
            delay=0.0,
            random_state=rand_state,
            x_points=x_points,
            y_points=y_points,
            z_points=z_points,
            num_test=num_test
        )
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({"write_lock": False,
                                     "episodic_returns": [],
                                     "episodic_lengths": [], })
    builtins.shared_returns = shared_returns

    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher, args=(env, timesteps_per_batch, shared_returns, plot_running))
    pp.start()

    # Run TRPO policy
    run_policy(network='mlp', 
          num_layers=2, # these are network_kwargs for the MLP network
          num_hidden=64,
          env=env, 
          total_timesteps=total_timesteps, #Originally 200,000
          timesteps_per_batch=timesteps_per_batch,
          callback=grid_test_callback,
          load_path=policy_path
          )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown plotting process
    time.sleep(2)
    pp.join()

    env.close()
Ejemplo n.º 2
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 mover environment
    env = Create2MoverEnv(90,
                          port='/dev/ttyUSB0',
                          obs_history=1,
                          dt=0.15,
                          random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Ejemplo n.º 3
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create DXL Reacher1D environment
    env = DxlReacher1DEnv(setup='dxl_gripper_default',
                          idn=1,
                          baudrate=1000000,
                          obs_history=1,
                          dt=0.04,
                          gripper_dt=0.01,
                          rllab_box=False,
                          episode_length_step=None,
                          episode_length_time=2,
                          max_torque_mag=100,
                          control_type='torque',
                          target_type='position',
                          reset_type='zero',
                          reward_type='linear',
                          use_ctypes_driver=True,
                          random_state=rand_state)

    # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque
    # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly.
    # By default, it does not normalize observations or rewards.
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_dxl_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(
        env,
        policy_fn,
        max_timesteps=50000,
        timesteps_per_batch=2048,
        max_kl=0.05,
        cg_iters=10,
        cg_damping=0.1,
        vf_iters=5,
        vf_stepsize=0.001,
        gamma=0.995,
        lam=0.995,
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Ejemplo n.º 4
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = ReacherEnv(setup="UR5_6dof",
                     host=None,
                     dof=6,
                     control_type="velocity",
                     target_type="position",
                     reset_type="zero",
                     reward_type="precision",
                     derivative_type="none",
                     deriv_action_max=5,
                     first_deriv_max=2,
                     accel_max=1.4,
                     speed_max=0.3,
                     speedj_a=1.4,
                     episode_length_time=4.0,
                     episode_length_step=None,
                     actuation_sync_period=1,
                     dt=0.04,
                     run_mode="multiprocess",
                     rllab_box=False,
                     movej_t=2.0,
                     delay=0.0,
                     random_state=rand_state)
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=200000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Ejemplo n.º 5
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment.
    env = DoubleInvertedPendulumEnv(agent_dt=0.005,
                                    sensor_dt=[0.01, 0.0033333],
                                    is_render=False,
                                    random_state=rand_state)
    # Start environment processes
    env.start()

    # Create baselines ppo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_returns,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn

    kindred_callback = create_callback(shared_returns)

    # Train baselines PPO
    learn(
        env,
        policy_fn,
        max_timesteps=1e6,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.0001,
        optim_batchsize=64,
        gamma=0.995,
        lam=0.995,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Ejemplo n.º 6
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = SawyerReachXYZEnv(target_goal=(0, 0, 0),
                            indicator_threshold=.05,
                            reward_type='hand_distance',
                            action_mode='torque',
                            use_safety_box=True,
                            torque_action_scale=1,
                            **kwargs)

    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=200000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
def main():
    # optionally use a pretrained model
    save_model_path = None
    load_model_path = None
    load_trained_model = False
    hidden_sizes = (64, 64, 64)

    if len(sys.argv) > 2:# load model
        load_trained_model = True

    save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/
    os.makedirs(save_model_path, exist_ok=True)
    run_dirs = os.listdir(save_model_path)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True)
    save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/'

    if load_trained_model:# loading true
        load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model*

    # use fixed random state
    #rand_state = np.random.RandomState(1).get_state()
    #np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 docker environment
    # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0])
    # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0])
    #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1
    # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2
    # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes
    # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0])
    #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0])
    # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0])
    distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) 

    env = Create2DockerEnv(30, distro,
                           port='/dev/ttyUSB0', ir_window=20,
                           ir_history=1,
                           obs_history=1, dt=0.045)
                           #random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    #plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
        "episodic_ss": [],
    })
    # Spawn plotting process
    #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
    #pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path)

    # Train baselines PPO
    model = learn(
        env,
        policy_fn,
        max_timesteps=100000,
        timesteps_per_actorbatch=675,#512
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.00005,
        optim_batchsize=16,
        gamma=0.96836,
        lam=0.99944,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    #plot_running.value = 0  # shutdown ploting process
    #time.sleep(2)
    #pp.join()

    env.close()
Ejemplo n.º 8
0
def get_env(cfg):
    # Set seed to potentially fix random state
    seed_low  = cfg['global']['seed']['low'] 
    seed_high = cfg['global']['seed']['high']
    if seed_low is not None:
        logging.debug('Using seed [{},{}) "half-open" interval'.format(seed_low, seed_high))
        rand_state = np.random.RandomState(seed_low).get_state()
        np.random.set_state(rand_state)
        tf_set_seeds(np.random.randint(seed_low, seed_high))
    else:
        logging.debug('Not using any seeds!')

    # Load the RL Environment
    env_module = importlib.import_module(cfg['environment']['codebase']['module'])
    env_class = getattr(env_module, cfg['environment']['codebase']['class'])
    logging.debug("Environment function: {}".format(env_class))

    logging.debug("Host IP: {}".format(cfg['environment']['setup']['host']))

    # Create UR5 Reacher2D environment
    env = env_class(
            setup                 = cfg['environment']['setup'],
            host                  = cfg['environment']['setup']['host'],
            dof                   = cfg['environment']['parameters']['dof'],
            control_type          = cfg['environment']['parameters']['control_type'],
            target_type           = cfg['environment']['parameters']['target_type'],
            reset_type            = cfg['environment']['parameters']['reset_type'],
            reward_type           = cfg['environment']['parameters']['reward_type'],
            derivative_type       = cfg['environment']['parameters']['derivative_type'],
            deriv_action_max      = cfg['environment']['parameters']['deriv_action_max'],
            first_deriv_max       = cfg['environment']['parameters']['first_deriv_max'],
            accel_max             = cfg['environment']['parameters']['accel_max'],
            speed_max             = cfg['environment']['parameters']['speed_max'],
            speedj_a              = cfg['environment']['parameters']['speedj_a'],
            episode_length_time   = cfg['environment']['parameters']['episode_length_time'],
            episode_length_step   = cfg['environment']['parameters']['episode_length_step'],
            actuation_sync_period = cfg['environment']['parameters']['actuation_sync_period'],
            dt                    = cfg['environment']['parameters']['dt'],
            run_mode              = cfg['environment']['parameters']['run_mode'],
            rllab_box             = cfg['environment']['parameters']['rllab_box'],
            movej_t               = cfg['environment']['parameters']['movej_t'],
            delay                 = cfg['environment']['parameters']['delay'],
            random_state          = rand_state if seed_low else None
        )
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    policy_fn_module = importlib.import_module(cfg['model']['module'])
    policy_fn_class = getattr(policy_fn_module, cfg['model']['class'])
    logging.debug("Policy function: {}".format(policy_fn_class))

    def policy_fn(name, ob_space, ac_space):
        return policy_fn_class(name           = name,
                               ob_space       = ob_space,
                               ac_space       = ac_space,
                               hid_size       = cfg['algorithm']['hyperparameters']['hid_size'],
                               num_hid_layers = cfg['algorithm']['hyperparameters']['num_hid_layers'])
    
    return env, policy_fn