Beispiel #1
0
def main():
    # Create the Create2 mover environment
    env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines trpo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Beispiel #2
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create DXL Reacher1D environment
    env = DxlReacher1DEnv(setup='dxl_gripper_default',
                          idn=1,
                          baudrate=1000000,
                          obs_history=1,
                          dt=0.04,
                          gripper_dt=0.01,
                          rllab_box=False,
                          episode_length_step=None,
                          episode_length_time=2,
                          max_torque_mag=100,
                          control_type='torque',
                          target_type='position',
                          reset_type='zero',
                          reward_type='linear',
                          use_ctypes_driver=True,
                          random_state=rand_state)

    # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque
    # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly.
    # By default, it does not normalize observations or rewards.
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_dxl_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(
        env,
        policy_fn,
        max_timesteps=50000,
        timesteps_per_batch=2048,
        max_kl=0.05,
        cg_iters=10,
        cg_damping=0.1,
        vf_iters=5,
        vf_stepsize=0.001,
        gamma=0.995,
        lam=0.995,
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Beispiel #3
0
def main():
    # optionally use a pretrained model
    load_model_data = None
    hidden_sizes = (32, 32)
    if len(sys.argv) > 1:
        load_model_path = sys.argv[1]
        load_model_data = pkl.load(open(load_model_path, 'rb'))
        hidden_sizes = load_model_data['hidden_sizes']

    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 mover environment
    env = Create2MoverEnv(90,
                          port='/dev/ttyUSB0',
                          obs_history=1,
                          dt=0.15,
                          random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hidden_sizes[0],
                         num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, load_model_data)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
def main():
    # optionally use a pretrained model
    save_model_path = None
    load_model_path = None
    load_trained_model = False
    hidden_sizes = (64, 64, 64)

    if len(sys.argv) > 2:# load model
        load_trained_model = True

    save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/
    os.makedirs(save_model_path, exist_ok=True)
    run_dirs = os.listdir(save_model_path)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True)
    save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/'

    if load_trained_model:# loading true
        load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model*

    # use fixed random state
    #rand_state = np.random.RandomState(1).get_state()
    #np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 docker environment
    # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0])
    # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0])
    #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1
    # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2
    # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes
    # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0])
    #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0])
    # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0])
    distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) 

    env = Create2DockerEnv(30, distro,
                           port='/dev/ttyUSB0', ir_window=20,
                           ir_history=1,
                           obs_history=1, dt=0.045)
                           #random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    #plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
        "episodic_ss": [],
    })
    # Spawn plotting process
    #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
    #pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path)

    # Train baselines PPO
    model = learn(
        env,
        policy_fn,
        max_timesteps=100000,
        timesteps_per_actorbatch=675,#512
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.00005,
        optim_batchsize=16,
        gamma=0.96836,
        lam=0.99944,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    #plot_running.value = 0  # shutdown ploting process
    #time.sleep(2)
    #pp.join()

    env.close()