Example #1
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo_pnp import mlp_policy, pposgd_simple, interactive_ppo, ppo_gail
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=3)

    env = JacoEnv(64, 64, 1, 1.0)  #make_mujoco_env(env_id, seed)
    dataset = Mujoco_Dset(expert_path='data/pnp_demo.npz', traj_limitation=-1)
    reward_giver = TransitionClassifier(env, 100, entcoeff=1e-3)
    ppo_gail.learn(
        env,
        policy_fn,
        reward_giver,
        dataset,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Example #2
0
    def __init__(self, wid):

        self.wid = wid
        #self.env = gym.make(GAME).unwrapped
        self.env = JacoEnv(64, 64, 100)
        self.ppo = GLOBAL_PPO
        if self.wid == 0:
            self.viewer = mujoco_py.MjViewer(self.env.sim)
Example #3
0
    def __init__(self,
                 render=False,
                 eps_start=EPS_START,
                 eps_end=EPS_STOP,
                 eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)

        self.render = render
        self.env = JacoEnv(64, 64, 100, 0.1, 0.8, True)
        self.agent = Agent(eps_start, eps_end, eps_steps)
Example #4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = JacoEnv()#env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
    savedir_fname = learn(env,
                          policy_fn,
                          dataset,
                          max_iters=args.BC_max_iter,
                          ckpt_dir=args.checkpoint_dir,
                          log_dir=args.log_dir,
                          task_name=task_name,
                          verbose=True)
    avg_len, avg_ret = runner(env,
                              policy_fn,
                              savedir_fname,
                              timesteps_per_batch=1024,
                              number_trajs=10,
                              stochastic_policy=args.stochastic_policy,
                              save=args.save_sample,
                              reuse=True)
Example #5
0
class Environment(threading.Thread):
    stop_signal = False

    def __init__(self,
                 render=False,
                 eps_start=EPS_START,
                 eps_end=EPS_STOP,
                 eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)

        self.render = render
        self.env = JacoEnv(64, 64, 100, 0.1, 0.8, True)
        self.agent = Agent(eps_start, eps_end, eps_steps)

    def runEpisode(self):
        s = self.env.reset()

        R = 0
        while True:
            time.sleep(THREAD_DELAY)  # yield

            if self.render: self.env.render()

            a = self.agent.act(s)
            s_, r, done, info = self.env.step(a)
            # print(self.ident, info['step'])

            if done:  # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)

            s = s_
            R += r

            if done or self.stop_signal:
                break

        print("Total R:", R)

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True
Example #6
0
class JacoEnvRandomAgent():
    def __init__(self, width, height, frame_skip, rewarding_distance,
                 control_magnitude, reward_continuous, render):
        self.env = JacoEnv(width, height, frame_skip, rewarding_distance,
                           control_magnitude, reward_continuous)
        self.render = render

    def run(self):
        (_, _, obs_rgb_view2) = self.env.reset()

        if self.render:
            viewer = mujoco_py.MjViewer(self.env.sim)
        else:
            f, ax = plt.subplots()
            im = ax.imshow(obs_rgb_view2)

        while True:
            self.env.reset()

            while True:

                # random action selection
                action = np.random.choice([0, 1, 2, 3, 4], 6)

                # take the random action and observe the reward and next state (2 rgb views and proprioception)
                (obs_joint, obs_rgb_view1,
                 obs_rgb_view2), reward, done = self.env.step(action)

                # print("action : ", action)
                # print("reward : ", reward)

                if done:
                    break

                if self.render:
                    viewer.render()
                else:
                    im.set_data(obs_rgb_view2)
                    plt.draw()
                    plt.pause(0.1)
Example #7
0
class Worker(object):
    def __init__(self, wid):

        self.wid = wid
        #self.env = gym.make(GAME).unwrapped
        self.env = JacoEnv(64, 64, 100)
        self.ppo = GLOBAL_PPO
        if self.wid == 0:
            self.viewer = mujoco_py.MjViewer(self.env.sim)

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data

                if self.wid == 0:
                    self.viewer.render()

                a = self.ppo.choose_action(s)
                s_, r, done = self.env.step(a)
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(
                    (r + 8) / 8)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue

                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break
                    if done:
                        break

            with open("reward.txt", "a") as f:
                f.write(str(ep_r) + '\n')
            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            # r_d = 200 / (sum(GLOBAL_RUNNING_R[:-10:-1])/10 + 250 + GLOBAL_EP)
            # print(r_d)
            #self.env.reduce_rewarding_distance(r_d)
            #if sum(GLOBAL_RUNNING_R[:-11:-1])/10 > 100:
            #    self.env.reset_target()
            # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 0:
            #     self.env.reset_target()
            # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 1:
            #     self.env.reset_target()
            # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 2:
            #     self.env.reset_target()
            # if GLOBAL_EP > 1495 and GLOBAL_EP % 300 == 3:
            #     self.env.reset_target()
            # if sum(GLOBAL_RUNNING_R[:-11:-1])/10 > 1500:
            #     with open("state.txt", "a") as f:
            #         f.write(str(self.env.sim.model.body_pos[-1]) + '\n')
            #         f.write(str(self.env.sim.model.geom_pos[-1]) + '\n')
            #print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
            print(
                GLOBAL_EP,
                '/',
                EP_MAX,
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
Example #8
0
    GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
    GLOBAL_RUNNING_R = []  #Global_reward
    COORD = tf.train.Coordinator()
    QUEUE = queue.Queue()  # workers putting data in this queue
    threads = []
    for worker in workers:  # worker threads
        t = threading.Thread(target=worker.work, args=())
        t.start()  # training
        threads.append(t)
    # add a PPO updating thread
    threads.append(threading.Thread(target=GLOBAL_PPO.update, ))
    threads[-1].start()
    COORD.join(threads)

    # plot reward change and test
    plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
    plt.xlabel('Episode')
    plt.ylabel('Moving reward')
    plt.ion()
    plt.show()

    env = JacoEnv(64, 64, 100)

    viewer = mujoco_py.MjViewer(env.sim)

    while True:
        s = env.reset()
        for t in range(200):
            viewer.render()
            s = env.step(GLOBAL_PPO.choose_action(s))[0]
Example #9
0
def run():
    """Construct and start the environment."""

    env = JacoEnv(64,
                  64,
                  100,
                  0.1,
                  0.8,
                  True)
    nb_actions = env.real_num_actions # All possible action, where each action is a unit in this vector
    new_floor_color = list((0.55 - 0.45) * np.random.random(3) + 0.45) + [1.]
    new_cube_color = list(np.random.random(3)) + [1.]
    env.change_floor_color(new_floor_color)
    env.change_cube_color(new_cube_color)

    encoder = load_model(WEIGHTS_FILE)
    print("#########################")
    nb_observation_space = (64, 64, 3)
    original_input = Input(shape=(WINDOW_LENGTH,) + nb_observation_space)
    in_layer = [Lambda(lambda x: x[:, i, :, :])(original_input) for i in range(WINDOW_LENGTH)]
    for layer in encoder.layers:
        layer.trainable = False
    print(encoder.summary())
    encoder_output = [encoder(x) for x in in_layer]

    x = Concatenate()(encoder_output)
    x = Dense(512, activation='relu')(x)
    x = Dense(512, activation='relu')(x)
    x = Dense(nb_actions, activation='linear')(x)
    model = Model(original_input, [x])
    print(model.summary())
    if MULTI_GPU:
        model = multi_gpu_model(model, gpus=2)
        print(model.summary())

    num_warmup = 50000
    # num_simulated_annealing = 500000 + num_warmup
    # num_warmup = 0
    num_simulated_annealing = 220000 + num_warmup

    memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=num_simulated_annealing)

    dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=num_warmup, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.)
    dqn.compile(Adam(lr=.00025), metrics=['mae'])

    if False:
        dqn.load_weights("stylegan_dqn_weights")
        checkpoint_callback = ModelCheckpoint("stylegan_dqn_checkpoint", monitor='episode_reward', verbose=0, save_best_only=True, save_weights_only=True, mode='max', period = 10)
        history = dqn.fit(env, nb_steps=num_simulated_annealing + 450000, visualize=False, verbose=1, callbacks=[checkpoint_callback])
        dqn.save_weights("stylegan_dqn_weights")
        np.savez_compressed("stylegan_dqn_history", episode_reward=np.asarray(history.history['episode_reward']))
    else:
        dqn.load_weights("stylegan_dqn_weights")

        print("original domain")
        source_test_losses = dqn.test(env, nb_episodes=100, visualize=True)
        np.savez_compressed("myvae_dqn_source_test",
                            episode_reward=np.asarray(source_test_losses.history['episode_reward']),
                            nb_steps=np.asarray(source_test_losses.history['nb_steps']))

        print("target domain")
        new_floor_color = [0.4, 0.6, 0.4, 1.]
        new_cube_color = [1.0, 0.0, 0.0, 1.]
        env.change_floor_color(new_floor_color)
        env.change_cube_color(new_cube_color)
        target_test_losses = dqn.test(env, nb_episodes=100, visualize=True)
        np.savez_compressed("myvae_dqn_target_test",
                            episode_reward=np.asarray(target_test_losses.history['episode_reward']),
                            nb_steps=np.asarray(target_test_losses.history['nb_steps']))
Example #10
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width,
                  args.height,
                  args.frame_skip,
                  args.rewarding_distance,
                  args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)
    if args.render:
        (_, _, obs_rgb_view2) = env.reset()
        plt.ion()
        f, ax = plt.subplots()
        im = ax.imshow(obs_rgb_view2)

    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.eval()
    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    n_digits = str(
        len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        cx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                                0), 0, False, 0
                        reward_sum = 0

                    # Calculate policy
                    policy, _, (hx, cx) = model(
                        Variable(
                            state[0], volatile=True),
                        Variable(
                            state[1], volatile=True),
                        (hx.detach(),
                         cx.detach()))  # Break graph for memory efficiency

                    # Choose action greedily
                    action = [p.max(1)[1].data[0, 0] for p in policy]

                    # Step
                    state, reward, done = env.step(action)
                    obs_rgb_view1 = state[1]
                    obs_rgb_view2 = state[2]
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Optionally render validation states
                    if args.render:
                        # rendering the first camera view
                        im.set_data(obs_rgb_view1)
                        plt.draw()
                        plt.pause(0.05)

                        # rendering mujoco simulation
                        # viewer = mujoco_py.MjViewer(env.sim)
                        # viewer.render()

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break

            print(('[{}] Step: {:<' + n_digits +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime(
                           '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join('results', str(t_start) +
                                    '_model.pth'))  # Checkpoint model params
            can_test = False  # Finish testing
            if args.evaluate:
                return
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond
Example #11
0
 def __init__(self, width, height, frame_skip, rewarding_distance,
              control_magnitude, reward_continuous, render):
     self.env = JacoEnv(width, height, frame_skip, rewarding_distance,
                        control_magnitude, reward_continuous)
     self.render = render
Example #12
0
from jaco_arm import JacoStackEnv as JacoEnv
import mujoco_py
import gym
import numpy as np
import glfw
from sklearn.mixture import GaussianMixture as GM
import cv2
from sklearn.decomposition import PCA

env = JacoEnv()

traj_data = np.load('new_stack.npz', allow_pickle=True)
obs = traj_data['obs'][:30]
acs = traj_data['acs'][:30]

ret_save_list = []

#pca = PCA(n_components=3)
#nobs = pca.fit_transform(np.vstack(obs))

print(np.vstack(obs).shape)
print(obs[0].shape)
gm = GM(n_components=3, init_params='random', random_state=0)
gm.fit(np.vstack(obs))
for i in range(len(obs)):
    print('traj [', i, '] :', gm.predict(obs[i]))

#np.savez('new_stack.npz', obs = obs[:100], acs=acs[:100], rets=ret_save_list)
Example #13
0
def train(rank, args, T, shared_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width, args.height, args.frame_skip,
                  args.rewarding_distance, args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)

    # TODO: pass in the observation and action space
    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.train()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # Sync with shared model at least every t_max steps
        model.load_state_dict(shared_model.state_dict())
        # Get starting timestep
        t_start = t

        # Reset or pass on hidden state
        if done:
            hx = Variable(torch.zeros(1, args.hidden_size))
            cx = Variable(torch.zeros(1, args.hidden_size))
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                    0), 0, False, 0

        else:
            # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
            hx = hx.detach()
            cx = cx.detach()

        # Lists of outputs for training
        policies, Vs, actions, rewards = [], [], [], []

        while not done and t - t_start < args.t_max:
            # Calculate policy and value
            policy, V, (hx, cx) = model(Variable(state[0]), Variable(state[1]),
                                        (hx, cx))

            # Sample action
            action = [
                p.multinomial().data[0, 0] for p in policy
            ]  # Graph broken as loss for stochastic action calculated manually

            # Step
            state, reward, done = env.step(action)
            state = state_to_tensor(state)
            done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
            episode_length += 1  # Increase episode counter

            # Save outputs for online training
            [
                arr.append(el)
                for arr, el in zip((policies, Vs, actions, rewards), (
                    policy, V, Variable(torch.LongTensor(action)), reward))
            ]

            # Increment counters
            t += 1
            T.increment()

        # Break graph for last values calculated (used for targets, not directly as model outputs)
        if done:
            # R = 0 for terminal s
            R = Variable(torch.zeros(1, 1))

        else:
            # R = V(s_i; θ) for non-terminal s
            _, R, _ = model(Variable(state[0]), Variable(state[1]), (hx, cx))
            R = R.detach()
        Vs.append(R)

        # Train the network
        _train(args, T, model, shared_model, optimiser, policies, Vs, actions,
               rewards, R)
Example #14
0
    args.non_rgb_state_size = 18  # 9 joints qpos and qvel TODO: don't hardcode!

    mp.set_start_method('spawn')
    torch.manual_seed(args.seed)
    T = Counter()  # Global shared counter

    # Results dir
    if not os.path.exists('results'):
        os.makedirs('results')
    elif not args.overwrite:
        raise OSError('results dir exists and overwrite flag not passed')

    # Create shared network
    env = JacoEnv(args.width,
                  args.height,
                  args.frame_skip,
                  args.rewarding_distance,
                  args.control_magnitude,
                  args.reward_continuous)

    shared_model = ActorCritic(None, args.non_rgb_state_size, None,
                               args.hidden_size)
    shared_model.share_memory()
    if args.model and os.path.isfile(args.model):
        # Load pretrained weights
        shared_model.load_state_dict(torch.load(args.model))
    # Create optimiser for shared network parameters with shared statistics
    optimiser = SharedRMSprop(
        shared_model.parameters(), lr=args.lr, alpha=args.rmsprop_decay)
    optimiser.share_memory()

    # Start validation agent
Example #15
0
    # BLAS setup
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['MKL_NUM_THREADS'] = '1'

    # Setup
    args = parser.parse_args()
    args.env = 'jaco'
    args.non_rgb_state_size = 18  # 9 joints qpos and qvel TODO: don't hardcode!

    mp.set_start_method('spawn')
    torch.manual_seed(args.seed)
    T = Counter()  # Global shared counter

    # Create shared network
    env = JacoEnv(args.width, args.height, args.frame_skip,
                  args.rewarding_distance, args.control_magnitude,
                  args.reward_continuous)

    M = cv2.getRotationMatrix2D((32, 32), 180, 1.)
    done = False
    for i in trange(1000):
        done = False
        j = 0
        while not done:
            obs, reward, done = env.step(
                np.random.randint(0, 4, env.num_actuators))
            img = cv2.warpAffine(obs[2], M, (64, 64))
            cv2.imwrite(
                "training_observations/obs" + str(i) + "_" + str(j) + ".png",
                img)