Exemple #1
0
def Simulation(proxy_agent,index, return_dict,  episodes, vis=False):
    print('starting simulation')
    env = RunEnv(visualize=vis)
    observation = env.reset(difficulty=0)

    rewards = np.zeros(episodes)
    totalreward = 0
    for episode in range(0, episodes):
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        observation = np.array(observation)
        Preprocess = Preprocessing(observation, delta=0.01)
        prevState = Preprocess.GetState(observation)
        for i in range(1,1000):
            observation, reward, done, info = env.step(action)
            observation = np.array(observation)
            #means it didn't go the full simulation
            if done and i < 1000:
                reward = 0  

            state = Preprocess.GetState(observation)
            s,a,r,sp = Preprocess.ConvertToTensor(prevState,action, reward, state)

            totalreward += reward
            if done:
                env.reset(difficulty = 0, seed = None) #resets the environment if done is true
                print("reseting environment" + str(episode))
                rewards[episode] = totalreward
                totalreward = 0
                break
            action = proxy_agent(Variable(s, volatile=True))
            action = action.data.numpy()
            prevState = state;
    return_dict[index] = np.sum(rewards) / episodes
    return np.sum(rewards) / episodes
Exemple #2
0
def Simulation(proxy_agent, episodes, vis=False):
    env = RunEnv(visualize=vis)
    observation = env.reset(difficulty=0)
    memory = random.randint(1000, 2000)
    tau = random.uniform(0.01, .9)
    epsilon = random.uniform(.15, .9)
    target = proxy_agent.ProduceTargetActorCritic( memory, tau, epsilon )
    batches =  [ 16, 32, 64, 128]
    batchsize = batches[random.randint(0,len(batches)-1)]
    for episode in range(0, episodes):
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        observation = np.array(observation)
        Preprocess = Preprocessing(observation, delta=0.01)
        prevState = Preprocess.GetState(observation)
        if(vis):
            target.OUprocess(0, 0.15, 0.0)
        else:
            target.OUprocess(random.random(), 0.15,0.0)
        pelvis_y = 0

        for i in range(1,1000):
            observation, reward, done, info = env.step(action)
            observation = np.array(observation)
            #means it didn't go the full simulation
            if i > 1:
                reward += (observation[2] - pelvis_y)*0.01 #penalty for pelvis going down
            reward = env.current_state[4] * 0.01
            reward += 0.01  # small reward for still standing
            reward += min(0, env.current_state[22] - env.current_state[1]) * 0.1  # penalty for head behind pelvis
            reward -= sum([max(0.0, k - 0.1) for k in [env.current_state[7], env.current_state[10]]]) * 0.02  # penalty for straight legs


            if done and i < 1000:
                reward = 0

            state = Preprocess.GetState(observation)
            s,a,r,sp = Preprocess.ConvertToTensor(prevState,action, reward, state)
            target.addToMemory(s,a,r,sp)

                #        env.render()
            if done:
                env.reset(difficulty = 0, seed = None) #resets the environment if done is true
                if(target.primedToLearn()):

                    lock.acquire()
                    proxy_agent.PerformUpdate(batchsize, target)
                    target.UpdateTargetNetworks(agent.getCritic(), agent.getActor())
                    print("saving actor")
                    proxy_agent.saveActorCritic()
                    print("actor saved")
                    lock.release()
                print("reseting environment" + str(episode))
                break
            action = target.selectAction(s)
            action = action.numpy()
            prevState = state;
class LearnToRunEnv(gym.Env):
    """Wrapping LearnToRunEnv in OpenAI Gym"""
    def __init__(self, visualize=False, difficulty=None):
        super(LearnToRunEnv, self).__init__()
        if difficulty == None:
            self.difficulty = random.randint(0,2)
        else:
            self.difficulty = difficulty

        self.learntorun_env = RunEnv(visualize=visualize)
        self.observation_space = self.learntorun_env.observation_space
        self.action_space = self.learntorun_env.action_space

        self._spec = EnvSpec("RunEnv-diff{}-v1".format(difficulty))

    def _step(self, action):
        obs, reward, terminal, info = self.learntorun_env.step(action)
        return np.asarray(obs), reward, terminal, info

    def _reset(self):
        obs = self.learntorun_env.reset(difficulty=self.difficulty,\
                                            seed=self.learntorun_seed)
        return np.asarray(obs)

    def _render(self, mode='human', close=False):
        #raise NotImplementedError
        return None

    def _seed(self, seed=None):
        self.learntorun_seed = seed

    def _close(self):
        self.learntorun_env.close()
Exemple #4
0
class Environment:
    def __init__(self):

        print("Setting env...")
        self.env = RunEnv(visualize=False)
        print("Env set !")

    def get_state_size(self):
        return list(self.env.observation_space.shape)

    def get_action_size(self):
        return self.env.action_space.shape[0]

    def get_bounds(self):
        return self.env.action_space.low, self.env.action_space.high

    def set_render(self, render):
        self.env = RunEnv(visualize=render)

    def reset(self):
        return self.env.reset(difficulty=0)

    def random(self):
        return self.env.action_space.sample()

    def act(self, action):
        return self.env.step(action)

    def close(self):
        self.env.close()
Exemple #5
0
class Environment:
    def __init__(self):
        self.env = RunEnv(visualize=False)
        print()
        self.render = False

    def get_state_size(self):
        return list(self.env.observation_space.shape)

    def get_action_size(self):
        return self.env.action_space.shape[0]

    def get_bounds(self):
        return self.env.action_space.low, self.env.action_space.high

    def set_render(self, render):
        visu = render and DISPLAY
        if visu != self.render:
            self.render = visu
            self.env = RunEnv(visualize=visu)
            self.reset()

    def reset(self):
        return np.asarray(self.env.reset(difficulty=0))

    def random(self):
        return self.env.action_space.sample()

    def act(self, action):
        s_, r, d, i = self.env.step(action)
        return np.asarray(s_), r, d, i

    def close(self):
        self.env.close()
class LearnToRunEnv(gym.Env):
    """Wrapping LearnToRunEnv in OpenAI Gym"""
    def __init__(self, visualize=False, difficulty=None):
        super(LearnToRunEnv, self).__init__()
        if difficulty == None:
            self.difficulty = random.randint(0, 2)
        else:
            self.difficulty = difficulty

        self.learntorun_env = RunEnv(visualize=visualize)
        self.observation_space = self.learntorun_env.observation_space
        self.action_space = self.learntorun_env.action_space

    def _step(self, action):
        return self.learntorun_env.step(action)

    def _reset(self):
        return self.learntorun_env.reset(difficulty=self.difficulty,\
                                            seed=self.learntorun_seed)

    def _render(self, mode='human', close=False):
        #raise NotImplementedError
        return None

    def _seed(self, seed=None):
        self.learntorun_seed = seed

    def _close(self):
        self.learntorun_env.close()
Exemple #7
0
def test():
    task_fn = lambda: LTR()
    task = task_fn()
    state_dim = task.env.observation_space.shape[0]
    action_dim = task.env.action_space.shape[0]
    with open('data/ddpg-model-LearningToRun.bin', 'rb') as f:
        model = pickle.load(f)
    actor = DDPGActorNet(state_dim, action_dim)
    actor.load_state_dict(model)

    logger = Logger('./log')

    env = RunEnv(visualize=False)
    state = env.reset(difficulty=0)
    print state
    done = False
    total_reward = 0.0
    step = 0
    while not done:
        action = actor.predict(np.stack([state]), to_numpy=True).flatten()
        state, reward, done, info = env.step(action)
        total_reward += reward
        step += 1
        logger.histo_summary('input', actor.input, step)
        logger.histo_summary('act1', actor.act1, step)
        logger.histo_summary('act2', actor.act2, step)
        logger.histo_summary('pre_act3', actor.pre_act3, step)
        logger.histo_summary('act3', actor.act3, step)
        for tag, value in actor.named_parameters():
            tag = tag.replace('.', '/')
            logger.histo_summary(tag, value.data.numpy(), step)

    print total_reward
    print step
Exemple #8
0
class OsimAdapter:
    def __init__(self):
        self.env = RunEnv(visualize=False)
        self.reset()

    def reset(self, difficulty=2):
        self.reward = 0
        self.total_reward = 0
        self.timestamp = 0.
        self.features = np.array(
            (self.env.reset(difficulty=difficulty))).reshape((1, -1))
        self.last_obs = np.zeros(shape=(1, 41))
        self.features = np.concatenate([self.features, self.last_obs], axis=1)
        self.done = False
        return self.features

    def get_action_space(self):
        space = [1] * 18
        return space

    def get_observation_space(self):
        return 41 * 2

    def step(self, actions):
        mean_possible = (np.array(self.env.action_space.low) +
                         np.array(self.env.action_space.high)) / 2.
        actions = np.array(actions) + mean_possible
        actions *= (np.array(self.env.action_space.high) -
                    np.array(self.env.action_space.low))
        actions = np.clip(actions, self.env.action_space.low,
                          self.env.action_space.high)
        obs, reward1, done, _ = self.env.step(actions)
        reward2 = 0
        if not done:
            obs, reward2, done, _ = self.env.step(actions)
        self.features = np.array(obs).reshape((1, -1))
        self.features = np.concatenate(
            [self.features, self.features - self.last_obs], axis=1)
        self.last_obs = np.array(obs).reshape((1, -1))
        self.reward = reward1 + reward2
        self.total_reward += self.reward
        self.done = done
        self.timestamp += 1

    def get_total_reward(self):
        return self.total_reward
 def test_actions(self):
     env = RunEnv(visualize=False)
     env.reset()
     v = env.action_space.sample()
     v[0] = 1.5
     v[1] = -0.5
     observation, reward, done, info = env.step(v)
     self.assertLessEqual(env.last_action[0],1.0)
     self.assertGreaterEqual(env.last_action[1],0.0)
Exemple #10
0
def standalone_headless_isolated(pq, cq, plock):
    # locking to prevent mixed-up printing.
    plock.acquire()
    print('starting headless...', pq, cq)
    try:
        from osim.env import RunEnv
        # RunEnv = runenv_with_alternative_obstacle_generation_scheme()
        e = RunEnv(visualize=False, max_obstacles=0)
        # bind_alternative_pelvis_judgement(e)
        # use_alternative_episode_length(e)
    except Exception as err:
        print('error on start of standalone')
        traceback.print_exc()
        plock.release()
        return
    else:
        plock.release()

    def report(e):
        # a way to report errors ( since you can't just throw them over a pipe )
        # e should be a string
        print('(standalone) got error!!!')
        cq.put(('error', e))

    def floatify(np):
        return [float(np[i]) for i in range(len(np))]

    try:
        while True:
            msg = pq.get()
            # messages should be tuples,
            # msg[0] should be string

            # isinstance is dangerous, commented out
            # if not isinstance(msg,tuple):
            #     raise Exception('pipe message received by headless is not a tuple')

            if msg[0] == 'reset':
                o = e.reset(difficulty=0)
                cq.put(floatify(o))
            elif msg[0] == 'step':
                o, r, d, i = e.step(msg[1])
                o = floatify(o)  # floatify the observation
                cq.put((o, r, d, i))
            else:
                cq.close()
                pq.close()
                del e
                break
    except Exception as e:
        traceback.print_exc()
        report(str(e))

    return  # end process
def standalone_headless_isolated(conn, plock):
    # locking to prevent mixed-up printing.
    plock.acquire()
    print('starting headless...', conn)
    try:
        import traceback
        from osim.env import RunEnv
        e = RunEnv(visualize=False)
    except Exception as e:
        print('error on start of standalone')
        traceback.print_exc()

        plock.release()
        return
    else:
        plock.release()

    def report(e):
        # a way to report errors ( since you can't just throw them over a pipe )
        # e should be a string
        print('(standalone) got error!!!')
        conn.send(('error', e))

    def floatify(np):
        return [float(np[i]) for i in range(len(np))]

    try:
        while True:
            msg = conn.recv()
            # messages should be tuples,
            # msg[0] should be string

            # isinstance is dangerous, commented out
            # if not isinstance(msg,tuple):
            #     raise Exception('pipe message received by headless is not a tuple')

            if msg[0] == 'reset':
                o = e.reset(difficulty=2)
                conn.send(floatify(o))
            elif msg[0] == 'step':
                ordi = e.step(msg[1])
                ordi[0] = floatify(ordi[0])
                conn.send(ordi)
            else:
                conn.close()
                del e
                break
    except Exception as e:
        traceback.print_exc()
        report(str(e))

    return  # end process
Exemple #12
0
class OsimEnv(Env):
    def __init__(self,
                 visualize=True,
                 test=False,
                 step_size=0.01,
                 processor=None,
                 timestep_limit=1000):
        self.visualize = visualize
        self._osim_env = RunEnv(visualize=visualize)
        self._osim_env.stepsize = step_size
        self._osim_env.spec.timestep_limit = timestep_limit
        self._osim_env.horizon = timestep_limit
        # self._osim_env.integration_accuracy = 1e-1
        if test:
            self._osim_env.timestep_limit = 1000
        self.processor = processor
        print "stepsize: " + str(self._osim_env.stepsize)

    def reset(self, seed=None, difficulty=2):
        observation = self._osim_env.reset(seed=seed, difficulty=difficulty)
        if self.processor:
            observation, reward, done, info = self.processor.process_step(
                observation, 0.0, False, dict())

        return observation

    def step(self, action):
        if self.processor:
            action = self.processor.process_action(action)

        observation, reward, done, info = self._osim_env.step(action)

        if self.processor:
            observation, reward, done, info = self.processor.process_step(
                observation, reward, done, info)

        return observation, reward, done, info

    def get_observation_dim(self):
        return len(self.reset())

    def get_action_dim(self):
        nb_actions = self._osim_env.action_space.shape[0]
        return nb_actions

    # FOR PICKLING
    def __setstate__(self, state):
        self.__init__(visualize=state['visualize'])

    def __getstate__(self):
        state = {'visualize': self.visualize}
        return state
Exemple #13
0
class LTR(BasicTask):
    name = 'LearningToRun'
    success_threshold = 2000
    def __init__(self):
        BasicTask.__init__(self)
        self.env = RunEnv(visualize=False)

    def step(self, action):
        action = np.clip(action, 0, 1)
        next_state, reward, done, info = self.env.step(action)
        return np.asarray(next_state) / math.pi, reward, done, info

    def reset(self):
        state = self.env.reset(difficulty=0, seed=np.random.randint(0, 10000000))
        return np.asarray(state) / math.pi
def test(args):
    print('start testing')

    ddpg = DDPG()
    ddpg.load_model(args.model, load_memory=False)
    env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obs)

    np.random.seed(args.seed)
    for i in range(1):
        step = 0
        state = env.reset(difficulty=2)
        fg = FeatureGenerator()

        state = fg.gen(state)
        #obs = fg.traj[0]
        #print(obs.left_knee_r, obs.right_knee_r)

        ep_reward = 0
        ep_memories = []
        while True:
            action = ddpg.select_action(list(state))
            next_state, reward, done, info = env.step(action.tolist())
            next_state = fg.gen(next_state)

            #obs = fg.traj[0]
            #print(obs.left_knee_r, obs.right_knee_r)

            print('step: {0:03d}'.format(step), end=', action: ')
            for act in action:
                print('{0:.3f}'.format(act), end=', ')
            print()

            state = next_state
            ep_reward += reward
            step += 1

            print('reward:', ep_reward)

            if done:
                break

        print('\nEpisode: {} Reward: {}, n_steps: {}'.format(
            i, ep_reward, step))
Exemple #15
0
def standalone_headless_isolated(conn,
                                 visualize,
                                 n_obstacles,
                                 run_logs_dir,
                                 additional_info,
                                 higher_pelvis=0.65):
    try:
        e = RunEnv(visualize=visualize, max_obstacles=n_obstacles)
        if higher_pelvis != 0.65:
            bind_alternative_pelvis_judgement(e, higher_pelvis)
        e = MyRunEnvLogger(e,
                           log_dir=run_logs_dir,
                           additional_info=additional_info)

        while True:
            msg = conn.recv()

            # messages should be tuples,
            # msg[0] should be string

            if msg[0] == 'reset':
                o = e.reset(difficulty=msg[1], seed=msg[2])
                conn.send(o)
            elif msg[0] == 'step':
                ordi = e.step(msg[1])
                conn.send(ordi)
            elif msg[0] == 'close':
                e.close()
                conn.send(None)

                import psutil
                current_process = psutil.Process()
                children = current_process.children(recursive=True)
                for child in children:
                    child.terminate()
                return
    except Exception as e:
        import traceback
        print(traceback.format_exc())
        conn.send(e)
Exemple #16
0
class GameManager:
    def __init__(self, game_name, display):
        self.game_name = game_name
        self.display = display

        # self.env = gym.make(game_name)
        self.env = RunEnv(self.display)
        self.reset()

    def reset(self):
        observation = self.env.reset()
        return observation

    def step(self, action):
        self._update_display()
        observation, reward, done, info = self.env.step(action)
        return observation, reward, done, info

    def _update_display(self):
        # if self.display:
        #     self.env.render()
        return
def main():

    env = RunEnv(visualize=True)
    env.close()

    with open('save.p', 'r') as f:
        population = pickle.load(f)

    nn = population[0][0]
    total_reward = 0
    observation = env.reset()

    total_reward = 0
    observation = env.reset()
    for i in range(200):
        step = nn.compute(i)
        observation, reward, done, info = env.step(step)

        total_reward += reward
        if done:
            break

    print total_reward
Exemple #18
0
def test(actor, critic, args, act_update_fn):
    act_fn, _, _ = act_update_fn(actor, critic, None, None, args)
    env = RunEnv(visualize=args.visualize, max_obstacles=args.max_obstacles)

    all_episode_metrics = []
    for episode in range(args.num_episodes):
        episode_metrics = {
            "reward": 0.0,
            "step": 0,
        }

        observation_handler = create_observation_handler(args)
        action_handler = create_action_handler(args)
        observation = env.reset(difficulty=2, seed=SEEDS[episode % len(SEEDS)])
        action = np.zeros(ACTION_SHAPE, dtype=np.float32)
        observation = observation_handler(observation, action)

        done = False
        while not done:
            print(episode_metrics["reward"])
            action = act_fn(observation)

            observation, reward, done, _ = env.step(action_handler(action))

            episode_metrics["reward"] += reward
            episode_metrics["step"] += 1

            if done:
                break

            observation = observation_handler(observation, action)

        all_episode_metrics.append(episode_metrics)

    df = pd.DataFrame(all_episode_metrics)
    pprint(df.describe())
Exemple #19
0
    #     vec=[0.3]*18
    #     val = 0.9
    #     vec[7] = val
    #     # vec[16] = val
    # elif ctr < 105:
    #     vec=[0.2]*18
    #     val = 0.9
    #     vec[9] = val
    # elif ctr < 115:
    #     vec=[0.2]*18 #contract upper muscles
    #     val = 0.9
    #     vec[16] = val
    # elif ctr < 125:
    #     vec=[0.0]*18
    #     val = 0.9
    #     vec[0] = val
    #     vec[9] = val
    # else:
    #     vec=[0.0]*18

    # print (observation)
    return vec


ctr = 0
for i in range(1000):
    observation, reward, done, info = env.step(my_controller(observation, ctr))
    ctr += 1
    ctr = ctr % 100
    print(ctr)
Exemple #20
0
from osim.env import RunEnv

env = RunEnv(visualize=True)
observation = env.reset(difficulty=0)
for i in range(200):
    observation, reward, done, info = env.step(env.action_space.sample())
    print(reward)
    if done:
        break
# pdb.set_trace()
# print max_action_steps
# for i in range(max_action_steps):
#     # print type(my_controller(observation, i)[0])
#     observation, reward, done, info = env.step(my_controller(observation, i))
total_reward = 0
# print max_action_steps
# for i in range(min(max_action_steps, 500)):
#     # print type(my_controller(observation, i)[0])
#     observation, reward, done, info = env.step(my_controller(observation, i))
#     if (observation[2] < 0.65):
#         break
#     total_reward += reward
#     print(total_reward)

i = 0
while True:
    observation, reward, done, info = env.step(
        my_controller(observation, i % max_action_steps))
    total_reward += reward
    print('Total reward', total_reward, 'Iter', i)
    i += 1
    if (observation[2] < 0.65):
        break

print("Terminating")

# print observation

# print observation
Exemple #22
0
def train(rank, args, traffic_light, counter, shared_model,
          shared_grad_buffers, shared_obs_stats, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = [0] * 41
    last_v = [0] * 10
    #last_state = numpy.zeros(48)

    env = RunEnv(visualize=False)

    #running_state = ZFilter((num_inputs,), clip=5)
    #running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    #running_state = ZFilter((num_inputs,), clip=5)

    start_time = time.time()

    for i_episode in range(args.start_epoch + 1, 999999):
        #print(shared_obs_stats.n[0])
        #print('hei')
        #if rank == 0:
        #    print(running_state.rs._n)

        signal_init = traffic_light.get()
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        #Tot_loss = 0
        #Tot_num =
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            #state = numpy.array(state)

            last_state, last_v, state = process_observation(
                last_state, last_v, state)

            state = numpy.array(state)

            #state = running_state(state)

            state = Variable(torch.Tensor(state).unsqueeze(0))
            shared_obs_stats.observes(state)
            state = shared_obs_stats.normalize(state)
            state = state.data[0].numpy()

            #print(state)
            #return

            #print(AA)

            #print(type(AA))
            #print(type(state))
            #print(AA.shape)
            #print(state.shape)

            reward_sum = 0
            #timer = time.time()
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)
                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(state)
                    print(action)
                    print(ac_net.affine1.weight)
                    print(ac_net.affine1.weight.data)
                    print('ERROR')
                    #action = select_action_actor_critic(state,ac_net)
                    #action = action.data[0].numpy()
                    #state = state + numpy.random.rand(args.feature)*0.001

                    raise RuntimeError('action NaN problem')
                #print(action)
                #print("------------------------")
                #timer = time.time()
                reward = 0
                if args.skip:
                    #env.step(action)
                    _, A, _, _ = env.step(action)
                    reward += A
                    _, A, _, _ = env.step(action)
                    reward += A
                BB = numpy.append(action, action)
                next_state, A, done, _ = env.step(BB)
                reward += A
                #print(next_state)
                #last_state = process_observation(state)
                last_state, last_v, next_state = process_observation(
                    last_state, last_v, next_state)

                next_state = numpy.array(next_state)
                #print(next_state)
                #print(next_state.shape)
                #return
                reward_sum += reward
                #print('env:')
                #print(time.time()-timer)

                #last_state ,next_state = update_observation(last_state,next_state)

                #next_state = running_state(next_state)

                next_state = Variable(torch.Tensor(next_state).unsqueeze(0))
                shared_obs_stats.observes(next_state)
                next_state = shared_obs_stats.normalize(next_state)
                next_state = next_state.data[0].numpy()

                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1

            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #print('env:')
        #print(time.time()-timer)

        #timer = time.time()
        update_params_actor_critic(batch, args, ac_net, opt_ac)
        shared_grad_buffers.add_gradient(ac_net)

        counter.increment()

        epoch = i_episode
        if (i_episode % args.log_interval == 0) and (rank == 0):

            print(
                'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}'
                .format(
                    i_episode,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, reward_batch))

            epoch = i_episode
            if reward_batch > best_result:
                best_result = reward_batch
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': shared_model.state_dict(),
                        'optimizer': opt_ac.state_dict(),
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, 'best')

            if epoch % 30 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': shared_model.state_dict(),
                        'optimizer': opt_ac.state_dict(),
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, epoch)
        # wait for a new signal to continue
        while traffic_light.get() == signal_init:
            pass
Exemple #23
0
def standalone_headless_isolated(pq, cq, plock):
    # locking to prevent mixed-up printing.
    plock.acquire()
    print('starting headless...',pq,cq)
    try:
        import traceback
        from osim.env import RunEnv
        e = RunEnv(visualize=True,max_obstacles=0)
        # bind_alternative_pelvis_judgement(e)
        # use_alternative_episode_length(e)
    except Exception as e:
        print('error on start of standalone')
        traceback.print_exc()

        plock.release()
        return
    else:
        plock.release()

    def report(e):
        # a way to report errors ( since you can't just throw them over a pipe )
        # e should be a string
        print('(standalone) got error!!!')
        # conn.send(('error',e))
        # conn.put(('error',e))
        cq.put(('error',e))

    def floatify(n_p):
        return [float(n_p[i]) for i in range(len(n_p))]

    try:
        previous_o = None
        while True:
            # msg = conn.recv()
            # msg = conn.get()
            msg = pq.get()
            # messages should be tuples,
            # msg[0] should be string

            # isinstance is dangerous, commented out
            # if not isinstance(msg,tuple):
            #     raise Exception('pipe message received by headless is not a tuple')

            if msg[0] == 'reset': #or (previous_o==None and msg[0]=='step'):
                o = e.reset(difficulty=0)
                o = floatify(o)
                o_processed = generate_observation(o, o)
                previous_o = o
                cq.put(o_processed)

            elif msg[0] == 'step':
                actions = msg[1]
                o,r,d,i = e.step(np.array(actions))
                o = floatify(o) # floatify the observation
                o_processed = generate_observation(o, previous_o)
                previous_o = o
                cq.put((o_processed, r, d, i))
            elif msg[0] == 'action_space':
                a_s = e.action_space
                r_a_s = (a_s.low.tolist(), a_s.high.tolist(), a_s.shape)
                cq.put(r_a_s)
            elif msg[0] == 'observation_space':
                o_s = get_observation_space()
                r_o_s = (o_s['low'].tolist(), o_s['high'].tolist(),o_s['shape'])
                cq.put(r_o_s)
            else:
                cq.close()
                pq.close()
                del e
                break
    except Exception as e:
        traceback.print_exc()
        report(str(e))

    return # end process
    print("Initializing new best")

w_try = copy.deepcopy(w)
best_reward = 0.
runs = 500
unev_runs = 0

print("Baseline, run with w_best")
observation = env.reset(difficulty=0)
total_reward = 0.0
for i in range(500):
    i *= 0.01
    if i > 2:
        i -= 2
        observation, reward, done, info = env.step(input(w_best, i))
        reward -= 1.38
        T = 2
    else:
        # make a step given by the controller and record the state and the reward
        observation, reward, done, info = env.step(input(w_first, i))
    total_reward += reward
    if done:
        break
best_reward = total_reward

# Your reward is
print("Total reward %f" % total_reward)

for run in range(runs):
Exemple #25
0
class WrapperEnv():
    def __init__(self,
                 game='l2r',
                 visualize=False,
                 max_obstacles=10,
                 skip_count=1):
        self.env = RunEnv(visualize=visualize, max_obstacles=max_obstacles)
        self.step_count = 0
        self.old_observation = None
        self.skip_count = 1  # skip_count  # 4
        self.last_x = 0
        self.current_x = 0
        self.observation_space_shape = (76, )
        self.action_space = self.env.action_space
        self.difficulty = 2

    def obg(self, plain_obs):
        # observation generator
        # derivatives of observations extracted here.
        processed_observation, self.old_observation = go(plain_obs,
                                                         self.old_observation,
                                                         step=self.step_count)
        return np.array(processed_observation)

    def process_action(self, action):
        processed_action = [(v + 1.0) / 2 for v in action]
        return processed_action

    def step(self, action):
        action = [float(action[i]) for i in range(len(action))]
        action = self.process_action(action)

        import math
        for num in action:
            if math.isnan(num):
                print('NaN met', action)
                raise RuntimeError('this is bullshit')

        sr = 0
        sp = 0
        o, oo = [], []
        d, i = 0, 0
        self.last_x = self.current_x
        for j in range(self.skip_count):
            self.step_count += 1
            oo, r, d, i = self.env.step(action)
            self.current_x = oo[1]
            headx = oo[22]
            px = oo[1]
            py = oo[2]
            kneer = oo[7]
            kneel = oo[10]
            lean = min(0.3, max(0, px - headx - 0.15)) * 0.05
            joint = sum([max(0, k - 0.1)
                         for k in [kneer, kneel]]) * 0.03  # * 0.03
            penalty = lean + joint

            o = self.obg(oo)
            sr += r
            sp += penalty

            if d is True:
                break
        res = [o, sr, d, sp]
        # res = [o, sr, d, i]
        return res

    def reset(self, difficulty=2):
        self.difficulty = difficulty
        self.step_count = 0
        self.old_observation = None
        oo = self.env.reset(difficulty=difficulty)
        self.last_x = oo[1]
        self.current_x = oo[1]
        o = self.obg(oo)
        return o

    def seed(self, s):
        self.env.seed(s)
        -0.12318915143209062, 0.8572259102524259, 0.8941775106918655,
        -0.01404221329096258, 0.2295314378679483, -0.021037075157206642,
        -0.681491323768328, 0.31352610722416563, -0.7920196539712908,
        -0.1582820172462255, 0.311412855895345, -0.10984746585998507,
        -0.02296411197489962, -0.00802550380398804, -0.0017461366413788204,
        -0.3041740263231416, 0.016811307539095512, 0.1819317051058162,
        0.7530491584560023, 0.976491955750641, 0.21107478867080567,
        -0.014844146458944022, 0.898891834890124, 1.5194984121043213,
        0.8572259102524259, 0.8941775106918655, 0.7562374616756263,
        0.9883207301533766, 0.8043896105729741, -0.02441792460823137,
        0.2730685561935682, 0.051075198992798276, 0.6779778525960489,
        0.029056095674362847, 0.2780392718810951, 0.18824186908999707, 1, 1,
        100, 0, 0
    ]

    if visualize:
        manager = opensim.Manager(self.osim_model.model)
        manager.setInitialTime(-0.00001)
        manager.setFinalTime(0.0)
        manager.integrate(state)


env = RunEnv(visualize=True)

env.__init__ = types.MethodType(modified_init, env)
observation = env.reset(difficulty=0)

for i in range(200):
    observation, reward, done, info = env.step([1.0] * 18)
    print reward
    
    print("Initializing new best")

w_try=copy.deepcopy(w)
best_reward=0.
runs=500
unev_runs=0

print("Baseline, run with w_best")
observation = env.reset(difficulty = 0)
total_reward = 0.0
for i in range(500):
    i*=0.01
    if i>2:
        i-=2
        observation, reward, done, info = env.step(input(w_best,i))
        T=2
    else:
    # make a step given by the controller and record the state and the reward
        observation, reward, done, info = env.step(input(w_first,i))
    total_reward += reward
    if done:
        break
best_reward=total_reward

# Your reward is
print("Total reward %f" % total_reward)


for run in range(runs):
Exemple #28
0
# 1. find the accordingly words "env.step"
# 2. find&build my controller.
# 3. Does the origin model count the body impair into the reward? (check github)
# 4. range & iteration. How to judge the failure.
'''
 If the reward is the distance, every step has a reward ?
 Can we know each part of the reward ?
'''

from osim.env import RunEnv

env = RunEnv(visualize=True)
observation = env.reset(difficulty=0)

for j in range(3):
    print("-------")
    for i in range(2):
        eas = env.action_space.sample()
        o, r, d, i = env.step(eas)
        print("eas")
        print(eas)
        print("o")
        print(o)
        print('r')
        print(r)
        print('d')
        print(d)
        print('i')
        print(i)
Exemple #29
0
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = [1] * 48

    if args.render and can_save:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    #running_state = ZFilter((num_inputs,), clip=5)
    #running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())
        ac_net.zero_grad()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        #Tot_loss = 0
        #Tot_num =
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            last_state = process_observation(state)
            state = process_observation(state)
            last_state, state = transform_observation(last_state, state)

            state = numpy.array(state)
            #global last_state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = Variable(torch.Tensor(state).unsqueeze(0))
            shared_obs_stats.observes(state)
            state = shared_obs_stats.normalize(state)
            state = state.data[0].numpy()
            #state = running_state(state)

            reward_sum = 0
            #timer = time.time()
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)
                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(state)
                    print(action)
                    print('ERROR')
                    raise RuntimeError('action NaN problem')
                #print(action)
                #print("------------------------")
                #timer = time.time()

                BB = numpy.append(action, action)
                #print(BB)

                reward = 0
                if args.skip:
                    #env.step(action)
                    _, A, _, _ = env.step(BB)
                    reward += A
                    _, A, _, _ = env.step(BB)
                    reward += A

                next_state, A, done, _ = env.step(BB)
                reward += A
                next_state = process_observation(next_state)
                last_state, next_state = transform_observation(
                    last_state, next_state)

                next_state = numpy.array(next_state)
                reward_sum += reward
                #print('env:')
                #print(time.time()-timer)

                #last_state ,next_state = update_observation(last_state,next_state)
                #next_state = running_state(next_state)
                next_state = Variable(torch.Tensor(next_state).unsqueeze(0))
                shared_obs_stats.observes(next_state)
                next_state = shared_obs_stats.normalize(next_state)
                next_state = next_state.data[0].numpy()
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1

            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #print('env:')
        #print(time.time()-timer)

        #timer = time.time()
        update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac)
        #print('backpropagate:')
        #print(time.time()-timer)

        epoch = i_episode
        if (i_episode % args.log_interval == 0) and (rank == 0):

            print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.
                  format(i_episode, reward_sum, reward_batch))
            if reward_batch > best_result:
                best_result = reward_batch
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': ac_net.state_dict(),
                        'optimizer': opt_ac,
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, 'best')

            if epoch % 30 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': ac_net.state_dict(),
                        'optimizer': opt_ac,
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, epoch)
Exemple #30
0
def test(rank, args, shared_model, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = numpy.zeros(41)

    if args.render:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            state = numpy.array(state)
            #global last_state
            #last_state = state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = running_state(state)

            reward_sum = 0
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                #timer = time.time()
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)

                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(action)
                    puts('ERROR')
                    return
                #print('NN take:')
                #print(time.time()-timer)
                #print(action)
                #print("------------------------")

                #timer = time.time()
                if args.skip:
                    #env.step(action)
                    _, reward, _, _ = env.step(action)
                    reward_sum += reward
                next_state, reward, done, _ = env.step(action)
                next_state = numpy.array(next_state)
                reward_sum += reward

                #print('env take:')
                #print(time.time()-timer)

                #timer = time.time()

                #last_state ,next_state = update_observation(last_state,next_state)
                next_state = running_state(next_state)
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                #print('update take:')
                #print(time.time()-timer)

                #timer = time.time()

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #print('memory take:')
                #print(time.time()-timer)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state

            num_steps += (t - 1)
            num_episodes += 1
            #print(num_episodes)
            reward_batch += reward_sum

        #print(num_episodes)
        reward_batch /= num_episodes
        batch = memory.sample()

        #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac)
        time.sleep(60)

        if i_episode % args.log_interval == 0:
            File = open(PATH_TO_MODEL + '/record.txt', 'a+')
            File.write("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            File.close()
            #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
            #    i_episode, reward_sum, reward_batch))
            print("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            #print('!!!!')

        epoch = i_episode
        if reward_batch > best_result:
            best_result = reward_batch
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, 'best')

        if epoch % 30 == 1:
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, epoch)
 def test_first_obs(self):
     env = RunEnv(visualize=False)
     observation_start = env.reset()
     observation, reward, done, info = env.step(env.action_space.sample())
     self.assertAlmostEqual(observation_start[-1], observation[-1])
     self.assertAlmostEqual(observation_start[-2], observation[-2])