Exemple #1
0
    def playonce(remote_env):
        from multi import fastenv

        fenv = fastenv(remote_env, 2)
        #print(type(agent))
        agent.play(fenv)
        remote_env.rel()
        del fenv
Exemple #2
0
    def playonce(nl, remote_env):
        from multi import fastenv

        fenv = fastenv(remote_env, 2)
        #print(type(agent))
        agent.play(fenv, realtime=False, max_steps=-1, noise_Level=nl)
        remote_env.rel()
        del fenv
Exemple #3
0
    def playonce(nl, env):
        from multi import fastenv

        # global noise_level
        # env = farmer.acq_env()
        fenv = fastenv(env, 4)
        agent.play(fenv, realtime=False, max_steps=-1, noise_level=nl)
        # epl.rel_env(env)
        env.rel()
        del fenv
Exemple #4
0
    def test(skip=1):
        # e = p.env
        te = RunEnv(visualize=True, max_obstacles=10)
        from multi import fastenv

        fenv = fastenv(te, skip)  # 4 is skip factor
        agent.render = True
        try:
            agent.play(fenv, realtime=True, max_steps=-1, noise_level=1e-11)
        except:
            pass
        finally:
            del te
Exemple #5
0
    def get_env(self):  # obtain a new environment on demand
        global farmer
        while 1:
            remote_env = farmer.acq_env()
            if remote_env == False:  # no free environment
                time.sleep(0.1)
            else:
                if hasattr(self, 'remote_env'):
                    del self.remote_env  # release previous before allocate new

                self.remote_env = remote_env
                from multi import fastenv
                fenv = fastenv(remote_env, 2)
                # a skip of 2; also performs observation processing
                return fenv
    def test(skip=1):
        # e = p.env
        te = RunEnv(visualize=False)
        from multi import fastenv

        fenv = fastenv(te, skip)  # 4 is skip factor
        agent.render = True
        agent.training = False
        try:
            #print('playing')
            #agent.play(fenv,realtime=True,max_steps=-1,noise_level=1e-11)
            playifavailable(0)
        except:
            pass
        finally:
            del te
Exemple #7
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis, args.atari)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    validate_episodes = args.validate_episodes

    if resume is not None:
        print('load weight')
        agent.load_weights(output)
        agent.memory.load(output)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0

    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            # print("observation shape:", observation.shape)
            action = agent.select_action(observation, noise_level=noise_level)

        # env response with next_observation, reward, terminate_info
        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation,
                                                    args.pic)

        # print("observation = ", observation)
        # print("reward = ", reward)
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length
                     and max_episode_length)):  # end of episode
            # [optional] save
            # if args.env == "Paint":
            # writer.add_image(str(episode) + '.png', env.canvas)
            if step > args.warmup:
                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(fenv,
                                               agent.select_action,
                                               debug=debug,
                                               visualize=False)
                    if debug:
                        prRed(
                            'Step_{:07d}: mean_reward:{} reward_var:{}'.format(
                                step, np.mean(validate_reward),
                                np.var(validate_reward)))
                    writer.add_scalar('validate/reward',
                                      np.mean(validate_reward), step)
                    writer.add_image(str(step) + '.png', env.canvas)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(traintimes):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss',
                                      value_loss.data.cpu().numpy(), log)
            if debug:                prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
          .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)

            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    if debug: prRed('[Save model] #{}'.format(save_num))
    agent.save_model(output, save_num)
Exemple #8
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis, args.atari)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    validate_episodes = args.validate_episodes

    if resume is not None:
        print('load weight')
        agent.load_weights(output)
        agent.memory.load(output)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0
    
    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            # print("observation shape:", observation.shape)            
            action = agent.select_action(observation, noise_level=noise_level)
            
        # env response with next_observation, reward, terminate_info
        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation, args.pic)
        
        # print("observation = ", observation)
        # print("reward = ", reward)
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode
            # [optional] save
            # if args.env == "Paint":
                # writer.add_image(str(episode) + '.png', env.canvas)       
            if step > args.warmup:
                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(fenv, agent.select_action, debug=debug, visualize=False)
                    if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward)))
                    writer.add_scalar('validate/reward', np.mean(validate_reward), step)
                    if args.env == "Paint":
                        writer.add_image(str(step) + '.png', env.canvas)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(traintimes):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log)
            if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)
            
            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1
            
    if debug: prRed('[Save model] #{}'.format(save_num))
    agent.save_model(output, save_num)
Exemple #9
0
        env = NormalizedEnv(gym.make(args.env))

    # input random seed
    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    # input states count & actions count
    print(env.observation_space.shape, env.action_space.shape)
    nb_states = env.observation_space.shape[0]
    if args.discrete:
        nb_actions = env.action_space.n
    else:
        nb_actions = env.action_space.shape[0]

    env = fastenv(env, args.action_repeat, args.vis)
    agent = DDPG(nb_states, nb_actions, args, args.discrete, args.cuda)
    evaluate = Evaluator(args.validate_episodes, max_episode_length=args.max_episode_length)

    if args.vis and args.env == 'HalfCheetahBulletEnv-v0':
        env.render()
    
    if args.test is False:
        train(args.train_iter, agent, env, evaluate, 
              args.validate_interval, args.output, args.window_length, max_episode_length=args.max_episode_length,
              debug=args.debug, visualize=args.vis, traintimes=args.traintimes, resume=args.resume)

    else:
        test(args.validate_episodes, agent, env, evaluate, args.resume, args.window_length, 
             visualize=True, debug=args.debug)
 def playonce(self, env, T):
     from multi import fastenv
     fenv = fastenv(env, 4)
     self.play(fenv, T)
     env.rel()
     del fenv
Exemple #11
0
def train(num_iterations, agent, env):
    fenv = fastenv(env, args.action_repeat)
    window_length = args.window_length
    save_interval = args.save_interval
    debug = args.debug
    output = args.output

    time_stamp = 0.
    log = 0
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = args.noise_level * random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0

    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)

        # env response with next_observation, reward, terminate_info
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation)

        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update
        step += 1
        episode_steps += 1
        episode_reward += reward
        if done:
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug:
                        prRed('[Save model] #{} in {}'.format(
                            save_num, args.output))
                    agent.save_model(output, save_num)

            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss',
                                      value_loss.data.cpu().numpy(), log)

            if debug:                prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
          .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)

            # reset
            noise_level = args.noise_level * random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1
Exemple #12
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    ace = args.ace
    validate_episodes = args.validate_episodes

    # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf
    if ace != 1:
        ensemble = ACE(nb_status, nb_actions, args)
    
    if resume is not None:
        print('load weight')
        if ace != 1:
            ensemble.load(output)
        agent.load_weights(output)
        agent.memory.load(output)

    def sigint_handler(signum, frame):
        print('memory saving...'),
        agent.memory.save(output)
        agent.save_model(output, 0)
        print('done')
        exit()
    signal.signal(signal.SIGINT, sigint_handler)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0
    
    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)
            
        # env response with next_observation, reward, terminate_info

        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation, args.pic)
        
        # print("observation shape = ", np.shape(observation))
        # print("observation = ", observation)
        # print("reward = ", reward)
        # exit()       
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug: prRed('[Save model] #{}'.format(save_num))
                    agent.save_model(output, save_num)
                    if ace != 1:
                        ensemble.append(output, save_num)

                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(env, agent.select_action, debug=debug, visualize=False)
                    if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward)))
                    if ace != 1 and save_num >= 1:
                        validate_reward2 = evaluate(env, ensemble, debug=debug, visualize=False)
                        if debug: prRed('ACE Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward2), np.var(validate_reward2)))
#                    for i in range(validate_episodes):
#                        validate_num += 1
                    writer.add_scalar('validate/reward', np.mean(validate_reward), step)
                    if ace != 1 and save_num >= 1:
                        writer.add_scalar('validate/ACE_reward', np.mean(validate_reward2), step)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log)
            if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)
            
            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    sigint_handler(0, 0)
Exemple #13
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    ace = args.ace
    validate_episodes = args.validate_episodes

    # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf
    if ace != 1:
        ensemble = ACE(nb_status, nb_actions, args)

    if resume is not None:
        print('load weight')
        if ace != 1:
            ensemble.load(output)
        agent.load_weights(output)
        agent.memory.load(output)

    def sigint_handler(signum, frame):
        print('memory saving...'),
        agent.memory.save(output)
        agent.save_model(output, 0)
        print('done')
        exit()

    signal.signal(signal.SIGINT, sigint_handler)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0

    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)

        # env response with next_observation, reward, terminate_info

        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation,
                                                    args.pic)

        # print("observation shape = ", np.shape(observation))
        # print("observation = ", observation)
        # print("reward = ", reward)
        # exit()
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length
                     and max_episode_length)):  # end of episode
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug: prRed('[Save model] #{}'.format(save_num))
                    agent.save_model(output, save_num)
                    if ace != 1:
                        ensemble.append(output, save_num)

                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(env,
                                               agent.select_action,
                                               debug=debug,
                                               visualize=False)
                    if debug:
                        prRed(
                            'Step_{:07d}: mean_reward:{} reward_var:{}'.format(
                                step, np.mean(validate_reward),
                                np.var(validate_reward)))
                    if ace != 1 and save_num >= 1:
                        validate_reward2 = evaluate(env,
                                                    ensemble,
                                                    debug=debug,
                                                    visualize=False)
                        if debug:
                            prRed(
                                'ACE Step_{:07d}: mean_reward:{} reward_var:{}'
                                .format(step, np.mean(validate_reward2),
                                        np.var(validate_reward2)))


#                    for i in range(validate_episodes):
#                        validate_num += 1
                    writer.add_scalar('validate/reward',
                                      np.mean(validate_reward), step)
                    if ace != 1 and save_num >= 1:
                        writer.add_scalar('validate/ACE_reward',
                                          np.mean(validate_reward2), step)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss',
                                      value_loss.data.cpu().numpy(), log)
            if debug:                prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
          .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)

            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    sigint_handler(0, 0)
Exemple #14
0
    def real_test(skip=1):
        def obg(plain_obs):
            # observation generator
            # derivatives of observations extracted here.
            #print('pg multi.py 21, plain_obs:', len(plain_obs))

            #processed_observation, self.old_observation = go(plain_obs, self.old_observation, step=self.stepcount)

            observation = plain_obs
            obs = []

            obs.extend(observation['misc']['mass_center_pos'])  # x, y, z
            obs.extend(observation['misc']['mass_center_vel'])  # x, y, z
            obs.extend(observation['misc']['mass_center_acc'])  # x, y, z

            # joint body, positions and vels relative to pelvis

            # Absolute Joint Positions
            obs.extend(observation['joint_pos']['ground_pelvis'])

            obs.extend(observation['joint_pos']['hip_r'])
            obs.extend(observation['joint_pos']['knee_r'])
            obs.extend(observation['joint_pos']['ankle_r'])

            obs.extend(observation['joint_pos']['hip_l'])
            obs.extend(observation['joint_pos']['knee_l'])
            obs.extend(observation['joint_pos']['ankle_l'])
            '''

            # Relative Joint Positions
            #print(observation['joint_pos']['ground_pelvis'])
            obs.extend(observation['joint_pos']['ground_pelvis']) # 6 elements

            #print(rel_to_A(observation['joint_pos']['hip_r'], observation['body_pos']['pelvis']))
            obs.extend(rel_to_A(observation['joint_pos']['hip_r'], observation['body_pos']['pelvis'])) # 3e
            obs.extend(rel_to_A(observation['joint_pos']['knee_r'], observation['body_pos']['pelvis'])) # 1e
            obs.extend(rel_to_A(observation['joint_pos']['ankle_r'], observation['body_pos']['pelvis'])) # 1e

            obs.extend(rel_to_A(observation['joint_pos']['hip_l'], observation['body_pos']['pelvis'])) # 3e
            obs.extend(rel_to_A(observation['joint_pos']['knee_l'], observation['body_pos']['pelvis'])) # 1e
            obs.extend(rel_to_A(observation['joint_pos']['ankle_l'], observation['body_pos']['pelvis'])) # 1e
            '''

            # Absolute Joint Vel

            obs.extend(observation['joint_vel']['ground_pelvis'])

            obs.extend(observation['joint_vel']['hip_r'])
            obs.extend(observation['joint_vel']['knee_r'])
            obs.extend(observation['joint_vel']['ankle_r'])

            obs.extend(observation['joint_vel']['hip_l'])
            obs.extend(observation['joint_vel']['knee_l'])
            obs.extend(observation['joint_vel']['ankle_l'])

            # Absolute Joint Acc

            obs.extend(observation['joint_acc']['ground_pelvis'])

            obs.extend(observation['joint_acc']['hip_r'])
            obs.extend(observation['joint_acc']['knee_r'])
            obs.extend(observation['joint_acc']['ankle_r'])

            obs.extend(observation['joint_acc']['hip_l'])
            obs.extend(observation['joint_acc']['knee_l'])
            obs.extend(observation['joint_acc']['ankle_l'])

            b = [
                'body_pos', 'body_vel', 'body_acc', 'body_pos_rot',
                'body_vel_rot', 'body_acc_rot'
            ]
            parts = [
                'pelvis', 'femur_r', 'pros_tibia_r', 'pros_foot_r', 'femur_l',
                'tibia_l', 'talus_l', 'calcn_l', 'toes_l', 'torso', 'head'
            ]

            for i in b:

                for j in parts:

                    obs.extend(observation[i][j])

            forces_subkeys = observation['forces'].keys()

            for k in forces_subkeys:

                obs.extend(observation['forces'][k])

            #print('pg multi.py 25, proc_obs:', len(processed_observation))

            return np.array(obs)

        import opensim as osim
        from osim.env import ProstheticsEnv as RunEnv

        #te = RunEnv(visualize=False)
        from multi import fastenv

        #env = fastenv(te,skip)
        remote_env = farmer.acq_env()
        env = fastenv(remote_env, skip)

        observation = env.reset()

        #print(observation)

        stepno = 0
        epino = 0
        total_reward = 0
        old_observation = None

        while True:

            proc_observation = observation

            a = [float(i) for i in list(agent.act(proc_observation)[0])]
            #print(a)

            observation, reward, done, info, real_reward = env.step(a)

            stepno += 1
            total_reward += reward
            print('step', stepno, 'total reward', total_reward)

            if done:

                print('>>>>>>>episode', epino, ' DONE after', stepno,
                      'got_reward', total_reward)
                break
Exemple #15
0
def train(num_iterations, agent, env):
    fenv = fastenv(env, args.action_repeat)
    window_length = args.window_length
    save_interval = args.save_interval
    debug = args.debug
    output = args.output

    time_stamp = 0.
    log = 0
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = args.noise_level * random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0
    
    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)
            
        # env response with next_observation, reward, terminate_info
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation)
        
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward
        if done:
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug: prRed('[Save model] #{} in {}'.format(save_num, args.output))
                    agent.save_model(output, save_num)

            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log)

            if debug: prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)
            
            # reset
            noise_level = args.noise_level * random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1