Beispiel #1
0
 def __call__(self, env, policy, debug=False, visualize=False):
     
     episode_memory = queue()
     observation = None
     result = []
     for episode in range(self.validate_episodes):
         
         # reset at the start of episode
         episode_memory.clear()
         observation = env.reset()
         episode_memory.append(observation)
         observation = episode_memory.getObservation(self.window_length, observation, self.pic)
         episode_steps = 0
         episode_reward = 0.     
         assert observation is not None            
         # start episode
         done = False
         while not done and (episode_steps <= self.max_episode_length or not self.max_episode_length):
             action = policy(observation)
             observation, reward, done, info = env.step(action)
             episode_memory.append(observation)
             observation = episode_memory.getObservation(self.window_length, observation, self.pic)
             if visualize:
                 if self.bullet:
                     import pybullet
                     pybullet.resetDebugVisualizerCamera(cameraDistance=10, cameraYaw=0, cameraPitch=-6.6,cameraTargetPosition=[10,0,0])
                 env.render()
             episode_reward += reward
             episode_steps += 1
         result.append(episode_reward)
     if debug: prRed('[Evaluate] reward:{}'.format(result))
     return result
Beispiel #2
0
    def __call__(self,
                 env,
                 policy,
                 debug=False,
                 visualize=False,
                 window_length=1):

        episode_memory = queue()
        observation = None
        result = []

        for episode in range(self.num_episodes):

            # reset at the start of episode
            episode_memory.clear()
            observation = env.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation)
            episode_steps = 0
            episode_reward = 0.
            assert observation is not None

            # start episode
            done = False
            while not done:
                action = policy(observation)
                observation, reward, done, info = env.step(action)
                episode_memory.append(observation)
                observation = episode_memory.getObservation(
                    window_length, observation)
                if self.max_episode_length and episode_steps >= self.max_episode_length - 1:
                    done = True
                if visualize:
                    env.render()
                # update
                episode_reward += reward
                episode_steps += 1

            result.append(episode_reward)

        return np.mean(result)
Beispiel #3
0
    def __call__(self, env, policy, debug=False, visualize=False):

        episode_memory = queue()
        observation = None
        result = []
        for episode in range(self.validate_episodes):

            # reset at the start of episode
            episode_memory.clear()
            observation = env.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                self.window_length, observation)
            episode_steps = 0
            episode_reward = 0.
            assert observation is not None

            # start episode
            done = False
            while not done and (episode_steps <= self.max_episode_length
                                or not self.max_episode_length):
                action = policy(observation)
                observation, reward, done, info = env.step(action)
                episode_memory.append(observation)
                observation = episode_memory.getObservation(
                    self.window_length, observation)
                if visualize:
                    if self.bullet:
                        import pybullet
                        pybullet.resetDebugVisualizerCamera(
                            cameraDistance=10,
                            cameraYaw=0,
                            cameraPitch=-6.6,
                            cameraTargetPosition=[10, 0, 0])
                    env.render()
                episode_reward += reward
                episode_steps += 1
            result.append(episode_reward)
        if debug: prRed('[Evaluate] reward:{}'.format(result))
        return result
Beispiel #4
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis, args.atari)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    validate_episodes = args.validate_episodes

    if resume is not None:
        print('load weight')
        agent.load_weights(output)
        agent.memory.load(output)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0

    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            # print("observation shape:", observation.shape)
            action = agent.select_action(observation, noise_level=noise_level)

        # env response with next_observation, reward, terminate_info
        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation,
                                                    args.pic)

        # print("observation = ", observation)
        # print("reward = ", reward)
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length
                     and max_episode_length)):  # end of episode
            # [optional] save
            # if args.env == "Paint":
            # writer.add_image(str(episode) + '.png', env.canvas)
            if step > args.warmup:
                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(fenv,
                                               agent.select_action,
                                               debug=debug,
                                               visualize=False)
                    if debug:
                        prRed(
                            'Step_{:07d}: mean_reward:{} reward_var:{}'.format(
                                step, np.mean(validate_reward),
                                np.var(validate_reward)))
                    writer.add_scalar('validate/reward',
                                      np.mean(validate_reward), step)
                    writer.add_image(str(step) + '.png', env.canvas)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(traintimes):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss',
                                      value_loss.data.cpu().numpy(), log)
            if debug:                prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
          .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)

            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    if debug: prRed('[Save model] #{}'.format(save_num))
    agent.save_model(output, save_num)
Beispiel #5
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis, args.atari)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    validate_episodes = args.validate_episodes

    if resume is not None:
        print('load weight')
        agent.load_weights(output)
        agent.memory.load(output)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0
    
    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            # print("observation shape:", observation.shape)            
            action = agent.select_action(observation, noise_level=noise_level)
            
        # env response with next_observation, reward, terminate_info
        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation, args.pic)
        
        # print("observation = ", observation)
        # print("reward = ", reward)
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode
            # [optional] save
            # if args.env == "Paint":
                # writer.add_image(str(episode) + '.png', env.canvas)       
            if step > args.warmup:
                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(fenv, agent.select_action, debug=debug, visualize=False)
                    if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward)))
                    writer.add_scalar('validate/reward', np.mean(validate_reward), step)
                    if args.env == "Paint":
                        writer.add_image(str(step) + '.png', env.canvas)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(traintimes):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log)
            if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)
            
            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1
            
    if debug: prRed('[Save model] #{}'.format(save_num))
    agent.save_model(output, save_num)
Beispiel #6
0
    def sigint_handler(signum, frame):
        print('memory saving...'),
        agent.memory.save(output)
        print('done')
        exit()
    signal.signal(signal.SIGINT, sigint_handler)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    max_reward = -100000.
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    while step < num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = env.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
Beispiel #7
0
def train(num_iterations, agent, env):
    fenv = fastenv(env, args.action_repeat)
    window_length = args.window_length
    save_interval = args.save_interval
    debug = args.debug
    output = args.output

    time_stamp = 0.
    log = 0
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = args.noise_level * random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0

    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)

        # env response with next_observation, reward, terminate_info
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation)

        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update
        step += 1
        episode_steps += 1
        episode_reward += reward
        if done:
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug:
                        prRed('[Save model] #{} in {}'.format(
                            save_num, args.output))
                    agent.save_model(output, save_num)

            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss',
                                      value_loss.data.cpu().numpy(), log)

            if debug:                prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
          .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)

            # reset
            noise_level = args.noise_level * random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1
Beispiel #8
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    ace = args.ace
    validate_episodes = args.validate_episodes

    # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf
    if ace != 1:
        ensemble = ACE(nb_status, nb_actions, args)
    
    if resume is not None:
        print('load weight')
        if ace != 1:
            ensemble.load(output)
        agent.load_weights(output)
        agent.memory.load(output)

    def sigint_handler(signum, frame):
        print('memory saving...'),
        agent.memory.save(output)
        agent.save_model(output, 0)
        print('done')
        exit()
    signal.signal(signal.SIGINT, sigint_handler)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0
    
    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)
            
        # env response with next_observation, reward, terminate_info

        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation, args.pic)
        
        # print("observation shape = ", np.shape(observation))
        # print("observation = ", observation)
        # print("reward = ", reward)
        # exit()       
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug: prRed('[Save model] #{}'.format(save_num))
                    agent.save_model(output, save_num)
                    if ace != 1:
                        ensemble.append(output, save_num)

                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(env, agent.select_action, debug=debug, visualize=False)
                    if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward)))
                    if ace != 1 and save_num >= 1:
                        validate_reward2 = evaluate(env, ensemble, debug=debug, visualize=False)
                        if debug: prRed('ACE Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward2), np.var(validate_reward2)))
#                    for i in range(validate_episodes):
#                        validate_num += 1
                    writer.add_scalar('validate/reward', np.mean(validate_reward), step)
                    if ace != 1 and save_num >= 1:
                        writer.add_scalar('validate/ACE_reward', np.mean(validate_reward2), step)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log)
            if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)
            
            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    sigint_handler(0, 0)
Beispiel #9
0
def train(num_iterations, agent, env, evaluate, bullet):
    fenv = fastenv(env, args.action_repeat, args.vis)
    window_length = args.window_length
    validate_interval = args.validate_interval
    save_interval = args.save_interval
    max_episode_length = args.max_episode_length // args.action_repeat
    debug = args.debug
    visualize = args.vis
    traintimes = args.traintimes
    output = args.output
    resume = args.resume
    ace = args.ace
    validate_episodes = args.validate_episodes

    # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf
    if ace != 1:
        ensemble = ACE(nb_status, nb_actions, args)

    if resume is not None:
        print('load weight')
        if ace != 1:
            ensemble.load(output)
        agent.load_weights(output)
        agent.memory.load(output)

    def sigint_handler(signum, frame):
        print('memory saving...'),
        agent.memory.save(output)
        agent.save_model(output, 0)
        print('done')
        exit()

    signal.signal(signal.SIGINT, sigint_handler)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0

    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(
                window_length, observation, args.pic)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)

        # env response with next_observation, reward, terminate_info

        # print("action = ", action)
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation,
                                                    args.pic)

        # print("observation shape = ", np.shape(observation))
        # print("observation = ", observation)
        # print("reward = ", reward)
        # exit()
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update
        step += 1
        episode_steps += 1
        episode_reward += reward
        if (done or (episode_steps >= max_episode_length
                     and max_episode_length)):  # end of episode
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug: prRed('[Save model] #{}'.format(save_num))
                    agent.save_model(output, save_num)
                    if ace != 1:
                        ensemble.append(output, save_num)

                # [optional] evaluate
                if episode > 0 and validate_interval > 0 and episode % validate_interval == 0:
                    validate_reward = evaluate(env,
                                               agent.select_action,
                                               debug=debug,
                                               visualize=False)
                    if debug:
                        prRed(
                            'Step_{:07d}: mean_reward:{} reward_var:{}'.format(
                                step, np.mean(validate_reward),
                                np.var(validate_reward)))
                    if ace != 1 and save_num >= 1:
                        validate_reward2 = evaluate(env,
                                                    ensemble,
                                                    debug=debug,
                                                    visualize=False)
                        if debug:
                            prRed(
                                'ACE Step_{:07d}: mean_reward:{} reward_var:{}'
                                .format(step, np.mean(validate_reward2),
                                        np.var(validate_reward2)))


#                    for i in range(validate_episodes):
#                        validate_num += 1
                    writer.add_scalar('validate/reward',
                                      np.mean(validate_reward), step)
                    if ace != 1 and save_num >= 1:
                        writer.add_scalar('validate/ACE_reward',
                                          np.mean(validate_reward2), step)
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss',
                                      value_loss.data.cpu().numpy(), log)
            if debug:                prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
          .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)

            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    sigint_handler(0, 0)
Beispiel #10
0
def train(num_iterations, agent, env):
    fenv = fastenv(env, args.action_repeat)
    window_length = args.window_length
    save_interval = args.save_interval
    debug = args.debug
    output = args.output

    time_stamp = 0.
    log = 0
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    episode_num = 0
    episode_memory = queue()
    noise_level = args.noise_level * random.uniform(0, 1) / 2.
    save_num = 0
    # validate_num = 0
    
    while step <= num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = fenv.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            # print('step = ', step)
            
        # env response with next_observation, reward, terminate_info
        observation, reward, done, info = fenv.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation)
        
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward
        if done:
            # [optional] save
            if step > args.warmup:
                if episode > 0 and save_interval > 0 and episode % save_interval == 0:
                    save_num += 1
                    if debug: prRed('[Save model] #{} in {}'.format(save_num, args.output))
                    agent.save_model(output, save_num)

            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(episode_steps):
                if step > args.warmup:
                    log += 1
                    # print('updating', i)
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('train/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log)

            if debug: prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \
                .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('train/train_reward', episode_reward, episode)
            
            # reset
            noise_level = args.noise_level * random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1
Beispiel #11
0
def train(num_iterations, agent, env, evaluate, validate_interval, output, window_length, max_episode_length=None,
          debug=False, visualize=False, traintimes=None, resume=None):
    if resume is not None:
        print('load weight')
        agent.load_weights(output)
        agent.memory.load(output)

    def sigint_handler(signum, frame):
        print('memory saving...'),
        agent.memory.save(output)
        print('done')
        exit()
    signal.signal(signal.SIGINT, sigint_handler)

    time_stamp = 0.
    log = 0
    agent.is_training = True
    step = episode = episode_steps = 0
    episode_reward = 0.
    observation = None
    max_reward = -100000.
    episode_num = 0
    episode_memory = queue()
    noise_level = random.uniform(0, 1) / 2.
    while step < num_iterations:
        # reset if it is the start of episode
        if observation is None:
            episode_memory.clear()
            observation = env.reset()
            episode_memory.append(observation)
            observation = episode_memory.getObservation(window_length, observation)
            agent.reset(observation)

        # agent pick action ...
        if step <= args.warmup and resume is None:
            action = agent.random_action()
        else:
            action = agent.select_action(observation, noise_level=noise_level)
            
        # env response with next_observation, reward, terminate_info

        # print("action = ", action)
        observation, reward, done, info = env.step(action)
        episode_memory.append(observation)
        observation = episode_memory.getObservation(window_length, observation)
        
        # print("observation shape = ", np.shape(observation))
        # print("observation = ", observation)
        # print("reward = ", reward)
        # exit()       
        # agent observe and update policy
        agent.observe(reward, observation, done)
        # update 
        step += 1
        episode_steps += 1
        episode_reward += reward

        if done or (episode_steps >= max_episode_length - 1 and max_episode_length): # end of episode

            # [optional] evaluate
            if evaluate is not None and validate_interval > 0 and episode % validate_interval == 0:
                policy = lambda x: agent.select_action(x, decay_epsilon=False, noise_level=0)
                validate_reward = evaluate(env, policy, debug=False, visualize=False, window_length=window_length)
                writer.add_scalar('data/validate_reward', validate_reward, episode / validate_interval)
                if debug: prRed('[Evaluate and save] Step_{:07d}: mean_reward:{}'.format(step, validate_reward))
                if validate_reward > max_reward and step != 0:
                    max_reward = validate_reward
                agent.save_model(output)
            
            train_time_interval = time.time() - time_stamp
            time_stamp = time.time()
            for i in range(traintimes):
                log += 1
                if step > args.warmup:
                    Q, value_loss = agent.update_policy()
                    writer.add_scalar('data/Q', Q.data.cpu().numpy(), log)
                    writer.add_scalar('data/critic_loss', value_loss.data.cpu().numpy(), log)
            if debug: prBlack('#{}: train_reward:{:.2f} steps:{} noise:{:.2f} time:{:.2f},{:.2f}' \
                              .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp))
            time_stamp = time.time()
            writer.add_scalar('data/train_reward', episode_reward, episode)
            
            # reset
            noise_level = random.uniform(0, 1) / 2.
            episode_num += 1
            observation = None
            episode_steps = 0
            episode_reward = 0.
            episode += 1

    sigint_handler(0, 0)