Ejemplo n.º 1
0
    def __init__(self,
                 hp,
                 token,
                 network_actor,
                 network_critic,
                 prioritized=False,
                 initial_epsilon=None,
                 finial_epsilon=None,
                 finial_epsilon_frame=None,
                 discount_factor=None,
                 minibatch_size=None,
                 reply_start=None,
                 reply_memory_size=None,
                 target_network_update_frequency=None):
        super().__init__(
            hp,
            token,
            None,  # network_build=None
            prioritized,
            initial_epsilon,
            finial_epsilon,
            finial_epsilon_frame,
            discount_factor,
            minibatch_size,
            reply_start,
            reply_memory_size,
            target_network_update_frequency)
        self.network_actor = network_actor
        self.network_critic = network_critic
        self.actor = Actor(self.sess, network=self.network_actor)
        self.critic = Critic(self.sess, network=self.network_critic)

        write_file(self.graph_path + 'loss_exp_v_q.txt',
                   'critic loss: | actor exp_v:\n', True)
Ejemplo n.º 2
0
    def learn(self, incre_epsilon):
        self.learn_step_counter += 1

        # sample batch memory from all memory
        # zip(): Take iterable objects as parameters, wrap the corresponding elements in the object into tuples,
        # and then return a list of those tuples
        samples_batch = random.sample(self.memory,
                                      self.batch_size)  # list of tuples
        observation, eval_act_index, reward, observation_ = zip(
            *samples_batch)  # tuple of lists

        observation = np.array(observation)
        action = np.array(eval_act_index)
        reward = np.array(reward)
        observation_ = np.array(observation_)

        td_error, loss = self.critic.learn(observation, reward, observation_)
        exp_v = self.actor.learn(observation, action, td_error)

        # print('critic loss: {0} | actor exp_v: {1}'.format(loss, exp_v))
        content = '{0} | {1}\n'.format(loss, exp_v)
        write_file(self.graph_path + 'loss_exp_v_v.txt', content, False)
Ejemplo n.º 3
0
def train_model(brain, if_REINFORCE=False, if_a2c=False):
    env = SingleMiniShooting()

    if if_REINFORCE is True or if_a2c is True:
        log_data_list = np.arange(5).reshape((1, 5))
    else:
        log_data_list = np.arange(6).reshape((1, 6))

    saver, load_episode = restore_parameters(brain.sess, brain.graph_path)

    # train for numbers of episodes
    total_steps = 0
    probs_list = []
    write_file(brain.graph_path + 'probs', 'probs_list\n', True)

    for i_episode in range(brain.max_episode):

        observation = env.reset()
        state = np.stack([observation] * 4)

        ep_reward = 0
        num_step = 0
        start_time = time.time()

        while True:
            # time.sleep(0.1)
            if if_a2c is True:
                action, probs = brain.actor.choose_action(state)
            elif if_REINFORCE is True:
                action, probs = brain.choose_action(state)
            else:
                action = brain.choose_action(state)

            # print('action: {}'.format(action))
            observation_, reward, done = env.step(action)
            if reward == 2:
                print('------------------------- SCORE ---------------------------')
            next_state = np.concatenate([state[1:], np.expand_dims(observation_, 0)], axis=0)

            brain.store_transition(state, action, reward, next_state)

            ep_reward += reward
            num_step += 1
            state = next_state
            total_steps += 1

            if if_REINFORCE is False and i_episode > brain.replay_start:
                brain.learn(done)

            if done:

                if if_REINFORCE is True:
                    brain.learn()
                # Logs issues, i.e., save the log info.
                record_num = 6

                if if_REINFORCE is True or if_a2c is True:
                    record_num = 5

                    print('episode: ', i_episode, ' | reward: ', ep_reward, 'probs', probs,
                          'episode_time', time.time() - start_time)

                    probs_list.append(probs)
                    if len(probs_list) % brain.hp.OUTPUT_SAVER_ITER == 0:
                        write_file(brain.graph_path + 'probs', probs_list, False)
                        probs_list = []

                    new_data = np.array([i_episode, ep_reward, num_step, total_steps,
                                         time.time() - start_time]).reshape((1, record_num))
                    log_data_list = np.concatenate((log_data_list, new_data))

                if if_REINFORCE is False and if_a2c is False:
                    # record_num = 6
                    print('episode: ', i_episode, ' | reward: ', ep_reward, 'epsilon', brain.epsilon,
                          'episode_time', time.time() - start_time)

                    new_data = np.array([i_episode, ep_reward, num_step, total_steps, brain.epsilon,
                                         time.time() - start_time]).reshape((1, record_num))
                    log_data_list = np.concatenate((log_data_list, new_data))

                if log_data_list.shape[0] % brain.hp.OUTPUT_SAVER_ITER == 0:
                    write_ndarray(brain.graph_path + 'data', np.array(log_data_list))
                    log_data_list = log_data_list[-1, :].reshape(1, record_num)
                if i_episode % brain.hp.WEIGHTS_SAVER_ITER == 0 and i_episode != 0:
                    save_parameters(brain.sess, brain.graph_path, saver,
                                    brain.graph_path + '-' + str(load_episode + i_episode))
                break