Example #1
0
    def train(self):
        '''
        call this function when the episode ends
        '''

        if not self.is_training:
            self.logger.info("Not in training mode")
            return
        else:
            self.logger.info("Update a2c policy parameters.")

        self.episodecount += 1
        self.logger.info("Sample Num so far: %s" % self.samplecount)
        self.logger.info("Episode Num so far: %s" % self.episodecount)

        Settings.add_count()
        globalEpisodeCount = copy.deepcopy(Settings.get_count())
        self.loadLastestPolicy()

        if self.samplecount >= self.minibatch_size * 1 and globalEpisodeCount % self.training_frequency == 0:
            self.logger.info('start training...')

            assert len(Settings.global_policysaver) == Settings.global_threadsnum
            # self.dqn.reset_noise()
            total_batch_size = 0
            for k, thread_policy in Settings.global_policysaver.items():
                s_batch, a_batch_one_hot, V_trace, advantage = Settings.global_hackpolicysaver[k]._sample_and_updateV()
                grad, batch_size = thread_policy.train(s_batch, a_batch_one_hot, V_trace, advantage)
                total_batch_size += batch_size
                Settings.load_grad(grad, k)

            assert len(Settings.global_gradsaver) == Settings.global_threadsnum
            grads_list = Settings.grad_sum()
            self._load_and_update(grads_list, total_batch_size)
            self.savePolicyInc()
Example #2
0
    def train(self):
        '''
        call this function when the episode ends
        '''

        # print(threading.currentThread().getName() + ' Domain: ' +self.domainString + ' Training')

        if not self.is_training:
            self.logger.info("Not in training mode")
            return
        else:
            self.logger.info("Update dqn policy parameters.")

        self.episodecount += 1

        # lock = Settings.load_lock()
        # lock.acquire()
        # try:
        Settings.add_count()
        globalEpisodeCount = copy.deepcopy(Settings.get_count())

        # print('###################################################')
        # print(threading.currentThread().getName() + ' ' + str(globalEpisodeCount))

        self.logger.info("Sample Num so far: %s" % (self.samplecount))
        self.logger.info("Episode Num so far: %s" % (self.episodecount))

        # if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
        if self.samplecount >= self.minibatch_size * 10 and globalEpisodeCount % self.training_frequency == 0:
            self.logger.info('start training...')

            # print('reshaped_targetdis_shape: ')
            # print(reshaped_targetdis.shape)
            # print('a_batch_one_hot_shape: ')
            # print(a_batch_one_hot.shape)

            assert len(Settings.global_policysaver) == Settings.global_threadsnum
            # self.dqn.reset_noise()
            total_batch_size = 0
            for k, v in Settings.global_policysaver.items():
                s_batch, a_batch_one_hot, reshaped_targetdis = Settings.global_hackpolicysaver[k]._sample_and_updateQ()
                grad, batch_size = Settings.global_policysaver[k].train(s_batch, a_batch_one_hot, reshaped_targetdis)
                total_batch_size += batch_size
                Settings.load_grad(grad, k)

            assert len(Settings.global_gradsaver) == Settings.global_threadsnum
            grads_list = Settings.grad_sum()
            self._load_and_update(grads_list, self.minibatch_size)
            self._savePolicyInc()