Ejemplo n.º 1
0
    def train(self, sentences, min_eojeol_count=1):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(sentences, min_eojeol_count,
            max_length=self.l_max_length + self.r_max_length,
            filtering_checkpoint=self.eojeol_counter_filtering_checkpoint,
            verbose=self.verbose)
        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')
        self.lrgraph = eojeol_counter.to_lrgraph(
            self.l_max_length, self.r_max_length)

        if self.verbose:
            print('[Noun Extractor] has been trained. mem={} Gb'.format(
                '%.3f'%get_process_memory()))
Ejemplo n.º 2
0
    def train(self, sentences):

        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        eojeol_counter = EojeolCounter(
            sentences,
            self.min_eojeol_frequency,
            max_length=self.max_left_length + self.max_right_length,
            filtering_checkpoint=self.eojeol_counter_filtering_checkpoint,
            verbose=self.verbose)

        self._num_of_eojeols = eojeol_counter._count_sum
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Noun Extractor] complete eojeol counter -> lr graph')

        self.lrgraph = eojeol_counter.to_lrgraph(self.max_left_length,
                                                 self.max_right_length)

        if self.verbose:
            print('[Noun Extractor] has been trained. mem={} Gb'.format(
                '%.3f' % get_process_memory()))
Ejemplo n.º 3
0
    def _prepare_predicator_lrgraph(self):
        def contains_noun(eojeol):
            n = len(eojeol)
            for e in range(2, n + 1):
                if eojeol[:e] in self._nouns:
                    return True
            return False

        eojeols = {k: v for k, v in self.eojeol_counter._counter.items()}
        eojeols = {
            eojeol: count
            for eojeol, count in eojeols.items()
            if len(eojeol) > 1 and not contains_noun(eojeol)
        }
        lrgraph = EojeolCounter()._to_lrgraph(eojeols)
        return lrgraph
Ejemplo n.º 4
0
    def _train_with_sentences(self, sentences, min_eojeol_frequency=1):
        if self.verbose:
            print('[Noun Extractor] counting eojeols')

        if self.ensure_normalized:
            preprocess = lambda x:x
        else:
            preprocess = normalize_sent_for_lrgraph

        eojeol_counter = EojeolCounter(
            sentences,
            min_count = min_eojeol_frequency,
            max_length = self.max_left_length + self.max_right_length,
            filtering_checkpoint = self.eojeol_counter_filtering_checkpoint,
            verbose = self.verbose,
            preprocess = preprocess
        )

        self._train_with_eojeol_counter(eojeol_counter)
Ejemplo n.º 5
0
    def _train_with_sentences(self,
                              sentences,
                              min_eojeol_frequency=2,
                              filtering_checkpoint=100000):

        check = filtering_checkpoint > 0

        if self.verbose:
            message = 'counting eojeols ... '
            self._print(message, replace=False, newline=False)

        if self.ensure_normalized:
            preprocess = lambda x: x
        else:
            preprocess = normalize_sent_for_lrgraph

        # Eojeol counting
        eojeol_counter = EojeolCounter(sentences,
                                       min_count=min_eojeol_frequency,
                                       verbose=self.verbose,
                                       preprocess=preprocess)

        self._train_with_eojeol_counter(eojeol_counter)
Ejemplo n.º 6
0
    def _train_with_sentences(self,
                              sentences,
                              min_eojeol_count=2,
                              filtering_checkpoint=100000):

        check = filtering_checkpoint > 0

        if self.verbose:
            message = 'counting eojeols'
            self._print(message, replace=False, newline=False)

        # Eojeol counting
        counter = {}

        def contains_noun(eojeol, n):
            for e in range(2, n + 1):
                if eojeol[:e] in self.nouns:
                    return True
            return False

        for i_sent, sent in enumerate(sentences):

            if check and i_sent > 0 and i_sent % filtering_checkpoint == 0:
                counter = {
                    eojeol: count
                    for eojeol, count in counter.items()
                    if count >= min_eojeol_count
                }

            if self.verbose and i_sent % 100000 == 99999:
                message = 'n eojeol = {} from {} sents. mem={} Gb{}'.format(
                    len(counter), i_sent + 1, '%.3f' % get_process_memory(),
                    ' ' * 20)
                self._print(message, replace=True, newline=False)

            for eojeol in sent.split():

                n = len(eojeol)

                if n <= 1 or contains_noun(eojeol, n):
                    continue

                counter[eojeol] = counter.get(eojeol, 0) + 1

        if self.verbose:
            message = 'counting eojeols was done. {} eojeols, mem={} Gb{}'.format(
                len(counter), '%.3f' % get_process_memory(), ' ' * 20)
            self._print(message, replace=True, newline=True)

        counter = {
            eojeol: count
            for eojeol, count in counter.items() if count >= min_eojeol_count
        }

        self._num_of_eojeols = sum(counter.values())
        self._num_of_covered_eojeols = 0

        if self.verbose:
            message = 'complete eojeol counter -> lr graph'
            self._print(message, replace=False, newline=True)

        self.lrgraph = EojeolCounter()._to_lrgraph(counter,
                                                   l_max_length=10,
                                                   r_max_length=9)

        if self.verbose:
            message = 'has been trained. mem={} Gb'.format(
                '%.3f' % get_process_memory())
            self._print(message, replace=False, newline=True)
Ejemplo n.º 7
0
class EomiExtractor:
    def __init__(self,
                 nouns,
                 noun_pos_features=None,
                 stems=None,
                 eomis=None,
                 verbose=True):

        if not noun_pos_features:
            noun_pos_features = self._load_default_noun_pos_features()

        if not stems:
            stems = self._load_default_stems()

        if not eomis:
            eomis = self._load_default_eomis()

        self._nouns = nouns
        self._noun_pos_features = noun_pos_features
        self._stems = stems
        self._pos_l = {l for stem in stems for l in _conjugate_stem(stem)}
        self._eomis = eomis
        self.verbose = verbose
        self.lrgraph = None

    def _load_default_noun_pos_features(self):
        path = '%s/trained_models/noun_predictor_ver2_pos' % installpath
        with open(path, encoding='utf-8') as f:
            pos_features = {word.split()[0] for word in f}
        return pos_features

    def _load_default_stems(self, min_count=100):
        dirs = '%s/lemmatizer/dictionary/default/stem' % installpath
        paths = ['%s/Adjective.txt', '%s/Verb.txt']
        paths = [p % dirs for p in paths]
        stems = set()
        for path in paths:
            with open(path, encoding='utf-8') as f:
                for line in f:
                    word, count = line.split()
                    if int(count) < min_count:
                        continue
                    stems.add(word)
        return stems

    def _load_default_eomis(self, min_count=20):
        path = '%s/lemmatizer/dictionary/default/Eomi/Eomi.txt' % installpath
        eomis = set()
        with open(path, encoding='utf-8') as f:
            for line in f:
                word, count = line.split()
                if int(count) < min_count:
                    continue
                eomis.add(word)
        return eomis

    @property
    def is_trained(self):
        return self.lrgraph

    def train(self,
              sentences,
              min_eojeol_count=2,
              filtering_checkpoint=100000):

        check = filtering_checkpoint > 0

        if self.verbose:
            print('[Eomi Extractor] counting eojeols', end='')

        # Eojeol counting
        counter = {}

        def contains_noun(eojeol, n):
            for e in range(2, n + 1):
                if eojeol[:e] in self.nouns:
                    return True
            return False

        for i_sent, sent in enumerate(sentences):

            if check and i_sent > 0 and i_sent % filtering_checkpoint == 0:
                counter = {
                    eojeol: count
                    for eojeol, count in counter.items()
                    if count >= min_eojeol_count
                }

            if self.verbose and i_sent % 100000 == 99999:
                message = '\r[Eomi Extractor] n eojeol = {} from {} sents. mem={} Gb{}'.format(
                    len(counter), i_sent + 1, '%.3f' % get_process_memory(),
                    ' ' * 20)
                print(message, flush=True, end='')

            for eojeol in sent.split():

                n = len(eojeol)

                if n <= 1 or contains_noun(eojeol, n):
                    continue

                counter[eojeol] = counter.get(eojeol, 0) + 1

        if self.verbose:
            message = '\r[Eomi Extractor] counting eojeols was done. {} eojeols, mem={} Gb{}'.format(
                len(counter), '%.3f' % get_process_memory(), ' ' * 20)
            print(message)

        counter = {
            eojeol: count
            for eojeol, count in counter.items() if count >= min_eojeol_count
        }

        self._num_of_eojeols = sum(counter.values())
        self._num_of_covered_eojeols = 0

        if self.verbose:
            print('[Eomi Extractor] complete eojeol counter -> lr graph')

        self.lrgraph = EojeolCounter()._to_lrgraph(counter,
                                                   l_max_length=10,
                                                   r_max_length=9)

        if self.verbose:
            print('[Eomi Extractor] has been trained. mem={} Gb'.format(
                '%.3f' % get_process_memory()))

    def extract(self,
                minimum_eomi_score=0.3,
                min_count=10,
                reset_lrgraph=True):

        # reset covered eojeol count
        self._num_of_covered_eojeols = 0

        # base prediction
        eomi_candidates = self._eomi_candidates_from_stems()

    def _eomi_candidates_from_stems(self, condition=None):
        def satisfy(word, e):
            return word[-e:] == condition

        # noun candidates from positive featuers such as Josa
        R_from_L = {}

        for l in self._pos_l:

            for r, c in self.lrgraph.get_r(l, -1):

                # candidates filtering for debugging
                # condition is last chars in R
                if not condition:
                    R_from_L[r] = R_from_L.get(r, 0) + c
                    continue

                # for debugging
                if satisfy(r, len(condition)):
                    R_from_L[r] = R_from_L.get(r, 0) + c

        # sort by length of word
        R_from_L = sorted(R_from_L.items(), key=lambda x: -len(x[0]))

        return R_from_L

    def predict_r(self,
                  r,
                  minimum_r_score=0.3,
                  min_num_of_features=5,
                  debug=False):

        features = self.lrgraph.get_l(r, -1)

        pos, neg, unk = self._predict_r(features, r)

        base = pos + neg
        score = 0 if base == 0 else (pos - neg) / base
        support = pos + unk if score >= minimum_r_score else neg + unk

        features_ = self._refine_features(features, r)
        n_features_ = len(features_)

        if debug:
            print('pos={}, neg={}, unk={}, n_features_={}'.format(
                pos, neg, unk, n_features_))

        if n_features_ >= min_num_of_features:
            return score, support
        else:
            # TODO
            return (0, 0)

    def _predict_r(self, features, r):

        pos, neg, unk = 0, 0, 0

        for l, freq in features:
            if self._exist_longer_pos(l, r):  # ignore
                continue
            if l in self._pos_l:
                pos += freq
            elif self._has_stem_at_last(l):
                unk += freq
            else:
                neg += freq

        return pos, neg, unk

    def _exist_longer_pos(self, l, r):
        for i in range(1, len(r) + 1):
            if (l + r[:i]) in self._pos_l:
                return True
        return False

    def _has_stem_at_last(self, l):
        for i in range(1, len(l)):
            if l[-i:] in self._pos_l:
                return True
        return False

    def _refine_features(self, features, r):
        return [(l, count) for l, count in features
                if ((l in self._pos_l) and (not self._exist_longer_pos(l, r)))]

    def _batch_prediction_order_by_word_length(self,
                                               eomi_candidates,
                                               minimum_eomi_score=0.3,
                                               min_num_of_features=5):

        prediction_scores = {}

        n = len(eomi_candidates)
        for i, (r, _) in enumerate(eomi_candidates):

            if self.verbose and i % 1000 == 999:
                percentage = '%.3f' % (100 * (i + 1) / n)
                print('\r  -- batch prediction {} % of {} words'.format(
                    percentage, n),
                      flush=True,
                      end='')

            # base prediction
            score, support = self.predict_r(r, minimum_eomi_score,
                                            min_num_of_features)
            prediction_scores[r] = (score, support)

            # if their score is higher than minimum_eomi_score,
            # remove eojeol pattern from lrgraph
            if score >= minimum_eomi_score:
                for l, count in self.lrgraph.get_l(r, -1):
                    if l in self._pos_l:
                        self.lrgraph.remove_eojeol(l + r, count)

        if self.verbose:
            print(
                '\r[Eomi Extractor] batch prediction was completed for {} words'
                .format(n),
                flush=True)

        return prediction_scores