Beispiel #1
0
    def _as_lemma_candidates(self, eojeol_counter=None):

        def is_noun_josa(eojeol):
            for i in range(1, len(eojeol)):
                if ((eojeol[:i] in self._nouns) and
                    (eojeol[i:] in self._josas)):
                    return True
            return False

        self._num_of_covered_eojeols = 0
        self._count_of_covered_eojeols = 0

        lemmas = {}
        eomi_to_word_count = defaultdict(lambda: [])
        num_eojeol = len(eojeol_counter)

        for i, (eojeol, count) in enumerate(eojeol_counter.items()):
            if self.verbose and i % 5000 == 4999:
                message = 'lemmatizing {} / {} words'.format(i+1, num_eojeol)
                self._print(message, replace=True, newline=False)
            if is_noun_josa(eojeol):
                continue

            n = len(eojeol)
            lemma_candidates = set()

            for i in range(1, n+1):
                l, r = eojeol[:i], eojeol[i:]
                for stem, eomi in _lemma_candidate(l, r):
                    if (stem in self._stems) and (eomi in self._eomis):
                        lemma_candidates.add((stem, eomi))

            lemma_candidates_ = set()
            for stem, eomi in lemma_candidates:
                if eojeol in conjugate(stem, eomi):
                    lemma_candidates_.add((stem, eomi))

            if lemma_candidates_:
                lemmas[eojeol] = Predicator(count, lemma_candidates_)
                for stem, eomi in lemma_candidates_:
                    eomi_to_word_count[eomi].append((eojeol, count))
                self._num_of_covered_eojeols += 1
                self._count_of_covered_eojeols += count

        lemmas = self._remove_wrong_eomis(lemmas, eomi_to_word_count)
        if self.verbose:
            perc = '%.3f' % (100 * self._count_of_covered_eojeols / self._count_of_eojeols)
            message = 'lemma candidating was done'
            self._print(message, replace=True, newline=True)

        return lemmas
Beispiel #2
0
    def _as_lemma_candidates(self, eojeols=None, min_count=10):

        if not eojeols:
            eojeols = {
                l: rdict.get('', 0)
                for l, rdict in self.lrgraph._lr.items()
            }
            eojeols = [
                eojeol for eojeol, count in eojeols.items()
                if count > min_count
            ]

        def all_character_are_complete_korean(s):
            for c in s:
                if not character_is_complete_korean(c):
                    return False
            return True

        eojeols = [
            eojeol for eojeol in eojeols
            if all_character_are_complete_korean(eojeol)
        ]

        n_eojeols = len(eojeols)
        lemmas = {}

        for i_eojeol, eojeol in enumerate(eojeols):

            if self.verbose and i_eojeol % 5000 == 0:
                perc = '%.3f' % (100 * i_eojeol / n_eojeols)
                message = 'lemma candidates ... {} %'.format(perc)
                self._print(message, replace=True, newline=False)

            n = len(eojeol)
            lemma_candidates = set()

            for i in range(1, n + 1):
                l, r = eojeol[:i], eojeol[i:]
                for stem, eomi in _lemma_candidate(l, r):
                    if (stem in self._stems) and (eomi in self._eomis):
                        lemma_candidates.add((stem, eomi))

            if lemma_candidates:
                lemmas[eojeol] = lemma_candidates

        if self.verbose:
            message = 'lemma candidating was done     '
            self._print(message, replace=True, newline=True)

        return lemmas
Beispiel #3
0
    def _lemmatize(self, word, stems, eomis):
        def only_knowns(lemmas):
            return [
                lemma for lemma in lemmas
                if (lemma[0] in stems) and (lemma[1] in eomis)
            ]

        for i in range(len(word) + 1, 0, -1):
            try:
                lemmas = _lemma_candidate(word[:i], word[i:])
            except:
                continue
            lemmas = only_knowns(lemmas)
            if lemmas:
                return lemmas
        return None
Beispiel #4
0
    def _to_stem(self, surfaces):
        def merge_score(freq0, score0, freq1, score1):
            return (freq0 + freq1,
                    (score0 * freq0 + score1 * freq1) / (freq0 + freq1))

        stems = {}
        for l, (freq0, score0) in surfaces.items():
            for r, count in self.lrgraph.get_r(l, -1):
                try:
                    for stem, eomi in _lemma_candidate(l, r):
                        if eomi in self.eomis:
                            continue
                        stems[stem] = merge_score(freq0, score0,
                                                  *stems.get(stem, (0, 0)))
                except:
                    continue

        return stems
Beispiel #5
0
    def _eomi_lemmatize(self, eomis):
        def merge_score(freq0, score0, freq1, score1):
            return (freq0 + freq1,
                    (score0 * freq0 + score1 * freq1) / (freq0 + freq1))

        lemmas = {}
        for eomi, (_, score0) in eomis.items():
            for stem_surface, count in self.lrgraph.get_l(eomi, -1):
                try:
                    for stem, lemma in _lemma_candidate(stem_surface, eomi):
                        if not (stem in self._stems):
                            continue
                        lemmas[lemma] = merge_score(count, score0,
                                                    *lemmas.get(lemma, (0, 0)))
                # stem 이 한글이 아닌 경우 불가
                except Exception as e:
                    continue

        return lemmas
Beispiel #6
0
    def _eomi_lemmatize(self, eomis):
        def merge_score(freq0, score0, freq1, score1):
            return (freq0 + freq1,
                    (score0 * freq0 + score1 * freq1) / (freq0 + freq1))

        eomis_ = {}
        #lrgraph = defaultdict(lambda: defaultdict(int))
        #lemma_to_word = defaultdict(lambda: [])
        for eomi, (_, score0) in eomis.items():
            for stem_surface, count in self.lrgraph.get_l(eomi, -1):
                try:
                    for stem_, eomi_ in _lemma_candidate(stem_surface, eomi):
                        if not (stem_ in self._stems):
                            continue
                        eomis_[eomi_] = merge_score(count, score0,
                                                    *eomis_.get(eomi_, (0, 0)))
                        #lrgraph[stem_][eomi_] += count
                        #lemma_to_word[(stem_, eomi_)].append(stem_surface + eomi)
                # stem 이 한글이 아닌 경우 불가
                except Exception as e:
                    continue

        return eomis_