def __init__(self, nouns, noun_pos_features=None, stems=None, eomis=None, extract_eomi=False, extract_stem=False, verbose=True): if not noun_pos_features: noun_pos_features = self._load_default_noun_pos_features() if not stems: stems = self._load_default_stems() if not eomis: eomis = self._load_default_eomis() self._nouns = nouns self._noun_pos_features = noun_pos_features self._stems = stems self._eomis = eomis self.verbose = verbose self.extract_eomi = extract_eomi self.extract_stem = extract_stem self._stem_surfaces = { l for stem in stems for l in _conjugate_stem(stem) } self.lrgraph = None
def main(): testset = { '가', '감미롭', '곱', '구르', '그렇', '긋', '깨닫', '끄', '낫', '덥', '돌아오', '동그랗', '들', '벗', '사오', '삼가', '시퍼렇', '아깝', '아니꼽', '아니하', '아름답', '영원하', '오', '이', '이르', '좋', '주', '줍', '트', '파랗', '푸', '푸르', '하' } for stem in testset: print('{} -> {}'.format(stem, _conjugate_stem(stem)))
def _transform_stem_as_surfaces(self): surfaces = set() for stem in self._stems: try: for l in _conjugate_stem(stem): surfaces.add(l) except Exception as e: print('Exception stem = {}, {}'.format(stem, e)) continue return surfaces
def extract(self, condition=None, min_eomi_score=0.3, min_eomi_frequency=1, reset_lrgraph=True): # reset covered eojeol count and extracted eomis self._num_of_covered_eojeols = 0 self._eomis = {} self._stem_surfaces = { l for stem in self._stems for l in _conjugate_stem(stem) } # base prediction candidates = self._candidates_from_stem_surfaces(condition) prediction_scores = self._batch_prediction(candidates, min_eomi_score, self.min_num_of_features) eomi_surfaces = { eomi: score for eomi, score in prediction_scores.items() if (score[1] >= min_eomi_score) } if self.verbose: message = 'eomi lemmatization with {} candidates'.format( len(eomi_surfaces)) self._print(message, replace=False, newline=True) self.lrgraph.reset_lrgraph() lemmas = self._eomi_lemmatize(eomi_surfaces) lemmas = { eomi: score for eomi, score in lemmas.items() if (score[0] >= min_eomi_frequency) and ( score[1] >= min_eomi_score) } if self.logpath: with open(self.logpath + '_eomi_prediction_score.log', 'w', encoding='utf-8') as f: f.write('eomi frequency score\n') for word, score in sorted(prediction_scores.items(), key=lambda x: -x[1][1]): f.write('{} {} {}\n'.format(word, score[0], score[1])) if self.verbose: message = '{} eomis extracted with min frequency = {}, min score = {}'.format( len(lemmas), min_eomi_frequency, min_eomi_score) self._print(message, replace=False, newline=True) self._check_covered_eojeols(lemmas) # TODO with lemma self._eomis = lemmas if reset_lrgraph: self.lrgraph.reset_lrgraph() del self._stem_surfaces lemmas_ = { eomi: EomiScore(score[0], score[1]) for eomi, score in lemmas.items() } return lemmas_