def test(self): test_case = '421/nb+년/nu+째/xn' morpheme = Morpheme(test_case) self.assertEqual([ch for ch in '421년째'], [ch for ch, tag in morpheme.tags()]) self.assertEqual(['nb', 'nb', 'nb', 'nu', 'xn'], [tag for ch, tag in morpheme.tags()])
def test_plus(self): test_case = '(/sl++/so+)/sr+반파/nc' morpheme = Morpheme(test_case) self.assertEqual([ch for ch in '(+)반파'], [ch for ch, tag in morpheme.tags()]) self.assertEqual(['sl', 'so', 'sr', 'nc', 'nc'], [tag for ch, tag in morpheme.tags()])
def test_match3(self): test_case = ('선각자였습니다.', '선/nc+각/nc+자/nc+이/pp+었/ep+습/ef+니/ef+다/ef+./sf') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['nc', 'nc', 'nc', 'ppep', 'ef', 'ef', 'ef', 'sf'], result)
def test_slash(self): test_case = 'TCP/ne+//sc+IP/ne+를/po' morpheme = Morpheme(test_case) self.assertEqual([ch for ch in 'TCP/IP를'], [ch for ch, tag in morpheme.tags()]) self.assertEqual(['ne', 'ne', 'ne', 'sc', 'ne', 'ne', 'po'], [tag for ch, tag in morpheme.tags()])
def run_testing(self, *, batch_size: int, start_of_morpheme: str, end_of_morpheme: str) -> None: start_symbol: Symbol = self.corpus.morphemes.alphabet[ start_of_morpheme] end_symbol: Symbol = self.corpus.morphemes.alphabet[end_of_morpheme] data_loader: DataLoader = DataLoader( dataset=self.corpus, batch_size=batch_size, shuffle=False, collate_fn=MorphemeVectors.collate_morphemes) for morphemes in iter(data_loader): # type: List[Morpheme] predicted_morphemes: List[Morpheme] = self.evaluate(morphemes) for i in range(len(morphemes)): original = morphemes[i] predicted = predicted_morphemes[i] start_index = predicted_morphemes[i].graphemes.index( start_symbol) end_index = predicted_morphemes[i].graphemes.index(end_symbol) predicted_morpheme = Morpheme( graphemes=predicted_morphemes[i]. graphemes[start_index:end_index], tpr=predicted_morphemes[i].tpr[start_index:end_index]) print(f"{morphemes[i]}\t{predicted_morpheme}")
def unbind(self, predicted_tpr: torch.Tensor) -> List[Morpheme]: dimensions: Dimensions = self.check_dimensions(predicted_tpr) cosine_similarity = self.calculate_cosine_similarity( predicted_tpr, dimensions) predicted_labels = cosine_similarity.view( -1, dimensions.a).argmax(dim=-1).view(dimensions.b, dimensions.m) return [ Morpheme(graphemes=[ self.alphabet[i] for i in predicted_labels[b].tolist() ], tpr=predicted_tpr[b].tolist()) for b in range(dimensions.b) ]
def parse(txt: list, pos: list): items = [] errors = set() for text, label in zip(txt, pos): chars = [] tags = [] try: text_tokens = text.split(' ') morpheme_tokens = label.split(' ') for text_token, morpheme_token in zip(text_tokens, morpheme_tokens): chars += [ch for ch in text_token] tags += Morpheme(morpheme_token).match(text_token) chars.append(' ') tags.append('o') chars = chars[0:-1] tags = tags[0:-1] for i in range(len(tags)): if tags[i] == 'o': continue if i == 0 or not tags[i - 1].endswith(tags[i]): tags[i] = 'b-' + tags[i] else: tags[i] = 'i-' + tags[i] items.append(SejongCorpusParser.Item(chars, tags)) except ValueError as e: errors.add(e) print('Pattern Errors: %d' % len(errors)) with open('./pattern_errors.txt', mode='w', encoding='utf-8') as fp: fp.write('\n'.join([str(error) for error in errors])) return items
def test_match8(self): test_case = ('일컬어진다.', '일/vb+컫/vb+어/ex+지/vx+ㄴ/ef+다/ef+./sf') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['vb', 'vb', 'ex', 'vxef', 'ef', 'sf'], result)
def test_match7(self): test_case = ('있으므로', '있/vx+므/ec+로/ec') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['vx', 'mg', 'ec', 'ec'], result)
def test_match6(self): test_case = ('웃으면', '웃/vb+면/ef') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['vb', 'mg', 'ef'], result)
def test_match5(self): test_case = ('보십시오.', '보/vx+시/ep+ㅂ/ef+시/ef+오/ef+./sf') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['vx', 'epef', 'ef', 'ef', 'sf'], result)
def test_match4(self): test_case = ('활용한다든가', '활/na+용/na+하/xv+ㄴ/ec+다/ec+든/ec+가/ec') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['na', 'na', 'xvec', 'ec', 'ec', 'ec'], result)
def test_match2(self): test_case = ('끝내!', '끝내/vb+아/ef+!/sf') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['vb', 'vb', 'sf'], result)
def test_match(self): test_case = ('썼거니와', '쓰/vb+었/ep+거니와/ec') morpheme = Morpheme(test_case[1]) result = morpheme.match(test_case[0]) self.assertEqual(['vbep', 'ec', 'ec', 'ec'], result)