Esempio n. 1
0
 def test(self):
     test_case = '421/nb+년/nu+째/xn'
     morpheme = Morpheme(test_case)
     self.assertEqual([ch for ch in '421년째'],
                      [ch for ch, tag in morpheme.tags()])
     self.assertEqual(['nb', 'nb', 'nb', 'nu', 'xn'],
                      [tag for ch, tag in morpheme.tags()])
Esempio n. 2
0
 def test_plus(self):
     test_case = '(/sl++/so+)/sr+반파/nc'
     morpheme = Morpheme(test_case)
     self.assertEqual([ch for ch in '(+)반파'],
                      [ch for ch, tag in morpheme.tags()])
     self.assertEqual(['sl', 'so', 'sr', 'nc', 'nc'],
                      [tag for ch, tag in morpheme.tags()])
Esempio n. 3
0
 def test_match3(self):
     test_case = ('선각자였습니다.',
                  '선/nc+각/nc+자/nc+이/pp+었/ep+습/ef+니/ef+다/ef+./sf')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['nc', 'nc', 'nc', 'ppep', 'ef', 'ef', 'ef', 'sf'],
                      result)
Esempio n. 4
0
 def test_slash(self):
     test_case = 'TCP/ne+//sc+IP/ne+를/po'
     morpheme = Morpheme(test_case)
     self.assertEqual([ch for ch in 'TCP/IP를'],
                      [ch for ch, tag in morpheme.tags()])
     self.assertEqual(['ne', 'ne', 'ne', 'sc', 'ne', 'ne', 'po'],
                      [tag for ch, tag in morpheme.tags()])
    def run_testing(self, *, batch_size: int, start_of_morpheme: str,
                    end_of_morpheme: str) -> None:

        start_symbol: Symbol = self.corpus.morphemes.alphabet[
            start_of_morpheme]
        end_symbol: Symbol = self.corpus.morphemes.alphabet[end_of_morpheme]

        data_loader: DataLoader = DataLoader(
            dataset=self.corpus,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=MorphemeVectors.collate_morphemes)

        for morphemes in iter(data_loader):  # type: List[Morpheme]
            predicted_morphemes: List[Morpheme] = self.evaluate(morphemes)
            for i in range(len(morphemes)):
                original = morphemes[i]
                predicted = predicted_morphemes[i]

                start_index = predicted_morphemes[i].graphemes.index(
                    start_symbol)
                end_index = predicted_morphemes[i].graphemes.index(end_symbol)

                predicted_morpheme = Morpheme(
                    graphemes=predicted_morphemes[i].
                    graphemes[start_index:end_index],
                    tpr=predicted_morphemes[i].tpr[start_index:end_index])

                print(f"{morphemes[i]}\t{predicted_morpheme}")
    def unbind(self, predicted_tpr: torch.Tensor) -> List[Morpheme]:

        dimensions: Dimensions = self.check_dimensions(predicted_tpr)

        cosine_similarity = self.calculate_cosine_similarity(
            predicted_tpr, dimensions)

        predicted_labels = cosine_similarity.view(
            -1, dimensions.a).argmax(dim=-1).view(dimensions.b, dimensions.m)

        return [
            Morpheme(graphemes=[
                self.alphabet[i] for i in predicted_labels[b].tolist()
            ],
                     tpr=predicted_tpr[b].tolist())
            for b in range(dimensions.b)
        ]
    def parse(txt: list, pos: list):
        items = []
        errors = set()
        for text, label in zip(txt, pos):
            chars = []
            tags = []

            try:
                text_tokens = text.split(' ')
                morpheme_tokens = label.split(' ')
                for text_token, morpheme_token in zip(text_tokens,
                                                      morpheme_tokens):
                    chars += [ch for ch in text_token]
                    tags += Morpheme(morpheme_token).match(text_token)
                    chars.append(' ')
                    tags.append('o')

                chars = chars[0:-1]
                tags = tags[0:-1]

                for i in range(len(tags)):
                    if tags[i] == 'o':
                        continue

                    if i == 0 or not tags[i - 1].endswith(tags[i]):
                        tags[i] = 'b-' + tags[i]
                    else:
                        tags[i] = 'i-' + tags[i]

                items.append(SejongCorpusParser.Item(chars, tags))

            except ValueError as e:
                errors.add(e)

        print('Pattern Errors: %d' % len(errors))
        with open('./pattern_errors.txt', mode='w', encoding='utf-8') as fp:
            fp.write('\n'.join([str(error) for error in errors]))

        return items
Esempio n. 8
0
 def test_match8(self):
     test_case = ('일컬어진다.', '일/vb+컫/vb+어/ex+지/vx+ㄴ/ef+다/ef+./sf')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['vb', 'vb', 'ex', 'vxef', 'ef', 'sf'], result)
Esempio n. 9
0
 def test_match7(self):
     test_case = ('있으므로', '있/vx+므/ec+로/ec')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['vx', 'mg', 'ec', 'ec'], result)
Esempio n. 10
0
 def test_match6(self):
     test_case = ('웃으면', '웃/vb+면/ef')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['vb', 'mg', 'ef'], result)
Esempio n. 11
0
 def test_match5(self):
     test_case = ('보십시오.', '보/vx+시/ep+ㅂ/ef+시/ef+오/ef+./sf')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['vx', 'epef', 'ef', 'ef', 'sf'], result)
Esempio n. 12
0
 def test_match4(self):
     test_case = ('활용한다든가', '활/na+용/na+하/xv+ㄴ/ec+다/ec+든/ec+가/ec')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['na', 'na', 'xvec', 'ec', 'ec', 'ec'], result)
Esempio n. 13
0
 def test_match2(self):
     test_case = ('끝내!', '끝내/vb+아/ef+!/sf')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['vb', 'vb', 'sf'], result)
Esempio n. 14
0
 def test_match(self):
     test_case = ('썼거니와', '쓰/vb+었/ep+거니와/ec')
     morpheme = Morpheme(test_case[1])
     result = morpheme.match(test_case[0])
     self.assertEqual(['vbep', 'ec', 'ec', 'ec'], result)