Esempio n. 1
0
    def test_update(self):
        pars = Query.en(text.split('\n')).where(lambda z: z != '').to_list()
        df = Separator.separate_paragraphs(pars)

        new_pars = [
            pars[0], pars[1], 'Нулевая. Вставка', pars[4], pars[3],
            'Первая. Вставка', 'Вторая. Вставка', pars[6], 'Третья. Вставка'
        ]
        df1 = Separator.separate_paragraphs(new_pars)

        df2 = Separator.update_separation(
            df, new_pars, [0, 1, None, 4, 3, None, None, 6, None])

        for c in df1.columns:
            self.assertListEqual(list(df1[c]), list(df2[c]))

        pd.options.display.width = None

        self.assertListEqual([
            False, False, False, False, False, False, True, True, True, False,
            False, False, False, False, False, True, True, True, True, True,
            True, False, False, False, True, True, True
        ], list(df2.updated))
        self.assertListEqual([
            0, 1, 2, 3, 4, 5, -1, -1, -1, 12, 13, 14, 9, 10, 11, -1, -1, -1,
            -1, -1, -1, 18, 19, 20, -1, -1, -1
        ], list(df2.original_word_id))
        self.assertListEqual([
            0, 0, 1, 2, 2, 3, -1, -1, -1, 8, 8, 9, 6, 6, 7, -1, -1, -1, -1, -1,
            -1, 12, 12, 13, -1, -1, -1
        ], list(df2.original_sentence_id))
        self.assertListEqual([
            0, 0, 0, 1, 1, 1, -1, -1, -1, 4, 4, 4, 3, 3, 3, -1, -1, -1, -1, -1,
            -1, 6, 6, 6, -1, -1, -1
        ], list(df2.original_paragraph_id))
Esempio n. 2
0
 def test_separation_multi(self):
     df = Separator.separate_paragraphs(
         ['первое предожение. Второе.', 'Второй параграф'])
     #pd.set_option('max_columns',None); pd.set_option('display.width', 1000);print(df)
     self.assertListEqual([0, 0, 0, 1, 1, 2, 2], list(df.sentence_id))
     self.assertListEqual([0, 1, 2, 3, 4, 5, 6], list(df.word_id))
     self.assertListEqual([0, 0, 0, 0, 0, 1, 1], list(df.paragraph_id))
Esempio n. 3
0
 def test_highlight(self):
     text = 'Мама мыла раму'
     df = Separator.separate_string(text)
     df['highlight'] = df.word_id == 1
     v = DfViewer(as_html_object=False, highlight_column='highlight')
     self.assertEqual(
         'Мама <span style="background-color:#ffdddd;">мыла</span> раму',
         v.convert(df))
Esempio n. 4
0
 def test_separation(self):
     text = '«Какой-нибудь»   текст —  с знаками… И еще словами!.. Вот так.'
     df = Separator.separate_string(text)
     self.assertListEqual(
         list(df.word_offset),
         [0, 1, 13, 17, 23, 26, 28, 35, 37, 39, 43, 50, 54, 58, 61])
     self.assertListEqual(list(df.word_length),
                          [1, 12, 1, 5, 1, 1, 7, 1, 1, 3, 7, 3, 3, 3, 1])
Esempio n. 5
0
 def setUpClass(cls) -> None:
     super(SlovnetFeaturizersTestCase, cls).setUpClass()
     cls.analyzer = SlovnetFeaturizer()
     cls.context_featurizer = SlovnetContextFeaturizer()
     df = Separator.separate_string(text)
     for c in ['word_id', 'sentence_id', 'paragraph_id']:
         df[c] += 100
     cls.result = cls.analyzer.featurize(df)
     cls.context_result = cls.context_featurizer.featurize(cls.result)
Esempio n. 6
0
 def test_tikhonov(self):
     db = Separator.separate_string_into_bundle(
         "Безответственное пароходство")
     PyMorphyFeaturizer().as_enricher().enrich(db)
     MorphemeTikhonovEnricher().enrich(db)
     self.assertListEqual(
         ['без', 'ответственн', 'ый', 'пар', 'о', 'ход', 'ств', 'о'],
         list(db.tikhonov_morphemes.morpheme))
     self.assertListEqual(
         ['PREF', 'ROOT', 'END', 'ROOT', 'LINK', 'ROOT', 'SUFF', 'END'],
         list(db.tikhonov_morphemes.morpheme_type))
    def test_update_enrich_pymorphy(self):
        enricher = PyMorphyFeaturizer().as_enricher()

        pars0 = Query.en(text.split('\n')).where(lambda z: z != '').to_list()
        df0 = Separator.separate_paragraphs(pars0)
        db0 = DataBundle(src=df0)
        enricher.enrich(db0)

        pars1 = [pars0[1], pars0[0], 'Вставленное предложение.', pars0[2]]

        df_test = Separator.update_separation(df0, pars1, [1, 0, None, 2])
        db_test = DataBundle(src=df_test)
        enricher.update_enrich(db0, db_test)

        df_control = Separator.separate_paragraphs(pars1)
        db_control = DataBundle(src=df_control)
        enricher.enrich(db_control)

        py_test = db_test.pymorphy.reset_index().sort_values('word_id')
        py_control = db_control.pymorphy.reset_index().sort_values('word_id')

        for c in py_control.columns:
            self.check(py_control, py_test, c)
Esempio n. 8
0
 def test_pymorphy(self):
     df = Separator.separate_string(text)
     df.word_id += 100
     result = PyMorphyFeaturizer().featurize(df)
     self.assertListEqual(
         [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
         list(result.index))
     self.assertListEqual([
         'он', 'подойти', 'к', 'дверь', '.', 'за', 'она', 'никто', 'не',
         'быть', '.'
     ], list(result.normal_form))
     self.assertListEqual([
         'normal_form', 'alternatives', 'score', 'delta_score', 'POS',
         'animacy', 'gender', 'number', 'case', 'aspect', 'transitivity',
         'person', 'tense', 'mood', 'voice', 'involvement'
     ], list(result.columns))
Esempio n. 9
0
    def test_usage_of_provided_pymorphy_column(self):
        df = Separator.separate_string("окно открыто")
        df['check_requested'] = True

        df1 = df.copy()
        alg = RepetitionsAlgorithm(50, False, True, False)
        alg.run_on_bundle(DataBundle(src=df1))
        self.assertTrue(df1.repetition_status.all())

        df2 = df.copy()
        pym = df2[['word_id']].copy()
        pym['normal_form'] = 'окно'
        pym = pym.set_index('word_id')
        alg = RepetitionsAlgorithm(50, False, True, False)
        alg.run_on_bundle(DataBundle(src=df2, pymorphy=pym))
        self.assertFalse(df2.repetition_status.all())

        df3 = df.copy()
        pym = df3[['word_id']].copy()
        pym['normal_form'] = ['двуединый', 'единообразие']
        pym = pym.set_index('word_id')
        alg = RepetitionsAlgorithm(50, False, False, True)
        alg.run_on_bundle(DataBundle(src=df3, pymorphy=pym))
        self.assertFalse(df3.repetition_status.all())
Esempio n. 10
0
 def test_viewer(self):
     text = 'Мама мыла раму'
     df = Separator.separate_string(text)
     v = DfViewer(as_html_object=False)
     self.assertEqual(text, v.convert(df))
Esempio n. 11
0
 def test_separation_columns(self):
     text = '«Какой-нибудь»   текст —  с знаками… И еще словами!.. Вот так.'
     df = Separator.separate_string(text)
     self.assertListEqual(Separator.COLUMNS, list(df.columns))
Esempio n. 12
0
 def test_separator_types(self):
     df = Separator.separate_string('Слово сло' + chr(8242) +
                                    'во! Qwe - йцу ' + "it's")
     self.assertListEqual(
         ['ru', 'ru', 'punct', 'unk', 'punct', 'ru', 'unk', 'unk', 'unk'],
         list(df.word_type))
Esempio n. 13
0
 def test_separation_string_with_nl(self):
     df = Separator.separate_string('Строка\nВторая строка')
     self.assertListEqual([0, 1, 1], list(df.paragraph_id))