class BundleCreator:
    def __init__(self, enrichers):
        self.bundles = []
        self.alg = RepetitionsAlgorithm(100, True, True, True, True)
        self.enrichers = enrichers

    def build_index(self, db):
        idf = db.data_frames['src']
        idf['check_requested'] = True
        self.alg.run(idf)
        self.last_.after_algorithm = idf

        idf = idf.loc[~idf.repetition_status]
        idf = idf[[
            'word_id', 'file_id', 'repetition_algorithm',
            'repetition_reference'
        ]]
        idf.columns = ['word_id', 'file_id', 'match_type', 'another_word_id']

        idf = idf.merge(db.data_frames['pymorphy'][['POS']],
                        left_on='word_id',
                        right_index=True)
        idf = idf.loc[~idf.POS.isin(['NPRO', 'PREP', 'CONJ', 'PRCL'])]
        idf = idf.drop('POS', axis=1)
        self.last_.after_filtration = idf

        return idf

    def __call__(self, db):
        self.last_ = Obj()
        self.last_.incoming = deepcopy(db)
        for enricher in self.enrichers:
            enricher(db)
        idf = self.build_index(db)
        db.index_frame = idf
        wids = list(idf.word_id) + list(idf.another_word_id)
        filter_bundle_by_words(db, wids)
        return db
 def __init__(self, enrichers):
     self.bundles = []
     self.alg = RepetitionsAlgorithm(100, True, True, True, True)
     self.enrichers = enrichers
Example #3
0
 def test_tikhonov_join(self):
     df = RepetitionsAlgorithm(50, True, True,
                               True).run_on_string('двуединое единообразие')
     self.assertListEqual([True, False], list(df.repetition_status))
Example #4
0
    def test_usage_of_provided_pymorphy_column(self):
        df = Separator.separate_string("окно открыто")
        df['check_requested'] = True

        df1 = df.copy()
        alg = RepetitionsAlgorithm(50, False, True, False)
        alg.run_on_bundle(DataBundle(src=df1))
        self.assertTrue(df1.repetition_status.all())

        df2 = df.copy()
        pym = df2[['word_id']].copy()
        pym['normal_form'] = 'окно'
        pym = pym.set_index('word_id')
        alg = RepetitionsAlgorithm(50, False, True, False)
        alg.run_on_bundle(DataBundle(src=df2, pymorphy=pym))
        self.assertFalse(df2.repetition_status.all())

        df3 = df.copy()
        pym = df3[['word_id']].copy()
        pym['normal_form'] = ['двуединый', 'единообразие']
        pym = pym.set_index('word_id')
        alg = RepetitionsAlgorithm(50, False, False, True)
        alg.run_on_bundle(DataBundle(src=df3, pymorphy=pym))
        self.assertFalse(df3.repetition_status.all())
Example #5
0
 def test_extended_mode(self):
     df = RepetitionsAlgorithm(
         50, True, True, True).run_on_string('Един един единый двуединый')
     self.assertListEqual([-1, 0, 1, 2], list(df.repetition_reference))
     self.assertListEqual([None, 'simple', 'normal', 'tikhonov'],
                          list(df.repetition_algorithm))
Example #6
0
 def test_on_empty(self):
     df = RepetitionsAlgorithm(50, True, True,
                               True).run_on_string("Здесь нет ошибок", [])
     self.assertListEqual([True, True, True], list(df.repetition_status))
Example #7
0
 def test_multiline(self):
     df = RepetitionsAlgorithm(50, True, True,
                               True).run_on_string("Повтор\nЕще повтор")
     self.assertListEqual([True, True, False], list(df.repetition_status))
Example #8
0
 def test_all(self):
     df = RepetitionsAlgorithm(50, True, True, True).run_on_string(text)
     self.assertListEqual([2, 5, 7, 10, 12, 14],
                          list(df.loc[~df.repetition_status].word_id))
Example #9
0
 def test_normal(self):
     df = RepetitionsAlgorithm(50, False, True, False).run_on_string(text)
     self.assertListEqual([2, 7, 10],
                          list(df.loc[~df.repetition_status].word_id))