class BundleCreator: def __init__(self, enrichers): self.bundles = [] self.alg = RepetitionsAlgorithm(100, True, True, True, True) self.enrichers = enrichers def build_index(self, db): idf = db.data_frames['src'] idf['check_requested'] = True self.alg.run(idf) self.last_.after_algorithm = idf idf = idf.loc[~idf.repetition_status] idf = idf[[ 'word_id', 'file_id', 'repetition_algorithm', 'repetition_reference' ]] idf.columns = ['word_id', 'file_id', 'match_type', 'another_word_id'] idf = idf.merge(db.data_frames['pymorphy'][['POS']], left_on='word_id', right_index=True) idf = idf.loc[~idf.POS.isin(['NPRO', 'PREP', 'CONJ', 'PRCL'])] idf = idf.drop('POS', axis=1) self.last_.after_filtration = idf return idf def __call__(self, db): self.last_ = Obj() self.last_.incoming = deepcopy(db) for enricher in self.enrichers: enricher(db) idf = self.build_index(db) db.index_frame = idf wids = list(idf.word_id) + list(idf.another_word_id) filter_bundle_by_words(db, wids) return db
def __init__(self, enrichers): self.bundles = [] self.alg = RepetitionsAlgorithm(100, True, True, True, True) self.enrichers = enrichers
def test_tikhonov_join(self): df = RepetitionsAlgorithm(50, True, True, True).run_on_string('двуединое единообразие') self.assertListEqual([True, False], list(df.repetition_status))
def test_usage_of_provided_pymorphy_column(self): df = Separator.separate_string("окно открыто") df['check_requested'] = True df1 = df.copy() alg = RepetitionsAlgorithm(50, False, True, False) alg.run_on_bundle(DataBundle(src=df1)) self.assertTrue(df1.repetition_status.all()) df2 = df.copy() pym = df2[['word_id']].copy() pym['normal_form'] = 'окно' pym = pym.set_index('word_id') alg = RepetitionsAlgorithm(50, False, True, False) alg.run_on_bundle(DataBundle(src=df2, pymorphy=pym)) self.assertFalse(df2.repetition_status.all()) df3 = df.copy() pym = df3[['word_id']].copy() pym['normal_form'] = ['двуединый', 'единообразие'] pym = pym.set_index('word_id') alg = RepetitionsAlgorithm(50, False, False, True) alg.run_on_bundle(DataBundle(src=df3, pymorphy=pym)) self.assertFalse(df3.repetition_status.all())
def test_extended_mode(self): df = RepetitionsAlgorithm( 50, True, True, True).run_on_string('Един един единый двуединый') self.assertListEqual([-1, 0, 1, 2], list(df.repetition_reference)) self.assertListEqual([None, 'simple', 'normal', 'tikhonov'], list(df.repetition_algorithm))
def test_on_empty(self): df = RepetitionsAlgorithm(50, True, True, True).run_on_string("Здесь нет ошибок", []) self.assertListEqual([True, True, True], list(df.repetition_status))
def test_multiline(self): df = RepetitionsAlgorithm(50, True, True, True).run_on_string("Повтор\nЕще повтор") self.assertListEqual([True, True, False], list(df.repetition_status))
def test_all(self): df = RepetitionsAlgorithm(50, True, True, True).run_on_string(text) self.assertListEqual([2, 5, 7, 10, 12, 14], list(df.loc[~df.repetition_status].word_id))
def test_normal(self): df = RepetitionsAlgorithm(50, False, True, False).run_on_string(text) self.assertListEqual([2, 7, 10], list(df.loc[~df.repetition_status].word_id))