# some spaces again, just to make sure modifiers.re_replace(r'\s+', ' '), modifiers.re_replace(r'\s+(?=[,.!?])', ''), modifiers.strip(), modifiers.re_replace('^[^*ЁёА-Яа-я]+', ''), modifiers.re_fullmatch_ban(''), modifiers.remove_to_much_gap_percentage(r'\W+', r'\*(\w+)[?]?\*', 0.5), modifiers.calculate_key() ] @gen_resource('DefinitionsResource', modifiers=definitions_mods) def read_articles(): """ Generator which yields raw Explanations based on definitions dict """ with open(_raw_data, 'r', encoding='utf-8') as source: while True: title = source.readline().strip('\n') if not title: break desc = source.readline().strip('\n') yield Explanation(title, desc)
all_expls.append(expl) by_key[expl.key] = expl by_text[expl.text] = expl by_title.setdefault(expl.title, []).append(expl) abused = set() dmp = diff_match_patch() with FileExplanationStorage(SEL_PATH) as inp: with FileExplanationStorage(OUT_PATH) as out: out.clear() for sel in inp.entries(): if sel.key in by_key: abused.add(sel.key) elif sel.text in by_text: abused.add(by_text[sel.text].key) sel.key = by_text[sel.text].key else: best = min( (dmp.diff_levenshtein(dmp.diff_main(e.text, sel.text)), e) for e in by_title[sel.title] if e.key not in abused) if best[0] <= 12: # fuzzy match sel.key = best[1].key else: # something completely new sel.key = None sel = calculate_key()(sel) out.add_entry(sel)
import itertools from preparation.resources.Resource import gen_resource from preparation import modifiers from hb_res.explanations.Explanation import Explanation sample_modifiers = ( modifiers.str_replace('?', 'ё'), modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'), modifiers.re_replace('\\.', ' '), modifiers.normalize_title(), modifiers.calculate_key(), ) @gen_resource('SampleResource', sample_modifiers) def sample_parser(): raw_expls = itertools.starmap(Explanation, ( ('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'), ('ИВАН', 'Один день Ивана Денисовича'), ('унылая', 'Унылая пора! очей очарованье!') )) return raw_expls
by_text[expl.text] = expl by_title.setdefault(expl.title, []).append(expl) abused = set() dmp = diff_match_patch() with FileExplanationStorage(SEL_PATH) as inp: with FileExplanationStorage(OUT_PATH) as out: out.clear() for sel in inp.entries(): if sel.key in by_key: abused.add(sel.key) elif sel.text in by_text: abused.add(by_text[sel.text].key) sel.key = by_text[sel.text].key else: best = min( (dmp.diff_levenshtein(dmp.diff_main(e.text, sel.text)), e) for e in by_title[sel.title] if e.key not in abused ) if best[0] <= 12: # fuzzy match sel.key = best[1].key else: # something completely new sel.key = None sel = calculate_key()(sel) out.add_entry(sel)