# some spaces again, just to make sure

    modifiers.re_replace(r'\s+', ' '),
    modifiers.re_replace(r'\s+(?=[,.!?])', ''),

    modifiers.strip(),
    modifiers.re_replace('^[^*ЁёА-Яа-я]+', ''),

    modifiers.re_fullmatch_ban(''),

    modifiers.remove_to_much_gap_percentage(r'\W+', r'\*(\w+)[?]?\*', 0.5),

    modifiers.calculate_key()
]


@gen_resource('DefinitionsResource', modifiers=definitions_mods)
def read_articles():
    """
    Generator which yields raw Explanations based on definitions dict
    """
    with open(_raw_data, 'r', encoding='utf-8') as source:
        while True:
            title = source.readline().strip('\n')
            if not title: break
            desc = source.readline().strip('\n')
            yield Explanation(title, desc)
            all_expls.append(expl)
            by_key[expl.key] = expl
            by_text[expl.text] = expl
            by_title.setdefault(expl.title, []).append(expl)

abused = set()

dmp = diff_match_patch()
with FileExplanationStorage(SEL_PATH) as inp:
    with FileExplanationStorage(OUT_PATH) as out:
        out.clear()

        for sel in inp.entries():
            if sel.key in by_key:
                abused.add(sel.key)
            elif sel.text in by_text:
                abused.add(by_text[sel.text].key)
                sel.key = by_text[sel.text].key
            else:
                best = min(
                    (dmp.diff_levenshtein(dmp.diff_main(e.text, sel.text)), e)
                    for e in by_title[sel.title] if e.key not in abused)
                if best[0] <= 12:
                    # fuzzy match
                    sel.key = best[1].key
                else:
                    # something completely new
                    sel.key = None
                    sel = calculate_key()(sel)
            out.add_entry(sel)
import itertools

from preparation.resources.Resource import gen_resource
from preparation import modifiers
from hb_res.explanations.Explanation import Explanation


sample_modifiers = (
    modifiers.str_replace('?', 'ё'),
    modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'),
    modifiers.re_replace('\\.', ' '),
    modifiers.normalize_title(),
    modifiers.calculate_key(),
)


@gen_resource('SampleResource', sample_modifiers)
def sample_parser():
    raw_expls = itertools.starmap(Explanation, (
        ('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'),
        ('ИВАН', 'Один день Ивана Денисовича'),
        ('унылая', 'Унылая пора! очей очарованье!')
    ))
    return raw_expls
            by_text[expl.text] = expl
            by_title.setdefault(expl.title, []).append(expl)

abused = set()

dmp = diff_match_patch()
with FileExplanationStorage(SEL_PATH) as inp:
    with FileExplanationStorage(OUT_PATH) as out:
        out.clear()

        for sel in inp.entries():
            if sel.key in by_key:
                abused.add(sel.key)
            elif sel.text in by_text:
                abused.add(by_text[sel.text].key)
                sel.key = by_text[sel.text].key
            else:
                best = min(
                    (dmp.diff_levenshtein(dmp.diff_main(e.text, sel.text)), e)
                    for e in by_title[sel.title]
                    if e.key not in abused
                )
                if best[0] <= 12:
                    # fuzzy match
                    sel.key = best[1].key
                else:
                    # something completely new
                    sel.key = None
                    sel = calculate_key()(sel)
            out.add_entry(sel)