Ejemplo n.º 1
0
import itertools

from preparation.resources.Resource import gen_resource
from preparation import modifiers
from hb_res.explanations.Explanation import Explanation


sample_modifiers = (
    modifiers.str_replace('?', 'ё'),
    modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'),
    modifiers.re_replace('\\.', ' '),
    modifiers.normalize_title(),
    modifiers.calculate_key(),
)


@gen_resource('SampleResource', sample_modifiers)
def sample_parser():
    raw_expls = itertools.starmap(Explanation, (
        ('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'),
        ('ИВАН', 'Один день Ивана Денисовича'),
        ('унылая', 'Унылая пора! очей очарованье!')
    ))
    return raw_expls
        return ret
    return apply

definitions_mods = [

    # there is even a board on trello for almost all of these modifiers: trello.com/b/IEP8jusD
    modifiers.strip(' -3', target_field='title'),
    modifiers.translate(
        '?і3',  # the second symbol is not i but \xd1\x96 in utf8
        'ЁЁЗ',
        '",.+:124567890',
        target_field='title'
    ),
    modifiers.str_replace('||', 'П', target_field='title'),
    modifiers.re_search_ban(r'[^-ЁёА-Яа-я]', target_field='title'),
    modifiers.normalize_title(),
    # Text OCR problems

    # modifiers.translate(
    #     '?~[]{}',
    #     'ё-()()',
    #     '|*o'  # the o is latin
    # ),
    # modifiers.re_replace(r'знай\.', 'знач.'),
    # modifiers.str_replace('3а', 'За'),
    # modifiers.re_replace(r'(?<={alph})\d+(-\d+)?'.format(alph=modifiers.ALPH_RE), ''),
    modifiers.str_replace(r' :', ':'),
    modifiers.re_replace(r'([,:])(?=[^ 0-9])', r'\1 '),

    # Text quality heuristics
Ejemplo n.º 3
0
__author__ = 'Алексей'

# noinspection PyProtectedMember
from preparation.resources.synonyms import _raw_data
from preparation import modifiers
from preparation.resources.Resource import gen_resource
from hb_res.explanations import Explanation
from ._synonyms_quality import choose_best_synonyms

synonyms_mods = [
    modifiers.normalize_title(0.01, True),

    modifiers.re_replace('[^#]+? [^#]+?(#|$)', ''),  # remove multi-word synonyms (containing spaces)
    modifiers.re_fullmatch_ban(''),

    modifiers.delete_cognates(4, '#'),
    modifiers.choose_normal_words_in_explanation('#'),

    choose_best_synonyms(5, '#'),
    modifiers.calculate_prior_frequency_rate('#'),

    modifiers.str_replace('#', ', '),
    modifiers.calculate_key()
]


@gen_resource('SynonymsResource', synonyms_mods)
def read_data():
    explanations = set()
    with open(_raw_data, 'r', encoding='utf-8') as source:
        for line in source: