),
    modifiers.str_replace('||', 'П', target_field='title'),
    modifiers.re_search_ban(r'[^-ЁёА-Яа-я]', target_field='title'),
    modifiers.normalize_title(),
    # Text OCR problems

    # modifiers.translate(
    #     '?~[]{}',
    #     'ё-()()',
    #     '|*o'  # the o is latin
    # ),
    # modifiers.re_replace(r'знай\.', 'знач.'),
    # modifiers.str_replace('3а', 'За'),
    # modifiers.re_replace(r'(?<={alph})\d+(-\d+)?'.format(alph=modifiers.ALPH_RE), ''),
    modifiers.str_replace(r' :', ':'),
    modifiers.re_replace(r'([,:])(?=[^ 0-9])', r'\1 '),

    # Text quality heuristics

    modifiers.re_replace(r'\s+', ' '),

    modifiers.re_replace(r' *[вк]о? *(\d|I)+( *, *(\d|I)+)*( *и *(\d|I)+)? *знач[,.]?', '', re.IGNORECASE),

    modifiers.re_replace(r' *см\. *\S+((, ?| и )\S+)*', ''),

    modifiers.re_replace(r'N((\d+)/)*(\d)+', ''),

    modifiers.re_replace('<=', ''),
    modifiers.re_replace('==', ''),
    modifiers.re_replace('\|', ''),
    modifiers.re_replace('Anti', 'противоположность'),
__author__ = 'moskupols'

import os

from hb_res.storage import get_storage, FileExplanationStorage
from preparation import modifiers
from preparation.resources.Resource import gen_resource, applied_modifiers

CUR_DIR = os.path.dirname(os.path.abspath(__file__))
INPUT_PATH = os.path.join(CUR_DIR, 'Selected.asset')
OUTPUT_PATH = os.path.join(CUR_DIR, 'SelectedAfterMissedModifiers.asset')

missed_modifiers = [
    modifiers.str_replace('p', 'р'),
    modifiers.re_replace(r'\s+', ' '),
    modifiers.re_replace(r'([,:])(?=[^ ])', r'\1 '),
    modifiers.str_replace(r' :', ':'),
    modifiers.str_replace(r' ,', ','),
]

with FileExplanationStorage(INPUT_PATH) as inp:
    PatchedResource = gen_resource('SelectedResource',
                                   missed_modifiers)(inp.entries)
    with FileExplanationStorage(OUTPUT_PATH) as outp:
        outp.clear()
        for e in applied_modifiers(PatchedResource()):
            outp.add_entry(e)
import itertools

from preparation.resources.Resource import gen_resource
from preparation import modifiers
from hb_res.explanations.Explanation import Explanation


sample_modifiers = (
    modifiers.str_replace('?', 'ё'),
    modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'),
    modifiers.re_replace('\\.', ' '),
    modifiers.normalize_title(),
    modifiers.calculate_key(),
)


@gen_resource('SampleResource', sample_modifiers)
def sample_parser():
    raw_expls = itertools.starmap(Explanation, (
        ('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'),
        ('ИВАН', 'Один день Ивана Денисовича'),
        ('унылая', 'Унылая пора! очей очарованье!')
    ))
    return raw_expls
Example #4
0
__author__ = 'Алексей'

# noinspection PyProtectedMember
from preparation.resources.synonyms import _raw_data
from preparation import modifiers
from preparation.resources.Resource import gen_resource
from hb_res.explanations import Explanation
from ._synonyms_quality import choose_best_synonyms

synonyms_mods = [
    modifiers.normalize_title(0.01, True),

    modifiers.re_replace('[^#]+? [^#]+?(#|$)', ''),  # remove multi-word synonyms (containing spaces)
    modifiers.re_fullmatch_ban(''),

    modifiers.delete_cognates(4, '#'),
    modifiers.choose_normal_words_in_explanation('#'),

    choose_best_synonyms(5, '#'),
    modifiers.calculate_prior_frequency_rate('#'),

    modifiers.str_replace('#', ', '),
    modifiers.calculate_key()
]


@gen_resource('SynonymsResource', synonyms_mods)
def read_data():
    explanations = set()
    with open(_raw_data, 'r', encoding='utf-8') as source:
        for line in source:
__author__ = 'shkiper'

# noinspection PyProtectedMember
from preparation.resources.crosswords import _raw_data
from preparation.resources.Resource import gen_resource
from hb_res.explanations import Explanation
from preparation import modifiers

crosswords_mods = [
    modifiers.re_replace('p', 'р'),
    modifiers.strip(target_field='title'),
    modifiers.strip(),
    modifiers.normalize_title(),
    modifiers.re_replace(r'\s+', ' '),
    modifiers.re_replace(r'([,:])(?=[^ ])', r'\1 '),
    modifiers.str_replace(r' :', ':'),
    modifiers.str_replace(r' ,', ','),
    modifiers.shadow_cognates(8, '\W+', with_pronoun=True),
    modifiers.remove_to_much_gap_percentage(r'\W+', r'\*(\w+)[?]?\*', 0.5),
    modifiers.calculate_key()
]


@gen_resource('CrosswordsResource', crosswords_mods)
def read_data():
    with open(_raw_data, 'r', encoding='utf-8') as source:
        for line in source:
            tokens = line.split('$')
            word_and_text = tokens[1], tokens[2]
            yield Explanation(*word_and_text)
__author__ = 'shkiper'

# noinspection PyProtectedMember
from preparation.resources.phraseological import _raw_data
from preparation.resources.Resource import gen_resource
from hb_res.explanations import Explanation
from preparation import modifiers
import re

phraseological_mods = [
    modifiers.check_contains_valid_parts(2, 0.1, '\W+'),
    modifiers.shadow_title_with_question(),
    modifiers.normalize_title(),
    modifiers.shadow_cognates(5, '\W+'),
    modifiers.delete_multiple_gaps(0),
    modifiers.re_replace(' ([,!?])', r'\1'),
    modifiers.strip(),
    modifiers.calculate_key()
]


@gen_resource('PhraseologicalResource', phraseological_mods)
def read_data():
    phrases = set()
    with open(_raw_data, 'r', encoding='utf-8') as source:
        for line in source:
            phrases.add(line)
    for line in sorted(phrases):
        for word in sorted(set(re.split('\W+', line))):
            if len(word) > 0:
                yield Explanation(word, line.strip('\n'))
__author__ = 'ryad0m'

# noinspection PyProtectedMember
from preparation.resources.ngram import _raw_data
from preparation.resources.Resource import gen_resource
from hb_res.explanations import Explanation
from preparation import modifiers

ngram_mods = [
    modifiers.ensure_russian_title(),
    modifiers.re_replace('p', 'р'),
    modifiers.check_contains_valid_parts(2, 0.1, '\W+'),
    modifiers.shadow_cognates(5, '\W+', with_question=True),
    modifiers.delete_multiple_gaps(0),
    modifiers.calculate_key()
]

@gen_resource('NgramResource', ngram_mods)
def read_data():
    with open(_raw_data, 'r', encoding='utf-8') as source:
        explanations = dict()
        for line in source:
            word, expl, rate = line.strip('\n').split('\t')
            if explanations.get((word, expl)) is None:
                explanations[(word, expl)] = rate
            else:
                explanations[(word, expl)] += rate
        for (word, expl), rate in sorted(explanations.items()):
            yield Explanation(word, expl, prior_rating=int(rate) / 400000)

@modifiers.modifier_factory
def add_antonyms_common_text():
    def apply(e: Explanation):
        if e.text.find(',') == -1:
            text = "антоним к слову " + e.text
        else:
            text = "антоним к словам " + e.text
        ret = copy.copy(e)
        ret.text = text
        return ret
    return apply

antonyms_mods = [
    modifiers.normalize_title(0.01, True),

    modifiers.re_replace('[^#]+ [^#]+(#|$)', ''),  # remove multi-word antonyms (containing spaces)
    modifiers.re_fullmatch_ban(''),

    modifiers.delete_cognates(6, '#'),
    modifiers.choose_normal_words_in_explanation('#'),

    modifiers.calculate_prior_frequency_rate('#'),

    modifiers.str_replace('#', ', ', target_field='text'),

    add_antonyms_common_text(),

    modifiers.calculate_key()
]

Example #9
0
def add_antonyms_common_text():
    def apply(e: Explanation):
        if e.text.find(',') == -1:
            text = "антоним к слову " + e.text
        else:
            text = "антоним к словам " + e.text
        ret = copy.copy(e)
        ret.text = text
        return ret

    return apply


antonyms_mods = [
    modifiers.normalize_title(0.01, True),
    modifiers.re_replace('[^#]+ [^#]+(#|$)',
                         ''),  # remove multi-word antonyms (containing spaces)
    modifiers.re_fullmatch_ban(''),
    modifiers.delete_cognates(6, '#'),
    modifiers.choose_normal_words_in_explanation('#'),
    modifiers.calculate_prior_frequency_rate('#'),
    modifiers.str_replace('#', ', ', target_field='text'),
    add_antonyms_common_text(),
    modifiers.calculate_key()
]


@gen_resource('AntonymsResource', antonyms_mods)
def read_data():
    explanations = set()
    with open(_raw_data, 'r', encoding='utf-8') as source:
        for line in source:
import itertools

from preparation.resources.Resource import gen_resource
from preparation import modifiers
from hb_res.explanations.Explanation import Explanation

sample_modifiers = (
    modifiers.str_replace('?', 'ё'),
    modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'),
    modifiers.re_replace('\\.', ' '),
    modifiers.normalize_title(),
    modifiers.calculate_key(),
)


@gen_resource('SampleResource', sample_modifiers)
def sample_parser():
    raw_expls = itertools.starmap(
        Explanation,
        (('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'),
         ('ИВАН', 'Один день Ивана Денисовича'),
         ('унылая', 'Унылая пора! очей очарованье!')))
    return raw_expls