), modifiers.str_replace('||', 'П', target_field='title'), modifiers.re_search_ban(r'[^-ЁёА-Яа-я]', target_field='title'), modifiers.normalize_title(), # Text OCR problems # modifiers.translate( # '?~[]{}', # 'ё-()()', # '|*o' # the o is latin # ), # modifiers.re_replace(r'знай\.', 'знач.'), # modifiers.str_replace('3а', 'За'), # modifiers.re_replace(r'(?<={alph})\d+(-\d+)?'.format(alph=modifiers.ALPH_RE), ''), modifiers.str_replace(r' :', ':'), modifiers.re_replace(r'([,:])(?=[^ 0-9])', r'\1 '), # Text quality heuristics modifiers.re_replace(r'\s+', ' '), modifiers.re_replace(r' *[вк]о? *(\d|I)+( *, *(\d|I)+)*( *и *(\d|I)+)? *знач[,.]?', '', re.IGNORECASE), modifiers.re_replace(r' *см\. *\S+((, ?| и )\S+)*', ''), modifiers.re_replace(r'N((\d+)/)*(\d)+', ''), modifiers.re_replace('<=', ''), modifiers.re_replace('==', ''), modifiers.re_replace('\|', ''), modifiers.re_replace('Anti', 'противоположность'),
__author__ = 'moskupols' import os from hb_res.storage import get_storage, FileExplanationStorage from preparation import modifiers from preparation.resources.Resource import gen_resource, applied_modifiers CUR_DIR = os.path.dirname(os.path.abspath(__file__)) INPUT_PATH = os.path.join(CUR_DIR, 'Selected.asset') OUTPUT_PATH = os.path.join(CUR_DIR, 'SelectedAfterMissedModifiers.asset') missed_modifiers = [ modifiers.str_replace('p', 'р'), modifiers.re_replace(r'\s+', ' '), modifiers.re_replace(r'([,:])(?=[^ ])', r'\1 '), modifiers.str_replace(r' :', ':'), modifiers.str_replace(r' ,', ','), ] with FileExplanationStorage(INPUT_PATH) as inp: PatchedResource = gen_resource('SelectedResource', missed_modifiers)(inp.entries) with FileExplanationStorage(OUTPUT_PATH) as outp: outp.clear() for e in applied_modifiers(PatchedResource()): outp.add_entry(e)
import itertools from preparation.resources.Resource import gen_resource from preparation import modifiers from hb_res.explanations.Explanation import Explanation sample_modifiers = ( modifiers.str_replace('?', 'ё'), modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'), modifiers.re_replace('\\.', ' '), modifiers.normalize_title(), modifiers.calculate_key(), ) @gen_resource('SampleResource', sample_modifiers) def sample_parser(): raw_expls = itertools.starmap(Explanation, ( ('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'), ('ИВАН', 'Один день Ивана Денисовича'), ('унылая', 'Унылая пора! очей очарованье!') )) return raw_expls
__author__ = 'Алексей' # noinspection PyProtectedMember from preparation.resources.synonyms import _raw_data from preparation import modifiers from preparation.resources.Resource import gen_resource from hb_res.explanations import Explanation from ._synonyms_quality import choose_best_synonyms synonyms_mods = [ modifiers.normalize_title(0.01, True), modifiers.re_replace('[^#]+? [^#]+?(#|$)', ''), # remove multi-word synonyms (containing spaces) modifiers.re_fullmatch_ban(''), modifiers.delete_cognates(4, '#'), modifiers.choose_normal_words_in_explanation('#'), choose_best_synonyms(5, '#'), modifiers.calculate_prior_frequency_rate('#'), modifiers.str_replace('#', ', '), modifiers.calculate_key() ] @gen_resource('SynonymsResource', synonyms_mods) def read_data(): explanations = set() with open(_raw_data, 'r', encoding='utf-8') as source: for line in source:
__author__ = 'shkiper' # noinspection PyProtectedMember from preparation.resources.crosswords import _raw_data from preparation.resources.Resource import gen_resource from hb_res.explanations import Explanation from preparation import modifiers crosswords_mods = [ modifiers.re_replace('p', 'р'), modifiers.strip(target_field='title'), modifiers.strip(), modifiers.normalize_title(), modifiers.re_replace(r'\s+', ' '), modifiers.re_replace(r'([,:])(?=[^ ])', r'\1 '), modifiers.str_replace(r' :', ':'), modifiers.str_replace(r' ,', ','), modifiers.shadow_cognates(8, '\W+', with_pronoun=True), modifiers.remove_to_much_gap_percentage(r'\W+', r'\*(\w+)[?]?\*', 0.5), modifiers.calculate_key() ] @gen_resource('CrosswordsResource', crosswords_mods) def read_data(): with open(_raw_data, 'r', encoding='utf-8') as source: for line in source: tokens = line.split('$') word_and_text = tokens[1], tokens[2] yield Explanation(*word_and_text)
__author__ = 'shkiper' # noinspection PyProtectedMember from preparation.resources.phraseological import _raw_data from preparation.resources.Resource import gen_resource from hb_res.explanations import Explanation from preparation import modifiers import re phraseological_mods = [ modifiers.check_contains_valid_parts(2, 0.1, '\W+'), modifiers.shadow_title_with_question(), modifiers.normalize_title(), modifiers.shadow_cognates(5, '\W+'), modifiers.delete_multiple_gaps(0), modifiers.re_replace(' ([,!?])', r'\1'), modifiers.strip(), modifiers.calculate_key() ] @gen_resource('PhraseologicalResource', phraseological_mods) def read_data(): phrases = set() with open(_raw_data, 'r', encoding='utf-8') as source: for line in source: phrases.add(line) for line in sorted(phrases): for word in sorted(set(re.split('\W+', line))): if len(word) > 0: yield Explanation(word, line.strip('\n'))
__author__ = 'ryad0m' # noinspection PyProtectedMember from preparation.resources.ngram import _raw_data from preparation.resources.Resource import gen_resource from hb_res.explanations import Explanation from preparation import modifiers ngram_mods = [ modifiers.ensure_russian_title(), modifiers.re_replace('p', 'р'), modifiers.check_contains_valid_parts(2, 0.1, '\W+'), modifiers.shadow_cognates(5, '\W+', with_question=True), modifiers.delete_multiple_gaps(0), modifiers.calculate_key() ] @gen_resource('NgramResource', ngram_mods) def read_data(): with open(_raw_data, 'r', encoding='utf-8') as source: explanations = dict() for line in source: word, expl, rate = line.strip('\n').split('\t') if explanations.get((word, expl)) is None: explanations[(word, expl)] = rate else: explanations[(word, expl)] += rate for (word, expl), rate in sorted(explanations.items()): yield Explanation(word, expl, prior_rating=int(rate) / 400000)
@modifiers.modifier_factory def add_antonyms_common_text(): def apply(e: Explanation): if e.text.find(',') == -1: text = "антоним к слову " + e.text else: text = "антоним к словам " + e.text ret = copy.copy(e) ret.text = text return ret return apply antonyms_mods = [ modifiers.normalize_title(0.01, True), modifiers.re_replace('[^#]+ [^#]+(#|$)', ''), # remove multi-word antonyms (containing spaces) modifiers.re_fullmatch_ban(''), modifiers.delete_cognates(6, '#'), modifiers.choose_normal_words_in_explanation('#'), modifiers.calculate_prior_frequency_rate('#'), modifiers.str_replace('#', ', ', target_field='text'), add_antonyms_common_text(), modifiers.calculate_key() ]
def add_antonyms_common_text(): def apply(e: Explanation): if e.text.find(',') == -1: text = "антоним к слову " + e.text else: text = "антоним к словам " + e.text ret = copy.copy(e) ret.text = text return ret return apply antonyms_mods = [ modifiers.normalize_title(0.01, True), modifiers.re_replace('[^#]+ [^#]+(#|$)', ''), # remove multi-word antonyms (containing spaces) modifiers.re_fullmatch_ban(''), modifiers.delete_cognates(6, '#'), modifiers.choose_normal_words_in_explanation('#'), modifiers.calculate_prior_frequency_rate('#'), modifiers.str_replace('#', ', ', target_field='text'), add_antonyms_common_text(), modifiers.calculate_key() ] @gen_resource('AntonymsResource', antonyms_mods) def read_data(): explanations = set() with open(_raw_data, 'r', encoding='utf-8') as source: for line in source:
import itertools from preparation.resources.Resource import gen_resource from preparation import modifiers from hb_res.explanations.Explanation import Explanation sample_modifiers = ( modifiers.str_replace('?', 'ё'), modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'), modifiers.re_replace('\\.', ' '), modifiers.normalize_title(), modifiers.calculate_key(), ) @gen_resource('SampleResource', sample_modifiers) def sample_parser(): raw_expls = itertools.starmap( Explanation, (('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'), ('ИВАН', 'Один день Ивана Денисовича'), ('унылая', 'Унылая пора! очей очарованье!'))) return raw_expls