import itertools from preparation.resources.Resource import gen_resource from preparation import modifiers from hb_res.explanations.Explanation import Explanation sample_modifiers = ( modifiers.str_replace('?', 'ё'), modifiers.shadow_cognates(length_threshold=3, sep_re='(\\s|!|\\.)+'), modifiers.re_replace('\\.', ' '), modifiers.normalize_title(), modifiers.calculate_key(), ) @gen_resource('SampleResource', sample_modifiers) def sample_parser(): raw_expls = itertools.starmap(Explanation, ( ('ПОРА', 'Однажды.в.студ?ную.зимнюю.пору.я.из.лесу.вышел.'), ('ИВАН', 'Один день Ивана Денисовича'), ('унылая', 'Унылая пора! очей очарованье!') )) return raw_expls
return ret return apply definitions_mods = [ # there is even a board on trello for almost all of these modifiers: trello.com/b/IEP8jusD modifiers.strip(' -3', target_field='title'), modifiers.translate( '?і3', # the second symbol is not i but \xd1\x96 in utf8 'ЁЁЗ', '",.+:124567890', target_field='title' ), modifiers.str_replace('||', 'П', target_field='title'), modifiers.re_search_ban(r'[^-ЁёА-Яа-я]', target_field='title'), modifiers.normalize_title(), # Text OCR problems # modifiers.translate( # '?~[]{}', # 'ё-()()', # '|*o' # the o is latin # ), # modifiers.re_replace(r'знай\.', 'знач.'), # modifiers.str_replace('3а', 'За'), # modifiers.re_replace(r'(?<={alph})\d+(-\d+)?'.format(alph=modifiers.ALPH_RE), ''), modifiers.str_replace(r' :', ':'), modifiers.re_replace(r'([,:])(?=[^ 0-9])', r'\1 '), # Text quality heuristics
__author__ = 'Алексей' # noinspection PyProtectedMember from preparation.resources.synonyms import _raw_data from preparation import modifiers from preparation.resources.Resource import gen_resource from hb_res.explanations import Explanation from ._synonyms_quality import choose_best_synonyms synonyms_mods = [ modifiers.normalize_title(0.01, True), modifiers.re_replace('[^#]+? [^#]+?(#|$)', ''), # remove multi-word synonyms (containing spaces) modifiers.re_fullmatch_ban(''), modifiers.delete_cognates(4, '#'), modifiers.choose_normal_words_in_explanation('#'), choose_best_synonyms(5, '#'), modifiers.calculate_prior_frequency_rate('#'), modifiers.str_replace('#', ', '), modifiers.calculate_key() ] @gen_resource('SynonymsResource', synonyms_mods) def read_data(): explanations = set() with open(_raw_data, 'r', encoding='utf-8') as source: for line in source: