def test_get_existing(fixer, allforms): text = """ ==Spanish== ===Noun=== {{es-noun|f|m=busero}} # [[bus driver]] """ title = "busera" wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) assert fixer.get_existing_forms(title, wikt) == { ('busera', 'n', 'f', 'busero'): None } text = """ ==Spanish== ===Noun=== {{es-noun|m|f=busera}} # [[bus driver]] """ title = "busero" # wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) # assert fixer.get_existing_forms(title, wikt) == {('busera', 'n', 'm', 'busera'): None} text = """ ==Spanish== ===Verb=== {{head|es|past participle form|g=f-s}} # {{es-verb form of|ending=ar|mood=participle|gender=f|number=s|abacorar}} """ title = "abacorada" wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) assert fixer.get_existing_forms(title, wikt) == { ('abacorada', 'v', 'pp_fs', 'abacorar'): '# {{es-verb form of|ending=ar|mood=participle|gender=f|number=s|abacorar}}\n' }
def test_ababillarse(fixer, allforms): title = "ababillándose" text = """ ==Spanish== ===Verb=== {{head|es|verb form}} # {{es-compound of|ababill|ar|ababillando|se|mood=gerund}} """ declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) print(declared_forms) assert declared_forms == [('ababillándose', 'v', 'gerund_comb_se', 'ababillarse', [])] wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) existing_forms = fixer.get_existing_forms(title, wikt) missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [] assert unexpected_forms == set()
def test_errar_verb_multi_forms(fixer, allforms): title = "yerras" text = """ ==Spanish== ===Verb=== {{head|es|verb form}} # {{es-verb form of|mood=ind|tense=pres|num=s|pers=2|formal=n|ending=ar|errar}} """ # print(allforms.all_forms["erras"]) # print(allforms.all_forms["yerras"]) declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) assert declared_forms == [('yerras', 'v', 'pres_2s', 'errar', [])] wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) existing_forms = fixer.get_existing_forms(title, wikt) missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [] assert unexpected_forms == set()
def test_reflexive_stripping(fixer, allforms): title = "aborregas" text = """ ==Spanish== ===Verb=== {{head|es|verb form}} # {{es-verb form of|aborregar|ending=-ar|mood=indicative|tense=present|number=s|person=2|formal=n}} """ declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) assert declared_forms == [('aborregas', 'v', 'pres_2s', 'aborregarse', [])] wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) existing_forms = fixer.get_existing_forms(title, wikt) assert existing_forms == { ('aborregas', 'v', 'pres_2s', 'aborregar'): '# {{es-verb form of|aborregar|ending=-ar|mood=indicative|tense=present|number=s|person=2|formal=n}}\n' } missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [('aborregas', 'v', 'pres_2s', 'aborregarse', [])] assert unexpected_forms == {('aborregas', 'v', 'pres_2s', 'aborregar')}
def test_imp2_se(fixer, allforms): title = "aborrascaos" text = """ ==Spanish== ===Verb=== {{head|es|verb form}} # {{es-verb form of|aborrascarse|ending=-ar|mood=imperative|number=p|person=2|formal=n|sense=affirmative|region=Spain}} """ declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) assert declared_forms == [('aborrascaos', 'v', 'imp_2p', 'aborrascarse', [])] wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) existing_forms = fixer.get_existing_forms(title, wikt) assert existing_forms == { ('aborrascaos', 'v', 'imp_2p', 'aborrascarse'): '# {{es-verb form of|aborrascarse|ending=-ar|mood=imperative|number=p|person=2|formal=n|sense=affirmative|region=Spain}}\n' } missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [] assert unexpected_forms == set()
def test_actriz(fixer, allforms): title = "actriz" text = """ ==Spanish== ===Noun=== {{es-noun|f|m=actor}} # [[actress]] """ declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) assert declared_forms == [('actriz', 'n', 'f', 'actor', ['m'])] wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) existing_forms = fixer.get_existing_forms(title, wikt) assert existing_forms == {('actriz', 'n', 'f', 'actor'): None} missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [] assert unexpected_forms == set()
def test_remove_unexpected_inner_pos(fixer, allforms): text = """ ==Spanish== ===Adjective=== {{es-adj}} # gloss ===Noun=== {{es-noun}} # {{plural of|es|blah}} ===Verb=== {{es-verb}} # gloss """ result = """ ==Spanish== ===Adjective=== {{es-adj}} # gloss ===Verb=== {{es-verb}} # gloss """ title = "test" declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, fixer.get_existing_forms(title, wikt)) assert missing_forms == [] assert unexpected_forms == {('test', 'n', 'pl', 'blah')} res = fixer.remove_undeclared_forms(title, text, declared_forms) assert res.split("\n") == result.split("\n") assert res == result
def check_page(title, page_text, log_function): log = log_function # All forms use the head template, # this is a fast way of finding the pages that don't if "{{head|es" not in page_text: return body = get_lean_spanish_entry(page_text) if not body: return if not has_form_header(body): return wikt = parse_page(body, title, None) for item in wikt.ifilter_words(matches=lambda x: is_form(x)): subsections = [x.name for x in item._parent.ifilter_sections()] if subsections: log("has_subsection", title, item, "; ".join(subsections)) for sense in item.ifilter_wordsenses(): sense_text = str(sense).strip("\n #:") sense_text = re.sub( "({{lb\|es\|uds.}}|{{lb\|es\|obsolete}}|{{lb\|es\|Latin America\|uds.}})", "", sense_text) sense_text = wiki_to_text(sense_text, title) if "\n" in sense_text: details = "\n".join(str(sense).splitlines()[1:]) details = details.strip() if details: log("has_sense_details", title, item, details) continue formtype, lemma, nonform = Sense.parse_form_of(sense_text) if not formtype: log("has_gloss", title, item, str(sense)) continue # find t= or gloss= params that aren't in {{ux}} templates match = re.search(r"{{(?!ux)[^}]*\|(t|gloss)=([^|}]*)", str(sense)) if match: detail = match.group(2) if match else None log("has_gloss_param", title, item, str(sense) ) # TODO: when higlighting, add line and highlight detail elif nonform: log("has_text_outside_form", title, item, str(sense))
def test_remove_unexpected_first_pos(fixer, allforms): text = """ ==Spanish== ===Adjective=== {{head|es|adjective form|g=f-p}} # {{adj form of|es|test||f|p}} ===Noun=== {{es-noun}} # gloss """ result = """ ==Spanish== ===Noun=== {{es-noun}} # gloss """ title = "test" declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, fixer.get_existing_forms(title, wikt)) assert missing_forms == [] assert unexpected_forms == {('test', 'adj', 'fpl', 'test')} res = fixer.remove_undeclared_forms(title, text, declared_forms) assert res.split("\n") == result.split("\n") assert res == result
def test_descomida(fixer, allforms): title = "descomida" text = """ ==Spanish== ===Verb=== {{head|es|verb form}} # {{es-verb form of|mood=subjunctive|tense=present|person=1|number=s|ending=ir|descomedirse}} # {{es-verb form of|mood=subjunctive|tense=present|formal=y|person=2|number=s|ending=ir|descomedirse}} # {{es-verb form of|mood=subjunctive|tense=present|person=3|number=s|ending=ir|descomedirse}} # {{es-verb form of|mood=participle|gender=f|number=s|ending=er|descomer}} """ # print(allforms.all_forms["erras"]) # print(allforms.all_forms["yerras"]) declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) assert declared_forms == [ ('descomida', 'v', 'neg_imp_2sf', 'descomedirse', []), ('descomida', 'v', 'pres_sub_1s', 'descomedirse', []), ('descomida', 'v', 'pres_sub_2sf', 'descomedirse', []), ('descomida', 'v', 'pres_sub_3s', 'descomedirse', []), ('descomida', 'v', 'pp_fs', 'descomer', []), ] wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) existing_forms = fixer.get_existing_forms(title, wikt) missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [('descomida', 'v', 'neg_imp_2sf', 'descomedirse', [])] assert unexpected_forms == set()
def test_fix_feminine_plural(fixer, allforms): text = """ ==Spanish== ===Adjective=== {{head|es|adjective form|g=f-p}} # {{adj form of|es|académico||f|p}} ===Noun=== {{head|es|noun form|g=f-p}} # {{feminine plural of|es|académico}} ===Verb=== {{es-verb}} # blah""" result = """ ==Spanish== ===Adjective=== {{head|es|adjective form|g=f-p}} # {{adj form of|es|académico||f|p}} ===Noun=== {{head|es|noun form|g=f-p}} # {{noun form of|es|académica||p}} ===Verb=== {{es-verb}} # blah""" title = "académicas" wikt = wtparser.parse_page(text, title=title, parent=None, skip_style_tags=True) declared_forms = fixer.get_declared_forms(title, fixer.wordlist, allforms) entry = fixer.get_language_entry(title, wikt, "Spanish") existing_forms = fixer.get_existing_forms(title, entry) assert declared_forms == [('académicas', 'adj', 'fpl', 'académico', ['m']), ('académicas', 'n', 'pl', 'académica', ['f'])] assert existing_forms == { ('académicas', 'adj', 'fpl', 'académico'): '# {{adj form of|es|académico||f|p}}\n', ('académicas', 'n', 'fpl', 'académico'): '# {{feminine plural of|es|académico}}\n', } missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) assert missing_forms == [('académicas', 'n', 'pl', 'académica', ['f'])] assert unexpected_forms == {('académicas', 'n', 'fpl', 'académico')} res = fixer.add_missing_forms(title, text, declared_forms) res = fixer.remove_undeclared_forms(title, res, declared_forms) assert res.split("\n") == result.split("\n") assert res == result
def merge_pairs(self, match, title, replacement): """ This will be called twice for each pair On the first call, it should add data to the target page On the second call, it should remove data from the source page """ self.iter_count += 1 page_text = match.group(0) if self.skip_next: self.skip_next = False print("skipping") return page_text self.skip_next = False if self.iter_count % 2: self._lemma = title lemma = title form = self.lemma2form[lemma] wiki_page = pywikibot.Page(self.site, form) src_text = wiki_page.text fixes = [] def log(*args): print(args) fixes.append(args) check_page(title, src_text, log) if not fixes: print("no fixes found") self.skip_next = True return page_text if len(fixes) != 1: print("too many changes, can't merge", fixes) self.skip_next = True return page_text error, page, item, line = fixes[0] if error != "has_sense_details": print("error is not has_sense_details", error) self.skip_next = True return page_text self._section = item._parent._name self._pos = self.section_to_pos[self._section] if not self._pos: print("can't find pos", self._section) self.skip_next = True return page_text self._matched = line body = get_lean_spanish_entry(page_text) if not body: print(page_text) raise ValueError("no spanish found") wikt = parse_page(body, title, None) if not wikt: print("no page data", title) print(page_text) return page_text items = wikt.filter_words( matches=lambda x: x._parent._name == self._section) print("checking", page) if not items: print("no matches", self._section) self.skip_next = True return page_text if len(items) > 1: print("too many matches", self._section) self.skip_next = True return page_text orig_entry = str(wikt) wt_section = items[0] if not str(wt_section).endswith("\n"): wt_section.add_text("\n" + line + "\n") else: wt_section.add_text(line + "\n") new_page_text = page_text.replace(orig_entry, str(wikt)) if new_page_text == page_text: print("no changes") self.skip_next = True return page_text replacement._edit_summary = f"Spanish: {self._section}: moved form data from {form} (manually assisted)" return new_page_text else: replacement._edit_summary = f"Spanish: {self._section}: moved form data to lemma {self._lemma} (manually assisted)" return page_text.replace(self._matched + "\n", "").replace(self._matched, "")
def replace_nym_section_with_tag(self, language_text, nym_title, title=None): # Make sure there's a nym section at L3 or deeper header_level = 3 header_tag = "=" * header_level if f"{header_tag}{nym_title}{header_tag}" not in language_text: return wikt = wtparser.parse_page(language_text, title=title, parent=self, skip_style_tags=True) self._stats[nym_title] = self._stats.get(nym_title, 0) + 1 # Don't use ifilter because it gets confused when we remove the nym section for nym in wikt.filter_nyms(matches=lambda x: x.name == nym_title): unhandled_problems = False search_pos = self.get_search_pos(nym, wikt) if not any(sense for p in search_pos for sense in p.ifilter_wordsenses()): nym.flag_problem("pos_has_no_defs", nym_title) continue for nymsense in nym.filter_nymsenses(): matches = self.get_matching_senses(nymsense, search_pos, fuzzy=False) # Try fuzzy match if we don't have an exact match if len(matches) != 1: alt_matches = self.get_matching_senses(nymsense, search_pos, fuzzy=True) if len(alt_matches) == 1: nymsense.flag_problem("nymsense_fuzzy_match", nym_title) matches = alt_matches # Still no exact match # Try matching the nymsense to the wordsense if len(matches) != 1: alt_matches = self.get_matching_senses_by_nymsense( nymsense, search_pos, fuzzy=False) if len(alt_matches) == 1: nymsense.flag_problem( "nymsense_gloss_matches_wordsense", nym_title) matches = alt_matches # There were no matches, try again with fuzzy matching elif not alt_matches: alt_matches = self.get_matching_senses_by_nymsense( nymsense, search_pos, fuzzy=True) if len(alt_matches) == 1: nymsense.flag_problem( "nymsense_gloss_matches_wordsense_fuzzy", nym_title) matches = alt_matches no_merge = len(matches) != 1 if len(matches) == 1: if nymsense.sense != "": nymsense.flag_problem("automatch_sense", matches, nymsense.sense) elif not matches: nymsense.flag_problem("nymsense_matches_no_defs", nymsense.sense) # Default to matching the first def matches = [ next(sense for p in search_pos for sense in p.ifilter_wordsenses()) ] else: nymsense.flag_problem("nymsense_matches_multiple_defs") # If this isn't a perfect match, don't merge it into existing nymsense # This makes it easy to manually review and move to the correct location match = matches[0] if self.can_handle(nym.local_problems) \ and self.can_handle(match.problems) \ and self.can_handle(nymsense.problems) \ and self.add_nymsense_to_def(nymsense, match, no_merge=no_merge): nymsense._parent.remove_child(nymsense) else: unhandled_problems = True # IF the nym has subsections, move them to the nym's parent object if any(nym.ifilter_sections(recursive=False)): if not self.flag_problem("autofix_nymsection_has_subsections"): if not unhandled_problems and self.can_handle( nym.local_problems): nym.raise_subsections() nym._parent.remove_child(nym) elif not unhandled_problems and self.can_handle( nym.local_problems): nym._parent.remove_child(nym) # if str(wikt) == language_text: # self.flag_problem("no_change") return str(wikt)