def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "German", pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval if "=Etymology 1=" in secbody: notes = [] etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) for k in xrange(2, len(etym_sections), 2): retval = process_text_in_section(index, pagetitle, etym_sections[k]) if retval: newsectext, newnotes = retval etym_sections[k] = newsectext notes.extend(newnotes) secbody = "".join(etym_sections) sections[j] = secbody + sectail return "".join(sections), notes else: retval = process_text_in_section(index, pagetitle, secbody) if retval: secbody, notes = retval sections[j] = secbody + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else args.langname, pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(1, len(subsections), 2): if re.search("==Anagrams==", subsections[k]): if k + 2 < len(subsections): subsections = (subsections[0:k] + subsections[k + 2:len(subsections)] + subsections[k:k + 2]) notes.append("put Anagrams last in %s section" % args.langname) secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") text = unicode(page.text) retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Hungarian section") return sections, j, secbody, sectail, has_non_lang = retval if "==Alternative forms==" in secbody: pagemsg("WARNING: Skipping page with 'Alternative forms' section") return parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["compound", "affix", "af"] and getparam( t, "1") == "hu" and not getparam(t, "pos"): t.add("pos", "noun") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("add pos=noun to {{%s|hu}}" % tn) sections[j] = unicode(parsed) + sectail text = "".join(sections) return text, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = blib.find_modifiable_lang_section(text, "Chinese", pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval m = re.search(r"\A(.*?)(\n*)\Z", secbody, re.S) secbody, secbody_finalnl = m.groups() secbody += "\n\n" notes = [] new_secbody = secbody new_secbody = re.sub( r"^\* http://www\.trade\.gov\.bt/administration/mktbriefs/10\.pdf\n", "", new_secbody, 0, re.M) new_secbody = re.sub( r"^\* http://www\.koreantk\.com/en/m_sta/med_stat_search\.jsp\?searchGbn=statis\n", "", new_secbody, 0, re.M) new_secbody = re.sub(r"^\* http://www1\.dict\.li/?\n", "", new_secbody, 0, re.M) new_secbody = re.sub(r"^\* http://www1\.dict\.li/ and ", "* ", new_secbody, 0, re.M) if new_secbody != secbody: notes.append( "remove bad Chinese links (see [[Wiktionary:Grease pit/2019/September#Requesting bot help]])" ) secbody = new_secbody subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) subsections_to_delete = [] for k in xrange(1, len(subsections), 2): if (subsections[k] in ["===References===\n", "====References====\n"] and not subsections[k + 1].strip()): subsections_to_delete.append(k) if subsections_to_delete: for k in reversed(subsections_to_delete): del subsections[k:k + 2] notes.append("remove empty References section") secbody = "".join(subsections) sections[j] = secbody.rstrip("\n") + secbody_finalnl + sectail return "".join(sections), notes
def process_page_for_modification(index, pagetitle, text, new_pronuns): if pagetitle not in new_pronuns: return def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") retval = blib.find_modifiable_lang_section(text, "Old English", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Old English section") return sections, j, secbody, sectail, has_non_lang = retval heads = None if "Etymology 1" in secbody: etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) for k in xrange(2, len(etym_sections), 2): parsed = blib.parse_text(etym_sections[k]) secheads = [] for t in parsed.filter_templates(): this_heads = get_head_param(t, pagetitle) if this_heads: this_heads = [blib.remove_links(x) for x in this_heads] for head in this_heads: if head not in secheads: secheads.append(head) if heads is None: heads = secheads elif set(heads) != set(secheads): pagemsg( "Saw head(s) %s in one etym section and %s in another, splitting pronuns per etym section" % (",".join(heads), ",".join(secheads))) for k in xrange(2, len(etym_sections), 2): etym_sections[k] = process_section_for_modification( index, pagetitle, etym_sections[k], 4, new_pronuns[pagetitle]) sections[j] = "".join(etym_sections) + sectail return "".join( sections), "add pronunciation(s) to Old English lemma(s)" pagemsg( "All etym sections have same head(s) %s, creating a single pronun section" % ",".join(heads)) secbody = process_section_for_modification(index, pagetitle, secbody, 3, new_pronuns[pagetitle]) sections[j] = secbody + sectail return "".join(sections), "add pronunciation(s) to Old English lemma(s)"
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) origtext = text retval = blib.find_modifiable_lang_section(text, "Japanese", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval notes = [] newsecbody = re.sub("^====Compounds====$", "====Derived terms====", secbody, 0, re.M) if newsecbody != secbody: notes.append( "Compounds -> Derived terms in Japanese section (see [[Wiktionary:Grease pit/2019/September#Requesting bot help]])" ) secbody = newsecbody subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(1, len(subsections), 2): if subsections[k] == "====Derived terms====\n": endk = k + 2 while endk < len(subsections) and (re.search( "^====(Synonyms|Antonyms)====\n$", subsections[endk])): endk += 2 if endk > k + 2: subsections = (subsections[0:k] + subsections[k + 2:endk] + subsections[k:k + 2] + subsections[endk:]) notes.append("reorder Derived terms after Synonyms/Antonyms") secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Hungarian section") return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(1, len(subsections), 2): if ("===Verb===" in subsections[k] and "{{head|hu|verb form" in subsections[k + 1] and "{{participle of|hu|" in subsections[k + 1]): if args.split_participle: newsubsec = re.sub(r"^(#.*\{\{participle of\|hu\|.*)\n(#.*\{\{inflection of\|hu\|.*)\n\n", r"\2\n\1\n\n", subsections[k + 1], 0, re.M) if newsubsec != subsections[k + 1]: notes.append("reorder {{inflection of|hu|...}} before {{participle of|hu|...}}") subsections[k + 1] = newsubsec elif re.search(r"\{\{participle of\|hu\|.*\{\{inflection of\|hu\|", subsections[k + 1], re.S): pagemsg("WARNING: Saw {{participle of|hu|...}} before {{inflection of|hu|...}} with likely usage examples") continue if args.split_participle and "{{inflection of|hu|" in subsections[k + 1]: subsections[k + 1] = re.sub(r"^(#.*\{\{participle of\|hu\|)", r"\n===Participle===\n{{head|hu|participle}}\n\n\1", subsections[k + 1], 0, re.M) notes.append("split Hungarian verb form from participle") else: subsections[k] = subsections[k].replace("===Verb===", "===Participle===") subsections[k + 1] = re.sub(r"\{\{head\|hu\|verb form", "{{head|hu|participle", subsections[k + 1]) notes.append("Hungarian verb form -> participle in section with {{participle of}}") secbody = "".join(subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): notes = [] pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) text = unicode(page.text) retval = blib.find_modifiable_lang_section(text, "Belarusian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Belarusian section") return sections, j, secbody, sectail, has_non_lang = retval if "Etymology 1" in secbody: etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) for k in xrange(2, len(etym_sections), 2): etym_sections[k], this_notes = process_section(index, pagetitle, etym_sections[k]) notes.extend(this_notes) secbody = "".join(etym_sections) else: secbody, this_notes = process_section(index, pagetitle, secbody) notes.extend(this_notes) sections[j] = secbody + sectail if notes: sections[j] = re.sub(r"\{\{cln\|be\|(in)?transitive verbs\}\}\n?", "", sections[j]) return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "English", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(1, len(subsections) - 2, 2): if (re.search(r"==%s==" % headers_to_swap_regex, subsections[k]) and re.search("==Translations==", subsections[k + 2])): notes.append("swap %s and %s sections" % (subsections[k].strip(), subsections[k + 2].strip())) temp = subsections[k] subsections[k] = subsections[k + 2] subsections[k + 2] = temp temp = subsections[k + 1] subsections[k + 1] = subsections[k + 3] subsections[k + 3] = temp secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail text = "".join(sections) return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) need_ref_section = False for k in xrange(2, len(subsections), 2): if "==Pronunciation==" in subsections[k - 1]: parsed = blib.parse_text(subsections[k]) all_pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "it-pr" or tn == "IPA" and getparam(t, "1") == "it": all_pronun_templates.append(t) saw_it_pr = False pronun_based_respellings = [] for t in parsed.filter_templates(): origt = unicode(t) def tmsg(txt): other_templates = [] for t in all_pronun_templates: thist = unicode(t) if thist != origt: other_templates.append(thist) pagemsg("%s: %s%s" % (txt, origt, ", other templates %s" % ", ".join(other_templates) if len(other_templates) > 0 else "")) tn = tname(t) if tn == "it-pr": saw_it_pr = True respellings = blib.fetch_param_chain(t, "1") # FIXME, need to split on comma pronun_based_respellings.extend(respellings) break if tn == "IPA" and getparam(t, "1") == "it": saw_it_pr = True pronuns = blib.fetch_param_chain(t, "2") this_phonemic_pronun = None this_phonemic_respelling = None this_phonetic_pronun = None this_phonetic_respelling = None respellings = [] all_warnings = [] hack_respelling_warnings = [] main_warnings = [] unable = [False] for pronun in pronuns: respelling = ipa_to_respelling(pronun) respelling, this_hack_respelling_warnings = hack_respelling( pagetitle, respelling) hack_respelling_warnings.extend( this_hack_respelling_warnings) def set_unable(msg): main_warnings.append(msg) unable[0] = True tmsg("For pronun %s, generated respelling %s" % (pronun, respelling)) respelling_words = respelling.split(" ") for rw in respelling_words: if rw.endswith("-"): # prefix continue hacked_rw = re.sub( u".[\u0323\u0331]", "e", rw ) # pretend vowels with secondary or no stress are 'e' if not re.search( u"[àèéìòóùÀÈÉÌÒÓÙ]", hacked_rw) and len( re.sub("[^aeiouAEIOU]", "", hacked_rw)) > 1: set_unable( "WARNING: For respelling %s for pronun %s, word %s is missing stress" % (respelling, pronun, rw)) if not re.search(u"^[a-zA-ZàèéìòóùÀÈÉÌÒÓÙ. ʒʃ\[\]-]+$", respelling): set_unable( "WARNING: Strange char in respelling %s for pronun %s" % (respelling, pronun)) else: putative_pagetitle = re.sub( u"([àèéìòóùÀÈÉÌÒÓÙ])([^ ])", lambda m: vowel_respelling_to_spelling[m.group( 1)] + m.group(2), respelling) pagetitle_words = pagetitle.split(" ") putative_pagetitle_words = putative_pagetitle.split( " ") if len(pagetitle_words) != len( putative_pagetitle_words): set_unable( "WARNING: Page title has %s words but putative page title %s has %s words" % (len(pagetitle_words), putative_pagetitle, len(putative_pagetitle_words))) else: hacked_putative_pagetitle_words = [] for ptw, puptw in zip( pagetitle_words, putative_pagetitle_words): split_ptw = re.split("([Zz]+)", ptw) split_puptw = re.split( "([Tt]?[Tt]s|[Dd]?[Dd]z)", puptw) if len(split_ptw) != len(split_puptw): set_unable( "WARNING: Different # of z's in pagetitle word %s vs. (t)ts/(d)dz's in putative pagetitle word %s" % (ptw, puptw)) hacked_putative_pagetitle_words.append( puptw) else: parts = [] for i in xrange(len(split_puptw)): if i % 2 == 0: parts.append(split_puptw[i]) else: parts.append(split_ptw[i]) hacked_putative_pagetitle_words.append( "".join(parts)) putative_pagetitle = " ".join( hacked_putative_pagetitle_words) if putative_pagetitle != pagetitle: # If respelling already seen, we already warned about it. if respelling in respellings: assert unable[0] else: set_unable( "WARNING: Respelling %s doesn't match page title (putative page title %s, pronun %s)" % (respelling, putative_pagetitle, pronun)) def append_respelling(respelling): if respelling not in respellings: respellings.append(respelling) def append_warnings(warning): if warning: all_warnings.append(warning) for warning in hack_respelling_warnings: all_warnings.append(warning) del hack_respelling_warnings[:] for warning in main_warnings: all_warnings.append(warning) del main_warnings[:] append_respelling(respelling) if pronun.startswith("/"): if this_phonemic_pronun is not None: append_warnings( "WARNING: Saw two phonemic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonetic pronun" % (this_phonemic_pronun, this_phonemic_respelling, pronun, respelling)) this_phonemic_pronun = pronun this_phonemic_respelling = respelling this_phonetic_pronun = None this_phonetic_respelling = None elif pronun.startswith("["): if this_phonemic_pronun is None: if this_phonetic_pronun is not None: unable[0] = True append_warnings( "WARNING: Saw two phonetic pronuns %s (respelling %s) and %s (respelling %s) without intervening phonemic pronun" % (this_phonetic_pronun, this_phonetic_respelling, pronun, respelling)) else: append_warnings( "WARNING: Saw phonetic pronun %s (respelling %s) without preceding phonemic pronun" % (pronun, respelling)) this_phonetic_pronun = pronun this_phonetic_respelling = respelling elif this_phonemic_respelling != respelling: unable[0] = True append_warnings( "WARNING: Phonemic respelling %s (pronun %s) differs from phonetic respelling %s (pronun %s)" % (this_phonemic_respelling, this_phonemic_pronun, respelling, pronun)) else: if unable[0] and len(main_warnings) > 0: # `unable` could be set from a previous pronunciation but no main warnings this time around # because the previously generated warnings have already been appended to all_warnings. mesg = main_warnings[0] del main_warnings[0] append_warnings(mesg) else: append_warnings(None) this_phonemic_pronun = None this_phonemic_respelling = None else: unable[0] = True append_warnings( "WARNING: Pronun %s (respelling %s) not marked as phonemic or phonetic" % (pronun, respelling)) if this_phonemic_pronun is not None: append_warnings( "WARNING: Saw phonemic pronun %s (respelling %s) without corresponding phonetic pronun" % (this_phonemic_pronun, this_phonemic_respelling)) if not unable[0]: for param in t.params: pn = pname(param) if not re.search("^[0-9]+$", pn) and pn != "nocount": unable[0] = True append_warnings( "WARNING: Saw unrecognized param %s=%s" % (pn, unicode(param.value))) manual_assist = "" if unable[0]: if pagetitle in ipa_directives: respellings = ipa_directives[pagetitle] unable[0] = False manual_assist = " (manually assisted)" tmsg( "%sUsing manually-specified IPA-based respelling%s %s; original warnings follow: %s" % ("[MULTIPLE PRONUN TEMPLATES] " if len(all_pronun_templates) > 1 else "", "s" if len(respellings) > 1 else "", ",".join(respellings), " ||| ".join(all_warnings))) else: tmsg("%s<respelling> %s <end> %s" % ("[MULTIPLE PRONUN TEMPLATES] " if len(all_pronun_templates) > 1 else "", " ".join(respellings), " ||| ".join(all_warnings))) if not unable[0]: del t.params[:] nextparam = 0 for param in respellings: if "=" in param: paramname, paramval = param.split("=", 1) else: nextparam += 1 paramname = str(nextparam) paramval = param if re.search("^n[0-9]*$", paramname): need_ref_section = True t.add(paramname, paramval) blib.set_template_name(t, "it-pr") notes.append( "replace raw {{IPA|it}} with {{it-pr|%s}}%s" % ("|".join(respellings), manual_assist)) pronun_based_respellings.extend(respellings) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) subsections[k] = unicode(parsed) rhymes_template = None for t in parsed.filter_templates(): tn = tname(t) if tn in ["rhyme", "rhymes"] and getparam(t, "1") == "it": if rhymes_template: pagemsg( "WARNING: Saw two {{rhymes|it}} templates: %s and %s" % (unicode(rhymes_template), unicode(t))) rhymes_template = t if rhymes_template: rhyme_based_respellings = [] all_warnings = [] def append_respelling(respelling): if respelling not in rhyme_based_respellings: rhyme_based_respellings.append(respelling) def append_warnings(warning): all_warnings.append(warning) rhymes = blib.fetch_param_chain(rhymes_template, "2") unable = False for rhy in rhymes: spellings = rhyme_to_spelling(rhy) matched = False bad_rhyme_msgs = [] for ending, ending_respelling in spellings: if pagetitle.endswith(ending): prevpart = pagetitle[:-len(ending)] respelling = prevpart + ending_respelling saw_oso_ese = False if ending_respelling == u"óso": saw_oso_ese = True append_respelling(respelling) append_respelling("#" + prevpart + u"ó[s]o") elif ending_respelling == u"ése": saw_oso_ese = True append_respelling(respelling) append_respelling("#" + prevpart + u"é[s]e") else: if respelling.endswith(u"zióne"): new_respelling = re.sub( u"zióne$", u"tsióne", respelling) pagemsg( "Replaced respelling '%s' with '%s'" % (respelling, new_respelling)) respelling = new_respelling prevpart = respelling[:-len( ending)] + ending_respelling append_respelling(respelling) if (re.search(u"[aeiouàèéìòóù]s([aeiouàèéìòóù]|$)", prevpart.lower()) or not saw_oso_ese and re.search( u"[aeiouàèéìòóù][sz][aeiouàèéìòóù]", ending_respelling.lower())): append_warnings( "WARNING: Unable to add pronunciation due to /s/ or /z/ between vowels: %s" % rhy) unable = True break if "z" in prevpart: append_warnings( "WARNING: Unable to add pronunciation due to z in part before rhyme: %s" % rhy) unable = True break hacked_prevpart = re.sub("([gq])u", r"\1w", prevpart) hacked_prevpart = hacked_prevpart.replace( "gli", "gl") hacked_prevpart = re.sub("([cg])i", r"\1", hacked_prevpart) if re.search("[^aeiou][iu]([aeiou]|$)", hacked_prevpart.lower()): append_warnings( "WARNING: Unable to add pronunciation due to hiatus in part before rhyme %s" % rhy) unable = True break if re.search(u"[aeiouàèéìòóù]i([^aeiouàèéìòóù]|$)", respelling.lower()): append_warnings( "WARNING: Unable to add pronunciation due to falling diphthong in -i: %s" % rhy) unable = True break matched = True break else: bad_rhyme_msgs.append( "WARNING: Unable to match rhyme %s, spelling %s, respelling %s" % (rhy, ending, ending_respelling)) if not matched and not unable and bad_rhyme_msgs: for bad_rhyme_msg in bad_rhyme_msgs: pagemsg(bad_rhyme_msg) if rhyme_based_respellings: if not saw_it_pr: manual_assist = "" if pagetitle in rhyme_directives: rhyme_based_respellings = rhyme_directives[ pagetitle] manual_assist = " (manually assisted)" pagemsg( "Using manually-specified rhyme-based respelling%s %s; original warnings follow: %s: %s" % ("s" if len(rhyme_based_respellings) > 1 else "", ",".join(rhyme_based_respellings), " ||| ".join(all_warnings), unicode(rhymes_template))) subsections[k] = "* {{it-pr|%s}}\n" % ",".join( rhyme_based_respellings) + subsections[k] notes.append( "add Italian rhyme-based respelling%s %s%s" % ("s" if len(rhyme_based_respellings) > 1 else "", ",".join(rhyme_based_respellings), manual_assist)) else: different_headers = [] for pos in [ "Noun", "Verb", "Adjective", "Adverb", "Participle" ]: if "==%s==" % pos in secbody: different_headers.append(pos) if len(different_headers) > 1: all_warnings[0:0] = [ "WARNING: Multiple headers %s seen" % ",".join(different_headers) ] if "Etymology 1" in secbody: all_warnings[0:0] = [ "WARNING: Multiple etymologies seen" ] pagemsg( "<respelling> all: %s <end>%s: <from> %s <to> %s <end>" % (" ".join(rhyme_based_respellings), " " + " ||| ".join(all_warnings) if all_warnings else "", unicode(rhymes_template), unicode(rhymes_template))) else: for respelling in rhyme_based_respellings: if (not re.search("^qual[0-9]*=", respelling) and pronun_based_respellings and respelling not in pronun_based_respellings): pagemsg( "WARNING: Rhyme-based respelling%s %s doesn't match it-pr respelling(s) %s%s" % (" (with problems)" if len(all_warnings) > 0 else "", respelling, ",".join(pronun_based_respellings), ": %s" % " ||| ".join(all_warnings) if len(all_warnings) > 0 else "")) if need_ref_section: for k in xrange(len(subsections) - 1, 2, -2): if re.search(r"^===\s*References\s*===$", subsections[k - 1].strip()): if not re.search(r"<references\s*/?\s*>", subsections[k]): subsections[k] = subsections[k].rstrip( "\n") + "\n<references />\n\n" notes.append( "add <references /> to existing ===References=== section for pronunciation refs" ) break else: # no break for k in xrange(len(subsections) - 1, 2, -2): if not re.search(r"==\s*(Anagrams|Further reading)\s*==", subsections[k - 1]): subsections[k + 1:k + 1] = [ "===References===\n", "<references />\n\n" ] notes.append( "add new ===References=== section for pronunciation refs" ) break else: # no break pagemsg( "WARNING: Something wrong, couldn't find location to insert ===References=== section" ) secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Polish", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) has_etym_sections = "==Etymology 1==" in secbody if has_etym_sections: # Check if either Pronunciation with pronunciation template above Etymology 1, or every # Etymology N section has Pronunciation with pronunciation template. saw_etym_1 = False cur_etym_header = None saw_pron_in_etym = False for k in range(1, len(subsections), 2): if "==Pronunciation==" in subsections[k]: secparsed = blib.parse_text(subsections[k + 1]) for t in secparsed.filter_templates(): tn = tname(t) if tn in pronun_templates: if saw_etym_1: saw_pron_in_etym = True break else: pagemsg("Already saw pronunciation template above ==Etymology 1==: %s" % unicode(t)) return else: # no break pagemsg("WARNING: Saw ==Pronunciation== section without pronunciation template, along with ==Etymology 1==; can't handle, skipping") return if "==Etymology 1==" in subsections[k]: saw_etym_1 = True cur_etym_header = subsections[k].strip() elif re.search("==Etymology [0-9]+==", subsections[k]): if not saw_pron_in_etym: pagemsg("WARNING: No ==Pronunciation== section above ==Etymology N== headers and saw %s without pronunciation template; can't handle, skipping" % cur_etym_header) return saw_pron_in_etym = False cur_etym_header = subsections[k].strip() if not saw_pron_in_etym: # Last Etymology N section didn't have pronunciation template. pagemsg("WARNING: No ==Pronunciation== section above ==Etymology N== headers and saw %s without pronunciation template; can't handle, skipping" % cur_etym_header) return parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in pronun_templates: pagemsg("Already saw pronunciation template: %s" % unicode(t)) return if not args.ignore_lemma_respelling: lemmas = set() for t in parsed.filter_templates(): tn = tname(t) if tn in infl_templates: def getp(param): return getparam(t, param) if getp("1") != "pl": pagemsg("WARNING: Wrong language in {{%s}}, skipping: %s" % (tn, unicode(t))) return lemma = getparam(t, "2") lemmas.add(lemma) if len(lemmas) > 1: pagemsg("WARNING: Saw inflection of multiple lemmas %s, skipping" % ",".join(lemmas)) return if not lemmas: pagemsg("WARNING: Didn't see inflection template, skipping") return lemma = list(lemmas)[0] pl_p_prop, pl_p_respellings = get_pl_p_property(index, lemma) if pl_p_prop == "no-pl-p": pagemsg("WARNING: Lemma page %s has no {{pl-p}}, not sure what to do, skipping" % lemma) return elif pl_p_prop == "pl-p-respelling": pagemsg("WARNING: Lemma page %s has respelling(s) %s, skipping" % ( lemma, ",".join(pl_p_respellings))) return else: pagemsg("Lemma page %s has {{pl-p}} without respelling, proceeding" % lemma) def construct_new_pron_template(): return "{{pl-p}}", "" def insert_into_existing_pron_section(k): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn in pronun_templates: pagemsg("Already saw pronunciation template: %s" % unicode(t)) break else: # no break new_pron_template, pron_prefix = construct_new_pron_template() # Remove existing rhymes/hyphenation/pl-IPA lines for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template m = re.search(regex, subsections[k], re.M) if m: pagemsg("Removed existing %s" % m.group(1).strip()) notes.append("remove existing {{%s}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) for template in ["audio|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template all_audios = re.findall(regex, subsections[k], re.M) if len(all_audios) > 1: pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios())) return if len(all_audios) == 1: audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0] assert(tname(audiot) == "audio") if getparam(audiot, "1") != "pl": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line) return audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, audio_line)) return if audiogloss in ["Audio", "audio"]: audiogloss = "" params = "|a=%s" % audiofile if audiogloss: params += "|ac=%s" % audiogloss new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:] pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip()) notes.append("incorporate existing {{%s}} into {{pl-p}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k] notes.append("insert %s into existing Pronunciation section" % new_pron_template) return True def insert_new_l3_pron_section(k): new_pron_template, pron_prefix = construct_new_pron_template() subsections[k:k] = ["===Pronunciation===\n", pron_prefix + new_pron_template + "\n\n"] notes.append("add top-level Polish pron %s" % new_pron_template) for k in xrange(2, len(subsections), 2): if "==Pronunciation==" in subsections[k - 1]: if not insert_into_existing_pron_section(k): return break else: # no break k = 2 while k < len(subsections) and re.search("==(Alternative forms|Etymology)==", subsections[k - 1]): k += 2 if k -1 >= len(subsections): pagemsg("WARNING: No lemma or non-lemma section at top level") return insert_new_l3_pron_section(k - 1) secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(pageindex, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (pageindex, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else args.langname, pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) defn_subsection = None saw_two_defn_subsections = False for k in xrange(2, len(subsections), 2): if re.search("=Etymology", subsections[k - 1]): defn_subsection = None saw_two_defn_subsections = False if "\n#" in subsections[k] and not re.search( "=(Etymology|Pronunciation|Usage notes)", subsections[k - 1]): if defn_subsection: saw_two_defn_subsections = True defn_subsection = k defn_subsection_level = get_subsection_level(subsections[k - 1]) saw_nyms_already = set() m = re.search("=(Synonyms|Antonyms)=", subsections[k - 1]) if m: syntype = m.group(1).lower()[:-1] if defn_subsection is None: pagemsg( "WARNING: Encountered %ss section #%s without preceding definition section" % (syntype, k // 2 + 1)) continue synant_subsection_level = get_subsection_level(subsections[k - 1]) if saw_two_defn_subsections and synant_subsection_level <= defn_subsection_level: pagemsg( "WARNING: Saw two definition sections followed by %s section #%s at same level or higher, skipping section" % (syntype, k // 2 + 1)) continue if syntype in saw_nyms_already: pagemsg( "WARNING: Encountered two %s sections without intervening definition section" % syntype) continue def parse_syns(syns): retval = [] syns = syns.strip() orig_syns = syns qualifier = None while True: # check for qualifiers specified using a qualifier template m = re.search( "^(.*?)\{\{(?:qualifier|qual|q|i)\|([^{}|=]*)\}\}(.*?)$", syns) if m: before_text, qualifier, after_text = m.groups() syns = before_text + after_text break # check for qualifiers using e.g. {{lb|ru|...}} m = re.search( "^(.*?)\{\{(?:lb)\|%s\|([^{}=]*)\}\}(.*?)$" % re.escape(args.lang), syns) if m: before_text, qualifier, after_text = m.groups() # do this before handling often/sometimes/etc. in case the label has often|_|pejorative or similar qualifier = qualifier.replace("|_|", " ") terms_no_following_comma = [ "also", "and", "or", "by", "with", "except", "outside", "in", "chiefly", "mainly", "mostly", "primarily", "especially", "particularly", "excluding", "extremely", "frequently", "humorously", "including", "many", "markedly", "mildly", "now", "occasionally", "of", "often", "sometimes", "originally", "possibly", "rarely", "slightly", "somewhat", "strongly", "then", "typically", "usually", "very" ] qualifier = re.sub( r"\b(%s)\|" % "|".join(terms_no_following_comma), r"\1 ", qualifier) qualifier = qualifier.replace("|", ", ") syns = before_text + after_text break # check for qualifier-like ''(...)'' m = re.search("^(.*?)''\(([^'{}]*)\)''(.*?)$", syns) if m: before_text, qualifier, after_text = m.groups() syns = before_text + after_text break # check for qualifier-like (''...'') m = re.search("^(.*?)\(''([^'{}]*)''\)(.*?)$", syns) if m: before_text, qualifier, after_text = m.groups() syns = before_text + after_text break break # Split on commas, semicolons, slashes but don't split commas etc. inside of braces or brackets split_by_brackets_braces = re.split( r"(\{\{[^{}]*\}\}|\[\[[^\[\]]*\]\])", syns.strip()) comma_separated_runs = blib.split_alternating_runs( split_by_brackets_braces, "(?: *[,;] *| +/ +)") syns = [ "".join(comma_separated_run) for comma_separated_run in comma_separated_runs ] if qualifier and len(syns) > 1: pagemsg( "WARNING: Saw qualifier along with multiple synonyms, not sure how to proceed: <%s>" % orig_syns) return None joiner_after = ";" if qualifier or len(syns) > 1 else "," for synindex, syn in enumerate(syns): orig_syn = syn m = re.search( r"^\{\{[lm]\|%s\|([^{}]*)\}\}$" % re.escape(args.lang), syn) if m: decl = blib.parse_text(syn).filter_templates()[0] gender = None translit = None raw_syn = None alt = None gloss = None lit = None pos = None for param in decl.params: pn = pname(param) pv = unicode(param.value) if pn in ["1"]: pass elif pn == "2": raw_syn = pv elif pn == "3": alt = pv elif pn in ["4", "t", "gloss"]: gloss = pv elif pn == "g": gender = pv elif pn in ["g2", "g3", "g4"]: if not gender: pagemsg( "WARNING: Saw %s=%s without g= in %s <%s> in line: %s" % (pn, pv, syntype, orig_syn, line)) return None gender += "," + pv elif pn == "tr": translit = pv elif pn == "lit": lit = pv elif pn == "pos": pos = pv else: pagemsg( "WARNING: Unrecognized param %s=%s in %s <%s> in line: %s" % (pn, pv, syntype, orig_syn, line)) return None if not raw_syn: pagemsg( "WARNING: Couldn't find raw synonym in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if raw_syn and alt: if "[[" in raw_syn or "[[" in alt: pagemsg( "WARNING: Saw both synonym=%s and alt=%s with brackets in one or both in %s <%s> in line: %s" % (raw_syn, alt, syntype, orig_syn, line)) return None syn = "[[%s|%s]]" % (raw_syn, alt) elif raw_syn: if "[[" in raw_syn: syn = raw_syn else: syn = "[[%s]]" % raw_syn elif alt: pagemsg( "WARNING: Saw alt=%s but no link text in %s <%s> in line: %s" % (alt, syntype, orig_syn, line)) return else: def add_brackets_if_not_already(m): raw_syn = m.group(1) if "[[" not in raw_syn: raw_syn = "[[%s]]" % raw_syn return raw_syn syn = re.sub( r"\{\{[lm]\|%s\|([^{}=]*)\}\}" % re.escape(args.lang), add_brackets_if_not_already, syn) gender = None translit = None gloss = None lit = None pos = None if "{{" in syn or "}}" in syn: pagemsg( "WARNING: Unmatched braces in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if "''" in syn: pagemsg( "WARNING: Italicized text in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if "(" in syn or ")" in syn: pagemsg( "WARNING: Unmatched parens in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None if ":" in syn: pagemsg( "WARNING: Unmatched colon in %s <%s> in line: %s" % (syntype, orig_syn, line)) return None # Strip brackets around entire synonym syn = re.sub(r"^\[\[([^\[\]|{}]*)\]\]$", r"\1", syn) # If there are brackets around some words but not all, put brackets around the remaining words if "[[" in syn: split_by_brackets = re.split( r"([^ ]*\[\[[^\[\]]*\]\][^ ]*)", syn) def maybe_add_brackets(m): text = m.group(1) if "[" in text or "]" in text: pagemsg( "WARNING: Saw nested brackets in %s in %s <%s> in line: %s" % (text, syntype, orig_syn, line)) return text if not re.search(r"\w", text, re.U): pagemsg( "Not adding brackets around '%s', saw no letters in %s <%s> in line: %s" % (text, syntype, orig_syn, line)) return text return "[[%s]]" % text # Put brackets around the remainin words not already bracketed or partially bracketed. But don't put # brackets around words inside of HTML comments, and don't include punctuation inside the brackets. for i in xrange(0, len(split_by_brackets), 2): split_out_comments = re.split( "(<!--.*?-->)", split_by_brackets[i]) for j in xrange(0, len(split_out_comments), 2): split_out_comments[j] = re.sub( "([^ ,*/{}:;()?!+<>]+)", maybe_add_brackets, split_out_comments[j]) split_by_brackets[i] = "".join(split_out_comments) new_syn = "".join(split_by_brackets) if new_syn != syn: pagemsg("Add brackets to '%s', producing '%s'" % (syn, new_syn)) syn = new_syn other_params = [ ("tr", translit), ("t", gloss), ("q", qualifier), ("g", gender), ("pos", pos), ("lit", lit), ] # Set the joiner_after to None for everything but the last synonym on the row; we will then change # all commas to semicolons if there is any semicolon, so we are consistently using commas or # semicolons to separate groups of synonyms. retval.append( (syn, other_params, joiner_after if synindex == len(syns) - 1 else None)) return retval def find_defns(): m = re.search(r"\A(.*?)((?:^#[^\n]*\n)+)(.*?)\Z", subsections[defn_subsection], re.M | re.S) if not m: pagemsg( "WARNING: Couldn't find definitions in definition subsection #%s" % (defn_subsection // 2 + 1)) return None, None, None before_defn_text, defn_text, after_defn_text = m.groups() if re.search("^#", before_defn_text, re.M) or re.search( "^#", after_defn_text, re.M): pagemsg( "WARNING: Saw definitions in before or after text in definition subsection #%s, not sure what to do" % (defn_subsection // 2 + 1)) return None, None, None if re.search("^##", defn_text, re.M): pagemsg( "WARNING: Found ## definition in definition subsection #%s, not sure what to do" % (defn_subsection // 2 + 1)) return None, None, None defns = re.split("^(#[^*:].*\n(?:#[*:].*\n)*)", defn_text, 0, re.M) for between_index in xrange(0, len(defns), 2): if defns[between_index]: pagemsg( "WARNING: Saw unknown text <%s> between definitions, not sure what to do" % defns[between_index].strip()) return None, None, None defns = [x for i, x in enumerate(defns) if i % 2 == 1] return before_defn_text, defns, after_defn_text def add_syns_to_defn(syns, defn, add_fixme): for syn, other_params, joiner_after in syns: if not syn and joiner_after is not None: pagemsg( "WARNING: Would remove last synonym from a group: %s" % ",".join( syn for syn, other_params, joiner_after in syns)) return None syns = [(syn, other_params, joiner_after) for syn, other_params, joiner_after in syns if syn] if len(syns) == 0: return defn any_semicolon = any(joiner_after == ";" for sy, other_params, joiner_after in syns) if any_semicolon: syns = [(syn, other_params, ";" if joiner_after is not None and any_semicolon else joiner_after) for syn, other_params, joiner_after in syns] saw_nyms_already.add(syntype) joined_syns = "|".join( "%s%s%s" % (syn, "".join("<%s:%s>" % (param, val) if val else "" for param, val in other_params), "|" + joiner_after if i < len(syns) - 1 and joiner_after is not None and joiner_after != "," else "") for i, (syn, other_params, joiner_after) in enumerate(syns)) fixme_msg = " FIXME" if add_fixme else "" if syntype == "synonym": if re.search(r"\{\{(syn|synonyms)\|", defn): pagemsg( "WARNING: Already saw inline synonyms in definition: <%s>" % defn) return None return re.sub( r"^(.*\n)", r"\1#: {{syn|%s|%s}}%s" % (args.lang, joined_syns, fixme_msg) + "\n", defn) else: if re.search(r"\{\{(ant|antonyms)\|", defn): pagemsg( "WARNING: Already saw inline antonyms in definition: <%s>" % defn) return None # Need to put antonyms after any inline synonyms return re.sub( r"^(.*\n(?:#: *\{\{(?:syn|synonyms)\|.*\n)*)", r"\1#: {{ant|%s|%s}}%s" % (args.lang, joined_syns, fixme_msg) + "\n", defn) # Find definitions before_defn_text, defns, after_defn_text = find_defns() if before_defn_text is None: continue def put_back_new_defns(defns, syndesc, skipped_a_line, lines, skipped_linenos): subsections[defn_subsection] = before_defn_text + "".join( defns) + after_defn_text if skipped_a_line: skipped_linenos = sorted(skipped_linenos) skipped_lines = [ lines[lineno] for lineno in skipped_linenos ] subsections[k] = "\n".join(skipped_lines) else: subsections[k - 1] = "" subsections[k] = "" notes.append( "Convert %ss in %s subsection %s to inline %ss in subsection %s based on %s" % (syntype, args.langname, k // 2 + 1, syntype, defn_subsection // 2 + 1, syndesc)) # Pull out all synonyms by number unparsable = False syns_by_number = defaultdict(list) skipped_lines = [] skipped_a_line = False lines = subsections[k].split("\n") for lineno, line in enumerate(lines): if not line.strip(): skipped_lines.append(lineno) continue # Look for '* (1) {{l|...}}' m = re.search(r"^\* *\(([0-9]+)\) *(.*?)$", line) if m: defnum, syns = m.groups() else: # Look for '* {{l|...}} (1)' m = re.search(r"^\* *(.*?) *\(([0-9]+)\)$", line) if m: syns, defnum = m.groups() else: # Look for '* {{sense|1}} {{l|...}}' m = re.search( r"^\* *\{\{(?:s|sense)\|([0-9]+)\}\} *(.*?)$", line) if m: defnum, syns = m.groups() else: # couldn't parse line pagemsg("Couldn't parse %s line for numbers: %s" % (syntype, line)) unparsable = True break parsed_syns = parse_syns(syns) if parsed_syns is None: skipped_a_line = True skipped_lines.append(lineno) else: syns_by_number[int(defnum)] += parsed_syns if not unparsable: # Find definitions before_defn_text, defns, after_defn_text = find_defns() if before_defn_text is None: continue # Don't consider definitions with {{reflexive of|...}} in them reindexed_defns = {} next_index = 1 for index, defn in enumerate(defns): if "{{reflexive of|" in defn: continue reindexed_defns[next_index] = index next_index += 1 # Make sure synonyms don't refer to nonexistent definition max_syn = max(syns_by_number.keys()) max_defn = max(reindexed_defns.keys()) if max_syn > max_defn: pagemsg( "WARNING: Numbered synonyms refer to maximum %s > maximum defn %s" % (max_syn, max_defn)) continue # Add inline synonyms must_continue = False for synno, syns in syns_by_number.iteritems(): index = reindexed_defns[synno] new_defn = add_syns_to_defn(syns, defns[index], False) if new_defn is None: must_continue = True break defns[index] = new_defn if must_continue: continue # Put back new definition text and clear out synonyms put_back_new_defns(defns, "numbered %ss" % syntype, skipped_a_line, lines, skipped_lines) continue # Try checking for {{sense|...}} or (''...'') indicators unparsable = False syns_by_tag = {} skipped_lines = [] skipped_a_line = False must_continue = False lines = subsections[k].split("\n") for lineno, line in enumerate(lines): if not line.strip(): skipped_lines.append(lineno) continue m = re.search(r"^\* *\(''([^']*?)''\) *(.*?)$", line) if m: tag, syns = m.groups() else: m = re.search(r"^\* *''\(([^']*?)\)'' *(.*?)$", line) if m: tag, syns = m.groups() else: m = re.search( r"^\* *\{\{(?:s|sense)\|([^{}|]*?)\}\} *(.*?)$", line) if m: tag, syns = m.groups() else: # couldn't parse line pagemsg("Couldn't parse %s line for tags: %s" % (syntype, line)) unparsable = True break tag = re.sub(r",? +etc\.?$", "", tag) parsed_syns = parse_syns(syns) if parsed_syns is None: skipped_a_line = True skipped_lines.append(lineno) else: if tag in syns_by_number: pagemsg("WARNING: Saw the same tag '%s' twice" % tag) must_continue = True break syns_by_tag[tag] = (parsed_syns, lineno) if must_continue: continue if not unparsable: # Pull out each definition (not including continuations) and remove links unlinked_defns = [] must_continue = False for defn in defns: m = re.search("^# *(.*)\n", defn) if not m: pagemsg( "WARNING: Something wrong, can't pull out definition from <%s>" % defn) must_continue = True break unlinked_defns.append(blib.remove_links(m.group(1))) if must_continue: continue # Match tags against definitions tag_to_defn = {} defn_to_tag = {} must_continue = False bad = False for tag in syns_by_tag.keys(): matching_defn = None must_break = False for defno, unlinked_defn in enumerate(unlinked_defns): tag_re = r"\b" + re.sub(r"[ ,.*/{}:;()?!\[\]+]+", r"\\b.*\\b", tag) + r"\b" if re.search(tag_re, unlinked_defn): if matching_defn is not None: pagemsg( "WARNING: Matched tag '%s' against both defn <%s> and <%s>" % (tag, unlinked_defns[matching_defn], unlinked_defn)) if args.do_your_best: bad = True else: must_break = True must_continue = True break else: matching_defn = defno if must_break: break if not bad and matching_defn is None: pagemsg( "WARNING: Couldn't match tag '%s' against definitions %s" % (tag, ", ".join( "<%s>" % unlinked_defn for unlinked_defn in unlinked_defns))) if args.do_your_best: bad = True else: must_continue = True break if not bad and matching_defn in defn_to_tag: pagemsg( "WARNING: Matched two tags '%s' and '%s' against the same defn <%s>" % (tag, defn_to_tag[matching_defn], unlinked_defns[matching_defn])) if args.do_your_best: bad = True else: must_continue = True break if not bad: defn_to_tag[matching_defn] = tag tag_to_defn[tag] = matching_defn if must_continue: continue # Add inline synonyms must_continue = False for tag, (syns, lineno) in syns_by_tag.iteritems(): if tag in tag_to_defn: index = tag_to_defn[tag] new_defn = add_syns_to_defn(syns, defns[index], bad) if new_defn is None: must_continue = True break defns[index] = new_defn else: skipped_a_line = True skipped_lines.append(lineno) if must_continue: continue # Put back new definition text and clear out synonyms put_back_new_defns(defns, "tagged %ss" % syntype, skipped_a_line, lines, skipped_lines) continue # Add synonyms if only one definition or --do-your-best if len(defns) > 1: pagemsg( "WARNING: Saw %s subsection %s with %s definitions and don't know where to add, %s" % (syntype, k // 2 + 1, len(defns), "adding to first definition" if args.do_your_best else "can't add")) if len(defns) == 1 or args.do_your_best: unparsable = False all_syns = [] syns_by_tag = {} skipped_lines = [] skipped_a_line = False lines = subsections[k].split("\n") total_syns = 0 for lineno, line in enumerate(lines): if not line.strip(): skipped_lines.append(lineno) continue m = re.search(r"^\* *(.*?)$", line) if m: syns = m.group(1) else: # couldn't parse line pagemsg( "WARNING: Couldn't parse %s line in last stage: %s" % (syntype, line)) unparsable = True break parsed_syns = parse_syns(syns) if parsed_syns is None: skipped_a_line = True skipped_lines.append(lineno) else: all_syns.append((lineno, total_syns, parsed_syns)) total_syns += 1 if not unparsable: changed = False if total_syns > 1 and len(defns) == total_syns: # only happens when --do-your-best pagemsg( "Saw %s definitions and %s synonym lines, matching definitions and synonym lines" % (len(defns), total_syns)) for lineno, synno, parsed_syns in all_syns: # Add inline synonyms new_defn = add_syns_to_defn( parsed_syns, defns[synno], True) if new_defn is None: pagemsg( "WARNING: Couldn't add %s line when matching definitions and synonym lines: %s" % (syntype, lines[lineno])) skipped_a_line = True skipped_lines.append(lineno) continue defns[synno] = new_defn changed = True else: if len(defns) > 1: # only happens when --do-your-best pagemsg( "WARNING: Saw %s definitions but %s synonym lines, adding to first definition" % (len(defns), total_syns)) # If more than one synonym line, add a qualifier specifying the original synonym line number # to the first synonym on the line to make it easier to manually line up synonyms with definitions. if total_syns > 1: all_syns = [(lineno, synno, [ (syn, other_params + [("qq", "l%s" % (synno + 1))] if synindex == 0 else other_params, joiner_after) for synindex, (syn, other_params, joiner_after) in enumerate(parsed_syns) ]) for lineno, synno, parsed_syns in all_syns] # Add inline synonyms all_syns = [ syn for lineno, synno, parsed_syns in all_syns for syn in parsed_syns ] # flatten new_defn = add_syns_to_defn(all_syns, defns[0], len(defns) > 1) if new_defn is None: continue defns[0] = new_defn changed = True # Put back new definition text and clear out moved synonyms if changed: put_back_new_defns( defns, "%ss with only one definition" % syntype, skipped_a_line, lines, skipped_lines) continue secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") m = re.search( "^Category:(Japanese|Okinawan) terms spelled with (.*) read as (.*)$", pagetitle) if not m: pagemsg("Skipped") return notes = [] lang, spelling, reading = m.groups() langcode = lang == "Japanese" and "ja" or "ryu" spelling_page = pywikibot.Page(site, spelling) def pagemsg_with_spelling(txt): pagemsg("%s: %s" % (spelling, txt)) def errandpagemsg_with_spelling(txt): pagemsg_with_spelling(txt) errmsg("Page %s %s: %s: %s" % (index, pagetitle, spelling, txt)) if not blib.safe_page_exists(spelling_page, pagemsg_with_spelling): pagemsg_with_spelling("Spelling page doesn't exist, skipping") return spelling_page_text = blib.safe_page_text(spelling_page, pagemsg_with_spelling) retval = blib.find_modifiable_lang_section(spelling_page_text, lang, pagemsg_with_spelling) if retval is None: pagemsg_with_spelling("WARNING: Couldn't find %s section" % lang) return sections, j, secbody, sectail, has_non_lang = retval parsed = blib.parse_text(secbody) saw_readings_template = False reading_types = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "%s-readings" % langcode: saw_readings_template = True for reading_type in allowed_reading_types: readings = getparam(t, reading_type).strip() if readings: readings = re.split(r"\s*,\s*", readings) readings = [re.sub("[<-].*", "", r) for r in readings] if reading in readings: reading_type = canonicalize_reading_types.get( reading_type, reading_type) pagemsg_with_spelling( "Appending reading type %s based on %s" % (reading_type, unicode(t))) if reading_type not in reading_types: reading_types.append(reading_type) notes.append( "add %s reading based on {{%s-readings}} on page [[%s]]" % (reading_type, langcode, spelling)) if not reading_types: pagemsg_with_spelling( "WARNING: Can't find reading %s among readings listed in %s" % (reading, unicode(t).replace("\n", r"\n"))) if not saw_readings_template: pagemsg_with_spelling( "WARNING: Couldn't find reading template {{%s-readings}}" % langcode) if reading_types: contents = "{{auto cat|%s}}" % "|".join(reading_types) return contents, notes else: pagemsg_with_spelling("WARNING: Can't find reading %s on page" % reading) for i, contents_page in blib.cat_articles( re.sub("^Category:", "", pagetitle)): contents_title = unicode(contents_page.title()) def pagemsg_with_contents(txt): pagemsg("%s: %s" % (contents_title, txt)) def errandpagemsg_with_contents(txt): pagemsg_with_contents(txt) errmsg("Page %s %s: %s: %s" % (index, pagetitle, contents_title, txt)) contents_page_text = blib.safe_page_text(contents_page, pagemsg_with_contents) retval = blib.find_modifiable_lang_section(contents_page_text, lang, pagemsg_with_contents) if retval is None: pagemsg_with_contents("WARNING: Couldn't find %s section" % lang) return sections, j, secbody, sectail, has_non_lang = retval saw_kanjitab = False must_continue = False for ch in contents_title: if 0xD800 <= ord(ch) <= 0xDFFF: pagemsg_with_contents( "WARNING: Surrogates in page name, skipping: %s" % ord(ch)) must_continue = True break if must_continue: continue chars_in_contents_title = [x for x in contents_title] for i, ch in enumerate(chars_in_contents_title): if ch == u"々": # kanji repeat char if i == 0: pagemsg_with_contents( u"Repeat char 々 found at beginning of contents title") must_continue = True break else: chars_in_contents_title[i] = chars_in_contents_title[i - 1] if must_continue: continue kanji_in_contents_title = [ x for x in chars_in_contents_title if unicodedata.name(x).startswith("CJK UNIFIED IDEOGRAPH") ] parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn == "%s-kanjitab" % langcode: saw_kanjitab = True readings = [] for i in range(1, 10): contents_reading = getparam(t, str(i)) if contents_reading: readings.append(contents_reading) if len(kanji_in_contents_title) != len(readings): pagemsg_with_contents( "WARNING: Saw %s chars in contents title but %s readings %s, skipping: %s" % (len(kanji_in_contents_title), len(readings), ",".join(readings), unicode(t))) continue yomi = getparam(t, "yomi") if not yomi: pagemsg_with_contents("WARNING: No yomi, skipping: %s" % unicode(t)) continue if "," in yomi or re.search("[0-9]$", yomi): yomi = yomi.split(",") if type(yomi) is list: expanded_yomi = [] for y in yomi: m = re.search("^(.*?)([0-9]+)$", y) if m: baseyomi, numyomi = m.groups() numyomi = int(numyomi) expanded_yomi.extend([baseyomi] * numyomi) else: expanded_yomi.append(y) if expanded_yomi != yomi: pagemsg_with_contents( "Expanding yomi %s to %s" % (",".join(yomi), ",".join(expanded_yomi))) yomi = expanded_yomi if type(yomi) is list and len(yomi) != len( kanji_in_contents_title): pagemsg_with_contents( "WARNING: %s values in yomi=%s but %s chars in contents, skipping: %s" % (len(yomi), ",".join(yomi), len(kanji_in_contents_title), unicode(t))) continue saw_spelling_in_contents = False must_continue = False for i, (ch, contents_reading) in enumerate( zip(kanji_in_contents_title, readings)): if ch == spelling: saw_spelling_in_contents = True if contents_reading == reading: if type(yomi) is list: reading_type = yomi[i] else: reading_type = yomi yomi_to_canonical_reading_type = { "o": "on", "on": "on", "kanon": "kanon", "goon": "goon", "soon": "soon", "toon": "toon", "kan": "kanyoon", "kanyo": "kanyoon", "kanyoon": "kanyoon", "k": "kun", "kun": "kun", "juku": "jukujikun", "jukuji": "jukujikun", "jukujikun": "jukujikun", "n": "nanori", "nanori": "nanori", "ok": "jubakoyomi", "j": "jubakoyomi", "ko": "yutoyomi", "y": "yutoyomi", "irr": "irregular", "irreg": "irregular", "irregular": "irregular", } if reading_type not in yomi_to_canonical_reading_type: pagemsg_with_contents( "WARNING: Unrecognized reading type %s: %s" % (reading_type, unicode(t))) must_continue = True break reading_type = yomi_to_canonical_reading_type[ reading_type] if reading_type not in allowed_reading_types: pagemsg_with_contents( "WARNING: Disallowed reading type %s: %s" % (reading_type, unicode(t))) must_continue = True break reading_type = canonicalize_reading_types.get( reading_type, reading_type) pagemsg_with_contents( "Appending reading type %s based on %s" % (reading_type, unicode(t))) if reading_type not in reading_types: reading_types.append(reading_type) notes.append( "add %s reading based on {{%s-kanjitab}} on page [[%s]]" % (reading_type, langcode, contents_title)) if must_continue: continue if not saw_spelling_in_contents: pagemsg_with_contents( "WARNING: Didn't see spelling in contents: %s" % unicode(t)) continue if not saw_kanjitab: pagemsg_with_contents("WARNING: Didn't see {{%s-kanjitab}}" % langcode) if reading_types: contents = "{{auto cat|%s}}" % "|".join(reading_types) return contents, notes else: pagemsg_with_spelling( "WARNING: Can't find reading %s by looking through category contents" % reading)
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval def extract_pronouns(form1, form2): prons = [] if form1: prons.append(form1) if form2.startswith("glie"): prons.extend(["glie", form2[4:]]) else: prons.append(form2) return prons def extract_base(pron1, pron2): if pron1: prontext = pron1 + pron2 else: prontext = pron2 m = re.search(r"^(.*)%s$" % prontext, pagetitle) if not m: pagemsg("WARNING: Page title should end in '%s' but doesn't" % prontext) return None return m.group(1) def fix_compound_of(m): origtext = m.group(0) m = re.search(r"^# Compound of (.*?)\.*\n$", origtext) if not m: pagemsg("WARNING: Internal error: Can't match line: %s" % origtext) return origtext text = m.group(1) def do_fix_compound_of(text): # Convert {{m|it|ci}} to [[ci]] text = re.sub(r"\{\{m\|it\|([^{}]*?)\}\}", r"[[\1]]", text) # Convert [[ci#Italian|ci]] to [[ci]] text = re.sub(r"\[\[[^\[\]|]*?#Italian\|([^\[\]|]*?)\]\]", r"[[\1]]", text) m = re.search( r"^(?:the )?gerund of '*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$", text) if m: inf, pron1, pron2 = m.groups() prons = extract_pronouns(pron1, pron2) base = extract_base(pron1, pron2) if not base: return None notes.append( "templatize Italian gerund compound-of expression") if len(prons) == 1 and base.endswith("ando"): return "" elif len(prons) == 1: return "|inf=%s" % inf elif base.endswith("ando"): return "|%s" % "|".join(prons) else: return "|%s|inf=%s" % ("|".join(prons), inf) m = re.search( r"^imperative(?: \(\[*(tu|noi|voi?|singular|plural|let's|)\]*(?: (?:form|person))?\))? of '*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$", text) if m: imp_pers, inf, pron1, pron2 = m.groups() if not imp_pers: base = extract_base(pron1, pron2) if not base: return None if base.endswith("te"): imp_pers = "voi" else: imp_pers = "tu" prons = extract_pronouns(pron1, pron2) imp_pers_to_pos = { "tu": "imp2s", "noi": "imp1p", "voi": "imp2p", "vo": "imp2p", "singular": "imp2s", "plural": "imp2p", "let's": "imp1p" } pos = imp_pers_to_pos[imp_pers] notes.append( "templatize Italian imperative compound-of expression") if len(prons) == 1: return "|pos=%s|inf=%s" % (pos, inf) else: return "|%s|pos=%s|inf=%s" % ("|".join(prons), pos, inf) m = re.search( r"^'*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$", text) if m: inf, pron1, pron2 = m.groups() prons = extract_pronouns(pron1, pron2) if inf.endswith("ando"): notes.append( "templatize Italian gerund compound-of expression") if len(prons) == 1: return "" else: return "|%s" % "|".join(prons) if not inf.endswith("re") and not re.search("r[mtscv]i$", inf): pagemsg("WARNING: Unrecognized infinitive %s: %s" % (inf, origtext.strip())) return None notes.append( "templatize Italian infinitive compound-of expression") if len(prons) == 1 and inf.endswith("re"): return "" inf_pron_to_pos = { "mi": "inf1s", "ti": "inf2s", "ci": "inf1p", "vi": "inf2p" } if re.search("[mtcv]i$", inf): pos = inf_pron_to_pos[inf[-2:]] return "|%s|%s|pos=%s" % (inf, "|".join(prons), pos) elif len(prons) == 1 and pagetitle.endswith(prons[0]): return "|pos=inf|inf=%s" % inf elif inf.endswith("re"): return "|%s" % "|".join(prons) else: return "|%s|pos=inf|inf=%s" % ("|".join(prons), inf) m = re.search( r"^(feminine|plural|masculine plural|feminine plural|) *past participle of '*\[\[([^\[\]|]*?)\]\]'*(?:, '*\[\[([^\[\]|]*?)\]\]'*)? and '*\[\[([^\[\]|]*?)\]\]'*$", text) if m: ppform, inf, pron1, pron2 = m.groups() prons = extract_pronouns(pron1, pron2) ppform_to_pos = { "": "ppms", "feminine": "ppfs", "plural": "ppmp", "masculine plural": "ppmp", "feminine plural": "ppfp" } pos = ppform_to_pos[ppform] notes.append( "templatize Italian past participle compound-of expression" ) if len(prons) == 1: return "|pos=%s|inf=%s" % (pos, inf) else: return "|%s|pos=%s|inf=%s" % ("|".join(prons), pos, inf) pagemsg("WARNING: Unrecognized raw compound-of expression: %s" % origtext.strip()) return None retval = do_fix_compound_of(text) if retval is None: return origtext return "# {{it-compound of%s}}\n" % retval hacked_secbody = re.sub(r"# \[\[[Cc]ompound\|[Cc]ompound\]\]", "# Compound", secbody) hacked_secbody = re.sub(r"# compound", "# Compound", hacked_secbody) hacked_secbody = re.sub( r"# \{\{(?:non-gloss definition|n-g)\|[Cc]ompound (.*)\}\}", r"# Compound \1", hacked_secbody) fixed_secbody = re.sub(r"# (Compound of.*?\.*)\n", fix_compound_of, hacked_secbody) if "{{it-compound of" in fixed_secbody: newsecbody = re.sub(r"\{\{head\|it\|combined forms?\}\}", "{{head|it|verb form}}", fixed_secbody) if newsecbody != fixed_secbody: notes.append( "replace {{head|it|combined form}} with {{head|it|verb form}}") fixed_secbody = newsecbody secbody = fixed_secbody # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail text = "".join(sections) return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Polish", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval # Add missing space between * and { in case of {{R:pl:WSJP}} or {{R:pl:PWN}} directly after * without space newsecbody = re.sub("^\*\{", "* {", secbody, 0, re.M) if newsecbody != secbody: notes.append("add missing space after bullet *") secbody = newsecbody # Remove trailing spaces to avoid issues with spaces after {{R:pl:WSJP}} or {{R:pl:PWN}} newsecbody = re.sub(" *\n", "\n", secbody) if newsecbody != secbody: notes.append("remove extraneous trailing spaces") secbody = newsecbody # See if there are definition lines that do not contain {{surname}}, {{given name}}, {{verbal noun of}}, # {{inflection of}} and {{infl of}}. lines = secbody.split("\n") saw_good_defn_line = False bad_templates = ["surname", "given name", "verbal noun of", "inflection of", "infl of"] for line in lines: if line.startswith("#") and not re.search(r"\{\{(%s)\|pl[|}]" % "|".join(bad_templates), line): saw_good_defn_line = True if not saw_good_defn_line: saw_bad_templates = [] for bad_template in bad_templates: if re.search(r"\{\{%s\|pl[|}]" % bad_template, secbody): saw_bad_templates.append(bad_template) if saw_bad_templates: pagemsg("Skipping page because saw no good definition lines, and saw %s" % ( " and ".join("{{%s|pl}}" % bad_template for bad_template in saw_bad_templates))) else: pagemsg("WARNING: Skipping page because saw no good definition lines; didn't see any of %s" % ( ", ".join("{{%s|pl}}" % bad_template for bad_template in bad_templates))) return subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) # Check for templates in sections outside of 'Further reading' for k in xrange(2, len(subsections), 2): if not re.search("^==+Further reading==+\n", subsections[k - 1]): if "{{R:pl:WSJP}}" in subsections[k] or "{{R:pl:PWN}}" in subsections[k]: if re.search("^==+References==+\n", subsections[k - 1]): pagemsg("WARNING: Saw {{R:pl:WSJP}} or {{R:pl:PWN}} in %s section, can't handle" % subsections[k - 1].strip()) return else: pagemsg("WARNING: Saw {{R:pl:WSJP}} or {{R:pl:PWN}} in %s section, need to review manually" % subsections[k - 1].strip()) # Check for References or Further reading already present for k in xrange(2, len(subsections), 2): if re.search("^==+Further reading==+\n", subsections[k - 1]): newsubsecval = "===Further reading===\n" if subsections[k - 1] != newsubsecval: for l in xrange(k + 2, len(subsections), 2): if not re.search("^===Anagrams===\n", subsections[l - 1]): pagemsg("WARNING: Saw level > 3 Further reading and a following non-Anagrams section %s, can't handle" % subsections[l - 1].strip()) return notes.append("replaced %s with level-3 %s" % (subsections[k - 1].strip(), newsubsecval.strip())) subsections[k - 1] = newsubsecval newsubsec = re.sub(r"^(\* \{\{R:pl:PWN\}\}\n)(.*)(\* \{\{R:pl:WSJP\}\}\n)", r"\3\1\2", subsections[k], 0, re.M | re.S) if newsubsec != subsections[k]: notes.append("standardize order of ===Further reading=== with {{R:pl:WSJP}} followed by {{R:pl:PWN}} followed by anything else") subsections[k] = newsubsec else: has_wsjp = "{{R:pl:WSJP}}" in subsections[k] has_pwn = "{{R:pl:PWN}}" in subsections[k] if has_wsjp and not has_pwn: newsubseck = subsections[k].replace("* {{R:pl:WSJP}}\n", "* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n") if newsubseck == subsections[k]: pagemsg("WARNING: Unable to add {{R:pl:PWN}} after {{R:pl:WSJP}}") else: subsections[k] = newsubseck notes.append("add {{R:pl:PWN}} to Polish lemma in ===Further reading===") elif has_pwn and not has_wsjp: newsubseck = subsections[k].replace("* {{R:pl:PWN}}\n", "* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n") if newsubseck == subsections[k]: pagemsg("WARNING: Unable to add {{R:pl:WSJP}} before {{R:pl:PWN}}") else: subsections[k] = newsubseck notes.append("add {{R:pl:WSJP}} to Polish lemma in ===Further reading===") elif has_wsjp and has_pwn: pagemsg("Already has {{R:pl:WSJP}} and {{R:pl:PWN}}") else: subsections[k] = "* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n" + subsections[k] notes.append("add {{R:pl:WSJP}} and {{R:pl:PWN}} to Polish lemma in ===Further reading===") break else: # no break k = len(subsections) - 1 while k >= 2 and re.search(r"==\s*Anagrams\s*==", subsections[k - 1]): k -= 2 if k < 2: pagemsg("WARNING: No lemma or non-lemma section") return subsections[k + 1:k + 1] = ["===Further reading===\n* {{R:pl:WSJP}}\n* {{R:pl:PWN}}\n\n"] notes.append("add new ===Further reading=== section to Polish lemma with {{R:pl:WSJP}} and {{R:pl:PWN}}") secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Hungarian section") return sections, j, secbody, sectail, has_non_lang = retval parsed = blib.parse_text(secbody) saw_mpos_inflection_of = False for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "inflection of": if getparam(t, "1") != "hu": pagemsg( "WARNING: Saw non-Hungarian {{inflection of}}, skipping") return for i in range(4, 30): if getparam(t, str(i)) == "(single possession)": t.add(str(i), "spos") notes.append( "(single possession) -> spos in {{inflection of|hu}}") if getparam(t, str(i)) in [ "(multiple possessions)", "(multiple possession)" ]: t.add(str(i), "mpos") notes.append( "(multiple possessions) -> mpos in {{inflection of|hu}}" ) if getparam(t, str(i)) == "mpos" and getparam( t, str(i + 1)) == "poss": saw_mpos_inflection_of = True if tn == "hu-infl-nom" and saw_mpos_inflection_of: n = getparam(t, "n") if n == "isg": pass elif n == "sg": t.add("n", "isg") notes.append( "n=sg -> n=isg in {{hu-infl-nom}} in the context of {{inflection of|hu|...|mpos|poss}}" ) else: pagemsg("WARNING: Saw strange value n=%s in %s" % (n, unicode(t))) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) secbody = unicode(parsed) if notes and "==Etymology 1==" in secbody: pagemsg( "WARNING: Would make a change, but saw ==Etymology 1==, skipping") return sections[j] = secbody + sectail text = "".join(sections) return text, notes
def process_text_on_page(index, pagetitle, text, lang, pos): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) cappos = pos.capitalize() notes = [] pagemsg("Processing") retval = blib.find_modifiable_lang_section(text, lang_to_name[lang], pagemsg) if retval is None: pagemsg("WARNING: Couldn't find %s section" % lang_to_name[lang]) return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) k = 1 last_pos = None if "indeclinable %ss" % pos in secbody + sectail: pagemsg("Saw 'indeclinable %ss' in text, skipping: %s" % pos) return while k < len(subsections): if re.search(r"=\s*%s\s*=" % cappos, subsections[k]): level = get_indentation_level(subsections[k]) last_pos = cappos endk = k + 2 while endk < len(subsections) and get_indentation_level( subsections[endk]) > level: endk += 2 if endk < len(subsections) and re.search( r"=\s*(Declension|Inflection|Conjugation)\s*=", subsections[endk]): pagemsg( "WARNING: Found probably misindented inflection header after ==%s== header: %s" % (cappos, subsections[endk].strip())) k = endk + 2 continue pos_text = "".join(subsections[k:endk]) parsed = blib.parse_text(pos_text) saw_head = False saw_head_form = False head_is_indeclinable = False saw_inflection_of = False inflt = None found_rfinfl = False for t in parsed.filter_templates(): tn = tname(t) if re.search( "^" + pos_to_headword_template[lang][pos] + "$", tn) or (tn == "head" and getparam(t, "1") == lang and getparam(t, "2") in [pos, "%ss" % pos]): if saw_head: pagemsg( "WARNING: Found two heads under one POS section: second is %s" % unicode(t)) saw_head = True if tn != "head" and lemma_is_indeclinable[lang]( t, pagetitle, pagemsg): pagemsg("Headword template is indeclinable: %s" % unicode(t)) head_is_indeclinable = True break if re.search("^" + pos_to_infl_template[lang][pos] + "$", tn): exclude_re = pos_to_infl_template_exclude.get( lang, {}).get(pos, None) if not exclude_re or not re.search("^" + exclude_re + "$", tn): if inflt: pagemsg( "WARNING: Found two inflection templates under one POS section: %s and %s" % (unicode(inflt), unicode(t))) inflt = t pagemsg("Found %s inflection: %s" % (pos, unicode(t))) if tn in ["inflection of", "infl of"]: pagemsg("Saw 'inflection of': %s" % unicode(t)) saw_inflection_of = True if pos_to_nonlemma_template[lang] and re.search( "^" + pos_to_nonlemma_template[lang] + "$", tn) or (tn == "head" and getparam(t, "1") == lang and re.search(" forms?$", getparam(t, "2"))): pagemsg("Saw non-lemma headword template: %s" % unicode(t)) saw_head_form = True if not inflt: pagemsg("Didn't find %s inflection" % pos) if saw_head_form: pagemsg( "Saw non-lemma headword template, not adding {{rfinfl}}" ) elif saw_inflection_of: pagemsg( "WARNING: Didn't see non-lemma headword template but saw 'inflection of'; not adding {{rfinfl}}" ) elif not saw_head: pagemsg( "WARNING: Didn't see lemma or non-lemma headword template; not adding {{rfinfl}}" ) elif head_is_indeclinable: pagemsg( "Headword template is indeclinable, not adding {{rfinfl}}" ) else: for l in xrange(k, endk, 2): if re.search( r"=\s*(Declension|Inflection|Conjugation)\s*=", subsections[l]): secparsed = blib.parse_text(subsections[l + 1]) for t in secparsed.filter_templates(): tn = tname(t) if tname(t) != "rfinfl": pagemsg( "WARNING: Saw unknown template %s in existing inflection section, skipping" % (unicode(t))) break else: pagemsg("Found %s" % unicode(t)) break else: # no break insert_k = k + 2 while insert_k < endk and "Usage notes" in subsections[ insert_k]: insert_k += 2 if not subsections[insert_k - 1].endswith("\n\n"): subsections[insert_k - 1] = re.sub( "\n*$", "\n\n", subsections[insert_k - 1] + "\n\n") subsections[insert_k:insert_k] = [ "%s%s%s\n" % ("=" * (level + 1), "Conjugation" if pos == "verb" else "Declension", "=" * (level + 1)), "{{rfinfl|%s|%s}}\n\n" % (lang, pos) ] pagemsg( "Inserted level-%s inflection section with {{rfinfl|%s|%s}}" % (level + 1, lang, pos)) notes.append("add {{rfinfl|%s|%s}}" % (lang, pos)) endk += 2 # for the two subsections we inserted k = endk else: m = re.search( r"=\s*(Noun|Proper noun|Pronoun|Determiner|Verb|Adverb|Adjective|Interjection|Conjunction)\s*=", subsections[k]) if m: last_pos = m.group(1) if re.search(r"=\s*(Declension|Inflection|Conjugation)\s*=", subsections[k]): if not last_pos: pagemsg( "WARNING: Found inflection header before seeing any parts of speech: %s" % (subsections[k].strip())) elif last_pos == cappos: pagemsg( "WARNING: Found probably misindented inflection header after ==%s== header: %s" % (cappos, subsections[k].strip())) k += 2 secbody = "".join(subsections) sections[j] = secbody + sectail text = "".join(sections) text = re.sub("\n\n\n+", "\n\n", text) if not notes: notes.append("convert 3+ newlines to 2") return text, notes
def delete_form_1(page, index, lemma, formind, formval, lang): notes = [] def pagemsg(txt): msg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: form %s %s: %s" % (index, lemma, formind, formval, txt)) text = unicode(page.text) origtext = text retval = blib.find_modifiable_lang_section(text, lang_to_langname[lang], pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval # FIXME! #if "==Etymology 1==" in secbody: # etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) # for k in xrange(2, len(etym_sections), 2): # etym_sections[k] = fix_up_section(etym_sections[k], warn_on_multiple_heads=True) # secbody = "".join(etym_sections) subsections_to_delete = [] subsections_to_remove_inflections_from = [] subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) saw_head = False saw_infl = False saw_other_infl = False remove_deletable_tag_sets_from_subsection = False saw_bad_template = False for t in parsed.filter_templates(): tn = tname(t) if tn in lang_headword_templates[lang] or ( tn == "head" and getparam(t, "1") == lang and getparam(t, "2") in form_poses): saw_head = True elif tn in inflection_of_templates: langcode = getparam(t, "1") if langcode != lang: errandpagemsg( "WARNING: In %s section, found {{%s}} for different language %s: %s" % (lang_to_langname[lang], tn, langcode, unicode(t))) return actual_lemma = getparam(t, "2") if actual_lemma == lemma: saw_infl = True else: pagemsg("Found {{%s}} for different lemma %s: %s" % (tn, actual_lemma, unicode(t))) saw_other_infl = True elif tn in lang_inflection_of_templates[lang]: actual_lemma = getparam(t, "1") if actual_lemma == lemma: saw_infl = True else: pagemsg("Found {{%s}} for different lemma %s: %s" % (tn, actual_lemma, unicode(t))) saw_other_infl = True if saw_head and saw_infl: if saw_other_infl: pagemsg( "Found subsection #%s to delete but has inflection template for different lemma or nondeletable tag set, will remove only deletable tag sets" % (k // 2)) remove_deletable_tag_sets_from_subsection = True for t in parsed.filter_templates(): tn = tname(t) if tn not in lang_headword_templates[ lang] + lang_inflection_of_templates[ lang] + inflection_of_templates and not ( tn == "head" and getparam(t, "1") == lang and getparam(t, "2") in form_poses): pagemsg( "WARNING: Saw unrecognized template in otherwise deletable subsection #%s: %s" % (k // 2, unicode(t))) saw_bad_template = True break else: # No break if re.search("===(Noun|Verb|Adjective)===", subsections[k - 1]): indent_header = subsections[k - 1].strip() indent = len(re.sub("^(=+).*", r"\1", indent_header)) has_non_deletable_subsubsection = False extra_subsubsections_to_delete = [] l = k while l + 1 < len(subsections): nextindent = len( re.sub("^(=+).*", r"\1", subsections[l + 1].strip())) if nextindent <= indent: break # Italian verb forms often have Synonyms sections for alternative forms, and random Related terms sections if re.search("==(Synonyms|Related terms)==", subsections[l + 1]): extra_subsubsections_to_delete.append(l + 2) l += 2 else: has_non_deletable_subsubsection = True pagemsg( "WARNING: Subsection #%s (header %s, indent %s) has subsubsection with header %s (indent %s), not deleting" % (l // 2, indent_header, indent, subsections[l + 1].strip(), nextindent)) break if not has_non_deletable_subsubsection: if remove_deletable_tag_sets_from_subsection: subsections_to_remove_inflections_from.append(k) else: subsections_to_delete.append(k) subsections_to_delete.extend( extra_subsubsections_to_delete) else: pagemsg( "WARNING: Wrong header in otherwise deletable subsection #%s: %s" % (k // 2, subsections[k - 1].strip())) if not subsections_to_delete and not subsections_to_remove_inflections_from: pagemsg("Found %s section but no deletable or excisable subsections" % lang_to_langname[lang]) return #### Now, we can delete an inflection, a subsection or the whole section or page for k in subsections_to_remove_inflections_from: newsubsec = subsections[k] if not newsubsec.endswith("\n"): # This applies to the last subsection on the page newsubsec += "\n" def remove_inflections(m): parsed = blib.parse_text(m.group(0)) for t in parsed.filter_templates(): tn = tname(t) if tn in inflection_of_templates: langcode = getparam(t, "1") assert langcode == lang actual_lemma = getparam(t, "2") if actual_lemma == lemma: return "" if tn in lang_inflection_of_templates[lang]: actual_lemma = getparam(t, "1") if actual_lemma == lemma: return "" return unicode(parsed) for tn in lang_inflection_of_templates[lang] + inflection_of_templates: newnewsubsec = re.sub(r"^# \{\{%s\|[^{}\n]*\}\}\n" % re.escape(tn), remove_inflections, newsubsec, 0, re.M) if newnewsubsec != newsubsec: newsubsec = newnewsubsec notes.append( "removed {{%s}} inflection(s) for bad %s form(s) of [[%s]]" % (tn, lang_to_langname[lang], lemma)) subsections[k] = newsubsec for k in reversed(subsections_to_delete): # Do in reverse order so indices don't change del subsections[k] del subsections[k - 1] whole_section_deletable = False if len(subsections) == 1: whole_section_deletable = True else: for k in xrange(3, len(subsections), 2): if not re.search("^==+(References|Anagrams)==+$", subsections[k].strip()): break else: # no break whole_section_deletable = True if whole_section_deletable: # Whole section deletable if subsections[0].strip(): pagemsg( "WARNING: Whole %s section deletable except that there's text above all subsections: <%s>" % (lang_to_langname[lang], subsections[0].strip())) return if "[[Category:" in sectail: pagemsg( "WARNING: Whole %s section deletable except that there's a category at the end: <%s>" % (lang_to_langname[lang], sectail.strip())) return if not has_non_lang: # Can delete the whole page, but check for non-blank section 0 cleaned_sec0 = re.sub("^\{\{also\|.*?\}\}\n", "", sections[0]) if cleaned_sec0.strip(): pagemsg( "WARNING: Whole page deletable except that there's text above all sections: <%s>" % cleaned_sec0.strip()) return pagetitle = unicode(page.title()) pagemsg("Page %s should be deleted" % pagetitle) pages_to_delete.append(pagetitle) return del sections[j] del sections[j - 1] notes.append( "excised %s subsection%s for bad %s form(s) of [[%s]], leaving no %s section" % ((len(subsections_to_delete), "" if len(subsections_to_delete) == 1 else "s", lang_to_langname[lang], lemma, lang_to_langname[lang]))) if j > len(sections): # We deleted the last section, remove the separator at the end of the # previous section. sections[-1] = re.sub(r"\n+--+\n*\Z", "", sections[-1]) text = "".join(sections) else: # Some but not all subsections remain secbody = "".join(subsections) sections[j] = secbody + sectail if subsections_to_delete and subsections_to_remove_inflections_from: deletable_subsec_text = "Subsection(s) %s deletable and subsection(s) %s excisable" % ( ",".join(str(k // 2) for k in subsections_to_delete), ",".join( str(k // 2) for k in subsections_to_remove_inflections_from)) deletable_subsec_note_text = "deleted %s subsection%s and partly excised %s subsection%s" % ( len(subsections_to_delete), "" if len(subsections_to_delete) == 1 else "s", len(subsections_to_remove_inflections_from), "" if len(subsections_to_remove_inflections_from) == 1 else "s") elif subsections_to_delete: deletable_subsec_text = "Subsection(s) %s deletable" % (",".join( str(k // 2) for k in subsections_to_delete)) deletable_subsec_note_text = "deleted %s subsection%s" % ( len(subsections_to_delete), "" if len(subsections_to_delete) == 1 else "s") else: deletable_subsec_text = "Subsection(s) %s excisable" % (",".join( str(k // 2) for k in subsections_to_remove_inflections_from)) deletable_subsec_note_text = "partly excised %s subsection%s" % ( len(subsections_to_remove_inflections_from), "" if len(subsections_to_remove_inflections_from) == 1 else "s") if "==Etymology" in sections[j]: pagemsg( "WARNING: %s but found Etymology subsection, don't know how to handle" % deletable_subsec_text) return if "==Pronunciation" in sections[j]: pagemsg( "WARNING: %s but found Pronunciation subsection, don't know how to handle" % deletable_subsec_text) return notes.append( "%s for bad %s form(s) of %s, leaving some subsections remaining" % (deletable_subsec_note_text, lang_to_langname[lang], lemma)) text = "".join(sections) return text, notes
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc): notes = [] pagetitle = unicode(nounpage.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) text = unicode(nounpage.text) retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % ( desc, ",".join(new_adj_or_dims))) return sections, j, secbody, sectail, has_non_lang = retval parsed = blib.parse_text(secbody) head = None for t in parsed.filter_templates(): tn = tname(t) if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]: if head: pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" % (unicode(head), unicode(t), desc, ",".join(new_adj_or_dims))) return head = t if not head: pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims))) return orig_adjs_or_dims = blib.fetch_param_chain(head, param, param) adjs_or_dims = blib.fetch_param_chain(head, param, param) added_adjs_or_dims = [] for adj_or_dim in new_adj_or_dims: if adj_or_dim in adjs_or_dims: pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, unicode(head))) else: adjs_or_dims.append(adj_or_dim) added_adjs_or_dims.append(adj_or_dim) if adjs_or_dims != orig_adjs_or_dims: orighead = unicode(head) blib.set_param_chain(head, adjs_or_dims, param, param) pagemsg("Replaced %s with %s" % (orighead, unicode(head))) notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims))) secbody = unicode(parsed) subsecs = re.split("(^==.*==\n)", secbody, 0, re.M) for k in xrange(2, len(subsecs), 2): if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]: header = re.sub("=", "", subsecs[k - 1]).strip() for adj_or_dim in adjs_or_dims: def note_removed_text(m): if m.group(1): pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" % (header, desc, adj_or_dim, m.group(0))) return "" newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k]) if newsubsecsk != subsecs[k]: notes.append("remove %s %s from %s" % (desc, adj_or_dim, header)) subsecs[k] = newsubsecsk subsecs[k] = re.sub(", *,", ",", subsecs[k]) # Repeat in case adjacent terms removed (unlikely though). subsecs[k] = re.sub(", *,", ",", subsecs[k]) subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M) subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M) subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M) if re.search(r"^\s*$", subsecs[k]): subsecs[k] = "" subsecs[k - 1] = "" secbody = "".join(subsecs) secj = secbody + sectail newsecj = re.sub(r"\n\n\n+", "\n\n", secj) if newsecj != secj and not notes: notes.append("eliminate sequences of 3 or more newlines") secj = newsecj sections[j] = secj return "".join(sections), notes
def process_page(page, index, pos): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) cappos = pos.capitalize() notes = [] pagemsg("Processing") text = unicode(page.text) retval = blib.find_modifiable_lang_section(text, "Old English", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Old English section") return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) k = 1 last_pos = None while k < len(subsections): if re.search(r"=\s*%s\s*=" % cappos, subsections[k]): level = get_indentation_level(subsections[k]) last_pos = cappos endk = k + 2 while endk < len(subsections) and get_indentation_level( subsections[endk]) > level: endk += 2 pos_text = "".join(subsections[k:endk]) parsed = blib.parse_text(pos_text) head = None inflt = None found_rfinfl = False for t in parsed.filter_templates(): tn = tname(t) if tn == pos_to_headword_template[pos] or ( tn == "head" and getparam(t, "1") == "ang" and getparam(t, "2") in [pos, "%ss" % pos]): newhead = getparam(t, "head").strip() or pagetitle if head: pagemsg( "WARNING: Found two heads under one POS section: %s and %s" % (head, newhead)) head = newhead if tn == pos_to_new_style_infl_template[pos] or ( pos_to_old_style_infl_template_prefix[pos] and tn.startswith( pos_to_old_style_infl_template_prefix[pos])): if inflt: pagemsg( "WARNING: Found two inflection templates under one POS section: %s and %s" % (unicode(inflt), unicode(t))) inflt = t pagemsg( "Found %s inflection for headword %s: <from> %s <to> {{%s|%s}} <end>" % (pos, head or pagetitle, unicode(t), pos_to_new_style_infl_template[pos], getparam(t, "1") if pos == "verb" else head or pagetitle)) if not inflt: pagemsg( "Didn't find %s inflection for headword %s: <new> {{%s|%s%s}} <end>" % (pos, head or pagetitle, pos_to_new_style_infl_template[pos], head or pagetitle, "" if pos == "noun" else "<>")) if pages_to_infls: for l in xrange(k, endk, 2): if re.search( r"=\s*(Declension|Inflection|Conjugation)\s*=", subsections[l]): secparsed = blib.parse_text(subsections[l + 1]) for t in secparsed.filter_templates(): tn = tname(t) if tname(t) != "rfinfl": pagemsg( "WARNING: Saw unknown template %s in existing inflection section, skipping" % (unicode(t))) break else: # no break if pagetitle not in pages_to_infls: pagemsg( "WARNING: Couldn't find inflection for headword %s" % (head or pagetitle)) else: m = re.search(r"\A(.*?)(\n*)\Z", subsections[l + 1], re.S) sectext, final_newlines = m.groups() subsections[l + 1] = pages_to_infls[ pagetitle] + final_newlines pagemsg( "Replaced existing decl text <%s> with <%s>" % (sectext, pages_to_infls[pagetitle])) notes.append( "replace decl text <%s> with <%s>" % (sectext, pages_to_infls[pagetitle])) break else: # no break if pagetitle not in pages_to_infls: pagemsg( "WARNING: Couldn't find inflection for headword %s" % (head or pagetitle)) else: insert_k = k + 2 while insert_k < endk and "Usage notes" in subsections[ insert_k]: insert_k += 2 if not subsections[insert_k - 1].endswith("\n\n"): subsections[insert_k - 1] = re.sub( "\n*$", "\n\n", subsections[insert_k - 1] + "\n\n") subsections[insert_k:insert_k] = [ "%s%s%s\n" % ("=" * (level + 1), "Conjugation" if pos == "verb" else "Declension", "=" * (level + 1)), pages_to_infls[pagetitle] + "\n\n" ] pagemsg( "Inserted level-%s inflection section with inflection <%s>" % (level + 1, pages_to_infls[pagetitle])) notes.append("add decl <%s>" % pages_to_infls[pagetitle]) endk += 2 # for the two subsections we inserted k = endk else: m = re.search( r"=\s*(Noun|Proper noun|Pronoun|Determiner|Verb|Adjective|Adverb|Interjection|Conjunction)\s*=", subsections[k]) if m: last_pos = m.group(1) if re.search(r"=\s*(Declension|Inflection|Conjugation)\s*=", subsections[k]): if not last_pos: pagemsg( "WARNING: Found inflection header before seeing any parts of speech: %s" % (subsections[k].strip())) elif last_pos == cappos: pagemsg( "WARNING: Found probably misindented inflection header after ==%s== header: %s" % (cappos, subsections[k].strip())) k += 2 secbody = "".join(subsections) sections[j] = secbody + sectail text = "".join(sections) text = re.sub("\n\n\n+", "\n\n", text) if not notes: notes.append("convert 3+ newlines to 2") return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) def verify_template_is_full_line(tn, line): line = line.strip() templates = list(blib.parse_text(line).filter_templates()) if type(tn) is list: tns = tn else: tns = [tn] tntext = "/".join(tns) if len(templates) == 0: pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line)) return None t = templates[0] if tname(t) not in tns: pagemsg( "WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" % (tntext, tntext, line)) return None if unicode(t) != line: pagemsg( "WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line)) return None return t notes = [] if len(pagetitle) == 1 or pagetitle.endswith("-"): pagemsg("Page title is a single letter or a prefix, skipping") return retval = blib.find_modifiable_lang_section( text, None if args.partial_page else "Polish", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) for k in xrange(1, len(subsections), 2): if re.search(r"==\s*Pronunciation\s*==", subsections[k]): secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k]) if secheader != subsections[k]: subsections[k] = secheader notes.append( "remove extraneous spaces in ==Pronunciation== header") extra_notes = [] parsed = blib.parse_text(subsections[k + 1]) num_pl_IPA = 0 saw_pl_p = False for t in parsed.filter_templates(): tn = tname(t) if tn in ["pl-p", "pl-pronunciation"]: saw_pl_p = True break if tn in ["pl-IPA", "pl-IPA-auto"]: num_pl_IPA += 1 if saw_pl_p: pagemsg("Already saw {{pl-p}}, skipping: %s" % unicode(t)) continue if num_pl_IPA == 0: pagemsg( "WARNING: Didn't see {{pl-IPA}} in Pronunciation section, skipping" ) continue if num_pl_IPA > 1: pagemsg( "WARNING: Saw multiple {{pl-IPA}} in Pronunciation section, skipping" ) continue lines = subsections[k + 1].strip().split("\n") # Remove blank lines. lines = [line for line in lines if line] hyph_lines = [] homophone_lines = [] rhyme_lines = [] audio_lines = [] must_continue = False newtemp = None next_audio_param = 0 has_respelling = False ipat = None for line in lines: origline = line # In case of "* {{pl-IPA|...}}", chop off the "* ". line = re.sub(r"^\*\s*(\{\{pl-IPA)", r"\1", line) if line.startswith("{{pl-IPA"): if newtemp: pagemsg( "WARNING: Something wrong, already saw {{pl-IPA}}?: %s" % origline) must_continue = True break ipat = verify_template_is_full_line( ["pl-IPA", "pl-IPA-auto"], line) if ipat is None: must_continue = True break newtemp_str = "{{pl-p}}" newtemp = list( blib.parse_text(newtemp_str).filter_templates())[0] for param in ipat.params: pn = pname(param) pv = unicode(param.value) if re.search("^[0-9]+$", pn): has_respelling = True newtemp.add(pn, pv, preserve_spacing=False) elif re.search("^qual[0-9]*$", pn): newtemp.add(pn.replace("qual", "q"), pv, preserve_spacing=False) else: pagemsg( "WARNING: Unrecognized param %s=%s in {{pl-IPA}}, skipping: %s" % (pn, pv, origline)) must_continue = True break if has_respelling: pagemsg("WARNING: {{pl-IPA}} has respelling: %s" % unicode(ipat)) if must_continue: break continue if not line.startswith("* ") and not line.startswith("*{"): pagemsg( "WARNING: Pronunciation section line doesn't start with '* ', skipping: %s" % origline) must_continue = True break if line.startswith("* "): line = line[2:] else: line = line[1:] if line.startswith("{{hyph"): hyph_lines.append(line) elif line.startswith("{{homophone") or line.startswith( "{{hmp"): homophone_lines.append(line) elif line.startswith("{{audio"): audio_lines.append(line) elif line.startswith("{{rhyme"): rhyme_lines.append(line) else: pagemsg( "WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline) must_continue = True break if has_respelling and (rhyme_lines or hyph_lines): rhyme_hyph = [] if rhyme_lines: rhyme_hyph.append("rhyme line(s) %s" % ",".join(rhyme_lines)) if hyph_lines: rhyme_hyph.append("hyphenation line(s) %s" % ",".join(hyph_lines)) # We formerly skipped these pages, but [[User:Vininn126]] requested running the bot on them. pagemsg("WARNING: Has respelling %s along with %s" % (ipat and unicode(ipat) or "UNKNOWN", " and ".join(rhyme_hyph))) #continue if must_continue: continue if audio_lines: must_continue = False for audio_line in audio_lines: audiot = verify_template_is_full_line("audio", audio_line) if audiot is None: must_continue = True break if getparam(audiot, "1") != "pl": pagemsg( "WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line) must_continue = True break audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg( "WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % (pn, pv, audio_line)) must_continue = True break if must_continue: break if audiogloss in ["Audio", "audio"]: audiogloss = "" if not newtemp: pagemsg( "WARNING: Saw %s without {{pl-IPA}}, skipping: %s" % (unicode(audiot), audio_line)) must_continue = True break next_audio_param += 1 if next_audio_param == 1: paramsuf = "" else: paramsuf = str(next_audio_param) newtemp.add("a%s" % paramsuf, audiofile, preserve_spacing=False) if audiogloss: newtemp.add("ac%s" % paramsuf, audiogloss, preserve_spacing=False) pagemsg("Replacing %s with %s" % (unicode(audiot), unicode(newtemp))) extra_notes.append("incorporate %s into {{pl-p}}" % unicode(audiot)) if must_continue: continue if rhyme_lines: if len(rhyme_lines) > 1: pagemsg("WARNING: Multiple rhyme lines, not removing: %s" % ", ".join(rhyme_lines)) continue rhyme_line = rhyme_lines[0] rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line) if not rhymet: continue if getparam(rhymet, "1") != "pl": pagemsg( "WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line)) continue pagemsg("Ignoring rhyme line: %s" % rhyme_line) extra_notes.append("remove rhyme template %s" % unicode(rhymet)) if hyph_lines: if len(hyph_lines) > 1: pagemsg( "WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines)) continue hyph_line = hyph_lines[0] hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_line) if not hypht: continue if getparam(hypht, "1") != "pl": pagemsg( "WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_line)) continue pagemsg("Ignoring hyphenation line: %s" % hyph_line) extra_notes.append("remove hyphenation template %s" % unicode(hypht)) if homophone_lines: next_homophone_param = 0 must_continue = False for homophone_line in homophone_lines: homophones = {} homophone_qualifiers = {} hmpt = verify_template_is_full_line( ["hmp", "homophone", "homophones"], homophone_line) if not hmpt: must_continue = True break if getparam(hmpt, "1") != "pl": pagemsg( "WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line)) must_continue = True break for param in hmpt.params: pn = pname(param) pv = unicode(param.value) if not re.search("^q?[0-9]+$", pn): pagemsg( "WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hmpt), homophone_line)) must_continue = True break if pn.startswith("q"): homophone_qualifiers[int(pn[1:])] = pv elif int(pn) > 1: homophones[int(pn) - 1] = pv if must_continue: break if not newtemp: pagemsg( "WARNING: Something wrong, saw %s without {{pl-IPA}}, skipping" % unicode(hmpt)) must_continue = True break hhs = [] hhp_args = [] for pn, pv in sorted(homophones.items()): next_homophone_param += 1 hmp_param = "" if next_homophone_param == 1 else str( next_homophone_param) hhs.append(pv) if pn in homophone_qualifiers: hhp_args.append(("hhp%s" % hmp_param, homophone_qualifiers[pn])) if hhs: newtemp.add("hh", ",".join(hhs)) for pn, pv in hhp_args: newtemp.add(pn, pv, preserve_spacing=False) pagemsg("Replacing %s with %s" % (unicode(hmpt), unicode(newtemp))) extra_notes.append("incorporate homophones into {{pl-p}}") if must_continue: continue pagemsg("Replaced %s with %s" % (unicode(ipat), unicode(newtemp))) all_lines = "\n".join([unicode(newtemp)]) newsubsec = "%s\n\n" % all_lines if subsections[k + 1] != newsubsec: this_notes = ["convert {{pl-IPA}} to {{pl-p}}"] + extra_notes notes.extend(this_notes) subsections[k + 1] = newsubsec secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] retval = blib.find_modifiable_lang_section(text, None if args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) has_etym_sections = "==Etymology 1==" in secbody saw_pronun_section_at_top = False split_pronun_sections = False saw_pronun_section_this_etym_section = False saw_existing_pron = False saw_existing_pron_this_etym_section = False etymsection = "top" if has_etym_sections else "all" etymsections_to_first_subsection = {} if etymsection == "top": after_etym_1 = False for k in xrange(2, len(subsections), 2): if "==Etymology 1==" in subsections[k - 1]: after_etym_1 = True if "==Pronunciation==" in subsections[k - 1]: if after_etym_1: split_pronun_sections = True else: saw_pronun_section_at_top = True m = re.search("==Etymology ([0-9]*)==", subsections[k - 1]) if m: etymsections_to_first_subsection[int(m.group(1))] = k msgs = [] def append_msg(txt): if txt not in msgs: msgs.append(txt) def apply_default_pronun_to_pagetitle(): respellings, this_msgs = apply_default_pronun(pagetitle) for msg in this_msgs: append_msg(msg) return respellings for k in xrange(2, len(subsections), 2): msgs = [] def check_missing_pronun(etymsection): if split_pronun_sections and not saw_existing_pron_this_etym_section: pagemsg("WARNING: Missing pronunciations in etym section %s" % etymsection) append_msg("MISSING_PRONUN") append_msg("NEW_DEFAULTED") respellings = apply_default_pronun_to_pagetitle() pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs))) #pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all", # " ".join(x.replace(" ", "_") for x in respellings), " ".join(msgs))) m = re.search("==Etymology ([0-9]*)==", subsections[k - 1]) if m: if etymsection != "top": check_missing_pronun(etymsection) etymsection = m.group(1) saw_pronun_section_this_etym_section = False saw_existing_pron_this_etym_section = False if "==Pronunciation " in subsections[k - 1]: pagemsg("WARNING: Saw Pronunciation N section header: %s" % subsections[k - 1].strip()) if "==Pronunciation==" in subsections[k - 1]: if saw_pronun_section_this_etym_section: pagemsg("WARNING: Saw two Pronunciation sections under etym section %s" % etymsection) if saw_pronun_section_at_top and etymsection != "top": pagemsg("WARNING: Saw Pronunciation sections both at top and in etym section %s" % etymsection) saw_pronun_section_this_etym_section = True parsed = blib.parse_text(subsections[k]) respellings = [] prev_it_IPA_t = None prev_it_pr_t = None must_continue = False for t in parsed.filter_templates(): tn = tname(t) if tn == "it-IPA": saw_existing_pron = True saw_existing_pron_this_etym_section = True if prev_it_IPA_t: pronun_lines = re.findall(r"^.*\{\{it-IPA.*$", subsections[k], re.M) pagemsg("WARNING: Saw multiple {{it-IPA}} templates in a single Pronunciation section: %s" % " ||| ".join(pronun_lines)) must_continue = True break prev_it_IPA_t = t this_respellings = [] saw_pronun = False last_numbered_param = 0 for param in t.params: pn = pname(param) pv = unicode(param.value).strip().replace(" ", "_") if re.search("^[0-9]+$", pn): last_numbered_param += 1 saw_pronun = True if pv == "+": append_msg("EXISTING_DEFAULTED") this_respellings.extend(apply_default_pronun_to_pagetitle()) else: append_msg("EXISTING") this_respellings.append(pv) elif re.search("^ref[0-9]*$", pn) and int(pn[3:] or "1") == last_numbered_param: m = re.search(r"^\{\{R:it:(DiPI|Olivetti|Treccani|Trec)(\|[^{}]*)?\}\}$", pv) if m: refname, refparams = m.groups() refname = "Treccani" if refname == "Trec" else refname this_respellings.append("n:%s%s" % (refname, refparams or "")) else: this_respellings.append("%s=%s" % (pn, pv)) else: this_respellings.append("%s=%s" % (pn, pv)) if not saw_pronun: append_msg("EXISTING_DEFAULTED") this_respellings.extend(apply_default_pronun_to_pagetitle()) respellings.extend(this_respellings) if tn == "it-pr": saw_existing_pron = True saw_existing_pron_this_etym_section = True if prev_it_pr_t: pronun_lines = re.findall(r"^.*\{\{it-pr.*$", subsections[k], re.M) pagemsg("WARNING: Saw multiple {{it-pr}} templates in a single Pronunciation section: %s" % " ||| ".join(pronun_lines)) must_continue = True break prev_it_pr_t = t this_respellings = [] saw_pronun = False for param in t.params: pn = pname(param) pv = unicode(param.value).strip().replace(" ", "_") if re.search("^[0-9]+$", pn): saw_pronun = True #if pv == "+": # append_msg("EXISTING_DEFAULTED") # this_respellings.extend(apply_default_pronun_to_pagetitle()) #else: def fix_ref(m): refname, refparams = m.groups() refname = "Treccani" if refname == "Trec" else refname return "<r:%s%s>" % (refname, refparams or "") pv = re.sub(r"<ref:\{\{R:it:(DiPI|Olivetti|Treccani|Trec|DOP)(\|[^{}]*)?\}\}>", fix_ref, pv) append_msg("EXISTING") this_respellings.append(pv) else: this_respellings.append("%s=%s" % (pn, pv)) if not saw_pronun: append_msg("EXISTING_DEFAULTED") #this_respellings.extend(apply_default_pronun_to_pagetitle()) this_respellings.append("+") respellings.extend(this_respellings) if must_continue: continue if args.include_defns and etymsection not in ["top", "all"]: first_etym_subsec = etymsections_to_first_subsection.get(int(etymsection), None) next_etym_subsec = etymsections_to_first_subsection.get(1 + int(etymsection), None) if first_etym_subsec is None: pagemsg("WARNING: Internal error: Unknown first etym section for =Etymology %s=" % etymsection) else: if next_etym_subsec is None: next_etym_subsec = len(subsections) defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it") append_msg("defns: %s" % ";".join(defns)) if respellings: pagemsg("<respelling> %s: %s <end> %s" % (etymsection, " ".join(respellings), " ".join(msgs))) check_missing_pronun(etymsection) if not saw_existing_pron: if args.include_defns and has_etym_sections: for etymsec in sorted(list(etymsections_to_first_subsection.keys())): msgs = [] first_etym_subsec = etymsections_to_first_subsection[etymsec] next_etym_subsec = etymsections_to_first_subsection.get(1 + etymsec, None) if next_etym_subsec is None: next_etym_subsec = len(subsections) append_msg("NEW_DEFAULTED") defns = blib.find_defns("".join(subsections[first_etym_subsec:next_etym_subsec]), "it") append_msg("defns: %s" % ";".join(defns)) respellings = apply_default_pronun_to_pagetitle() pagemsg("<respelling> %s: %s <end> %s" % (etymsec, " ".join(respellings), " ".join(msgs))) else: msgs = [] append_msg("NEW_DEFAULTED") respellings = apply_default_pronun_to_pagetitle() pagemsg("<respelling> %s: %s <end> %s" % ("top" if has_etym_sections else "all", " ".join(respellings), " ".join(msgs)))
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if pagetitle in hu_pages_seen: pagemsg("Skipping because already seen") return hu_pages_seen.add(pagetitle) pagemsg("Processing") retval = blib.find_modifiable_lang_section(text, "Hungarian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Hungarian section") return sections, j, secbody, sectail, has_non_lang = retval if "==Etymology 1==" not in secbody: return etym_sections = re.split("(^===Etymology [0-9]+===\n)", secbody, 0, re.M) if len(etym_sections) < 5: pagemsg("WARNING: Not enough etym sections, found %s, expected >= 5" % len(etym_sections)) return num_lemmas = 0 num_nonlemma_forms = 0 poses_seen_per_section = defaultdict(set) for k in range(2, len(etym_sections), 2): section = etym_sections[k] parsed = blib.parse_text(section) saw_lemma = False saw_nonlemma_form = False for t in parsed.filter_templates(): tn = tname(t) p2 = getparam(t, "2") recording_lemma = None if tn in hu_lemma_template_mapping: recording_lemma = hu_lemma_template_mapping[tn] elif tn == "head" and getparam(t, "1") == "hu" and p2 in hu_lemmas: recording_lemma = hu_lemma_mapping.get(p2, p2) elif tn == "head" and getparam( t, "1" ) == "hu" and p2 and p2[-1] == "s" and p2[:-1] in hu_lemmas: recording_lemma = hu_lemma_mapping.get(p2[:-1], p2[:-1]) if recording_lemma: poses_seen_per_section[k // 2 - 1].add(recording_lemma) if not saw_lemma: num_lemmas += 1 saw_lemma = True recording_nonlemma_form = None if tn == "head" and getparam( t, "1") == "hu" and p2 in hu_nonlemma_forms: recording_nonlemma_form = p2 elif tn == "head" and getparam(t, "1") == "hu" and p2 and p2[ -1] == "s" and p2[:-1] in hu_nonlemma_forms: recording_nonlemma_form = p2[:-1] if recording_nonlemma_form: poses_seen_per_section[k // 2 - 1].add(recording_nonlemma_form) if not saw_nonlemma_form: num_nonlemma_forms += 1 saw_nonlemma_form = True if not saw_lemma and not saw_nonlemma_form: pagemsg("WARNING: In %s, didn't see lemma or non-lemma" % etym_sections[k - 1].strip()) pagemsg("Saw num_lemmas=%s, num_nonlemma_forms=%s" % (num_lemmas, num_nonlemma_forms)) if num_lemmas and num_nonlemma_forms: secbody, sectail = add_category( secbody, sectail, pagemsg, notes, "terms with lemma and non-lemma form etymologies") if num_lemmas > 1: secbody, sectail = add_category( secbody, sectail, pagemsg, notes, "terms with multiple lemma etymologies") if num_nonlemma_forms > 1: secbody, sectail = add_category( secbody, sectail, pagemsg, notes, "terms with multiple non-lemma form etymologies") pairs_seen = set() for k in range((len(etym_sections) - 1) // 2): for l in range(k + 1, (len(etym_sections) - 1) // 2): for posk in poses_seen_per_section[k]: for posl in poses_seen_per_section[l]: if posk in hu_nonlemma_forms and posl in hu_lemmas: pairs_seen.add((posl, posk)) elif ((posk in hu_lemmas and posl in hu_lemmas or posk in hu_nonlemma_forms and posl in hu_nonlemma_forms) and posk > posl): pairs_seen.add((posl, posk)) else: pairs_seen.add((posk, posl)) pagemsg("; ".join("%s: %s" % (sec + 1, ",".join(poses)) for sec, poses in sorted(poses_seen_per_section.items(), key=lambda x: x[0]))) for posk, posl in pairs_seen: hu_pos_pos_pairs[(posk, posl)] += 1 if posk == posl: secbody, sectail = add_category( secbody, sectail, pagemsg, notes, "terms with multiple %s etymologies" % posk) else: secbody, sectail = add_category( secbody, sectail, pagemsg, notes, "terms with %s and %s etymologies" % (posk, posl)) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(index, page, spec): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing pronunciation spec: %s" % spec) m = re.search("^([a-z0-9]*): (.*)$", spec) if not m: pagemsg("WARNING: Unrecognized pronunciation spec: %s" % spec) return location, pronspecs = m.groups() if (pagetitle, location) in seen_pages: pagemsg("WARNING: Already saw page, skipping") return seen_pages.add((pagetitle, location)) pronspecs = [ pronspec.replace("_", " ") for pronspec in pronspecs.split(" ") ] if args.old_it_ipa: prons = [] refs = [] have_footnotes = False next_num_pron = 0 last_num_pron = None last_footnote_param_index = None for pronspec in pronspecs: if pronspec.startswith("r:"): ref = pronspec[2:] if not re.search(r"^%s\b" % refs_re, ref): pagemsg("WARNING: Unrecognized reference %s: pronspec=%s" % (pronspec, spec)) return refs.append("{{R:it:%s}}" % ref) elif pronspec.startswith("n:"): ref = pronspec[2:] if not re.search(r"^%s\b" % refs_re, ref): pagemsg("WARNING: Unrecognized reference %s: pronspec=%s" % (pronspec, spec)) return if next_num_pron == 0: pagemsg( "WARNING: No preceding pronunciations for footnote %s: %s" % (pronspec, spec)) return reftemp = "{{R:it:%s}}" % ref if next_num_pron == last_num_pron: prons[last_footnote_param_index] += " !!! " + reftemp else: last_footnote_param_index = len(prons) last_num_pron = next_num_pron prons.append( "ref%s=%s" % ("" if next_num_pron == 1 else next_num_pron, reftemp)) have_footnotes = True else: if re.search("^ref[0-9]*=", pronspec): have_footnotes = True if "=" not in pronspec: respellings, msgs = apply_default_pronun(pronspec) if "NEED_ACCENT" in msgs: pagemsg( "WARNING: Missing accent for pronunciation %s" % pronspec) return if "Z" in msgs: pagemsg("WARNING: Unconverted z in pronunciation %s" % pronspec) return next_num_pron += 1 prons.append(pronspec) else: prons = [] refs = [] have_footnotes = False for pronspec in pronspecs: pronspec_parts = re.split("(<r:[^<>]*)", pronspec) for i, pronspec_part in enumerate(pronspec_parts): if i % 2 == 1: # a reference if pronspec_part == "<r:": # a cross-reference to another reference pronspec_parts[i] = "<ref:" else: if not re.search(r"^<r:%s\b" % refs_re, pronspec_part): pagemsg( "WARNING: Unrecognized reference %s: pronspec=%s" % (pronspec_part, spec)) return ref_template_text = pronspec_part[3:] # If the argument to the reference template is the page title, remove it. m = re.search(r"^%s\|(.*)$" % refs_re, ref_template_text) if m and m.group(2) == pagetitle: ref_template_text = m.group(1) pronspec_parts[ i] = "<ref:{{R:it:%s}}" % ref_template_text pronspec = "".join(pronspec_parts) if "<ref:" in pronspec: have_footnotes = True # <r: or original <ref: # FIXME: Verify respellings checking for NEED_ACCENT and Z, as above. prons.append(pronspec) if not re.search("^[0-9]+$", location) and location not in ["top", "all"]: pagemsg("WARNING: Unrecognized location %s: pronspec=%s" % (location, spec)) return notes = [] text = unicode(page.text) retval = blib.find_modifiable_lang_section(text, "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) has_etym_sections = "==Etymology 1==" in secbody if has_etym_sections and location == "all": pagemsg("WARNING: With ==Etymology 1==, location cannot be 'all': %s" % spec) return if not has_etym_sections and location != "all": pagemsg( "WARNING: Without split etymology sections, location must be 'all': %s" % spec) return def construct_new_pron_template(): if args.old_it_ipa: return "{{it-IPA|%s}}" % "|".join(prons), "* " else: return "{{it-pr|%s}}" % "|".join(prons), "" def insert_into_existing_pron_section(k): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn == "it-IPA" and args.old_it_ipa: origt = unicode(t) # Compute set of current reference params current_refs = set() for param in t.params: pn = pname(param) m = re.search("^n([0-9]*)$", pn) if m: current_refs.add(m.group(1) or "1") # Compute params to add along with set of new reference params params_to_add = [] new_refs = set() nextparam = 0 for param in prons: if "=" in param: pn, pv = param.split("=", 1) else: nextparam += 1 pn = str(nextparam) pv = param m = re.search("^n([0-9]*)$", pn) if m: new_refs.add(m.group(1) or "1") params_to_add.append((pn, pv)) # Make sure we're not removing references if len(current_refs - new_refs) > 0 and not args.override_refs: pagemsg( "WARNING: Saw existing refs not in new refs, not removing: existing=%s, new=%s" % (origt, "{{it-IPA|%s}}" % "|".join(prons))) return False # Now change the params del t.params[:] for pn, pv in params_to_add: t.add(pn, pv) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "replace existing %s with %s (manually assisted)" % (origt, unicode(t))) subsections[k] = unicode(parsed) break if tn == "it-pr" and not args.old_it_ipa: origt = unicode(t) # Now change the params del t.params[:] for pn, pv in enumerate(prons): t.add(str(pn + 1), pv) if origt != unicode(t): # Make sure we're not removing references if "<ref:" in origt and not args.override_refs: pagemsg( "WARNING: Saw existing refs not in new refs, not removing: existing=%s, new=%s" % (origt, unicode(t))) return False # Make sure we're not removing audio or other modifiers if re.search("<(audio|hmp|rhyme|hyph|pre|post):", origt) and not args.override_refs: pagemsg( "WARNING: Saw existing audio/hmp/rhyme/hyph/pre/post not in new refs, not removing: existing=%s, new=%s" % (origt, unicode(t))) return False pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "replace existing %s with %s (manually assisted)" % (origt, unicode(t))) subsections[k] = unicode(parsed) break else: # no break new_pron_template, pron_prefix = construct_new_pron_template() if not args.old_it_ipa: # Remove existing rhymes/hyphenation/it-IPA lines for template in [ "rhyme|it", "rhymes|it", "it-IPA", "hyph|it", "hyphenation|it" ]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template m = re.search(regex, subsections[k], re.M) if m: pagemsg("Removed existing %s" % m.group(1).strip()) notes.append("remove existing {{%s}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) subsections[ k] = pron_prefix + new_pron_template + "\n" + subsections[k] notes.append( "insert %s into existing Pronunciation section (manually assisted)" % new_pron_template) return True def insert_new_l3_pron_section(k): new_pron_template, pron_prefix = construct_new_pron_template() subsections[k:k] = [ "===Pronunciation===\n", pron_prefix + new_pron_template + "\n\n" ] notes.append("add top-level Italian pron %s (manually assisted)" % new_pron_template) if location == "all": for k in xrange(2, len(subsections), 2): if "==Pronunciation==" in subsections[k - 1]: if not insert_into_existing_pron_section(k): return break else: # no break k = 2 while k < len(subsections) and re.search( "==(Alternative forms|Etymology)==", subsections[k - 1]): k += 2 if k - 1 >= len(subsections): pagemsg("WARNING: No lemma or non-lemma section at top level") return insert_new_l3_pron_section(k - 1) elif location == "top": for k in xrange(2, len(subsections), 2): if "==Pronunciation==" in subsections[k - 1]: if not insert_into_existing_pron_section(k): return break else: # no break for k in xrange(2, len(subsections), 2): if "==Etymology 1==" in subsections[k - 1]: insert_new_l3_pron_section(k - 1) break else: # no break pagemsg( "WARNING: Something wrong, location == 'top' but can't find Etymology 1 section" ) return else: begin_etym_n_section = None def insert_pron_section_in_etym_section(): k = begin_etym_n_section + 2 while k < len(subsections) and re.search("==Alternative forms==", subsections[k - 1]): k += 2 if k - 1 >= len(subsections): pagemsg( "WARNING: No lemma or non-lemma section in Etymology N section: %s" % subsections[begin_etym_n_section].strip()) return new_pron_template, pron_prefix = construct_new_pron_template() subsections[k - 1:k - 1] = [ "====Pronunciation====\n", pron_prefix + new_pron_template + "\n\n" ] notes.append( "add Italian pron %s to Etymology %s (manually assisted)" % (new_pron_template, location)) for k in xrange(2, len(subsections), 2): if "==Etymology %s==" % location in subsections[k - 1]: begin_etym_n_section = k elif re.search("==Etymology [0-9]", subsections[k - 1]): if begin_etym_n_section: # We encountered the next Etymology section and didn't see Pronunciation; insert a Pronunciation section. insert_pron_section_in_etym_section() break elif begin_etym_n_section and "==Pronunciation==" in subsections[ k - 1]: if not insert_into_existing_pron_section(k): return break else: # no break # We reached the end. if begin_etym_n_section: # We found the Etymology section to insert in; it was the last one and didn't see Pronunciation. # Insert a pronunciation section. insert_pron_section_in_etym_section() else: pagemsg( "WARNING: Didn't find Etymology N section for location=%s: spec=%s" % (location, spec)) return if refs or have_footnotes: # Check for refs in References or Further reading embedded in Etym section begin_etym_n_section = None for k in xrange(2, len(subsections), 2): if "==Etymology %s==" % location in subsections[k - 1]: begin_etym_n_section = k - 1 elif re.search("==Etymology [0-9]", subsections[k - 1]): # next etym section break elif begin_etym_n_section: if refs and re.search( r"====\s*(References|Further reading)\s*====", subsections[k - 1]): # Found References or Further reading embedded in Etym section pagemsg("Found %s in Etymology %s section" % (subsections[k - 1].strip(), location)) needed_refs = [] for ref in refs: if ref in subsections[k]: pagemsg( "Already found %s in %s section %s under Etymology %s" % (ref, subsections[k - 1].strip(), k // 2, location)) else: needed_refs.append(ref) refs = needed_refs if have_footnotes and re.search( r"====\s*References\s*====", subsections[k - 1]): # Check for <references/> in References embedded in Etym section if re.search(r"<references\s*/?\s*>", subsections[k]): pagemsg( "Already found <references /> in ===References=== section %s under Etymology %s" % (k // 2, location)) have_footnotes = False if refs: # Check for references already present for k in xrange(2, len(subsections), 2): if re.search("^===(References|Further reading)===\n", subsections[k - 1]): needed_refs = [] for ref in refs: if ref in subsections[k]: pagemsg("Already found %s in %s section %s" % (ref, subsections[k - 1].strip(), k // 2)) else: needed_refs.append(ref) refs = needed_refs if refs: added_ref_text = "\n".join("* " + ref for ref in refs) + "\n\n" # Still some references, need to add them to existing References section or create new one for k in xrange(2, len(subsections), 2): if re.search("^===References===\n", subsections[k - 1]): subsections[k] = subsections[k].rstrip( "\n") + "\n" + added_ref_text notes.append( "add Italian pronun reference%s %s to existing ===References=== section" % ("s" if len(refs) > 1 else "", ", ".join(refs))) break else: # no break k = len(subsections) - 1 while k >= 2 and re.search( r"==\s*(Anagrams|Further reading)\s*==", subsections[k - 1]): k -= 2 if k < 2: pagemsg("WARNING: No lemma or non-lemma section") return subsections[k + 1:k + 1] = ["===References===\n", added_ref_text] notes.append( "add new ===References=== section for pron reference%s %s" % ("s" if len(refs) > 1 else "", ", ".join(refs))) if have_footnotes: # Need <references/>; check if already present for k in xrange(len(subsections) - 1, 2, -2): if re.search(r"^===\s*References\s*===$", subsections[k - 1].strip()): if re.search(r"<references\s*/?\s*>", subsections[k]): pagemsg( "Already found <references /> in ===References=== section %s" % (k // 2)) else: subsections[k] = "<references />\n" + subsections[k] notes.append( "add <references /> to existing ===References=== section for pron footnotes" ) break else: # no break k = len(subsections) - 1 while k >= 2 and re.search(r"==\s*(Anagrams|Further reading)\s*==", subsections[k - 1]): k -= 2 if k < 2: pagemsg("WARNING: No lemma or non-lemma section") return subsections[k + 1:k + 1] = ["===References===\n", "<references />\n\n"] notes.append("add new ===References=== section for pron footnotes") secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") retval = blib.find_modifiable_lang_section(text, "Georgian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Georgian section") return sections, j, secbody, sectail, has_non_lang = retval #newtext = re.sub(r"====[ ]?Declension[ ]?====\n\{\{ka-decl-adj-auto\}\}\n", "", secbody) #newtext = re.sub(r"====[ ]?Declension[ ]?====\n\{\{ka-adj-decl.*?\}\}\n", "", newtext) #if secbody != newtext: # notes.append("remove Georgian adjectival declension for noun") # secbody = newtext newtext = re.sub(r"\{\{ka-noun-c\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody) newtext = re.sub(r"\{\{ka-noun-c\|.*\}\}", "{{ka-infl-noun}}", newtext) if secbody != newtext: notes.append("convert {{ka-noun-c}} to {{ka-infl-noun}}") secbody = newtext newtext = re.sub("\{\{ka-noun-a\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody) newtext = re.sub("\{\{ka-noun-a\|.*\}\}", "{{ka-infl-noun}}", newtext) if secbody != newtext: notes.append("convert {{ka-noun-a}} to {{ka-infl-noun}}") secbody = newtext newtext = re.sub("\{\{ka-noun-o\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody) newtext = re.sub("\{\{ka-noun-o\|.*\}\}", "{{ka-infl-noun}}", newtext) if secbody != newtext: notes.append("convert {{ka-noun-o}} to {{ka-infl-noun}}") secbody = newtext newtext = re.sub("\{\{ka-noun-u\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody) newtext = re.sub("\{\{ka-noun-u\|.*\}\}", "{{ka-infl-noun}}", newtext) if secbody != newtext: notes.append("convert {{ka-noun-u}} to {{ka-infl-noun}}") secbody = newtext newtext = re.sub("\{\{ka-noun-e\|.*plural.*\}\}", "{{ka-infl-noun|-}}", secbody) newtext = re.sub("\{\{ka-noun-e\|.*\}\}", "{{ka-infl-noun}}", newtext) if secbody != newtext: notes.append("convert {{ka-noun-e}} to {{ka-infl-noun}}") secbody = newtext newtext = re.sub("\{\{ka\-noun-c-2\|.*?\|.*?\|(.*?)\|.*plural.*\}\}", r"{{ka-infl-noun|\1|-}}", secbody) newtext = re.sub("\{\{ka\-noun-c-2\|.*?\|.*?\|(.*?)\|.*\}\}", r"{{ka-infl-noun|\1}}", newtext) if secbody != newtext: notes.append("convert {{ka-noun-c-2}} to {{ka-infl-noun}}") secbody = newtext #newtext = re.sub(r"==\s*Declension\s*==", "==Inflection==", secbody) #if secbody != newtext: # notes.append("==Declension== -> ==Inflection== in Georgian section") # secbody = newtext sections[j] = secbody + sectail return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, program_args.verbose) def verify_template_is_full_line(tn, line): templates = list(blib.parse_text(line).filter_templates()) if type(tn) is list: tns = tn else: tns = [tn] tntext = "/".join(tns) if len(templates) == 0: pagemsg("WARNING: No templates on {{%s}} line?, skipping: %s" % (tntext, line)) return None t = templates[0] if tname(t) not in tns: pagemsg("WARNING: Putative {{%s}} line doesn't have {{%s...}} as the first template, skipping: %s" % (tntext, tntext, line)) return None if unicode(t) != line: pagemsg("WARNING: {{%s}} line has text other than {{%s...}}, skipping: %s" % (tntext, tntext, line)) return None return t notes = [] retval = blib.find_modifiable_lang_section(text, None if program_args.partial_page else "Italian", pagemsg, force_final_nls=True) if retval is None: return sections, j, secbody, sectail, has_non_lang = retval subsections = re.split("(^==+[^=\n]+==+\n)", secbody, 0, re.M) sect_for_wiki = 0 for k in xrange(1, len(subsections), 2): if re.search(r"==\s*Etymology [0-9]+\s*==", subsections[k]): sect_for_wiki = k + 1 elif re.search(r"==\s*Pronunciation\s*==", subsections[k]): secheader = re.sub(r"\s*Pronunciation\s*", "Pronunciation", subsections[k]) if secheader != subsections[k]: subsections[k] = secheader notes.append("remove extraneous spaces in ==Pronunciation== header") extra_notes = [] parsed = blib.parse_text(subsections[k + 1]) num_it_IPA = 0 saw_it_pr = False for t in parsed.filter_templates(): tn = tname(t) if tn in ["it-pr", "it-pronunciation"]: saw_it_pr = True break if tn == "it-IPA": num_it_IPA += 1 if saw_it_pr: pagemsg("Already saw {{it-pr}}, skipping: %s" % unicode(t)) continue if num_it_IPA == 0: pagemsg("WARNING: Didn't see {{it-IPA}} in Pronunciation section, skipping") continue if num_it_IPA > 1: pagemsg("WARNING: Saw multiple {{it-IPA}} in Pronunciation section, skipping") continue lines = subsections[k + 1].strip().split("\n") # Remove blank lines. lines = [line for line in lines if line] hyph_lines = [] homophone_lines = [] rfap_lines = [] rhyme_lines = [] must_continue = False audioarg = "" args = [] bare_args = [] args_for_hyph = [] lines_so_far = [] for lineind, line in enumerate(lines): origline = line lines_so_far.append(line) # In case of "* {{it-IPA|...}}", chop off the "* ". line = re.sub(r"^\*\s*(\{\{it-IPA)", r"\1", line) if line.startswith("{{it-IPA"): if args: pagemsg("WARNING: Something wrong, already saw {{it-IPA}}?: %s" % origline) must_continue = True break outer_ref_arg = None m = re.search("^(.*?) *<ref>(.*?)</ref>$", line) if m: line, outer_ref_arg = m.groups() ipat = verify_template_is_full_line("it-IPA", line) if ipat is None: must_continue = True break bare_args = blib.fetch_param_chain(ipat, "1") or [u"+"] bare_args = [u"+" if arg == pagetitle else arg for arg in bare_args] bare_args = [adjust_initial_capital(arg, pagetitle, pagemsg, origline) for arg in bare_args] bare_args = [re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], arg) for arg in bare_args] normalized_bare_args = [ normalize_bare_arg(arg, pagetitle, lambda msg: pagemsg("%s: %s" % (msg, origline))) for arg in bare_args ] if None in normalized_bare_args: must_continue = True break args = [x for x in bare_args] args_for_hyph = [] for arg in normalized_bare_args: hypharg = ( arg.replace("ddz", "zz").replace("tts", "zz").replace("dz", "z").replace("ts", "z") .replace("Dz", "Z").replace("Ts", "Z").replace("[s]", "s").replace("[z]", "z") ) hypharg = re.sub(pron_sign_c, "", hypharg) putative_pagetitle = remove_secondary_stress(hypharg.replace(".", "").replace("_", "")) putative_pagetitle = remove_non_final_accents(putative_pagetitle) # Check if the normalized pronunciation is the same as the page title, if so use the semi-normalized # pronunciation for hyphenation. If a word in the page title is a single syllable, it may or may not # have an accent on it, so also remove final monosyllabic accents from the normalized pronunciation # when comparing. (Don't remove from both normalized pronunciation and page title because we don't want # pronunciation rè to match page title ré or vice versa.) if putative_pagetitle == pagetitle or remove_final_monosyllabic_accents(putative_pagetitle) == pagetitle: args_for_hyph.append(hypharg) for param in ipat.params: pn = pname(param) pv = unicode(param.value) if re.search("^[0-9]+$", pn): continue m = re.search("^(ref|qual)([0-9]*)$", pn) if m: parampref, argnum = m.groups() argnum = int(argnum or "1") - 1 if argnum >= len(args): pagemsg("WARNING: Argument %s=%s specifies nonexistent pronun, skipping: %s" % ( pn, pv, origline)) must_continue = True break args[argnum] += "<%s:%s>" % (parampref, pv) else: pagemsg("WARNING: Unrecognized param %s=%s in {{it-IPA}}, skipping: %s" % ( pn, pv, origline)) must_continue = True break if must_continue: break if outer_ref_arg: if "<ref:" in args[-1]: pagemsg("WARNING: Trying to add outside ref %s into {{it-IPA}} but already has ref in arg %s, skipping: %s" % (outer_ref_arg, args[-1], origline)) must_continue = True break else: args[-1] += "<ref:%s>" % outer_ref_arg extra_notes.append("incorporate outer <ref>...</ref> into {{it-pr}}") continue if line.startswith("{{rfap"): line = "* " + line if line.startswith("{{wiki"): subsections[sect_for_wiki] = line + "\n" + subsections[sect_for_wiki] # Remove the {{wikipedia}} line from lines seen so far. Put back the remaining lines in case we # run into a problem later on, so we don't end up duplicating the {{wikipedia}} line. We accumulate # lines like this in case for some reason we have two {{wikipedia}} lines in the Pronunciation section. del lines_so_far[-1] subsections[k + 1] = "%s\n\n" % "\n".join(lines_so_far + lines[lineind + 1:]) notes.append("move {{wikipedia}} line to top of etym section") continue if not line.startswith("* ") and not line.startswith("*{"): pagemsg("WARNING: Pronunciation section line doesn't start with '* ', skipping: %s" % origline) must_continue = True break if line.startswith("* "): line = line[2:] else: line = line[1:] if line.startswith("{{hyph"): hyph_lines.append("* " + line) elif line.startswith("{{homophone"): homophone_lines.append("* " + line) elif line.startswith("{{rfap"): rfap_lines.append(line) elif line.startswith("{{audio"): audiot = verify_template_is_full_line("audio", line) if audiot is None: must_continue = True break if getparam(audiot, "1") != "it": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % origline) must_continue = True break audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, origline)) must_continue = True break if must_continue: break if audiogloss in ["Audio", "audio"]: audiogloss = "" if audiogloss: audiogloss = ";%s" % audiogloss audiopart = "<audio:%s%s>" % (audiofile, audiogloss) audioarg += audiopart pagemsg("Replacing %s with argument part %s" % (unicode(audiot), audiopart)) extra_notes.append("incorporate %s into {{it-pr}}" % unicode(audiot)) elif line.startswith("{{rhyme"): rhyme_lines.append(line) elif remove_accents(line) == remove_accents(pagetitle): pagemsg("Ignoring Pronunciation section line that looks like a possibly-accented page title: %s" % origline) else: pagemsg("WARNING: Unrecognized Pronunciation section line, skipping: %s" % origline) must_continue = True break if must_continue: continue if rhyme_lines: rhyme_error = False rhyme_pronuns = [] for bare_arg in normalized_bare_args: pronun = expand_text(u"{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % re.sub(pron_sign_c, "", bare_arg)) if not pronun: rhyme_error = True break rhyme_pronun = ( re.sub(u"^[^aeiouɛɔ]*", "", re.sub(u".*[ˌˈ]", "", pronun)).replace(TIE, "") .replace(".", "")) if rhyme_pronun not in rhyme_pronuns: rhyme_pronuns.append(rhyme_pronun) if not rhyme_error: saw_non_matching_rhyme = False normalized_rhymes = [] rhyme_line_text = ", ".join(rhyme_lines) normalized_bare_arg_text = ",".join(normalized_bare_args) rhyme_pronun_text = ",".join(rhyme_pronuns) for rhyme_line in rhyme_lines: rhymet = verify_template_is_full_line(["rhyme", "rhymes"], rhyme_line) if not rhymet: break if getparam(rhymet, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(rhymet), rhyme_line)) break rhymes = [] must_break = False num_syl = "" rhyme_specific_num_syl = [] for param in rhymet.params: pn = pname(param) pv = unicode(param.value) if not re.search("^s?[0-9]*$", pn): pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(rhymet), rhyme_line)) must_break = True break if pn == "s": num_syl = "<s:%s>" % pv elif pn.startswith("s"): rhyme_no = int(pn[1:]) - 1 rhyme_specific_num_syl.append((rhyme_no, pv)) elif int(pn) > 1: if pv: rhymes.append([pv, ""]) if must_break: break for rhyme_no, this_num_syl in rhyme_specific_num_syl: if rhyme_no >= len(rhymes): pagemsg("WARNING: Argument s%s=%s specifies nonexistent rhyme, skipping: %s" % ( rhyme_no + 1, this_num_syl, rhyme_line)) must_break = True break rhymes[rhyme_no][1] = "<s:%s>" % this_num_syl if must_break: break for rhyme, this_num_syl in rhymes: normalized_rhyme = re.sub(u"([aeɛoɔu])i", r"\1j", rhyme).replace("sm", "zm") normalized_rhyme = re.sub(u"a[uu̯](" + C + ")", r"aw\1", normalized_rhyme) this_num_syl = this_num_syl or num_syl if this_num_syl and not args_for_hyph and not hyph_lines: pagemsg("WARNING: Explicit number of syllables %s given for explicit rhyme %s and no default or explicit hyphenation: %s" % (this_num_syl, rhyme, rhyme_line_text)) saw_non_matching_rhyme = True normalized_rhymes.append(normalized_rhyme + this_num_syl) else: normalized_rhymes.append(normalized_rhyme) if rhyme in rhyme_pronuns: pagemsg("Removing explicit rhyme %s, same as pronunciation-based rhyme for spelling(s) '%s': %s" % (rhyme, normalized_bare_arg_text, rhyme_line_text)) elif normalized_rhyme in rhyme_pronuns: pagemsg("Removing explicit rhyme %s normalized to %s, same as pronunciation-based rhyme for spelling(s) '%s': %s" % (rhyme, normalized_rhyme, normalized_bare_arg_text, rhyme_line_text)) elif rhyme != normalized_rhyme: pagemsg("WARNING: Explicit rhyme %s normalized to %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s" % (rhyme, normalized_rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) saw_non_matching_rhyme = True else: pagemsg("WARNING: Explicit rhyme %s not same as pronunciation-based rhyme(s) (%s) for spelling(s) '%s': %s" % (rhyme, rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) saw_non_matching_rhyme = True else: # no break if saw_non_matching_rhyme: pagemsg("Not all explicit rhymes (%s) could be matched against pronunciation-based rhyme(s) (%s) for spelling(s) '%s', adding explicitly: %s" % (",".join(normalized_rhymes), rhyme_pronun_text, normalized_bare_arg_text, rhyme_line_text)) args[-1] += "<rhyme:%s>" % ",".join(normalized_rhymes) extra_notes.append("incorporate non-default rhymes into {{it-pr}}") else: extra_notes.append("remove rhymes that are generated automatically by {{it-pr}}") rhyme_lines = [] if not args: pagemsg("WARNING: Something wrong, didn't see {{it-IPA}}?") continue args[-1] += audioarg if hyph_lines: if len(hyph_lines) > 1: pagemsg("WARNING: Multiple hyphenation lines, not removing: %s" % ", ".join(hyph_lines)) else: assert hyph_lines[0].startswith("* ") hyph_line = hyph_lines[0][2:] hyph_templates = re.split(", *", hyph_line) hyphs = [] for hyph_template in hyph_templates: hypht = verify_template_is_full_line(["hyph", "hyphenation"], hyph_template) if not hypht: break syls = [] if getparam(hypht, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hypht), hyph_template)) break else: must_break = False for param in hypht.params: pn = pname(param) pv = unicode(param.value) if not re.search("^[0-9]+$", pn) and pn != "nocaption": pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hypht), hyph_line)) must_break = True break if pn != "nocaption" and int(pn) > 1: if not pv: hyphs.append(syls) syls = [] else: syls.append(pv) if must_break: break if syls: hyphs.append(syls) else: # no break if hyphs: specified_hyphenations = [".".join(syls) for syls in hyphs] specified_hyphenations = [ re.sub(u"([áíúÁÍÚ])", lambda m: acute_to_grave[m.group(1)], hyph) for hyph in specified_hyphenations] specified_hyphenations = [re.sub("''+", "", hyph) for hyph in specified_hyphenations] specified_hyphenations = [ adjust_initial_capital(hyph, pagetitle, pagemsg, hyph_line) for hyph in specified_hyphenations] specified_hyphenations = [re.sub(u"î([ -]|$)", r"i\1", hyph) for hyph in specified_hyphenations] hyphenations = [syllabify_from_spelling(arg) for arg in args_for_hyph] if set(specified_hyphenations) < set(hyphenations): pagemsg("Removing explicit hyphenation(s) %s that are a subset of auto-hyphenation(s) %s: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif set(specified_hyphenations) != set(hyphenations): hyphenations_without_accents = [remove_accents(hyph) for hyph in hyphenations] rehyphenated_specified_hyphenations = [syllabify_from_spelling(hyph) for hyph in specified_hyphenations] def indices_of_syllable_markers(hyph): # Get the character indices of the syllable markers, but not counting the syllable markers themselves # (i.e. return the number of characters preceding the syllable marker). raw_indices = [ind for ind, ch in enumerate(hyph) if ch == "."] adjusted_indices = [ind - offset for offset, ind in enumerate(raw_indices)] return set(adjusted_indices) if set(specified_hyphenations) == set(hyphenations_without_accents): pagemsg("Removing explicit hyphenation(s) %s that are missing accents but otherwise same as auto-hyphenation(s) %s: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif set(rehyphenated_specified_hyphenations) == set(hyphenations): pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified by rehyphenation): %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) elif (len(specified_hyphenations) == 1 and len(hyphenations) == 1 and specified_hyphenations[0].replace(".", "") == hyphenations[0].replace(".", "") and indices_of_syllable_markers(specified_hyphenations[0]) < indices_of_syllable_markers(hyphenations[0])): pagemsg("Removing explicit hyphenation(s) %s that are missing syllable breaks but otherwise same as auto-hyphenation(s) %s (verified that explicit hyphenation indices are subset of auto-hyphenation indices): %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) else: if not hyphenations: pagemsg("WARNING: Explicit hyphenation(s) %s but no auto-hyphenations, adding explicitly: %s" % (",".join(specified_hyphenations), hyph_line)) else: pagemsg("WARNING: Explicit hyphenation(s) %s not equal to auto-hyphenation(s) %s, adding explicitly: %s" % (",".join(specified_hyphenations), ",".join(hyphenations), hyph_line)) args[-1] += "<hyph:%s>" % ",".join(specified_hyphenations) extra_notes.append("incorporate non-default hyphenations into {{it-pr}}") else: pagemsg("Removed explicit hyphenation(s) same as auto-hyphenation(s): %s" % hyph_line) extra_notes.append("remove hyphenations that are generated automatically by {{it-pr}}") hyph_lines = [] if homophone_lines: if len(homophone_lines) > 1: pagemsg("WARNING: Multiple homophone lines, not removing: %s" % ", ".join(homophone_lines)) else: assert homophone_lines[0].startswith("* ") homophone_line = homophone_lines[0][2:] homophones = {} homophone_qualifiers = {} hmpt = verify_template_is_full_line(["hmp", "homophone", "homophones"], homophone_line) if hmpt: if getparam(hmpt, "1") != "it": pagemsg("WARNING: Wrong language in {{%s}}, not removing: %s" % (tname(hmpt), homophone_line)) else: for param in hmpt.params: pn = pname(param) pv = unicode(param.value) if not re.search("^q?[0-9]+$", pn): pagemsg("WARNING: Unrecognized param %s=%s in {{%s}}, not removing: %s" % (pn, pv, tname(hmpt), homophone_line)) break if pn.startswith("q"): homophone_qualifiers[int(pn[1:])] = pv elif int(pn) > 1: homophones[int(pn) - 1] = pv else: # no break hmp_args = [] for pn, pv in sorted(homophones.items()): hmp_args.append(pv) if pn in homophone_qualifiers: hmp_args[-1] += "<qual:%s>" % homophone_qualifiers[pn] args[-1] += "<hmp:%s>" % ",".join(hmp_args) extra_notes.append("incorporate homophones into {{it-pr}}") homophone_lines = [] if args == ["+"]: it_pr = "{{it-pr}}" else: it_pr = "{{it-pr|%s}}" % ",".join(args) pagemsg("Replaced %s with %s" % (unicode(ipat), it_pr)) all_lines = "\n".join([it_pr] + rhyme_lines + rfap_lines + hyph_lines + homophone_lines) newsubsec = "%s\n\n" % all_lines if subsections[k + 1] != newsubsec: this_notes = ["convert {{it-IPA}} to {{it-pr}}"] + extra_notes notes.extend(this_notes) subsections[k + 1] = newsubsec secbody = "".join(subsections) # Strip extra newlines added to secbody sections[j] = secbody.rstrip("\n") + sectail return "".join(sections), notes
def find_latin_section(text, pagemsg): return blib.find_modifiable_lang_section(text, "Latin", pagemsg)
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) if not args.stdin: pagemsg("Processing") if "==French==" not in text or "{{IPA|" not in text: return retval = blib.find_modifiable_lang_section(text, "French", pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_french = retval if "{{IPA|" not in secbody: return notes = [] def fix_up_section(sectext): parsed = blib.parse_text(sectext) pronun_templates = [] verb_templates = [] nonverb_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn in french_nonverb_head_templates: nonverb_templates.append(t) elif tn in french_verb_head_templates: verb_templates.append(t) elif tn == "head": if getparam(t, "1").strip() != "fr": pagemsg( "WARNING: Saw wrong-language {{head}} template: %s" % unicode(t)) else: pos = getparam(t, "2").strip() if pos in french_verb_head_pos: verb_templates.append(t) else: nonverb_templates.append(t) if verb_templates and nonverb_templates: pagemsg( "WARNING: Saw both verb template(s) %s and non-verb template(s) %s, using pos=vnv" % (",".join(unicode(x) for x in verb_templates), ",".join( unicode(x) for x in nonverb_templates))) if not verb_templates and not nonverb_templates: pagemsg("WARNING: Didn't see any French templates") for t in parsed.filter_templates(): tn = tname(t) if tn == "IPA": m = re.search("^.*?%s.*$" % re.escape(unicode(t)), sectext, re.M) if not m: pagemsg( "WARNING: Couldn't find template %s in section text" % unicode(t)) line = "(unknown)" else: line = m.group(0) if t.has("lang"): first_param = 1 lang = getparam(t, "lang") else: first_param = 2 lang = getparam(t, "1") if lang != "fr": pagemsg( "WARNING: Saw wrong-language {{IPA}} template: %s in line <%s>" % (unicode(t), line)) continue pron = getparam(t, str(first_param)) if not pron: pagemsg( "WARNING: No pronun in {{IPA}} template: %s in line <%s>" % (unicode(t), line)) continue if getparam(t, str(first_param + 1)) or getparam( t, str(first_param + 2)) or getparam( t, str(first_param + 3)): pagemsg( "WARNING: Multiple pronuns in {{IPA}} template: %s in line <%s>" % (unicode(t), line)) continue pos_val = ("vnv" if verb_templates and nonverb_templates else "v" if verb_templates else "") pos_arg = "|pos=%s" % pos_val if pos_val else "" #autopron = expand_text("{{#invoke:User:Benwing2/fr-pron|show|%s%s}}" % ( autopron = expand_text("{{#invoke:fr-pron|show|%s%s}}" % (pagetitle, pos_arg)) if not autopron: continue pron = re.sub("^/(.*)/$", r"\1", pron) pron = re.sub(r"^\[(.*)\]$", r"\1", pron) pron = pron.strip() pron = pron.replace("r", u"ʁ") # account for various common errors in Dawnraybot's generated pronunciations: # #1 if pagetitle.endswith("rez") and pron.endswith(u"ʁɔe"): pron = re.sub(u"ʁɔe$", u"ʁe", pron) # #2 if re.search("ai(s|t|ent)$", pagetitle) and pron.endswith(u"e"): pron = re.sub(u"e$", u"ɛ", pron) # #3 if pos_val == "v" and pagetitle.endswith( "ai") and pron.endswith(u"ɛ"): pron = re.sub(u"ɛ$", u"e", pron) if "." not in pron: autopron = autopron.replace(".", "") if autopron.endswith(u"ɑ") and pron.endswith("a"): autopron = autopron[:-1] + "a" if re.search(ur"ɑ[mt]$", autopron) and re.search( u"a[mt]$", pron): autopron = re.sub(ur"ɑ([mt])$", r"a\1", autopron) for i in xrange(2): # {{fr-IPA}} deletes schwa in the sequence V.Cə.CV esp. in the # sequence V.Cə.ʁV in verbs, whereas the bot-generated pronunciation # doesn't. We have separate cases depending on the identity of C, # which may go before or after the syllable break. Do it twice in # case it occurs twice in a row in a single word. pron = re.sub( ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([jlmnɲwʃʒ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])", r"\1\2.\3", pron) pron = re.sub( ur"([aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])\.([szfvtdpbkɡ])ə\.(ʁ[aɑɛeiɔouyœøɑ̃ɛ̃ɔ̃])", r"\1.\2\3", pron) # {{fr-IPA}} converts sequences of Crj and Clj to Cri.j and Cli.j, # which is correct, but Dawnraybot doesn't do that. pron = re.sub(u"([szfvtdpbkɡ][ʁl])j", r"\1i.j", pron) allow_mismatch = False if pron != autopron: tempcall = "{{fr-IPA%s}}" % pos_arg if pron.replace(u"ɑ", "a") == autopron.replace(u"ɑ", "a"): pagemsg( u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɑ vs. a only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) elif re.sub(u"ɛ(.)", r"e\1", pron) == re.sub(u"ɛ(.)", r"e\1", autopron): pagemsg( u"WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in ɛ vs. e only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) elif pron.replace(".", "") == autopron.replace(".", ""): pagemsg( "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable division only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) allow_mismatch = True elif pron.replace(".", "").replace(" ", "") == autopron.replace( ".", "").replace(" ", ""): pagemsg( "WARNING: Would replace %s with %s but auto-generated pron %s disagrees with %s in syllable/word division only: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) else: pagemsg( "WARNING: Can't replace %s with %s because auto-generated pron %s doesn't match %s: line <%s>" % (unicode(t), tempcall, autopron, pron, line)) if not allow_mismatch: continue origt = unicode(t) rmparam(t, "lang") rmparam(t, "1") rmparam(t, str(first_param)) blib.set_template_name(t, "fr-IPA") if pos_val: t.add("pos", pos_val) notes.append( "replace manually-specified {{IPA|fr}} pronun with {{fr-IPA}}" ) pagemsg("Replaced %s with %s: line <%s>" % (origt, unicode(t), line)) if "{{a|" in line: pagemsg( "WARNING: Replaced %s with %s on a line with an accent spec: line <%s>" % (origt, unicode(t), line)) return unicode(parsed)
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not re.search( r"\{\{head\|de\|(adjective (|comparative |superlative )|participle )form", text): return pagemsg("Processing") notes = [] retval = blib.find_modifiable_lang_section(text, "German", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find German section") return sections, j, secbody, sectail, has_non_lang = retval if re.search("== *Etymology 1 *==", secbody): pagemsg("WARNING: Multiple etymology sections, skipping") return parsed = blib.parse_text(secbody) headt = None comparative_of_t = None superlative_of_t = None inflection_of_t = None need_superlative_of_t_lemma = None for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) def do_comparative_superlative_of(pos, existing_t, should_end): if getparam(t, "1") != "de": pagemsg( "WARNING: Saw wrong language in {{%s of}}, skipping: %s" % (pos, origt)) return False if existing_t: pagemsg( "WARNING: Saw two {{%s of}} templates, skipping: %s and %s" % (pos, unicode(existing_t), origt)) return False if not headt: pagemsg( "WARNING: Saw {{%s of}} without head template, skipping: %s" % (pos, origt)) return False if not pagetitle.endswith(should_end): pagemsg( "WARNING: Incorrect ending for %s, should be -%s, skipping" % (pos, should_end)) return False param2 = getparam(headt, "2") if param2 != "%s adjective" % pos: headt.add("2", "%s adjective" % pos) notes.append( "convert {{head|de|%s}} to {{head|de|%s adjective}}" % (param2, pos)) return t if tn == "head" and getparam(t, "1") == "de" and getparam(t, "2") in [ "adjective form", "adjective comparative form", "adjective superlative form", "participle form" ]: if headt: pagemsg( "WARNING: Saw two head templates, skipping: %s and %s" % (unicode(headt), origt)) return headt = t elif tn == "head" and getparam(t, "1") == "de" and getparam( t, "2") == "verb form": pagemsg("Allowing and ignoring {{head|de|verb form}}: %s" % origt) elif tn == "head": pagemsg("WARNING: Saw unrecognized head template, skipping: %s" % origt) return elif tn == "comparative of": comparative_of_t = do_comparative_superlative_of( "comparative", comparative_of_t, "er") if not comparative_of_t: return elif tn == "superlative of": superlative_of_t = do_comparative_superlative_of( "superlative", superlative_of_t, "sten") if not superlative_of_t: return elif tn == "de-adj form of": pagemsg("Saw {{de-adj form of}}, assuming already converted: %s" % origt) return elif tn in ["inflection of", "infl of"]: if getparam(t, "1") != "de": pagemsg( "WARNING: Saw wrong language in {{inflection of}}, skipping: %s" % origt) return if not headt: pagemsg( "WARNING: Saw {{inflection of}} without head template, skipping: %s" % origt) return if inflection_of_t: pagemsg( "WARNING: Saw {{inflection of}} twice, skipping: %s and %s" % (unicode(inflection_of_t), origt)) return inflection_of_t = t lemma = getparam(t, "2") if getparam(t, "3"): pagemsg( "WARNING: Saw alt form in {{inflection of}}, skipping: %s" % origt) return infl_tags = [] for param in t.params: pn = pname(param) pv = unicode(param.value) if not re.search("^[0-9]+$", pn): pagemsg( "WARNING: Saw unrecognized param %s=%s in {{inflection of}}, skipping: %s" % (pn, pv, origt)) return if int(pn) >= 4: infl_tags.append(pv) tags = "|".join(infl_tags) if tags not in tags_to_ending: pagemsg( "WARNING: Saw unrecognized tags in {{inflection of}}, skipping: %s" % origt) return del t.params[:] ending = tags_to_ending[tags] if ending in ["sten", "esten"]: need_superlative_of_t_lemma = lemma blib.set_template_name(t, "de-adj form of") t.add("1", lemma) no_explicit = check_if_lemma_and_ending_match_pagetitle( lemma, ending, pagetitle, allow_umlaut=True) if not no_explicit: pagemsg("WARNING: Explicit ending %s required for lemma %s" % (ending, lemma)) t.add("2", ending) notes.append( "convert {{inflection of|de|...}} to {{de-adj form of}}") if "comd" in tags: param2 = getparam(headt, "2") if param2 != "comparative adjective form": headt.add("2", "comparative adjective form") notes.append( "convert {{head|de|%s}} to {{head|de|comparative adjective form}}" % param2) elif "supd" in tags: param2 = getparam(headt, "2") if param2 != "superlative adjective form": headt.add("2", "superlative adjective form") notes.append( "convert {{head|de|%s}} to {{head|de|superlative adjective form}}" % param2) secbody = unicode(parsed) def add_adj_form_of(secbody, pos, comparative_superlative_t, ending): lemma = getparam(comparative_superlative_t, "2") if check_if_lemma_and_ending_match_pagetitle(lemma, ending, pagetitle, allow_umlaut=False): form_pos = "superlative adjective form" if pos == "superlative" else "adjective form" newsec = """ ===Adjective=== {{head|de|%s}} # {{de-adj form of|%s}}""" % (form_pos, lemma) secbody, replaced = blib.replace_in_text( secbody, unicode(comparative_superlative_t), unicode(comparative_superlative_t) + newsec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add -%s inflection, skipping: %s" % (ending, unicode(comparative_of_t))) return secbody, False notes.append("add {{de-adj form of}} for %s" % pos) else: pagemsg( "WARNING: Lemma %s + %s ending %s doesn't match pagetitle" % (lemma, pos, ending)) return secbody, True if comparative_of_t and not inflection_of_t: secbody, ok = add_adj_form_of(secbody, "comparative", comparative_of_t, "er") if not ok: return if superlative_of_t and not inflection_of_t: secbody, ok = add_adj_form_of(secbody, "superlative", superlative_of_t, "sten") if not ok: return if inflection_of_t and not superlative_of_t and need_superlative_of_t_lemma: cursec = """===Adjective=== {{head|de|superlative adjective form}} # %s""" % unicode(inflection_of_t) newsec = """===Adjective=== {{head|de|superlative adjective}} # {{superlative of|de|%s}} """ % need_superlative_of_t_lemma secbody, replaced = blib.replace_in_text(secbody, cursec, newsec + cursec, pagemsg, abort_if_warning=True) if not replaced: pagemsg("WARNING: Couldn't add {{superlative of}}, skipping: %s" % unicode(inflection_of_t)) return notes.append("add {{superlative of|de|...}}") sections[j] = secbody + sectail text = "".join(sections) if not notes: pagemsg("WARNING: Couldn't convert page") return text, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) origtext = text notes = [] def get_templated_self_link(link): if args.self_links_use_raw: return "[[#English|%s]]" % link else: return "{{l|en|%s}}" % link def fix_sec_links(sectext): lines = sectext.split("\n") new_lines = [] for line in lines: if line.startswith("#"): if args.convert_raw_self_links: template_split_re = r"(\{\{(?:[^{}]|\{\{[^{}]*\}\})*\}\})" # Split templates and only change non-template text split_templates = re.split(template_split_re, line) for l in xrange(0, len(split_templates), 2): while True: newtext = re.sub( r"^#(.*?)\[\[%s\]\]" % pagetitle, r"#\1" + get_templated_self_link(pagetitle), split_templates[l], 0, re.M) if newtext == split_templates[l]: break changed = True notes.append( "replace raw self link to English terms with templated one" ) split_templates[l] = newtext line = "".join(split_templates) else: def replace_templated(m): origm1 = m.group(1) m1 = origm1 if "[[" not in m1: m1 = "[[%s]]" % m1 m1_new = m1.replace("[[%s]]" % pagetitle, get_templated_self_link(pagetitle)) saw_self_link = False if m1_new != m1: saw_self_link = True m1 = m1_new if m1 != get_templated_self_link(origm1): notes.append( "replace templated link to English terms in defns with raw link(s)" + (", keeping self-links templated" if saw_self_link else "")) return m1 line = re.sub( r"\{\{l\|en\|((?:[^{}|]|\[\[[^{}\[\]]*\]\])*?)\}\}", replace_templated, line) new_lines.append(line) return "\n".join(new_lines) if args.lang: retval = blib.find_modifiable_lang_section( text, None if args.partial_page else args.lang, pagemsg) if retval is None: pagemsg("WARNING: Couldn't find %s section" % args.lang) return sections, j, secbody, sectail, has_non_lang = retval secbody = fix_sec_links(secbody) sections[j] = secbody + sectail text = "".join(sections) else: text = fix_sec_links(text) return text, notes