def test_aquellos(): wordlist_data = """\ aquél {pron-meta} :: {{head|es|pronoun|demonstrative, feminine|aquélla|neuter|aquello|masculine plural|aquéllos|feminine plural|aquéllas|g=m}} aquél {pron-forms} :: demonstrative_feminine=aquélla; feminine_plural=aquéllas; masculine_plural=aquéllos; neuter=aquello aquél {pron} [demonstrative] :: that one (far from speaker and listener) aquéllos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}} aquéllos {pron} :: plural of "aquél"; those ones (far from speaker and listener) aquel {pron-meta} :: {{head|es|pronoun|g=m|feminine|aquella|neutrum|aquello|masculine plural|aquellos|neutrum plural|aquellos|feminine plural|aquellas}} aquel {pron-forms} :: feminine=aquella; feminine_plural=aquellas; masculine_plural=aquellos; neutrum=aquello; neutrum_plural=aquellos aquel {pron} [demonstrative] :: alternative spelling of "aquél" aquellos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}} aquellos {pron} :: alternative spelling of "aquéllos"; those ones (over there; implying some distance). The unaccented form can function as a pronoun if it can be unambiguously deduced as such from context. aquellos {pron-meta} :: {{head|es|pronoun|g=n-p}} aquellos {pron} :: Those ones. (over there; implying some distance) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("aquellos", "pron") == ['aquellos', 'aquél'] assert freq.get_best_lemma("aquellos", ['aquellos', 'aquél'], "pron") == "aquél" flist_data = """\ aquellos 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_asco(): wordlist_data = """\ asca {n-meta} :: x asca {n-forms} :: pl=ascas asca {m} [mycology] | teca :: ascus asco {n-meta} :: x asco {n-forms} :: pl=ascos asco {m} :: disgust asco {m} :: nausea asco {n-meta} :: x asco {n-forms} :: pl=ascos asco {m} :: alternative form of "asca" """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("asco") == ['n|asco', 'n|asca'] assert freq.get_lemmas("asco", "n") == ["asca", "asco"] assert freq.get_best_lemma("asco", ["asca", "asco"], "n") == "asco" flist_data = """\ asco 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_diva(): wordlist_data = """\ _____ diva pos: adj meta: {{head|es|adjective form}} gloss: adjective form of "divo" pos: n meta: {{es-noun|f|m=divo}} g: f gloss: diva _____ divo pos: adj meta: {{es-adj}} gloss: star (famous) pos: n meta: {{es-noun|m|f=diva}} g: m gloss: star, celeb """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("diva") == ['adj|divo', 'n|divo'] assert freq.get_lemmas("diva", "n") == ["divo"] flist_data = """\ diva 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_izquierdas(): wordlist_data = """\ _____ izquierda pos: adj meta: {{head|es|adjective form|g=f-s}} g: f-s gloss: adjective form of "izquierdo" pos: n meta: {{es-noun|f|-}} g: f gloss: left (side, direction) gloss: left q: politics _____ izquierdas pos: adj meta: {{head|es|adjective form}} gloss: adjective form of "izquierdo" pos: n meta: {{head|es|noun form|g=f-p}} g: f-p gloss: plural of "izquierda" _____ izquierdo pos: adj meta: {{es-adj}} gloss: left; on the left side or toward the left; the opposite of right syn: siniestro gloss: left-handed gloss: crooked _____ izquierdos pos: adj meta: {{head|es|adjective form|g=m-p}} g: m-p gloss: plural of "izquierdo" _____ """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) print(allforms.all_forms) assert freq.get_lemmas("izquierdas", "n") == ["izquierda"] assert freq.get_lemmas("izquierdo", "adj") == ["izquierdo"] assert freq.get_lemmas("izquierdos", "adj") == ["izquierdo"] assert freq.get_lemmas("izquierdas", "adj") == ["izquierdo"] assert freq.get_ranked_pos("izquierda") == ['n', 'adj'] assert freq.get_ranked_pos("izquierdas") == ['n', 'adj'] flist_data = """\ izquierda 34629 izquierdo 8150 izquierdas 436 izquierdos 234 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_format_def(): wordlist_data = """\ _____ rendir pos: v meta: {{es-verb|rend|ir|pres=rindo}} {{es-conj-ir|r|nd|p=e-i|combined=1}} gloss: to conquer q: transitive gloss: to tire, exhaust q: transitive gloss: to yield, pay, submit, pass down q: ditransitive gloss: to vomit q: intransitive gloss: to make headway q: intransitive gloss: to surrender, give in, give up q: reflexive gloss: to be paid (homage or tribute) q: reflexive """ wordlist = Wordlist(wordlist_data.splitlines()) sentences = None ignore = [] allforms = AllForms.from_wordlist(wordlist) # Full definition without ignore list deck = DeckBuilder(wordlist, sentences, ignore, allforms) usage = deck.get_usage("rendir", "v") print(usage) assert usage == { 'v': {'ditransitive': ['to yield, pay, submit, pass down']}, 'vi': {'': ['to vomit', 'to make headway']}, 'vr': {'': ['to surrender, give in, give up', 'to be paid (homage or tribute)']}, 'vt': {'': ['to conquer', 'to tire, exhaust']}} item = {'m/f': {'': ['retiree, pensioner (retired person)']}} assert DeckBuilder.format_def(item, hide_word="jubilado") == '<span class="pos n m_f mf">{mf} <span class="usage">retiree, pensioner (retired person)</span></span>' format_def = DeckBuilder.format_def item = { "m": { "tag": [ "def1", "def2" ] } } assert format_def(item) == """<span class="pos n m"> <span class="tag">[tag]:</span><span class="usage">def1; def2</span></span>""" item = { "m": { "Peru": [ "def1", "def2" ] } } assert format_def(item) == """<span class="pos n m only-latin-america only-peru only-south-america peru"> <span class="tag">[Peru]:</span><span class="usage">def1; def2</span></span>""" item = { "m": { "South America": [ "def1", "def2" ] } } assert format_def(item) == """<span class="pos n m only-latin-america only-south-america south-america"> <span class="tag">[South America]:</span><span class="usage">def1; def2</span></span>""" item = {'f': {'': ['sewer, storm drain'], 'zoology': ['cloaca']}} assert format_def(item, hide_word='cloaca') == """<span class="pos n f"> <span class="usage">sewer, storm drain</span></span>"""
def test_filters(): ignore_data = """\ # comment # - abuela {f} - abuelo {m} :: loose tufts """ wordlist_data = """\ abuela {n-meta} :: {{es-noun|m=abuelo}} abuela {f} :: grandmother, female equivalent of "abuelo" abuela {f} [colloquial] :: old woman abuela {f} [Mexico] :: a kind of flying ant abuelo {n-meta} :: {{es-noun|f=abuela}} abuelo {m} :: grandfather abuelo {m} [colloquial, endearing] :: an elderly person abuelo {m} | tolano :: loose tufts of hair in the nape when one's hair is messed up """ wordlist = Wordlist(wordlist_data.splitlines()) sentences = None ignore = [] allforms = AllForms.from_wordlist(wordlist) # Full definition without ignore list deck = DeckBuilder(wordlist, sentences, ignore, allforms) assert deck.filter_gloss("abuela", "", "", "grandmother") == "grandmother" assert deck.filter_gloss("abuela", "", "", 'grandmother, female equivalent of "abuelo"') == "grandmother" usage = deck.get_usage("abuelo", "n") assert usage == { 'm/f': {'f': ['grandmother'], 'f, colloquial': ['old woman'], 'f, Mexico': ['a kind of flying ant'], 'm': ['grandfather', "loose tufts of hair in the nape when one's hair is messed up"], 'm, colloquial, endearing': ['an elderly person'] }} # With ignore list ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines()) deck = DeckBuilder(wordlist, sentences, ignore, allforms) assert deck.filter_gloss("abuela", "x", "", "grandmother") == "grandmother" assert deck.filter_gloss("abuela", "f", "", "grandmother") == None assert deck.filter_gloss("abuela", "f", "colloquial", "old woman") == None assert deck.filter_gloss("abuelo", "m", "", "loose tufts of hair") == None assert deck.filter_gloss("abuelo", "m", "", "grandfather") == "grandfather" usage = deck.get_usage("abuelo", "n") assert usage == { 'm/f': { '': ['grandfather'], 'colloquial, endearing': ['an elderly person'] }}
def test_filters2(): ignore_data = """\ - test {f} """ wordlist_data = """\ test {n-meta} :: x test {n-forms} :: pl=tests test {m} :: masculine test {n-meta} :: x test {n-forms} :: pl=tests test {f} :: feminine """ xwordlist_data = """\ _____ test forms: pl=tests pos: n form: m gloss: masculine ____ test forms: pl=tests pos: n form: f gloss: feminine """ wordlist = Wordlist(wordlist_data.splitlines()) sentences = None ignore = [] allforms = AllForms.from_wordlist(wordlist) # Full definition without ignore list deck = DeckBuilder(wordlist, sentences, ignore, allforms) usage = deck.get_usage("test", "n") assert usage == { 'm-f': {'f': ['feminine'], 'm': ['masculine'] }} # With ignore list ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines()) deck = DeckBuilder(wordlist, sentences, ignore, allforms) usage = deck.get_usage("test", "n") assert usage == { 'm': { '': ['masculine'], }}
def test_lemma_filters(): wordlist_data = """\ _____ ir pos: v meta: {{es-verb}} {{es-conj}} {{es-conj|irse}} gloss: to go (away from speaker and listener) q: intransitive gloss: to come (towards or with the listener) q: intransitive gloss: to be going to (near future), to go (+ a + infinitive) q: auxiliary gloss: to go away, to leave, to be off (see irse) q: reflexive _____ irse pos: v meta: {{es-verb}} {{es-conj}} gloss: to go away, to leave, to depart, to go (when the destination is not essential; when something or someone is going somewhere else) syn: andarse; marcharse gloss: to leak out (with liquids and gasses), to boil away, to go flat (gas in drinks) gloss: to overflow gloss: to go out (lights) gloss: to finish, to wear out, to disappear (e.g. money, paint, pains, mechanical parts) gloss: to die gloss: to break wind, to fart q: informal gloss: to wet/soil oneself (i.e., urinate or defecate in one's pants) q: informal gloss: to come, to cum, to e*******e, to o****m q: vulgar """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) print(allforms.all_forms["nos vamos"]) assert freq.all_forms.get_lemmas("vamos") == ['v|ir'] assert freq.all_forms.get_lemmas("nos vamos") == ['v|ir', 'v|irse'] assert freq.get_lemmas("vamos", "v") == ["ir"] assert freq.get_lemmas("ir", "v") == ["ir"] assert freq.include_word("vamos", "v") == True assert freq.filter_pos("vamos", ["v"]) == ["v"] # assert len(freq.wordlist.get_words("vamos", "v")) > 0 assert freq.get_ranked_pos("vamos") == ["v"] assert freq.get_lemmas("vamos", "v") == ["ir"] flist_data = """\ vamos 10 va 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def main(): import argparse argparser = argparse.ArgumentParser(description="Find -ismo without -ista and vice versa") argparser.add_argument("file", help="Extract file to read") argparser.add_argument("--allforms", help="Allforms for checking lemmas") argparser.add_argument("--save", help="Save to wiktionary with specified commit message") argparser.add_argument("--date", help="Date of the database dump (used to generate page messages)") argparser.add_argument("--limit", type=int, help="Limit processing to first N articles") argparser.add_argument("--progress", help="Display progress", action='store_true') args = argparser.parse_args() count = 0 allforms = AllForms.from_file(args.allforms) if args.allforms else None all_lemmas = set(allforms.all_lemmas) for article in WikiExtract.iter_articles_from_bz2(args.file): if not count % 1000 and args.progress: print(count, end = '\r', file=sys.stderr) if args.limit and count >= args.limit: break count += 1 text = article.text path = article.title.split(":") page = path[0] pos = path[-1] if page.endswith("ismo"): error = "ismo_without_ista" search = page[:-4] + "ista" elif page.endswith("ista"): error = "ista_without_ismo" search = page[:-4] + "ismo" else: continue if page not in all_lemmas: continue if search in all_lemmas and search not in article.text: log(error, page) if args.save: base_url = "User:JeffDoozan/lists" logger.save(base_url, WikiSaver, commit_message=args.save) else: dest = "" logger.save(dest, FileSaver)
def test_veros(): wordlist_data = """\ ver {v-meta} :: {{es-verb}} {{es-conj}} ver {v} :: x vero {n-meta} :: {{es-noun|m}} vero {m} [heraldry] :: vair """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_ranked_pos("veros") == ["v", "n"]
def test_filters(): wordlist_data = """\ test {n-meta} :: x test {m} :: test test {adj-meta} :: x test {adj} :: obsolete form of "test" """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.filter_pos("test", ["n", "adj"]) == ["n"] assert freq.get_ranked_pos("test") == ["n"]
def test_piernas(): wordlist_data = """\ pierna {n-meta} :: {{es-noun|f}} pierna {n-forms} :: pl=piernas pierna {f} | pata :: leg (lower limb of a human) piernas {n-meta} :: {{es-noun|m|piernas}} piernas {n-forms} :: pl=piernas piernas {m} [dated] :: twit; idiot """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) lemmas = ['pierna', 'piernas'] assert freq.get_lemmas("piernas", "n") == lemmas assert freq.get_best_lemma("piernas", lemmas, "n") == "pierna"
def test_hamburguesa(): wordlist_data = """\ hamburgués {n-meta} :: {{es-noun|m|hamburgueses|f=hamburguesa|fpl=hamburguesas}} hamburgués {n-forms} :: f=hamburguesa; fpl=hamburguesas; pl=hamburgueses hamburgués {m} :: Hamburger, a person from Hamburg hamburguesa {n-meta} :: {{es-noun|f}} hamburguesa {n-forms} :: pl=hamburguesas hamburguesa {f} :: hamburger hamburguesa {f} :: female equivalent of "hamburgués"; Hamburger """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) lemmas = ['hamburguesa', 'hamburgués'] assert freq.get_lemmas("hamburguesa", "n") == lemmas assert freq.get_best_lemma("hamburguesa", lemmas, "n") == "hamburguesa"
def test_simple2(): wordlist_data = """\ rojo {adj-meta} :: {{es-adj|f=roja}} rojo {adj} :: red (colour) rojo {n-meta} :: {{es-noun|m}} rojo {m} :: red (colour) rojo {m} [Costa Rica] :: a 1000 colón bill rojo {m} [Spain, derogatory] :: a left-wing, especially communist roja {n-meta} :: {{es-noun|f|m=rojo}} roja {f} :: Red (Communist) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_ranked_pos("roja") == ["adj", "n"]
def test_microondas(): wordlist_data = """\ microonda {n-meta} :: {{es-noun|f}} microonda {n-forms} :: pl=microondas microonda {f} :: microwave (electromagnetic wave) microondas {n-meta} :: {{es-noun|m|microondas}} microondas {n-forms} :: pl=microondas microondas {m} | horno de microondas :: microwave oven, microwave microondas {m} :: necklacing (execution by burning tyre) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) lemmas = ["microonda", "microondas"] assert freq.get_lemmas("microondas", "n") == lemmas assert freq.get_best_lemma("microondas", lemmas, "n") == "microondas"
def test_veras(): wordlist_data = """\ vera {n-meta} :: {{es-noun|f}} vera {n-forms} :: pl=veras vera {f} [poetic] | lado :: side, face vera {n-meta} :: {{es-noun|f}} vera {n-forms} :: pl=veras vera {f} :: verawood (Bulnesia arborea) veras {n-meta} :: {{es-noun|f-p}} veras {fp} :: truth; reality veras {fp} :: serious things """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("veras", "n") == ["vera", "veras"] assert freq.get_best_lemma("veras", ["vera", "veras"], "n") == "veras"
def test_vete(): wordlist_data = """\ ir {v-meta} :: {{es-verb}} {{es-conj}} {{es-conj|irse}} ir {v} :: x ver {v-meta} :: {{es-verb}} {{es-conj}} ver {v} :: x verse {v-meta} :: {{es-verb}} {{es-conj}} verse {v} :: x vetar {v-meta} :: {{es-verb}} {{es-conj}} vetar {v} :: x """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("vete", "v") == ['ir', 'ver', 'vetar'] assert freq.get_best_lemma("vete", ['ir', 'ver', 'vetar'], "v") == "ir"
def test_rasguno(): wordlist_data = """\ rasguñar {v-meta} :: {{es-verb}} {{es-conj}} rasguñar {vt} | arañar; rascar :: to scratch rasguño {n-meta} :: {{es-noun}} rasguño {m} | arañazo :: scratch """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("rasguño") == ['v|rasguñar', 'n|rasguño'] assert freq.get_ranked_pos("rasguño") == ["n", "v"] flist_data = """\ rasguño 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_bienes(): wordlist_data = """\ bien {n-meta} :: {{es-noun|m|bienes}} bien {m} :: good (as opposed to evil) bienes {n-meta} :: {{es-noun|m-p}} bienes {mp} :: goods (that which is produced, traded, bought or sold) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("bienes") == ['n|bien', 'n|bienes'] assert freq.get_lemmas("bienes", "n") == ["bien", "bienes"] assert freq.get_best_lemma("bienes", ["bien", "bienes"], "n") == "bienes" flist_data = """\ bienes 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_hijo(): wordlist_data = """\ hija {n-meta} :: x hija {n-forms} :: m=hijo; mpl=hijos; pl=hijas hija {f} :: daughter; feminine noun of "hijo" hijo {n-meta} :: x hijo {n-forms} :: f=hija; fpl=hijas; pl=hijos hijo {m} :: son hijo {m} :: child (when the gender of the child is unknown) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("hijo") == ['n|hijo'] assert freq.get_lemmas("hijo", "n") == ["hijo"] flist_data = """\ hijo 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_dios(): wordlist_data = """\ dios {n-meta} :: {{es-noun|m|dioses|f=diosa}} dios {n-forms} :: f=diosa; fpl=diosas; pl=dioses dios {m} :: god diosa {n-meta} :: {{es-noun|f|m=dios}} diosa {n-forms} :: m=dios; mpl=dios; pl=diosas diosa {f} :: goddess diosa {n-meta} :: {{es-noun|f}} diosa {n-forms} :: pl=diosas diosa {f} [biochemistry] :: diose """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("dioses", "n") == ["dios"] assert freq.get_lemmas("diosas", "n") == ["dios", "diosa"] assert freq.get_lemmas("diosa", "n") == ["dios", "diosa"] assert freq.get_best_lemma("diosa", ["dios", "diosa"], "n") == "dios" # assert list(freq.all_forms.get_lemmas("dios", {})) == ['n:dios:m'] # assert list(freq.all_forms.get_lemmas("dioses", {})) == ['n:dios:pl'] # assert list(freq.all_forms.get_lemmas("diosa", {})) == ["n:dios:f"] # assert list(freq.all_forms.get_lemmas("diosas", {})) == ["n:diosa:pl"] flist_data = """\ dios 10 dioses 10 diosa 10 diosas 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def allforms(wordlist): return AllForms.from_wordlist(wordlist)
def allforms(fixer): return AllForms.from_wordlist(fixer.wordlist)
def main(): global args global ngprobs parser = argparse.ArgumentParser(description="Summarize ngram usage") parser.add_argument( "--allforms", help="Exclude coordinate terms that have standalone entries") parser.add_argument("--min-count", help="Ignore forms with less than N uses", type=int) parser.add_argument( "--min-percent", help= "Ignore coordinate terms used less than N percent of the form's uses", type=int) parser.add_argument( "--save", help="Save to wiktionary with specified commit message") parser.add_argument( "--ignore2", help= "Ignore coords containing the specified word (can be used more than once)", action='append') parser.add_argument("--ngprobs", help="Ngram probability data file") parser.add_argument( "--coord2", help="File containing 2 word coordinate terms to check") parser.add_argument( "--coord3", help="File containing 3 word coordinate terms to check") parser.add_argument( "--coord4", help="File containing 4 word coordinate terms to check") args = parser.parse_args() allforms = AllForms.from_file(args.allforms) all_forms = set(allforms.all_forms) print("all_forms") lemma_forms = defaultdict(list) for form, pos, lemma in allforms.all: lemma_forms[(lemma, pos)].append(form) print("lemma_forms") alt_case = { form.lower(): form for form in all_forms if form != form.lower() } ngprobs = NgramPosProbability(args.ngprobs) if False: coord = "reharás tu vida" words = coord.split(" ") print([coord, get_coord_lemma(ngprobs, allforms, words)]) exit() form = "fijamente" form_pos = ngprobs.get_preferred_pos(form) form_lemma = get_form_lemma(ngprobs, allforms, form) count = get_lemma_count(ngprobs, lemma_forms, form_lemma, form_pos) print([form, form_pos, form_lemma, count]) exit() all_coords = {} if args.coord2: all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case, args.coord2, args.ignore2) if args.coord3: all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case, args.coord3) if args.coord4: all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case, args.coord4) # seen2 = set() # all_coords = {} # if args.coord3: # coord_lemmas = find_coords(allforms, all_forms, ngprobs, alt_case, args.coord3) # for k in coord_lemmas.keys(): # coord_lemma, form, form_pos = k # words = coord_lemma.split(" ") # seen2.add(words[0:2]) # seen2.add(words[1:3]) # all_coords |= coord_lemmas # # if args.coord2: # coord_lemmas = find_coords(allforms, all_forms, ngprobs, alt_case, args.coord2, args.ignore2) # for k,count in coord_lemmas.items(): # coord_lemma, form, form_pos = k # if coord_lemma in seen2 and coord_lemma not in all_forms: # continue # all_coords[k] = count for k, coord_count in all_coords.items(): coord_lemma, form_lemma, form_pos = k form_count = get_lemma_count(ngprobs, lemma_forms, form_lemma, form_pos) # Skip uncommon forms if form_count < args.min_count: continue # Min ratio percent = int(coord_count / form_count * 100) if percent < args.min_percent: continue existing = coord_lemma if coord_lemma in all_forms else coord_lemma.lower( ) if coord_lemma.lower() in all_forms else None if existing: coord_lemma = f"[[{existing}]]" log(form_lemma, coord_lemma, form_count, coord_count, percent) if args.save: base_url = "User:JeffDoozan/lists" logger.save(base_url, WikiSaver, commit_message=args.save) else: dest = "" logger.save(dest, FileSaver)
def allforms(self): if not self._allforms: self._allforms = AllForms.from_file(self.allforms_file) return self._allforms
def test_simple(): wordlist_data = """\ _____ protector pos: n meta: {{es-noun|m|f=+|f2=protectriz}} g: m gloss: protector (someone who protects or guards) pos: n meta: {{es-noun|m}} g: m gloss: protector (a device or mechanism which is designed to protect) _____ protectora pos: n meta: {{es-noun|f|m=protector}} forms: m=protector; mpl=protectores; pl=protectoras g: f gloss: female equivalent of "protector" pos: n meta: {{es-noun|f}} forms: pl=protectoras g: f gloss: animal shelter (an organization that provides temporary homes for stray pet animals) syn: protectora de animales _____ protectoras pos: n meta: {{head|es|noun plural form|g=f-p}} g: f-p gloss: inflection of "protector" _____ protectores pos: n meta: {{head|es|noun plural form|g=m-p}} g: m-p gloss: inflection of "protector" _____ protectrices pos: n meta: {{head|es|noun plural form|g=f-p}} g: f-p gloss: inflection of "protector" _____ protectriz pos: n meta: {{es-noun|f|m=protector}} forms: m=protector; mpl=protectores; pl=protectrices g: f gloss: alternative form of "protectora" q: uncommon """ flist_data = """\ protector 10 protectora 10 protectoras 10 protectores 10 protectriz 10 protectrices 10 unknown 10 """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.wordlist.has_lemma("protectora", "n") == False assert freq.get_lemmas("protectores", "n") == ["protector"] assert freq.get_lemmas("protectoras", "n") == ["protector", "protectora"] assert freq.get_lemmas("notaword", "n") == ["notaword"] assert freq.get_ranked_pos("protectoras") == ["n"] assert "\n".join(freq.process(flist_data.splitlines())) == """\
if not args.data_dir: args.data_dir = os.environ.get("SPANISH_DATA_DIR", "spanish_data") if not args.custom_dir: args.custom_dir = os.environ.get("SPANISH_CUSTOM_DIR", "spanish_custom") with open(args.dictionary) as wordlist_data: cache_words = not args.low_mem wordlist = Wordlist(wordlist_data, cache_words=cache_words) print("wordlist", mem_use(), file=sys.stderr) ignore_data = open(args.ignore) if args.ignore else [] if args.allforms: allforms = AllForms.from_file(args.allforms) else: allforms = AllForms.from_wordlist(wordlist) print("all_forms", mem_use(), file=sys.stderr) sentences = spanish_sentences( sentences=args.sentences, data_dir=args.data_dir, custom_dir=args.custom_dir ) flist = FrequencyList(wordlist, allforms, sentences) with open(args.file) as infile: for line in flist.process(infile, ignore_data): print(line) ignore_data.close()
def main(): import argparse argparser = argparse.ArgumentParser( description="Find lemmas with only 'form of' senses") argparser.add_argument("--trans", help="Extract file to read") argparser.add_argument("--allforms", help="Allforms for resolving forms to lemmas") argparser.add_argument( "--save", help="Save to wiktionary with specified commit message") argparser.add_argument( "--date", help="Date of the database dump (used to generate page messages)") argparser.add_argument("--limit", type=int, help="Limit processing to first N articles") argparser.add_argument("--progress", help="Display progress", action='store_true') argparser.add_argument("--dump-aliases", help="Dump likely language aliases", action='store_true') argparser.add_argument("--dump-parents", help="Dump likely parent languages", action='store_true') args = argparser.parse_args() allforms = AllForms.from_file(args.allforms) if args.allforms else None if not os.path.isfile(args.trans): raise FileNotFoundError(f"Cannot open: {args.trans}") fixer = T9nFixer(allforms) logger = Logger() def log(error, page, pos, gloss, language, line="", highlight=""): if error is None: raise ValueError("error is none") if page is None: raise ValueError("page is none") if pos is None: raise ValueError("pos is none") if gloss is None: gloss = "" if language is None: language = "" if line is None: line = "" if highlight is None: highlight = "" logger.add(error, page, pos, gloss, language, line, highlight) # if language: # langlogger.add(error, page, pos, gloss, language, line, highlight) # if error != "text_outside_template": # logger.add(error, page, pos, gloss, line, highlight) count = 0 max_val = 0 pages_with_tables = set() for article in WikiExtract.iter_articles_from_bz2(args.trans): text = article.text path = article.title.split(":") page = path[0] pos = path[-1] if pos not in ALL_POS: log("outside_pos", page, pos, None, None, path) count += 1 if not count % 1000 and args.progress: print(count, end='\r', file=sys.stderr) if args.limit and count > args.limit: break # if page != "pie-eyed": # continue # if pathstr != "veggie:English:Adjective": # if pathstr != "I love you:English:Phrase": # continue # print("\n", count) # val = timeit.timeit(lambda: list(TranslationTable.find_tables(text, page, pos)), number=1) # if val > max_val: # max_val = val # max_page = pathstr # continue tables = list(TranslationTable.find_tables(text)) if not len(tables) and not re.search( "{{\s*(trans-see|checktrans|see translation)", text): log("no_tables", page, pos, None, None) # max_page = "X" pages_with_tables.add(page) stats["sections_with_tables"] += 1 for table_lines in tables: table_lines = table_lines.splitlines() # print(table_lines) # exit() # max_val += len(table_lines) # continue table = TranslationTable(page, pos, table_lines, log_function=log) stats["total_tables"] += 1 seen = set() for item in table.items: if isinstance(item, TranslationLine) and item.lang_id not in seen: stats["total_entries"] += len(item.entries) stats["lang_entries"][lang_ids[item.lang_id]] += 1 seen.add(item.lang_id ) # Don't count more than one entry per table if len(tables) > 1 and not table.gloss and table.template in [ "tran-top", "trans-top-see", "trans-top-also" ]: table.log("no_gloss") fixer.cleanup_table(table) # if "\n".join(map(str.strip,table_lines)) != str(table): # table.log("botfix_formatting") # print("OLD", page, pos, file=sys.stderr) # print("\n".join(table_lines), file=sys.stderr) # print("NEW", page, pos) # print(str(table)) #exit() stats["pages_with_tables"] = len(pages_with_tables) # print(max_val, max_page) # base_url = "User:JeffDoozan/lists/translations" if args.save else "Xtranslations" # langlogger.save(base_url, args.save) if args.save: base_url = "User:JeffDoozan/lists/translations" logger.save(base_url, WikiByLanguage, commit_message=args.save, page_limit=1000, data_date=args.date) logger.save(base_url + "/by_error", WikiByError, commit_message=args.save, data_date=args.date) else: dest = "Xtranslations" logger.save(dest, FileByLanguage, page_limit=1000, data_date=args.date) logger.save(dest + "/by_error", FileByError, data_date=args.date) # Dump nested language aliases if args.dump_aliases: print("language_aliases = {") #for lang,codes in sorted(UNKNOWN_LANGS.items(), key=lambda x: sum(x[1].values())*-1): for lang, codes in sorted(UNKNOWN_LANGS.items()): for code, count in sorted(codes.items(), key=lambda x: x[1] * -1): if count > 20: print( f" '{lang}': '{lang_ids[code]}', # {code} found in {count} entries" ) break print("}") if args.dump_parents: print("language_parents = {") for lang, count in sorted(LANG_PARENTS.items()): if count > 20: print(f" '{lang}', # used in {count} entries") print("}") colons = [x for x in lang_ids.values() if ":" in x] if colons: raise ValueError( "A language exists with a colon in the name, this may cause problems for nested languages that use : as a separator" ) print(f"Total pages with tables: {stats['pages_with_tables']}") print(f"Total sections with tables: {stats['sections_with_tables']}") total_lines = sum(stats["lang_entries"].values()) print(f"Total language lines in tables: {total_lines}") print(f"Total translation entries: {stats['total_entries']}")
def main(): global fixer global fixrunner import argparse parser = argparse.ArgumentParser( description="Generate list of missing forms") parser.add_argument("wordlist", help="wordlist") parser.add_argument("--allforms", required=True, help="all_forms file") parser.add_argument("--allpages", required=True, help="wiki.allpages") parser.add_argument( "--articles", required=True, help="Language extract with raw articles, used for checking autofixes") parser.add_argument("--save", help="wiktionary commit message") parser.add_argument("--limit", type=int, help="Limit processing to first N articles") parser.add_argument("--progress", help="Display progress", action='store_true') args = parser.parse_args() global ARTICLE_FILE ARTICLE_FILE = args.articles wordlist = Wordlist.from_file(args.wordlist) allforms = AllForms.from_file(args.allforms) fixer = FormFixer(wordlist) fixrunner = FixRunner("es", wordlist, allforms) with open(args.allpages) as infile: # Loading the entire contents of allpages takes 600M # To conserve memory, temporarily load allforms into a set # and then create a set of entries in allpages that are also in allforms allforms_set = set(allforms.all_forms) allpages = {x.strip() for x in infile if x in allforms_set} del allforms_set # form = "achaparrándolo" # declared_forms = fixer.get_declared_forms(form, wordlist, allforms) # existing_forms = get_existing_forms(form, wordlist) # missing_forms, unexpected_forms = fixer.compare_forms(declared_forms, existing_forms) # print("declared", declared_forms) # print("existing", existing_forms) # print("missing", missing_forms) # print("unexpected", unexpected_forms) # exit() count = 0 for form in allforms.all_forms: # Fix for conversion from <sup>x</sup> -> ^x if "^" in form: continue try: declared_forms = fixer.get_declared_forms(form, wordlist, allforms) except ValueError as e: print("ERROR", e) #error("form_errors", form, str(e)) continue if not count % 1000 and args.progress: print(count, end='\r', file=sys.stderr) if args.limit and count >= args.limit: break count += 1 existing_forms = get_existing_forms(form, wordlist) missing_forms, unexpected_forms = fixer.compare_forms( declared_forms, existing_forms) missing_pos = [] for item in missing_forms: if item.form != form: raise ValueError(form, item) if not FormFixer.can_handle_formtype(item.formtype): continue # TODO: for now skip multi word verbs if item.pos == "v" and " " in item.lemma: continue if item.pos == "n" and item.formtype == "m": error("should_be_lemma", form, item) continue words = list(wordlist.get_words(form, item.pos)) if not words: matches = list(wordlist.get_words(form)) if matches: if item.pos in missing_pos: continue ety = {w.etymology for w in matches} level = 4 if len(ety) > 1 else 3 # error("missing_pos_multi_ety", form, item) items = [i for i in missing_forms if i.pos == item.pos] if fixer.can_handle(item): pos_text = "\n".join( fixer.full_pos(item.form, level, items)) else: pos_text = "" error("missing_pos", form, item, pos_text) missing_pos.append(item.pos) else: if form in allpages: error("missing_entry", form, item) continue # if pos == "n" and formtype == "pl" and unexpected_forms: # masculines = get_masculines_from_fpl(words[0]) # masculine_links = [m for m in masculines if (pos, "fpl", m) in unexpected_forms] # if masculine_links: # for m in masculine_links: # unexpected_forms.remove((pos, "fpl", m)) # print(f"{form}:{pos} links to masculine {masculine_links} instead of feminine $is_doublet") # continue error("missing_sense", form, item) for item in sorted(unexpected_forms): words = list(wordlist.get_words(item.lemma, item.pos)) if words: error("unexpected_form", form, item) else: error("missing_lemma", form, item) if args.save: base_url = "User:JeffDoozan/lists/es/forms" logger.save(base_url, WikiSaver, commit_message=args.save) else: logger.save("forms", FileSaver)