def test_asco(): wordlist_data = """\ asca {n-meta} :: x asca {n-forms} :: pl=ascas asca {m} [mycology] | teca :: ascus asco {n-meta} :: x asco {n-forms} :: pl=ascos asco {m} :: disgust asco {m} :: nausea asco {n-meta} :: x asco {n-forms} :: pl=ascos asco {m} :: alternative form of "asca" """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("asco") == ['n|asco', 'n|asca'] assert freq.get_lemmas("asco", "n") == ["asca", "asco"] assert freq.get_best_lemma("asco", ["asca", "asco"], "n") == "asco" flist_data = """\ asco 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_diva(): wordlist_data = """\ _____ diva pos: adj meta: {{head|es|adjective form}} gloss: adjective form of "divo" pos: n meta: {{es-noun|f|m=divo}} g: f gloss: diva _____ divo pos: adj meta: {{es-adj}} gloss: star (famous) pos: n meta: {{es-noun|m|f=diva}} g: m gloss: star, celeb """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("diva") == ['adj|divo', 'n|divo'] assert freq.get_lemmas("diva", "n") == ["divo"] flist_data = """\ diva 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_izquierdas(): wordlist_data = """\ _____ izquierda pos: adj meta: {{head|es|adjective form|g=f-s}} g: f-s gloss: adjective form of "izquierdo" pos: n meta: {{es-noun|f|-}} g: f gloss: left (side, direction) gloss: left q: politics _____ izquierdas pos: adj meta: {{head|es|adjective form}} gloss: adjective form of "izquierdo" pos: n meta: {{head|es|noun form|g=f-p}} g: f-p gloss: plural of "izquierda" _____ izquierdo pos: adj meta: {{es-adj}} gloss: left; on the left side or toward the left; the opposite of right syn: siniestro gloss: left-handed gloss: crooked _____ izquierdos pos: adj meta: {{head|es|adjective form|g=m-p}} g: m-p gloss: plural of "izquierdo" _____ """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) print(allforms.all_forms) assert freq.get_lemmas("izquierdas", "n") == ["izquierda"] assert freq.get_lemmas("izquierdo", "adj") == ["izquierdo"] assert freq.get_lemmas("izquierdos", "adj") == ["izquierdo"] assert freq.get_lemmas("izquierdas", "adj") == ["izquierdo"] assert freq.get_ranked_pos("izquierda") == ['n', 'adj'] assert freq.get_ranked_pos("izquierdas") == ['n', 'adj'] flist_data = """\ izquierda 34629 izquierdo 8150 izquierdas 436 izquierdos 234 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def wordlist(): data = """\ _____ actor pos: n meta: {{es-noun|m|f=actriz|f2=+}} g: m etymology: From Latin "actor". gloss: An actor (person who performs in a theatrical play or movie) _____ actriz pos: n meta: {{es-noun|f|m=actor}} gloss: actress _____ alegre pos: adj meta: {{es-adj}} gloss: joyful, cheerful _____ dentista pos: n meta: {{es-noun|mf}} g: mf etymology: diente + -ista gloss: dentist _____ rojo pos: adj meta: {{es-adj}} gloss: red """ return Wordlist(data.splitlines())
def main(): import argparse parser = argparse.ArgumentParser( description="List verbs missing a type label in at least one sense") parser.add_argument("wordlist", help="wordlist") args = parser.parse_args() wordlist = Wordlist.from_file(args.wordlist) missing_type = set() for word in wordlist.iter_all_words(): if word.pos != "v": continue if not len(word.senses): continue if word.meta and "verb form" in word.meta: continue for s in word.senses: if not s.qualifier or not re.search( r"(transitive|reflexive|pronominal)", s.qualifier): missing_type.add(word.word) break for f in sorted(missing_type): print(f)
def test_aquellos(): wordlist_data = """\ aquél {pron-meta} :: {{head|es|pronoun|demonstrative, feminine|aquélla|neuter|aquello|masculine plural|aquéllos|feminine plural|aquéllas|g=m}} aquél {pron-forms} :: demonstrative_feminine=aquélla; feminine_plural=aquéllas; masculine_plural=aquéllos; neuter=aquello aquél {pron} [demonstrative] :: that one (far from speaker and listener) aquéllos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}} aquéllos {pron} :: plural of "aquél"; those ones (far from speaker and listener) aquel {pron-meta} :: {{head|es|pronoun|g=m|feminine|aquella|neutrum|aquello|masculine plural|aquellos|neutrum plural|aquellos|feminine plural|aquellas}} aquel {pron-forms} :: feminine=aquella; feminine_plural=aquellas; masculine_plural=aquellos; neutrum=aquello; neutrum_plural=aquellos aquel {pron} [demonstrative] :: alternative spelling of "aquél" aquellos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}} aquellos {pron} :: alternative spelling of "aquéllos"; those ones (over there; implying some distance). The unaccented form can function as a pronoun if it can be unambiguously deduced as such from context. aquellos {pron-meta} :: {{head|es|pronoun|g=n-p}} aquellos {pron} :: Those ones. (over there; implying some distance) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("aquellos", "pron") == ['aquellos', 'aquél'] assert freq.get_best_lemma("aquellos", ['aquellos', 'aquél'], "pron") == "aquél" flist_data = """\ aquellos 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_format_def(): wordlist_data = """\ _____ rendir pos: v meta: {{es-verb|rend|ir|pres=rindo}} {{es-conj-ir|r|nd|p=e-i|combined=1}} gloss: to conquer q: transitive gloss: to tire, exhaust q: transitive gloss: to yield, pay, submit, pass down q: ditransitive gloss: to vomit q: intransitive gloss: to make headway q: intransitive gloss: to surrender, give in, give up q: reflexive gloss: to be paid (homage or tribute) q: reflexive """ wordlist = Wordlist(wordlist_data.splitlines()) sentences = None ignore = [] allforms = AllForms.from_wordlist(wordlist) # Full definition without ignore list deck = DeckBuilder(wordlist, sentences, ignore, allforms) usage = deck.get_usage("rendir", "v") print(usage) assert usage == { 'v': {'ditransitive': ['to yield, pay, submit, pass down']}, 'vi': {'': ['to vomit', 'to make headway']}, 'vr': {'': ['to surrender, give in, give up', 'to be paid (homage or tribute)']}, 'vt': {'': ['to conquer', 'to tire, exhaust']}} item = {'m/f': {'': ['retiree, pensioner (retired person)']}} assert DeckBuilder.format_def(item, hide_word="jubilado") == '<span class="pos n m_f mf">{mf} <span class="usage">retiree, pensioner (retired person)</span></span>' format_def = DeckBuilder.format_def item = { "m": { "tag": [ "def1", "def2" ] } } assert format_def(item) == """<span class="pos n m"> <span class="tag">[tag]:</span><span class="usage">def1; def2</span></span>""" item = { "m": { "Peru": [ "def1", "def2" ] } } assert format_def(item) == """<span class="pos n m only-latin-america only-peru only-south-america peru"> <span class="tag">[Peru]:</span><span class="usage">def1; def2</span></span>""" item = { "m": { "South America": [ "def1", "def2" ] } } assert format_def(item) == """<span class="pos n m only-latin-america only-south-america south-america"> <span class="tag">[South America]:</span><span class="usage">def1; def2</span></span>""" item = {'f': {'': ['sewer, storm drain'], 'zoology': ['cloaca']}} assert format_def(item, hide_word='cloaca') == """<span class="pos n f"> <span class="usage">sewer, storm drain</span></span>"""
def test_filters(): ignore_data = """\ # comment # - abuela {f} - abuelo {m} :: loose tufts """ wordlist_data = """\ abuela {n-meta} :: {{es-noun|m=abuelo}} abuela {f} :: grandmother, female equivalent of "abuelo" abuela {f} [colloquial] :: old woman abuela {f} [Mexico] :: a kind of flying ant abuelo {n-meta} :: {{es-noun|f=abuela}} abuelo {m} :: grandfather abuelo {m} [colloquial, endearing] :: an elderly person abuelo {m} | tolano :: loose tufts of hair in the nape when one's hair is messed up """ wordlist = Wordlist(wordlist_data.splitlines()) sentences = None ignore = [] allforms = AllForms.from_wordlist(wordlist) # Full definition without ignore list deck = DeckBuilder(wordlist, sentences, ignore, allforms) assert deck.filter_gloss("abuela", "", "", "grandmother") == "grandmother" assert deck.filter_gloss("abuela", "", "", 'grandmother, female equivalent of "abuelo"') == "grandmother" usage = deck.get_usage("abuelo", "n") assert usage == { 'm/f': {'f': ['grandmother'], 'f, colloquial': ['old woman'], 'f, Mexico': ['a kind of flying ant'], 'm': ['grandfather', "loose tufts of hair in the nape when one's hair is messed up"], 'm, colloquial, endearing': ['an elderly person'] }} # With ignore list ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines()) deck = DeckBuilder(wordlist, sentences, ignore, allforms) assert deck.filter_gloss("abuela", "x", "", "grandmother") == "grandmother" assert deck.filter_gloss("abuela", "f", "", "grandmother") == None assert deck.filter_gloss("abuela", "f", "colloquial", "old woman") == None assert deck.filter_gloss("abuelo", "m", "", "loose tufts of hair") == None assert deck.filter_gloss("abuelo", "m", "", "grandfather") == "grandfather" usage = deck.get_usage("abuelo", "n") assert usage == { 'm/f': { '': ['grandfather'], 'colloquial, endearing': ['an elderly person'] }}
def test_lemma_filters(): wordlist_data = """\ _____ ir pos: v meta: {{es-verb}} {{es-conj}} {{es-conj|irse}} gloss: to go (away from speaker and listener) q: intransitive gloss: to come (towards or with the listener) q: intransitive gloss: to be going to (near future), to go (+ a + infinitive) q: auxiliary gloss: to go away, to leave, to be off (see irse) q: reflexive _____ irse pos: v meta: {{es-verb}} {{es-conj}} gloss: to go away, to leave, to depart, to go (when the destination is not essential; when something or someone is going somewhere else) syn: andarse; marcharse gloss: to leak out (with liquids and gasses), to boil away, to go flat (gas in drinks) gloss: to overflow gloss: to go out (lights) gloss: to finish, to wear out, to disappear (e.g. money, paint, pains, mechanical parts) gloss: to die gloss: to break wind, to fart q: informal gloss: to wet/soil oneself (i.e., urinate or defecate in one's pants) q: informal gloss: to come, to cum, to e*******e, to o****m q: vulgar """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) print(allforms.all_forms["nos vamos"]) assert freq.all_forms.get_lemmas("vamos") == ['v|ir'] assert freq.all_forms.get_lemmas("nos vamos") == ['v|ir', 'v|irse'] assert freq.get_lemmas("vamos", "v") == ["ir"] assert freq.get_lemmas("ir", "v") == ["ir"] assert freq.include_word("vamos", "v") == True assert freq.filter_pos("vamos", ["v"]) == ["v"] # assert len(freq.wordlist.get_words("vamos", "v")) > 0 assert freq.get_ranked_pos("vamos") == ["v"] assert freq.get_lemmas("vamos", "v") == ["ir"] flist_data = """\ vamos 10 va 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_filters2(): ignore_data = """\ - test {f} """ wordlist_data = """\ test {n-meta} :: x test {n-forms} :: pl=tests test {m} :: masculine test {n-meta} :: x test {n-forms} :: pl=tests test {f} :: feminine """ xwordlist_data = """\ _____ test forms: pl=tests pos: n form: m gloss: masculine ____ test forms: pl=tests pos: n form: f gloss: feminine """ wordlist = Wordlist(wordlist_data.splitlines()) sentences = None ignore = [] allforms = AllForms.from_wordlist(wordlist) # Full definition without ignore list deck = DeckBuilder(wordlist, sentences, ignore, allforms) usage = deck.get_usage("test", "n") assert usage == { 'm-f': {'f': ['feminine'], 'm': ['masculine'] }} # With ignore list ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines()) deck = DeckBuilder(wordlist, sentences, ignore, allforms) usage = deck.get_usage("test", "n") assert usage == { 'm': { '': ['masculine'], }}
def __init__(self, lang_id, wordlist=None, debug=()): self.LANG_SECTION = lang_ids[lang_id] self.LANG_ID = lang_id self._problems = {} self._stats = {} self._debug_fix = set(debug) self.fixes = set() self.wordlist = Wordlist.from_file(wordlist) if isinstance( wordlist, str) else wordlist
def test_veros(): wordlist_data = """\ ver {v-meta} :: {{es-verb}} {{es-conj}} ver {v} :: x vero {n-meta} :: {{es-noun|m}} vero {m} [heraldry] :: vair """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_ranked_pos("veros") == ["v", "n"]
def test_filters(): wordlist_data = """\ test {n-meta} :: x test {m} :: test test {adj-meta} :: x test {adj} :: obsolete form of "test" """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.filter_pos("test", ["n", "adj"]) == ["n"] assert freq.get_ranked_pos("test") == ["n"]
def test_piernas(): wordlist_data = """\ pierna {n-meta} :: {{es-noun|f}} pierna {n-forms} :: pl=piernas pierna {f} | pata :: leg (lower limb of a human) piernas {n-meta} :: {{es-noun|m|piernas}} piernas {n-forms} :: pl=piernas piernas {m} [dated] :: twit; idiot """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) lemmas = ['pierna', 'piernas'] assert freq.get_lemmas("piernas", "n") == lemmas assert freq.get_best_lemma("piernas", lemmas, "n") == "pierna"
def test_hamburguesa(): wordlist_data = """\ hamburgués {n-meta} :: {{es-noun|m|hamburgueses|f=hamburguesa|fpl=hamburguesas}} hamburgués {n-forms} :: f=hamburguesa; fpl=hamburguesas; pl=hamburgueses hamburgués {m} :: Hamburger, a person from Hamburg hamburguesa {n-meta} :: {{es-noun|f}} hamburguesa {n-forms} :: pl=hamburguesas hamburguesa {f} :: hamburger hamburguesa {f} :: female equivalent of "hamburgués"; Hamburger """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) lemmas = ['hamburguesa', 'hamburgués'] assert freq.get_lemmas("hamburguesa", "n") == lemmas assert freq.get_best_lemma("hamburguesa", lemmas, "n") == "hamburguesa"
def test_simple2(): wordlist_data = """\ rojo {adj-meta} :: {{es-adj|f=roja}} rojo {adj} :: red (colour) rojo {n-meta} :: {{es-noun|m}} rojo {m} :: red (colour) rojo {m} [Costa Rica] :: a 1000 colón bill rojo {m} [Spain, derogatory] :: a left-wing, especially communist roja {n-meta} :: {{es-noun|f|m=rojo}} roja {f} :: Red (Communist) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_ranked_pos("roja") == ["adj", "n"]
def test_microondas(): wordlist_data = """\ microonda {n-meta} :: {{es-noun|f}} microonda {n-forms} :: pl=microondas microonda {f} :: microwave (electromagnetic wave) microondas {n-meta} :: {{es-noun|m|microondas}} microondas {n-forms} :: pl=microondas microondas {m} | horno de microondas :: microwave oven, microwave microondas {m} :: necklacing (execution by burning tyre) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) lemmas = ["microonda", "microondas"] assert freq.get_lemmas("microondas", "n") == lemmas assert freq.get_best_lemma("microondas", lemmas, "n") == "microondas"
def test_rasguno(): wordlist_data = """\ rasguñar {v-meta} :: {{es-verb}} {{es-conj}} rasguñar {vt} | arañar; rascar :: to scratch rasguño {n-meta} :: {{es-noun}} rasguño {m} | arañazo :: scratch """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("rasguño") == ['v|rasguñar', 'n|rasguño'] assert freq.get_ranked_pos("rasguño") == ["n", "v"] flist_data = """\ rasguño 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_veras(): wordlist_data = """\ vera {n-meta} :: {{es-noun|f}} vera {n-forms} :: pl=veras vera {f} [poetic] | lado :: side, face vera {n-meta} :: {{es-noun|f}} vera {n-forms} :: pl=veras vera {f} :: verawood (Bulnesia arborea) veras {n-meta} :: {{es-noun|f-p}} veras {fp} :: truth; reality veras {fp} :: serious things """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("veras", "n") == ["vera", "veras"] assert freq.get_best_lemma("veras", ["vera", "veras"], "n") == "veras"
def test_vete(): wordlist_data = """\ ir {v-meta} :: {{es-verb}} {{es-conj}} {{es-conj|irse}} ir {v} :: x ver {v-meta} :: {{es-verb}} {{es-conj}} ver {v} :: x verse {v-meta} :: {{es-verb}} {{es-conj}} verse {v} :: x vetar {v-meta} :: {{es-verb}} {{es-conj}} vetar {v} :: x """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("vete", "v") == ['ir', 'ver', 'vetar'] assert freq.get_best_lemma("vete", ['ir', 'ver', 'vetar'], "v") == "ir"
def main(): import argparse argparser = argparse.ArgumentParser( description="Find lemmas with only 'form of' senses") argparser.add_argument("--wordlist", help="wordlist to load", required=True) argparser.add_argument("--limit", type=int, help="Limit processing to first N articles") argparser.add_argument("--progress", help="Display progress", action='store_true') argparser.add_argument( "--save", help="Save to wiktionary with specified commit message") args = argparser.parse_args() if not os.path.isfile(args.wordlist): raise FileNotFoundError(f"Cannot open: {args.wordlist}") wordlist = Wordlist.from_file(args.wordlist) count = 0 for word in wordlist.iter_all_words(): if not count % 1000 and args.progress: print(count, end='\r', file=sys.stderr) if args.limit and count >= args.limit: break count += 1 check_word(word) if args.save: base_url = "User:JeffDoozan/lists" logger.save(base_url, WikiSaver, commit_message=args.save) else: dest = "" logger.save(dest, FileSaver)
def test_bienes(): wordlist_data = """\ bien {n-meta} :: {{es-noun|m|bienes}} bien {m} :: good (as opposed to evil) bienes {n-meta} :: {{es-noun|m-p}} bienes {mp} :: goods (that which is produced, traded, bought or sold) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("bienes") == ['n|bien', 'n|bienes'] assert freq.get_lemmas("bienes", "n") == ["bien", "bienes"] assert freq.get_best_lemma("bienes", ["bien", "bienes"], "n") == "bienes" flist_data = """\ bienes 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def main(): parser = argparse.ArgumentParser(description="Find verbs with split data") parser.add_argument("--dictionary", help="Dictionary file name", required=True) parser.add_argument("--save", help="Save to wiktionary with specified commit message") args = parser.parse_args() wordlist = Wordlist.from_file(args.dictionary) for word in wordlist.iter_all_words(): if not word.pos == "v" or " " in word.word or not word.word.endswith("r"): continue if wordlist.has_word(word.word + "se", "v"): log(word.word) if args.save: base_url = "User:JeffDoozan/lists" logger.save(base_url, WikiSaver, commit_message=args.save) else: dest = "" logger.save(dest, FileSaver)
def test_hijo(): wordlist_data = """\ hija {n-meta} :: x hija {n-forms} :: m=hijo; mpl=hijos; pl=hijas hija {f} :: daughter; feminine noun of "hijo" hijo {n-meta} :: x hijo {n-forms} :: f=hija; fpl=hijas; pl=hijos hijo {m} :: son hijo {m} :: child (when the gender of the child is unknown) """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.all_forms.get_lemmas("hijo") == ['n|hijo'] assert freq.get_lemmas("hijo", "n") == ["hijo"] flist_data = """\ hijo 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
def test_dios(): wordlist_data = """\ dios {n-meta} :: {{es-noun|m|dioses|f=diosa}} dios {n-forms} :: f=diosa; fpl=diosas; pl=dioses dios {m} :: god diosa {n-meta} :: {{es-noun|f|m=dios}} diosa {n-forms} :: m=dios; mpl=dios; pl=diosas diosa {f} :: goddess diosa {n-meta} :: {{es-noun|f}} diosa {n-forms} :: pl=diosas diosa {f} [biochemistry] :: diose """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.get_lemmas("dioses", "n") == ["dios"] assert freq.get_lemmas("diosas", "n") == ["dios", "diosa"] assert freq.get_lemmas("diosa", "n") == ["dios", "diosa"] assert freq.get_best_lemma("diosa", ["dios", "diosa"], "n") == "dios" # assert list(freq.all_forms.get_lemmas("dios", {})) == ['n:dios:m'] # assert list(freq.all_forms.get_lemmas("dioses", {})) == ['n:dios:pl'] # assert list(freq.all_forms.get_lemmas("diosa", {})) == ["n:dios:f"] # assert list(freq.all_forms.get_lemmas("diosas", {})) == ["n:diosa:pl"] flist_data = """\ dios 10 dioses 10 diosa 10 diosas 10 """ assert "\n".join(freq.process(flist_data.splitlines())) == """\
) parser.add_argument("--low-mem", help="Use less memory", action='store_true', default=False) args = parser.parse_args() if not args.sentences: args.sentences = "sentences.tsv" if not args.data_dir: args.data_dir = os.environ.get("SPANISH_DATA_DIR", "spanish_data") if not args.custom_dir: args.custom_dir = os.environ.get("SPANISH_CUSTOM_DIR", "spanish_custom") with open(args.dictionary) as wordlist_data: cache_words = not args.low_mem wordlist = Wordlist(wordlist_data, cache_words=cache_words) print("wordlist", mem_use(), file=sys.stderr) ignore_data = open(args.ignore) if args.ignore else [] if args.allforms: allforms = AllForms.from_file(args.allforms) else: allforms = AllForms.from_wordlist(wordlist) print("all_forms", mem_use(), file=sys.stderr) sentences = spanish_sentences( sentences=args.sentences, data_dir=args.data_dir, custom_dir=args.custom_dir ) flist = FrequencyList(wordlist, allforms, sentences)
def main(): parser = argparse.ArgumentParser(description="Find usually plural nouns") parser.add_argument("--dictionary", help="Dictionary file name", required=True) parser.add_argument("--ngprobs", help="Ngram probability data file") parser.add_argument( "--save", help="Save to wiktionary with specified commit message") args = parser.parse_args() probs = NgramPosProbability(args.ngprobs) wordlist = Wordlist.from_file(args.dictionary) for form, data in probs.form_probs.items(): s_total, s_form_count = probs.get_data(form) s_usage = 0 # Check all words without any detected POS if not s_form_count: s_usage = s_total else: # Only check words that are primarily nouns if next(iter(s_form_count.keys())) != "n": continue # And only when the noun usage is at least %60 of total usage s_usage = s_form_count.get("n", 0) if s_usage / s_total < .6: continue if not s_usage: continue words = wordlist.get_words(form, "n") if not words: continue plurals = [pl for word in words for pl in word.forms.get("pl", [])] for plural in plurals: if plural == form: continue pl_total, pl_form_count = probs.get_data(plural) if not pl_total: continue pl_usage = 0 if not pl_form_count: pl_usage = pl_total else: # Only allow plurals that are primarily nouns if next(iter(pl_form_count.keys())) != "n": continue pl_usage = pl_form_count.get("n", 0) # And only when the noun usage is at least %60 of total usage if pl_usage / pl_total < .6: continue if pl_usage >= s_usage: log(form, plural, s_usage, pl_usage) if args.save: base_url = "User:JeffDoozan/lists" logger.save(base_url, WikiSaver, commit_message=args.save) else: dest = "" logger.save(dest, FileSaver)
) parser.add_argument("--tags", nargs=1, help="Merged tagged data with original data") parser.add_argument("--dictionary", help="Dictionary file", required=True) parser.add_argument("--allforms", help="Load word forms from file") parser.add_argument("--low-mem", help="Use less memory", action='store_true', default=False) args = parser.parse_args() if not os.path.isfile(args.sentences): raise FileNotFoundError(f"Cannot open: {args.sentences}") if args.tags and not os.path.isfile(args.tags[0]): raise FileNotFoundError(f"Cannot open: {args.tags}") cache_words = not args.low_mem with open(args.dictionary) as infile: wordlist = Wordlist(infile, cache_words=cache_words) if args.allforms: all_forms = AllForms.from_file(args.allforms) else: all_forms = AllForms.from_wordlist(wordlist) def tag_to_pos(tag, word): lemma = tag["lemma"] ctag = tag["ctag"] pos = None if ctag.startswith("A"): # and lemma not in ["el", "la", "uno"]: pos = "adj" elif ctag.startswith("C"): # and lemma not in ["si", "que"]:
def test_simple(): wordlist_data = """\ _____ protector pos: n meta: {{es-noun|m|f=+|f2=protectriz}} g: m gloss: protector (someone who protects or guards) pos: n meta: {{es-noun|m}} g: m gloss: protector (a device or mechanism which is designed to protect) _____ protectora pos: n meta: {{es-noun|f|m=protector}} forms: m=protector; mpl=protectores; pl=protectoras g: f gloss: female equivalent of "protector" pos: n meta: {{es-noun|f}} forms: pl=protectoras g: f gloss: animal shelter (an organization that provides temporary homes for stray pet animals) syn: protectora de animales _____ protectoras pos: n meta: {{head|es|noun plural form|g=f-p}} g: f-p gloss: inflection of "protector" _____ protectores pos: n meta: {{head|es|noun plural form|g=m-p}} g: m-p gloss: inflection of "protector" _____ protectrices pos: n meta: {{head|es|noun plural form|g=f-p}} g: f-p gloss: inflection of "protector" _____ protectriz pos: n meta: {{es-noun|f|m=protector}} forms: m=protector; mpl=protectores; pl=protectrices g: f gloss: alternative form of "protectora" q: uncommon """ flist_data = """\ protector 10 protectora 10 protectoras 10 protectores 10 protectriz 10 protectrices 10 unknown 10 """ wordlist = Wordlist(wordlist_data.splitlines()) allforms = AllForms.from_wordlist(wordlist) freq = FrequencyList(wordlist, allforms, sentences) assert freq.wordlist.has_lemma("protectora", "n") == False assert freq.get_lemmas("protectores", "n") == ["protector"] assert freq.get_lemmas("protectoras", "n") == ["protector", "protectora"] assert freq.get_lemmas("notaword", "n") == ["notaword"] assert freq.get_ranked_pos("protectoras") == ["n"] assert "\n".join(freq.process(flist_data.splitlines())) == """\
def wordlist(): data = """\ _____ Señor pos: n meta: {{es-noun|m|f=+}} g: m gloss: alternative letter-case form of "señor", used before a name (also Sr.) _____ Señora pos: n meta: {{es-noun|f|m=Señor}} g: f gloss: alternative letter-case form of "señora", used before a name _____ ababillarse pos: v meta: {{es-verb}} {{es-conj|nocomb=1}} etymology: From a- + babilla ("the stifle (as of a horse)") + -ar. gloss: to be sick with the stifle (of horses and other quadrupeds) q: veterinary medicine, Chile, Mexico _____ abjad pos: n meta: {{es-noun|m}} g: m gloss: abjad (writing system) q: linguistics _____ aborregarse pos: v meta: {{es-verb}} {{es-conj}} gloss: verb _____ aborrascarse pos: v meta: {{es-verb}} {{es-conj}} etymology: a + borrasca gloss: to get stormy q: reflexive _____ abuelito pos: n meta: {{es-noun|m|f=abuelita}} g: m gloss: diminutive of "abuelo", grandfather, gramps, grandpa _____ abyad pos: n meta: {{es-noun|m}} g: m gloss: alternative form of "abjad" _____ académico pos: adj meta: {{es-adj}} gloss: academic pos: n meta: {{es-noun|m|f=académica}} g: m gloss: academician, academic _____ accidentar pos: v meta: {{es-verb}} {{es-conj}} gloss: to cause an accident _____ accidentarse pos: v meta: {{es-verb}} {{es-conj|nocomb=1}} gloss: to have an accident, get into an accident, crash _____ actor pos: n meta: {{es-noun|m|f=actriz|f2=+}} g: m etymology: From Latin "actor". gloss: An actor (person who performs in a theatrical play or movie) pos: n meta: {{es-noun|m|f=+}} g: m etymology: From Latin "actor". gloss: A defendant q: law _____ aduanero pos: adj meta: {{es-adj}} etymology: From aduana + -ero. gloss: customs q: relational syn: aduanal pos: n meta: {{es-noun|mf|f=aduanera}} g: mf etymology: From aduana + -ero. gloss: customs officer _____ alegre pos: adj meta: {{es-adj}} gloss: joyful, cheerful _____ ambos pos: adj meta: {{head|es|adjective|g=m-p|feminine plural|ambas}} g: m-p gloss: both syn: los dos, las dos pos: num meta: {{head|es|numeral}} gloss: both pos: pron meta: {{head|es|pronoun}} gloss: both _____ amigar pos: v meta: {{es-verb}} {{es-conj}} gloss: to cause (people) to be friends _____ amigue pos: n meta: {{es-noun|m|g2=f|m=amigo|f=amiga}} g: m; f gloss: friend q: gender-neutral, neologism _____ aparecido pos: adj meta: {{es-adj}} gloss: appeared pos: adj meta: {{es-noun|m}} g: m gloss: ghost, apparition, revenant _____ aquél pos: pron meta: {{head|es|pronoun|demonstrative||feminine|aquélla|neuter|aquello|masculine plural|aquéllos|feminine plural|aquéllas|g=m}} g: m gloss: that one (far from speaker and listener) _____ ayuda pos: n meta: {{es-noun|f}} g: f etymology: From ayudar (“to help”). gloss: help, aid, assistance syn: asistencia pos: n meta: {{es-noun|mf}} g: mf etymology: From ayudar (“to help”). gloss: helper syn: ayudante _____ bosniaca pos: n meta: {{es-noun|f|m=bosniaco}} g: f gloss: female equivalent of "bosniaco" _____ bosniaco pos: n meta: {{es-noun|m|f=bosniaca}} g: m gloss: alternative spelling of "bosníaco" _____ cabra pos: n meta: {{es-noun|f|m=cabro}} g: f gloss: goat (unknown gender) _____ caldeo pos: adj meta: {{es-adj}} etymology: From Latin "Chaldaeus", from Ancient Greek "Χαλδαῖος", from Akkadian "𒅗𒀠𒌅". gloss: Chaldean pos: n meta: {{es-noun|m|f=caldea}} g: m etymology: From Latin "Chaldaeus", from Ancient Greek "Χαλδαῖος", from Akkadian "𒅗𒀠𒌅". gloss: Chaldean pos: v meta: {{head|es|verb form}} etymology: See caldear gloss: inflection of "caldear" _____ chama pos: n meta: {{es-noun|m}} g: m gloss: chama _____ chamo pos: n meta: {{es-noun|m|f=chama}} g: m gloss: kid, child q: Venezuela, colloquial _____ comer pos: v meta: {{es-verb}} {{es-conj}} gloss: to eat _____ comida pos: n meta: {{es-noun|f}} g: f gloss: food _____ comidas pos: n meta: {{head|es|noun form|g=f-p}} g: f-p gloss: plural of "comida" _____ comido pos: v meta: {{es-past participle|comid}} gloss: pp_ms of "comer" _____ crudívoro pos: adj meta: {{es-adj}} gloss: crudivorous pos: n meta: {{es-noun|m|f=crudívora}} g: m gloss: crudivore _____ del mismo pos: adj meta: {{es-adj|f=de la misma|mpl=de los mismos|fpl=de las mismas}} gloss: of it, them (substantive, refers back to a previous word in the text [see usage notes]) _____ dentista pos: n meta: {{es-noun|mf}} g: mf etymology: diente + -ista gloss: dentist _____ descomer pos: v meta: {{es-verb}} {{es-conj|nocomb=1}} etymology: des + comer gloss: to defecate q: euphemistic _____ descomedirse pos: v meta: {{es-verb|<i>}} {{es-conj|<i>}} gloss: to be rude or disrespectful q: reflexive _____ errar pos: v meta: {{es-verb|<ye[Spain],+[Latin America]>}} {{es-conj|<ye[Spain],+[Latin America]>}} gloss: to miss _____ estanciera pos: n meta: {{es-noun|f|m=estanciero}} g: f gloss: ranch owner _____ exconseller pos: n meta: {{es-noun|m|+|pl2=exconsellers}} g: m etymology: ex- + conseller gloss: former conseller _____ fulano pos: prop meta: {{head|es|proper noun|g=m|plural|fulanos|feminine|fulana|feminine plural|fulanas}} g: m gloss: alternative letter-case form of "Fulano", what's-his-name, so-and-so _____ gongo pos: n meta: {{es-noun|m}} g: m gloss: alternative form of "gong" gloss: bell or cowbell q: Puerto Rico syn: campana; cencerro _____ granado pos: adj meta: {{es-adj}} gloss: grained pos: n meta: {{es-noun|m}} g: m gloss: pomegranate tree _____ hijodalgo pos: n meta: {{es-noun|m|hijosdalgo|f=hijadalgo|fpl=hijasdalgo|pl2=hijosdalgos}} g: m etymology: contraction of "hijo de algo" gloss: alternative form of "hidalgo" _____ huila pos: n meta: {{es-noun|f}} g: f etymology: From Mapudungun. gloss: rags (tattered clothes) q: colloquial, Chile pos: n meta: {{es-noun|f}} g: f gloss: female equivalent of "huilo" pos: adj meta: {{head|es|adjective form}} gloss: feminine singular of "huilo" _____ huilo pos: adj meta: {{es-adj}} gloss: crippled q: colloquial, Mexico syn: tullido pos: n meta: {{es-noun|m|f=huila}} g: m gloss: a crippled person q: colloquial, Mexico _____ kirguiso pos: adj meta: {{es-adj}} gloss: of Kyrgyzstan; Kyrgyzstani (of or relating to Kyrgyzstan) pos: n meta: {{es-noun|m|f=kirguisa}} g: m gloss: Kyrgyzstani (native or inhabitant of Kyrgyzstan) _____ kirguís pos: adj meta: {{es-adj}} gloss: Kyrgyz (Turkic ethnic group) gloss: alternative form of "kirguiso" pos: n meta: {{es-noun|m|f=+}} g: m gloss: Kyrgyz (Turkic ethnic group) gloss: alternative form of "kirguiso" (inhabitant) _____ malayo pos: adj meta: {{es-adj}} gloss: Malay (from Malaysia) pos: n meta: {{es-noun|m|f=+}} g: m gloss: Malay (person) pos: n meta: {{es-noun|m|-}} g: m gloss: Malay (language) _____ parada pos: n meta: {{es-noun|f}} g: f etymology: From the feminine past participle of parar. gloss: stop (the act of stopping) gloss: station (a location where things stop) pos: n meta: {{es-noun|f|m=parado}} g: f etymology: From the feminine past participle of parar. gloss: female equivalent of "parado" _____ parado pos: n meta: {{es-noun|m|f=parada}} g: m gloss: unemployed person syn: desempleado; cesante _____ sumar pos: v meta: {{es-verb}} {{es-conj}} gloss: to add _____ sumir pos: v meta: {{es-verb}} {{es-conj}} gloss: to submerge _____ vosotres pos: pron meta: {{head|es|pronoun|masculine|vosotros|feminine|vosotras|g=m|g2=f}} g: m; f gloss: you (plural) q: gender-neutral, neologism """ return Wordlist(data.splitlines())