Beispiel #1
0
def test_aquellos():

    wordlist_data = """\
aquél {pron-meta} :: {{head|es|pronoun|demonstrative, feminine|aquélla|neuter|aquello|masculine plural|aquéllos|feminine plural|aquéllas|g=m}}
aquél {pron-forms} :: demonstrative_feminine=aquélla; feminine_plural=aquéllas; masculine_plural=aquéllos; neuter=aquello
aquél {pron} [demonstrative] :: that one (far from speaker and listener)
aquéllos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}}
aquéllos {pron} :: plural of "aquél"; those ones (far from speaker and listener)
aquel {pron-meta} :: {{head|es|pronoun|g=m|feminine|aquella|neutrum|aquello|masculine plural|aquellos|neutrum plural|aquellos|feminine plural|aquellas}}
aquel {pron-forms} :: feminine=aquella; feminine_plural=aquellas; masculine_plural=aquellos; neutrum=aquello; neutrum_plural=aquellos
aquel {pron} [demonstrative] :: alternative spelling of "aquél"
aquellos {pron-meta} :: {{head|es|pronoun|demonstrative|g=m-p}}
aquellos {pron} :: alternative spelling of "aquéllos"; those ones (over there; implying some distance). The unaccented form can function as a pronoun if it can be unambiguously deduced as such from context.
aquellos {pron-meta} :: {{head|es|pronoun|g=n-p}}
aquellos {pron} :: Those ones. (over there; implying some distance)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("aquellos", "pron") == ['aquellos', 'aquél']

    assert freq.get_best_lemma("aquellos", ['aquellos', 'aquél'],
                               "pron") == "aquél"

    flist_data = """\
aquellos 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #2
0
def test_asco():

    wordlist_data = """\
asca {n-meta} :: x
asca {n-forms} :: pl=ascas
asca {m} [mycology] | teca :: ascus
asco {n-meta} :: x
asco {n-forms} :: pl=ascos
asco {m} :: disgust
asco {m} :: nausea
asco {n-meta} :: x
asco {n-forms} :: pl=ascos
asco {m} :: alternative form of "asca"
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("asco") == ['n|asco', 'n|asca']
    assert freq.get_lemmas("asco", "n") == ["asca", "asco"]
    assert freq.get_best_lemma("asco", ["asca", "asco"], "n") == "asco"

    flist_data = """\
asco 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #3
0
def test_diva():

    wordlist_data = """\
_____
diva
pos: adj
  meta: {{head|es|adjective form}}
  gloss: adjective form of "divo"
pos: n
  meta: {{es-noun|f|m=divo}}
  g: f
  gloss: diva
_____
divo
pos: adj
  meta: {{es-adj}}
  gloss: star (famous)
pos: n
  meta: {{es-noun|m|f=diva}}
  g: m
  gloss: star, celeb
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("diva") == ['adj|divo', 'n|divo']
    assert freq.get_lemmas("diva", "n") == ["divo"]

    flist_data = """\
diva 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #4
0
def test_izquierdas():
    wordlist_data = """\
_____
izquierda
pos: adj
  meta: {{head|es|adjective form|g=f-s}}
  g: f-s
  gloss: adjective form of "izquierdo"
pos: n
  meta: {{es-noun|f|-}}
  g: f
  gloss: left (side, direction)
  gloss: left
    q: politics
_____
izquierdas
pos: adj
  meta: {{head|es|adjective form}}
  gloss: adjective form of "izquierdo"
pos: n
  meta: {{head|es|noun form|g=f-p}}
  g: f-p
  gloss: plural of "izquierda"
_____
izquierdo
pos: adj
  meta: {{es-adj}}
  gloss: left; on the left side or toward the left; the opposite of right
    syn: siniestro
  gloss: left-handed
  gloss: crooked
_____
izquierdos
pos: adj
  meta: {{head|es|adjective form|g=m-p}}
  g: m-p
  gloss: plural of "izquierdo"
_____
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    print(allforms.all_forms)

    assert freq.get_lemmas("izquierdas", "n") == ["izquierda"]
    assert freq.get_lemmas("izquierdo", "adj") == ["izquierdo"]
    assert freq.get_lemmas("izquierdos", "adj") == ["izquierdo"]
    assert freq.get_lemmas("izquierdas", "adj") == ["izquierdo"]
    assert freq.get_ranked_pos("izquierda") == ['n', 'adj']
    assert freq.get_ranked_pos("izquierdas") == ['n', 'adj']

    flist_data = """\
izquierda 34629
izquierdo 8150
izquierdas 436
izquierdos 234
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #5
0
def test_format_def():

    wordlist_data = """\
_____
rendir
pos: v
  meta: {{es-verb|rend|ir|pres=rindo}} {{es-conj-ir|r|nd|p=e-i|combined=1}}
  gloss: to conquer
    q: transitive
  gloss: to tire, exhaust
    q: transitive
  gloss: to yield, pay, submit, pass down
    q: ditransitive
  gloss: to vomit
    q: intransitive
  gloss: to make headway
    q: intransitive
  gloss: to surrender, give in, give up
    q: reflexive
  gloss: to be paid (homage or tribute)
    q: reflexive
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    sentences = None
    ignore = []
    allforms = AllForms.from_wordlist(wordlist)

    # Full definition without ignore list
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    usage = deck.get_usage("rendir", "v")
    print(usage)
    assert usage == {
          'v': {'ditransitive': ['to yield, pay, submit, pass down']},
          'vi': {'': ['to vomit', 'to make headway']},
          'vr': {'': ['to surrender, give in, give up',
                      'to be paid (homage or tribute)']},
          'vt': {'': ['to conquer', 'to tire, exhaust']}}


    item = {'m/f': {'': ['retiree, pensioner (retired person)']}}
    assert DeckBuilder.format_def(item, hide_word="jubilado") == '<span class="pos n m_f mf">{mf} <span class="usage">retiree, pensioner (retired person)</span></span>'


    format_def = DeckBuilder.format_def

    item = { "m": { "tag": [ "def1", "def2" ] } }
    assert format_def(item) == """<span class="pos n m"> <span class="tag">[tag]:</span><span class="usage">def1; def2</span></span>"""

    item = { "m": { "Peru": [ "def1", "def2" ] } }
    assert format_def(item) == """<span class="pos n m only-latin-america only-peru only-south-america peru"> <span class="tag">[Peru]:</span><span class="usage">def1; def2</span></span>"""

    item = { "m": { "South America": [ "def1", "def2" ] } }
    assert format_def(item) == """<span class="pos n m only-latin-america only-south-america south-america"> <span class="tag">[South America]:</span><span class="usage">def1; def2</span></span>"""

    item = {'f': {'': ['sewer, storm drain'], 'zoology': ['cloaca']}}
    assert format_def(item, hide_word='cloaca') == """<span class="pos n f"> <span class="usage">sewer, storm drain</span></span>"""
Beispiel #6
0
def test_filters():
    ignore_data = """\
# comment
#
- abuela {f}
- abuelo {m} :: loose tufts
"""

    wordlist_data = """\
abuela {n-meta} :: {{es-noun|m=abuelo}}
abuela {f} :: grandmother, female equivalent of "abuelo"
abuela {f} [colloquial] :: old woman
abuela {f} [Mexico] :: a kind of flying ant
abuelo {n-meta} :: {{es-noun|f=abuela}}
abuelo {m} :: grandfather
abuelo {m} [colloquial, endearing] :: an elderly person
abuelo {m} | tolano :: loose tufts of hair in the nape when one's hair is messed up
"""


    wordlist = Wordlist(wordlist_data.splitlines())
    sentences = None
    ignore = []
    allforms = AllForms.from_wordlist(wordlist)

    # Full definition without ignore list
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)
    assert deck.filter_gloss("abuela", "", "", "grandmother") == "grandmother"
    assert deck.filter_gloss("abuela", "", "", 'grandmother, female equivalent of "abuelo"') == "grandmother"

    usage = deck.get_usage("abuelo", "n")
    assert usage == {
        'm/f':
        {'f': ['grandmother'],
         'f, colloquial': ['old woman'],
         'f, Mexico': ['a kind of flying ant'],
         'm': ['grandfather', "loose tufts of hair in the nape when one's hair is messed up"],
         'm, colloquial, endearing': ['an elderly person']
        }}


    # With ignore list
    ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines())
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    assert deck.filter_gloss("abuela", "x", "", "grandmother") == "grandmother"
    assert deck.filter_gloss("abuela", "f", "", "grandmother") == None
    assert deck.filter_gloss("abuela", "f", "colloquial", "old woman") == None
    assert deck.filter_gloss("abuelo", "m", "", "loose tufts of hair") == None
    assert deck.filter_gloss("abuelo", "m", "", "grandfather") == "grandfather"

    usage = deck.get_usage("abuelo", "n")
    assert usage == {
        'm/f':
        {
         '': ['grandfather'],
         'colloquial, endearing': ['an elderly person']
        }}
Beispiel #7
0
def test_filters2():
    ignore_data = """\
- test {f}
"""

    wordlist_data = """\
test {n-meta} :: x
test {n-forms} :: pl=tests
test {m} :: masculine
test {n-meta} :: x
test {n-forms} :: pl=tests
test {f} :: feminine
"""

    xwordlist_data = """\
_____
test
  forms: pl=tests
  pos: n
  form: m
  gloss: masculine
____
test
  forms: pl=tests
  pos: n
  form: f
  gloss: feminine
"""


    wordlist = Wordlist(wordlist_data.splitlines())
    sentences = None
    ignore = []
    allforms = AllForms.from_wordlist(wordlist)

    # Full definition without ignore list
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    usage = deck.get_usage("test", "n")
    assert usage == {
        'm-f':
        {'f': ['feminine'],
         'm': ['masculine']
        }}


    # With ignore list
    ignore = DeckBuilder.load_ignore_data(ignore_data.splitlines())
    deck = DeckBuilder(wordlist, sentences, ignore, allforms)

    usage = deck.get_usage("test", "n")
    assert usage == {
        'm':
        {
         '': ['masculine'],
        }}
Beispiel #8
0
def test_lemma_filters():

    wordlist_data = """\
_____
ir
pos: v
  meta: {{es-verb}} {{es-conj}} {{es-conj|irse}}
  gloss: to go (away from speaker and listener)
    q: intransitive
  gloss: to come (towards or with the listener)
    q: intransitive
  gloss: to be going to (near future), to go (+ a + infinitive)
    q: auxiliary
  gloss: to go away, to leave, to be off (see irse)
    q: reflexive
_____
irse
pos: v
  meta: {{es-verb}} {{es-conj}}
  gloss: to go away, to leave, to depart, to go (when the destination is not essential; when something or someone is going somewhere else)
    syn: andarse; marcharse
  gloss: to leak out (with liquids and gasses), to boil away, to go flat (gas in drinks)
  gloss: to overflow
  gloss: to go out (lights)
  gloss: to finish, to wear out, to disappear (e.g. money, paint, pains, mechanical parts)
  gloss: to die
  gloss: to break wind, to fart
    q: informal
  gloss: to wet/soil oneself (i.e., urinate or defecate in one's pants)
    q: informal
  gloss: to come, to cum, to e*******e, to o****m
    q: vulgar
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    print(allforms.all_forms["nos vamos"])

    assert freq.all_forms.get_lemmas("vamos") == ['v|ir']
    assert freq.all_forms.get_lemmas("nos vamos") == ['v|ir', 'v|irse']
    assert freq.get_lemmas("vamos", "v") == ["ir"]
    assert freq.get_lemmas("ir", "v") == ["ir"]

    assert freq.include_word("vamos", "v") == True
    assert freq.filter_pos("vamos", ["v"]) == ["v"]
    #    assert len(freq.wordlist.get_words("vamos", "v")) > 0
    assert freq.get_ranked_pos("vamos") == ["v"]
    assert freq.get_lemmas("vamos", "v") == ["ir"]

    flist_data = """\
vamos 10
va 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #9
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(description="Find -ismo without -ista and vice versa")
    argparser.add_argument("file", help="Extract file to read")
    argparser.add_argument("--allforms", help="Allforms for checking lemmas")
    argparser.add_argument("--save", help="Save to wiktionary with specified commit message")
    argparser.add_argument("--date", help="Date of the database dump (used to generate page messages)")
    argparser.add_argument("--limit", type=int, help="Limit processing to first N articles")
    argparser.add_argument("--progress", help="Display progress", action='store_true')
    args = argparser.parse_args()

    count = 0

    allforms = AllForms.from_file(args.allforms) if args.allforms else None
    all_lemmas = set(allforms.all_lemmas)
    for article in WikiExtract.iter_articles_from_bz2(args.file):

        if not count % 1000 and args.progress:
            print(count, end = '\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        text = article.text
        path = article.title.split(":")
        page = path[0]
        pos = path[-1]

        if page.endswith("ismo"):
            error = "ismo_without_ista"
            search = page[:-4] + "ista"
        elif page.endswith("ista"):
            error = "ista_without_ismo"
            search = page[:-4] + "ismo"
        else:
            continue

        if page not in all_lemmas:
            continue

        if search in all_lemmas and search not in article.text:
            log(error, page)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
Beispiel #10
0
def test_veros():

    wordlist_data = """\
ver {v-meta} :: {{es-verb}} {{es-conj}}
ver {v} :: x
vero {n-meta} :: {{es-noun|m}}
vero {m} [heraldry] :: vair
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_ranked_pos("veros") == ["v", "n"]
Beispiel #11
0
def test_filters():

    wordlist_data = """\
test {n-meta} :: x
test {m} :: test
test {adj-meta} :: x
test {adj} :: obsolete form of "test"
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.filter_pos("test", ["n", "adj"]) == ["n"]
    assert freq.get_ranked_pos("test") == ["n"]
Beispiel #12
0
def test_piernas():
    wordlist_data = """\
pierna {n-meta} :: {{es-noun|f}}
pierna {n-forms} :: pl=piernas
pierna {f} | pata :: leg (lower limb of a human)
piernas {n-meta} :: {{es-noun|m|piernas}}
piernas {n-forms} :: pl=piernas
piernas {m} [dated] :: twit; idiot
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    lemmas = ['pierna', 'piernas']
    assert freq.get_lemmas("piernas", "n") == lemmas
    assert freq.get_best_lemma("piernas", lemmas, "n") == "pierna"
Beispiel #13
0
def test_hamburguesa():
    wordlist_data = """\
hamburgués {n-meta} :: {{es-noun|m|hamburgueses|f=hamburguesa|fpl=hamburguesas}}
hamburgués {n-forms} :: f=hamburguesa; fpl=hamburguesas; pl=hamburgueses
hamburgués {m} :: Hamburger, a person from Hamburg
hamburguesa {n-meta} :: {{es-noun|f}}
hamburguesa {n-forms} :: pl=hamburguesas
hamburguesa {f} :: hamburger
hamburguesa {f} :: female equivalent of "hamburgués"; Hamburger
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    lemmas = ['hamburguesa', 'hamburgués']
    assert freq.get_lemmas("hamburguesa", "n") == lemmas
    assert freq.get_best_lemma("hamburguesa", lemmas, "n") == "hamburguesa"
Beispiel #14
0
def test_simple2():

    wordlist_data = """\
rojo {adj-meta} :: {{es-adj|f=roja}}
rojo {adj} :: red (colour)
rojo {n-meta} :: {{es-noun|m}}
rojo {m} :: red (colour)
rojo {m} [Costa Rica] :: a 1000 colón bill
rojo {m} [Spain, derogatory] :: a left-wing, especially communist
roja {n-meta} :: {{es-noun|f|m=rojo}}
roja {f} :: Red (Communist)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_ranked_pos("roja") == ["adj", "n"]
Beispiel #15
0
def test_microondas():

    wordlist_data = """\
microonda {n-meta} :: {{es-noun|f}}
microonda {n-forms} :: pl=microondas
microonda {f} :: microwave (electromagnetic wave)
microondas {n-meta} :: {{es-noun|m|microondas}}
microondas {n-forms} :: pl=microondas
microondas {m} | horno de microondas :: microwave oven, microwave
microondas {m} :: necklacing (execution by burning tyre)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    lemmas = ["microonda", "microondas"]
    assert freq.get_lemmas("microondas", "n") == lemmas
    assert freq.get_best_lemma("microondas", lemmas, "n") == "microondas"
Beispiel #16
0
def test_veras():

    wordlist_data = """\
vera {n-meta} :: {{es-noun|f}}
vera {n-forms} :: pl=veras
vera {f} [poetic] | lado :: side, face
vera {n-meta} :: {{es-noun|f}}
vera {n-forms} :: pl=veras
vera {f} :: verawood (Bulnesia arborea)
veras {n-meta} :: {{es-noun|f-p}}
veras {fp} :: truth; reality
veras {fp} :: serious things
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("veras", "n") == ["vera", "veras"]
    assert freq.get_best_lemma("veras", ["vera", "veras"], "n") == "veras"
Beispiel #17
0
def test_vete():

    wordlist_data = """\
ir {v-meta} :: {{es-verb}} {{es-conj}} {{es-conj|irse}}
ir {v} :: x
ver {v-meta} :: {{es-verb}} {{es-conj}}
ver {v} :: x
verse {v-meta} :: {{es-verb}} {{es-conj}}
verse {v} :: x
vetar {v-meta} :: {{es-verb}} {{es-conj}}
vetar {v} :: x
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("vete", "v") == ['ir', 'ver', 'vetar']

    assert freq.get_best_lemma("vete", ['ir', 'ver', 'vetar'], "v") == "ir"
Beispiel #18
0
def test_rasguno():

    wordlist_data = """\
rasguñar {v-meta} :: {{es-verb}} {{es-conj}}
rasguñar {vt} | arañar; rascar :: to scratch
rasguño {n-meta} :: {{es-noun}}
rasguño {m} | arañazo :: scratch
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("rasguño") == ['v|rasguñar', 'n|rasguño']
    assert freq.get_ranked_pos("rasguño") == ["n", "v"]

    flist_data = """\
rasguño 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #19
0
def test_bienes():

    wordlist_data = """\
bien {n-meta} :: {{es-noun|m|bienes}}
bien {m} :: good (as opposed to evil)
bienes {n-meta} :: {{es-noun|m-p}}
bienes {mp} :: goods (that which is produced, traded, bought or sold)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("bienes") == ['n|bien', 'n|bienes']
    assert freq.get_lemmas("bienes", "n") == ["bien", "bienes"]
    assert freq.get_best_lemma("bienes", ["bien", "bienes"], "n") == "bienes"

    flist_data = """\
bienes 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #20
0
def test_hijo():

    wordlist_data = """\
hija {n-meta} :: x
hija {n-forms} :: m=hijo; mpl=hijos; pl=hijas
hija {f} :: daughter; feminine noun of "hijo"
hijo {n-meta} :: x
hijo {n-forms} :: f=hija; fpl=hijas; pl=hijos
hijo {m} :: son
hijo {m} :: child (when the gender of the child is unknown)
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.all_forms.get_lemmas("hijo") == ['n|hijo']
    assert freq.get_lemmas("hijo", "n") == ["hijo"]

    flist_data = """\
hijo 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #21
0
def test_dios():

    wordlist_data = """\
dios {n-meta} :: {{es-noun|m|dioses|f=diosa}}
dios {n-forms} :: f=diosa; fpl=diosas; pl=dioses
dios {m} :: god
diosa {n-meta} :: {{es-noun|f|m=dios}}
diosa {n-forms} :: m=dios; mpl=dios; pl=diosas
diosa {f} :: goddess
diosa {n-meta} :: {{es-noun|f}}
diosa {n-forms} :: pl=diosas
diosa {f} [biochemistry] :: diose
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.get_lemmas("dioses", "n") == ["dios"]
    assert freq.get_lemmas("diosas", "n") == ["dios", "diosa"]
    assert freq.get_lemmas("diosa", "n") == ["dios", "diosa"]

    assert freq.get_best_lemma("diosa", ["dios", "diosa"], "n") == "dios"

    #    assert list(freq.all_forms.get_lemmas("dios", {})) == ['n:dios:m']
    #    assert list(freq.all_forms.get_lemmas("dioses", {})) == ['n:dios:pl']
    #    assert list(freq.all_forms.get_lemmas("diosa", {})) == ["n:dios:f"]
    #    assert list(freq.all_forms.get_lemmas("diosas", {})) == ["n:diosa:pl"]

    flist_data = """\
dios 10
dioses 10
diosa 10
diosas 10
"""
    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #22
0
def allforms(wordlist):
    return AllForms.from_wordlist(wordlist)
Beispiel #23
0
def allforms(fixer):
    return AllForms.from_wordlist(fixer.wordlist)
Beispiel #24
0
def main():
    global args
    global ngprobs

    parser = argparse.ArgumentParser(description="Summarize ngram usage")
    parser.add_argument(
        "--allforms",
        help="Exclude coordinate terms that have standalone entries")
    parser.add_argument("--min-count",
                        help="Ignore forms with less than N uses",
                        type=int)
    parser.add_argument(
        "--min-percent",
        help=
        "Ignore coordinate terms used less than N percent of the form's uses",
        type=int)
    parser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    parser.add_argument(
        "--ignore2",
        help=
        "Ignore coords containing the specified word (can be used more than once)",
        action='append')
    parser.add_argument("--ngprobs", help="Ngram probability data file")
    parser.add_argument(
        "--coord2", help="File containing 2 word coordinate terms to check")
    parser.add_argument(
        "--coord3", help="File containing 3 word coordinate terms to check")
    parser.add_argument(
        "--coord4", help="File containing 4 word coordinate terms to check")
    args = parser.parse_args()

    allforms = AllForms.from_file(args.allforms)
    all_forms = set(allforms.all_forms)
    print("all_forms")
    lemma_forms = defaultdict(list)
    for form, pos, lemma in allforms.all:
        lemma_forms[(lemma, pos)].append(form)
    print("lemma_forms")

    alt_case = {
        form.lower(): form
        for form in all_forms if form != form.lower()
    }

    ngprobs = NgramPosProbability(args.ngprobs)

    if False:
        coord = "reharás tu vida"
        words = coord.split(" ")
        print([coord, get_coord_lemma(ngprobs, allforms, words)])
        exit()

        form = "fijamente"
        form_pos = ngprobs.get_preferred_pos(form)
        form_lemma = get_form_lemma(ngprobs, allforms, form)
        count = get_lemma_count(ngprobs, lemma_forms, form_lemma, form_pos)
        print([form, form_pos, form_lemma, count])
        exit()

    all_coords = {}
    if args.coord2:
        all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case,
                                  args.coord2, args.ignore2)
    if args.coord3:
        all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case,
                                  args.coord3)
    if args.coord4:
        all_coords |= find_coords(allforms, all_forms, ngprobs, alt_case,
                                  args.coord4)


#    seen2 = set()
#    all_coords = {}
#    if args.coord3:
#        coord_lemmas = find_coords(allforms, all_forms, ngprobs, alt_case, args.coord3)
#        for k in coord_lemmas.keys():
#            coord_lemma, form, form_pos = k
#            words = coord_lemma.split(" ")
#            seen2.add(words[0:2])
#            seen2.add(words[1:3])
#        all_coords |= coord_lemmas
#
#    if args.coord2:
#        coord_lemmas = find_coords(allforms, all_forms, ngprobs, alt_case, args.coord2, args.ignore2)
#        for k,count in coord_lemmas.items():
#            coord_lemma, form, form_pos = k
#            if coord_lemma in seen2 and coord_lemma not in all_forms:
#                continue
#            all_coords[k] = count

    for k, coord_count in all_coords.items():
        coord_lemma, form_lemma, form_pos = k
        form_count = get_lemma_count(ngprobs, lemma_forms, form_lemma,
                                     form_pos)

        # Skip uncommon forms
        if form_count < args.min_count:
            continue

        # Min ratio
        percent = int(coord_count / form_count * 100)
        if percent < args.min_percent:
            continue

        existing = coord_lemma if coord_lemma in all_forms else coord_lemma.lower(
        ) if coord_lemma.lower() in all_forms else None
        if existing:
            coord_lemma = f"[[{existing}]]"

        log(form_lemma, coord_lemma, form_count, coord_count, percent)

    if args.save:
        base_url = "User:JeffDoozan/lists"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        dest = ""
        logger.save(dest, FileSaver)
Beispiel #25
0
 def allforms(self):
     if not self._allforms:
         self._allforms = AllForms.from_file(self.allforms_file)
     return self._allforms
Beispiel #26
0
def test_simple():

    wordlist_data = """\
_____
protector
pos: n
  meta: {{es-noun|m|f=+|f2=protectriz}}
  g: m
  gloss: protector (someone who protects or guards)
pos: n
  meta: {{es-noun|m}}
  g: m
  gloss: protector (a device or mechanism which is designed to protect)
_____
protectora
pos: n
  meta: {{es-noun|f|m=protector}}
  forms: m=protector; mpl=protectores; pl=protectoras
  g: f
  gloss: female equivalent of "protector"
pos: n
  meta: {{es-noun|f}}
  forms: pl=protectoras
  g: f
  gloss: animal shelter (an organization that provides temporary homes for stray pet animals)
    syn: protectora de animales
_____
protectoras
pos: n
  meta: {{head|es|noun plural form|g=f-p}}
  g: f-p
  gloss: inflection of "protector"
_____
protectores
pos: n
  meta: {{head|es|noun plural form|g=m-p}}
  g: m-p
  gloss: inflection of "protector"
_____
protectrices
pos: n
  meta: {{head|es|noun plural form|g=f-p}}
  g: f-p
  gloss: inflection of "protector"
_____
protectriz
pos: n
  meta: {{es-noun|f|m=protector}}
  forms: m=protector; mpl=protectores; pl=protectrices
  g: f
  gloss: alternative form of "protectora"
    q: uncommon
"""

    flist_data = """\
protector 10
protectora 10
protectoras 10
protectores 10
protectriz 10
protectrices 10
unknown 10
"""

    wordlist = Wordlist(wordlist_data.splitlines())
    allforms = AllForms.from_wordlist(wordlist)
    freq = FrequencyList(wordlist, allforms, sentences)

    assert freq.wordlist.has_lemma("protectora", "n") == False

    assert freq.get_lemmas("protectores", "n") == ["protector"]
    assert freq.get_lemmas("protectoras", "n") == ["protector", "protectora"]
    assert freq.get_lemmas("notaword", "n") == ["notaword"]

    assert freq.get_ranked_pos("protectoras") == ["n"]

    assert "\n".join(freq.process(flist_data.splitlines())) == """\
Beispiel #27
0
    if not args.data_dir:
        args.data_dir = os.environ.get("SPANISH_DATA_DIR", "spanish_data")

    if not args.custom_dir:
        args.custom_dir = os.environ.get("SPANISH_CUSTOM_DIR", "spanish_custom")

    with open(args.dictionary) as wordlist_data:
        cache_words = not args.low_mem
        wordlist = Wordlist(wordlist_data, cache_words=cache_words)

    print("wordlist", mem_use(), file=sys.stderr)
    ignore_data = open(args.ignore) if args.ignore else []

    if args.allforms:
        allforms = AllForms.from_file(args.allforms)
    else:
        allforms = AllForms.from_wordlist(wordlist)
    print("all_forms", mem_use(), file=sys.stderr)

    sentences = spanish_sentences(
        sentences=args.sentences, data_dir=args.data_dir, custom_dir=args.custom_dir
    )

    flist = FrequencyList(wordlist, allforms, sentences)

    with open(args.file) as infile:
        for line in flist.process(infile, ignore_data):
            print(line)

    ignore_data.close()
Beispiel #28
0
def main():

    import argparse
    argparser = argparse.ArgumentParser(
        description="Find lemmas with only 'form of' senses")
    argparser.add_argument("--trans", help="Extract file to read")
    argparser.add_argument("--allforms",
                           help="Allforms for resolving forms to lemmas")
    argparser.add_argument(
        "--save", help="Save to wiktionary with specified commit message")
    argparser.add_argument(
        "--date",
        help="Date of the database dump (used to generate page messages)")
    argparser.add_argument("--limit",
                           type=int,
                           help="Limit processing to first N articles")
    argparser.add_argument("--progress",
                           help="Display progress",
                           action='store_true')
    argparser.add_argument("--dump-aliases",
                           help="Dump likely language aliases",
                           action='store_true')
    argparser.add_argument("--dump-parents",
                           help="Dump likely parent languages",
                           action='store_true')
    args = argparser.parse_args()

    allforms = AllForms.from_file(args.allforms) if args.allforms else None
    if not os.path.isfile(args.trans):
        raise FileNotFoundError(f"Cannot open: {args.trans}")

    fixer = T9nFixer(allforms)
    logger = Logger()

    def log(error, page, pos, gloss, language, line="", highlight=""):
        if error is None:
            raise ValueError("error is none")
        if page is None:
            raise ValueError("page is none")
        if pos is None:
            raise ValueError("pos is none")
        if gloss is None:
            gloss = ""
        if language is None:
            language = ""
        if line is None:
            line = ""
        if highlight is None:
            highlight = ""

        logger.add(error, page, pos, gloss, language, line, highlight)

#        if language:
#            langlogger.add(error, page, pos, gloss, language, line, highlight)

#        if error != "text_outside_template":
#            logger.add(error, page, pos, gloss, line, highlight)

    count = 0
    max_val = 0
    pages_with_tables = set()

    for article in WikiExtract.iter_articles_from_bz2(args.trans):
        text = article.text
        path = article.title.split(":")
        page = path[0]
        pos = path[-1]

        if pos not in ALL_POS:
            log("outside_pos", page, pos, None, None, path)

        count += 1
        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)
        if args.limit and count > args.limit:
            break

#        if page != "pie-eyed":
#            continue

#        if pathstr != "veggie:English:Adjective":
#        if pathstr != "I love you:English:Phrase":
#            continue
#        print("\n", count)

#        val = timeit.timeit(lambda: list(TranslationTable.find_tables(text, page, pos)), number=1)
#        if val > max_val:
#            max_val = val
#            max_page = pathstr
#        continue

        tables = list(TranslationTable.find_tables(text))
        if not len(tables) and not re.search(
                "{{\s*(trans-see|checktrans|see translation)", text):
            log("no_tables", page, pos, None, None)

#            max_page = "X"

        pages_with_tables.add(page)
        stats["sections_with_tables"] += 1
        for table_lines in tables:
            table_lines = table_lines.splitlines()
            #            print(table_lines)
            #            exit()
            #            max_val += len(table_lines)
            #            continue

            table = TranslationTable(page, pos, table_lines, log_function=log)

            stats["total_tables"] += 1
            seen = set()
            for item in table.items:
                if isinstance(item,
                              TranslationLine) and item.lang_id not in seen:
                    stats["total_entries"] += len(item.entries)
                    stats["lang_entries"][lang_ids[item.lang_id]] += 1
                    seen.add(item.lang_id
                             )  # Don't count more than one entry per table

            if len(tables) > 1 and not table.gloss and table.template in [
                    "tran-top", "trans-top-see", "trans-top-also"
            ]:
                table.log("no_gloss")
            fixer.cleanup_table(table)


#            if "\n".join(map(str.strip,table_lines)) != str(table):
#                table.log("botfix_formatting")
#                print("OLD", page, pos, file=sys.stderr)
#                print("\n".join(table_lines), file=sys.stderr)
#                print("NEW", page, pos)
#                print(str(table))
#exit()

    stats["pages_with_tables"] = len(pages_with_tables)

    #    print(max_val, max_page)

    #    base_url = "User:JeffDoozan/lists/translations" if args.save else "Xtranslations"
    #    langlogger.save(base_url, args.save)

    if args.save:
        base_url = "User:JeffDoozan/lists/translations"
        logger.save(base_url,
                    WikiByLanguage,
                    commit_message=args.save,
                    page_limit=1000,
                    data_date=args.date)
        logger.save(base_url + "/by_error",
                    WikiByError,
                    commit_message=args.save,
                    data_date=args.date)
    else:
        dest = "Xtranslations"
        logger.save(dest, FileByLanguage, page_limit=1000, data_date=args.date)
        logger.save(dest + "/by_error", FileByError, data_date=args.date)

    # Dump nested language aliases
    if args.dump_aliases:
        print("language_aliases = {")
        #for lang,codes in sorted(UNKNOWN_LANGS.items(), key=lambda x: sum(x[1].values())*-1):
        for lang, codes in sorted(UNKNOWN_LANGS.items()):
            for code, count in sorted(codes.items(), key=lambda x: x[1] * -1):
                if count > 20:
                    print(
                        f"    '{lang}': '{lang_ids[code]}', # {code} found in {count} entries"
                    )
                break
        print("}")

    if args.dump_parents:
        print("language_parents = {")
        for lang, count in sorted(LANG_PARENTS.items()):
            if count > 20:
                print(f"    '{lang}', # used in {count} entries")
        print("}")

    colons = [x for x in lang_ids.values() if ":" in x]
    if colons:
        raise ValueError(
            "A language exists with a colon in the name, this may cause problems for nested languages that use : as a separator"
        )

    print(f"Total pages with tables: {stats['pages_with_tables']}")
    print(f"Total sections with tables: {stats['sections_with_tables']}")
    total_lines = sum(stats["lang_entries"].values())
    print(f"Total language lines in tables: {total_lines}")
    print(f"Total translation entries: {stats['total_entries']}")
Beispiel #29
0
def main():

    global fixer
    global fixrunner

    import argparse

    parser = argparse.ArgumentParser(
        description="Generate list of missing forms")
    parser.add_argument("wordlist", help="wordlist")
    parser.add_argument("--allforms", required=True, help="all_forms file")
    parser.add_argument("--allpages", required=True, help="wiki.allpages")
    parser.add_argument(
        "--articles",
        required=True,
        help="Language extract with raw articles, used for checking autofixes")
    parser.add_argument("--save", help="wiktionary commit message")
    parser.add_argument("--limit",
                        type=int,
                        help="Limit processing to first N articles")
    parser.add_argument("--progress",
                        help="Display progress",
                        action='store_true')
    args = parser.parse_args()

    global ARTICLE_FILE
    ARTICLE_FILE = args.articles

    wordlist = Wordlist.from_file(args.wordlist)

    allforms = AllForms.from_file(args.allforms)
    fixer = FormFixer(wordlist)
    fixrunner = FixRunner("es", wordlist, allforms)

    with open(args.allpages) as infile:
        # Loading the entire contents of allpages takes 600M
        # To conserve memory, temporarily load allforms into a set
        # and then create a set of entries in allpages that are also in allforms

        allforms_set = set(allforms.all_forms)
        allpages = {x.strip() for x in infile if x in allforms_set}
        del allforms_set

#    form = "achaparrándolo"
#    declared_forms = fixer.get_declared_forms(form, wordlist, allforms)
#    existing_forms = get_existing_forms(form, wordlist)
#    missing_forms, unexpected_forms = fixer.compare_forms(declared_forms, existing_forms)
#    print("declared", declared_forms)
#    print("existing", existing_forms)
#    print("missing", missing_forms)
#    print("unexpected", unexpected_forms)
#    exit()

    count = 0
    for form in allforms.all_forms:

        # Fix for conversion from <sup>x</sup> -> ^x
        if "^" in form:
            continue

        try:
            declared_forms = fixer.get_declared_forms(form, wordlist, allforms)
        except ValueError as e:
            print("ERROR", e)
            #error("form_errors", form, str(e))
            continue

        if not count % 1000 and args.progress:
            print(count, end='\r', file=sys.stderr)

        if args.limit and count >= args.limit:
            break
        count += 1

        existing_forms = get_existing_forms(form, wordlist)

        missing_forms, unexpected_forms = fixer.compare_forms(
            declared_forms, existing_forms)

        missing_pos = []
        for item in missing_forms:

            if item.form != form:
                raise ValueError(form, item)

            if not FormFixer.can_handle_formtype(item.formtype):
                continue

            # TODO: for now skip multi word verbs
            if item.pos == "v" and " " in item.lemma:
                continue

            if item.pos == "n" and item.formtype == "m":
                error("should_be_lemma", form, item)
                continue

            words = list(wordlist.get_words(form, item.pos))
            if not words:
                matches = list(wordlist.get_words(form))
                if matches:
                    if item.pos in missing_pos:
                        continue
                    ety = {w.etymology for w in matches}
                    level = 4 if len(ety) > 1 else 3
                    #                    error("missing_pos_multi_ety", form, item)
                    items = [i for i in missing_forms if i.pos == item.pos]

                    if fixer.can_handle(item):
                        pos_text = "\n".join(
                            fixer.full_pos(item.form, level, items))
                    else:
                        pos_text = ""
                    error("missing_pos", form, item, pos_text)
                    missing_pos.append(item.pos)
                else:
                    if form in allpages:
                        error("missing_entry", form, item)

                continue


#            if pos == "n" and formtype == "pl" and unexpected_forms:
#                masculines = get_masculines_from_fpl(words[0])
#                masculine_links = [m for m in masculines if (pos, "fpl", m) in unexpected_forms]
#                if masculine_links:
#                    for m in masculine_links:
#                        unexpected_forms.remove((pos, "fpl", m))
#                    print(f"{form}:{pos} links to masculine {masculine_links} instead of feminine $is_doublet")
#                    continue

            error("missing_sense", form, item)

        for item in sorted(unexpected_forms):
            words = list(wordlist.get_words(item.lemma, item.pos))
            if words:
                error("unexpected_form", form, item)
            else:
                error("missing_lemma", form, item)

    if args.save:
        base_url = "User:JeffDoozan/lists/es/forms"
        logger.save(base_url, WikiSaver, commit_message=args.save)
    else:
        logger.save("forms", FileSaver)