Esempio n. 1
0
def ingest_lexica(session, num_lexica, num_wf_min, num_wf_max, vocabulary):
    """Run multiple (random built) lexicon ingestions."""
    lexica = generate_lexica(num_lexica, num_wf_min, num_wf_max + 1,
                             vocabulary)
    for i, wfs in enumerate(lexica):
        name = f'Lexicon {i}'
        LOGGER.info('Generating %s', name)
        add_lexicon(session, lexicon_name=name, vocabulary=True, wfs=wfs)
Esempio n. 2
0
def test_delete_lexicon(dbsession):
    name = 'test lexicon'

    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    lex = add_lexicon(dbsession, lexicon_name=name, vocabulary=True, wfs=wfs)
    print('lexicon id:', lex.lexicon_id)

    delete_lexicon(dbsession, lex.lexicon_id)

    # Wordforms should not be deleted
    wrdfrms = dbsession.query(Wordform).all()
    assert len(wrdfrms) == 3

    r = dbsession.execute(select([Lexicon])).fetchall()
    result = [row for row in r]
    print(result)
    assert len(result) == 0

    # Lexicon should be deleted
    lexica = dbsession.query(Lexicon).all()
    assert len(lexica) == 0

    # Wordforms are not associated with a lexicon
    # (i.e., lexical_source_wordform is empty)
    for wf in wrdfrms:
        print('Wordform:', wf.wordform)
        print('Wordform lexica:', wf.wf_lexica)
        assert wf.wf_lexica == []
    r = dbsession.execute(select([lexical_source_wordform])).fetchall()
    result = [row for row in r]
    print(len(result))
    assert len(result) == 0
Esempio n. 3
0
def test_add_lexicon(dbsession):
    name = 'test lexicon'

    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    print(dbsession)

    add_lexicon(dbsession, lexicon_name=name, vocabulary=True, wfs=wfs)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wrdfrms) == 3

    lexicons = dbsession.query(Lexicon).all()

    assert len(lexicons) == 1
    assert lexicons[0].lexicon_name == name
    assert len(lexicons[0].lexicon_wordforms) == 3

    wrdfrms = sorted([w.wordform for w in lexicons[0].lexicon_wordforms])
    assert wrdfrms == list(wfs['wordform'])
Esempio n. 4
0
def test_get_wf_mapping_lexicon(dbsession):
    name = 'test lexicon'

    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    print(dbsession)

    lex = add_lexicon(dbsession, lexicon_name=name, vocabulary=True, wfs=wfs)

    wf_mapping = get_wf_mapping(dbsession, lexicon=lex)

    for w in wfs['wordform']:
        assert w in wf_mapping.keys()
Esempio n. 5
0
def test_write_wf_links_data(dbsession, fs):
    wfl_file = 'wflinks'
    wfls_file = 'wflsources'

    name = 'linked test lexicon'

    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3', 'wf1s', 'wf2s', 'wf3s']

    lex = add_lexicon(dbsession, lexicon_name=name, vocabulary=True, wfs=wfs)

    wfs = pd.DataFrame()
    wfs['lemma'] = ['wf1', 'wf2', 'wf3']
    wfs['variant'] = ['wf1s', 'wf2s', 'wf3s']

    wfm = get_wf_mapping(dbsession, lexicon=lex)

    links_file = open(wfl_file, 'w')
    sources_file = open(wfls_file, 'w')

    num_l, num_s = write_wf_links_data(
        dbsession,
        wf_mapping=wfm,
        links_df=wfs,
        wf_from_name='lemma',
        wf_to_name='variant',
        lexicon_id=lex.lexicon_id,
        wf_from_correct=True,
        wf_to_correct=True,
        links_file=links_file,
        sources_file=sources_file,
    )

    links_file.close()
    sources_file.close()

    links_file = open(wfl_file, 'r')
    sources_file = open(wfls_file, 'r')

    assert num_l == 3 * 2
    assert num_s == 3 * 2

    wflinks = []
    for wf1, wf2 in zip(wfs['lemma'], wfs['variant']):
        wflinks.append({"wordform_from": wfm[wf1], "wordform_to": wfm[wf2]})
        wflinks.append({"wordform_from": wfm[wf2], "wordform_to": wfm[wf1]})

    wflsources = []
    for wfl in wflinks:
        wflsources.append({
            "wordform_from": wfl['wordform_from'],
            "wordform_to": wfl['wordform_to'],
            "lexicon_id": lex.lexicon_id,
            "wordform_from_correct": True,
            "wordform_to_correct": True
        })

    for wfls1, wfls2 in zip(read_json_lines(sources_file), wflsources):
        assert wfls1 == wfls2

    links_file.close()
    sources_file.close()