def test_cost_train(threadnum, tmpdir, capfd):
    """check parameter estimation is possible"""
    COST_TRAIN_SEED_PATH = "tests/test-data/cost-train/seed"
    CORPUS_PATH = "tests/test-data/cost-train/training-data.txt"
    COST_TRAIN_TEST_PATH = "tests/test-data/cost-train/test-data.txt"
    MODEL_PATH = str(tmpdir.join("model.bin"))
    DICTIONARY_PATH = str(tmpdir.mkdir("dic"))
    TEST_PROCESSED_PATH = str(tmpdir.join("test.txt"))
    EVAL_RESULT_PATH = str(tmpdir.join("result.txt"))

    assert os.path.exists(CORPUS_PATH)
    assert os.path.exists(COST_TRAIN_SEED_PATH)
    assert os.path.exists(COST_TRAIN_TEST_PATH)

    # move dicrc
    for filename in os.listdir(COST_TRAIN_SEED_PATH):
        _copy_file(os.path.join(COST_TRAIN_SEED_PATH, filename), tmpdir.join(filename))

    run_mecab_dict_index(["index", "-d", COST_TRAIN_SEED_PATH, "-o", str(tmpdir)])
    run_mecab_cost_train(["train", "-c", "1.0", "-p", threadnum, "-d", str(tmpdir), "-f", "1", CORPUS_PATH, MODEL_PATH])
    run_mecab_dict_gen(["dgen", "-d", str(tmpdir), "-m", MODEL_PATH, "-o", DICTIONARY_PATH])
    run_mecab_dict_index(["index", "-d", DICTIONARY_PATH, "-o", DICTIONARY_PATH])
    run_mecab_test_gen(["tgen", "-o", TEST_PROCESSED_PATH, COST_TRAIN_TEST_PATH])
    run_mecab_main(["mecab", "-r", "/dev/null", "-d", DICTIONARY_PATH, "-o", EVAL_RESULT_PATH, TEST_PROCESSED_PATH])
    run_mecab_system_eval(["eval", "-l", "0 1 2 4", EVAL_RESULT_PATH, COST_TRAIN_TEST_PATH])

    captured = capfd.readouterr().out
    assert training_result in captured
def test_tagger_with_test_data_katakana(tmpdir):
    DIC_DIR = "../../test-data/katakana"
    PROCESSED_DIC_DIR = tmpdir.mkdir("katakana")

    DICRC = os.path.join(DIC_DIR, "dicrc")
    TEST_CASE = os.path.join(DIC_DIR, "test")
    TRUE_PATH = os.path.join(DIC_DIR, "test.gld")

    with open(TEST_CASE) as f:
        test_data = [line.strip() for line in f]

    with open(TRUE_PATH) as f:
        true_data = [line.strip() for line in f]

    _copy_file(DICRC, PROCESSED_DIC_DIR.join("dicrc"))
    run_mecab_dict_index(
        ["index", "-d", DIC_DIR, "-o",
         str(PROCESSED_DIC_DIR)])
    tagger = Tagger(str(PROCESSED_DIC_DIR))

    # feature is on first index
    result = [
        "".join([parsed[1] for parsed in tagger.parse(line)])
        for line in test_data
    ]

    assert result == true_data
Exemple #3
0
def test_dics(test_data_path, tmpdir):
    """test dict index command and mecab command to get same results as a given files"""
    DIC_DIR = os.path.join("../../test-data", test_data_path)
    PROCESSED_DIC_DIR = tmpdir.mkdir(test_data_path)

    DICRC = os.path.join(DIC_DIR, "dicrc")
    TEST_CASE = os.path.join(DIC_DIR, "test")
    TRUE_PATH = os.path.join(DIC_DIR, "test.gld")
    PREDICT_PATH = PROCESSED_DIC_DIR.join("output.txt")

    assert os.path.exists(DIC_DIR)
    assert os.path.exists(DICRC)
    assert os.path.exists(TEST_CASE)
    assert os.path.exists(TRUE_PATH)

    _copy_file(DICRC, PROCESSED_DIC_DIR.join("dicrc"))

    run_mecab_dict_index(
        ["index", "-d", DIC_DIR, "-o",
         str(PROCESSED_DIC_DIR)])
    run_mecab_main([
        "mecab", "-r", "/dev/null", "-d",
        str(PROCESSED_DIC_DIR), "-o",
        str(PREDICT_PATH), TEST_CASE
    ])

    with open(TRUE_PATH, "r") as f:
        assert f.read() == PREDICT_PATH.read()