def test_cost_train(threadnum, tmpdir, capfd): """check parameter estimation is possible""" COST_TRAIN_SEED_PATH = "tests/test-data/cost-train/seed" CORPUS_PATH = "tests/test-data/cost-train/training-data.txt" COST_TRAIN_TEST_PATH = "tests/test-data/cost-train/test-data.txt" MODEL_PATH = str(tmpdir.join("model.bin")) DICTIONARY_PATH = str(tmpdir.mkdir("dic")) TEST_PROCESSED_PATH = str(tmpdir.join("test.txt")) EVAL_RESULT_PATH = str(tmpdir.join("result.txt")) assert os.path.exists(CORPUS_PATH) assert os.path.exists(COST_TRAIN_SEED_PATH) assert os.path.exists(COST_TRAIN_TEST_PATH) # move dicrc for filename in os.listdir(COST_TRAIN_SEED_PATH): _copy_file(os.path.join(COST_TRAIN_SEED_PATH, filename), tmpdir.join(filename)) run_mecab_dict_index(["index", "-d", COST_TRAIN_SEED_PATH, "-o", str(tmpdir)]) run_mecab_cost_train(["train", "-c", "1.0", "-p", threadnum, "-d", str(tmpdir), "-f", "1", CORPUS_PATH, MODEL_PATH]) run_mecab_dict_gen(["dgen", "-d", str(tmpdir), "-m", MODEL_PATH, "-o", DICTIONARY_PATH]) run_mecab_dict_index(["index", "-d", DICTIONARY_PATH, "-o", DICTIONARY_PATH]) run_mecab_test_gen(["tgen", "-o", TEST_PROCESSED_PATH, COST_TRAIN_TEST_PATH]) run_mecab_main(["mecab", "-r", "/dev/null", "-d", DICTIONARY_PATH, "-o", EVAL_RESULT_PATH, TEST_PROCESSED_PATH]) run_mecab_system_eval(["eval", "-l", "0 1 2 4", EVAL_RESULT_PATH, COST_TRAIN_TEST_PATH]) captured = capfd.readouterr().out assert training_result in captured
def test_tagger_with_test_data_katakana(tmpdir): DIC_DIR = "../../test-data/katakana" PROCESSED_DIC_DIR = tmpdir.mkdir("katakana") DICRC = os.path.join(DIC_DIR, "dicrc") TEST_CASE = os.path.join(DIC_DIR, "test") TRUE_PATH = os.path.join(DIC_DIR, "test.gld") with open(TEST_CASE) as f: test_data = [line.strip() for line in f] with open(TRUE_PATH) as f: true_data = [line.strip() for line in f] _copy_file(DICRC, PROCESSED_DIC_DIR.join("dicrc")) run_mecab_dict_index( ["index", "-d", DIC_DIR, "-o", str(PROCESSED_DIC_DIR)]) tagger = Tagger(str(PROCESSED_DIC_DIR)) # feature is on first index result = [ "".join([parsed[1] for parsed in tagger.parse(line)]) for line in test_data ] assert result == true_data
def test_dics(test_data_path, tmpdir): """test dict index command and mecab command to get same results as a given files""" DIC_DIR = os.path.join("../../test-data", test_data_path) PROCESSED_DIC_DIR = tmpdir.mkdir(test_data_path) DICRC = os.path.join(DIC_DIR, "dicrc") TEST_CASE = os.path.join(DIC_DIR, "test") TRUE_PATH = os.path.join(DIC_DIR, "test.gld") PREDICT_PATH = PROCESSED_DIC_DIR.join("output.txt") assert os.path.exists(DIC_DIR) assert os.path.exists(DICRC) assert os.path.exists(TEST_CASE) assert os.path.exists(TRUE_PATH) _copy_file(DICRC, PROCESSED_DIC_DIR.join("dicrc")) run_mecab_dict_index( ["index", "-d", DIC_DIR, "-o", str(PROCESSED_DIC_DIR)]) run_mecab_main([ "mecab", "-r", "/dev/null", "-d", str(PROCESSED_DIC_DIR), "-o", str(PREDICT_PATH), TEST_CASE ]) with open(TRUE_PATH, "r") as f: assert f.read() == PREDICT_PATH.read()