def main(): assert args.working_directory is not None try: os.mkdir(args.working_directory) except: pass # 学習に用いるテキストデータを準備 corpus_l = build_corpus(args.train_filename_l, args.train_directory_l, supervised=True) corpus_u = build_corpus(args.train_filename_u, args.train_directory_u, supervised=False) # 辞書 dictionary = nlp.dictionary() # 訓練データ・検証データに分けてデータセットを作成 # 同時に辞書が更新される dataset_l = nlp.dataset(corpus_l, dictionary, args.train_dev_split, args.seed) # 教師あり dataset_u = nlp.dataset(corpus_u, dictionary, args.train_dev_split, args.seed) # 教師なし # 辞書を保存 dictionary.save(os.path.join(args.working_directory, "char.dict")) # 確認 size_train_l = dataset_l.get_size_train() size_dev_l = dataset_l.get_size_dev() size_train_u = dataset_u.get_size_train() size_dev_u = dataset_u.get_size_dev() table = [ ["Labeled", size_train_l, size_dev_l, size_train_l + size_dev_l], ["Unlabeled", size_train_u, size_dev_u, size_train_u + size_dev_u], [ "Total", size_train_u + size_train_l, size_dev_u + size_dev_l, size_train_u + size_train_l + size_dev_u + size_dev_l ], ] print(tabulate(table, headers=["Data", "Train", "Dev", "Total"])) num_character_ids = dictionary.get_num_characters() # モデル crf = nlp.crf( dataset_labeled=dataset_l, num_character_ids=num_character_ids, feature_x_unigram_start=args.crf_feature_x_unigram_start, feature_x_unigram_end=args.crf_feature_x_unigram_end, feature_x_bigram_start=args.crf_feature_x_bigram_start, feature_x_bigram_end=args.crf_feature_x_bigram_end, feature_x_identical_1_start=args.crf_feature_x_identical_1_start, feature_x_identical_1_end=args.crf_feature_x_identical_1_end, feature_x_identical_2_start=args.crf_feature_x_identical_2_start, feature_x_identical_2_end=args.crf_feature_x_identical_2_end, initial_lambda_0=args.crf_lambda_0, sigma=args.crf_prior_sigma) npylm = nlp.npylm(max_word_length=args.max_word_length, g0=1.0 / num_character_ids, initial_lambda_a=args.lambda_a, initial_lambda_b=args.lambda_b, vpylm_beta_stop=args.vpylm_beta_stop, vpylm_beta_pass=args.vpylm_beta_pass) npycrf = nlp.npycrf(npylm=npylm, crf=crf) num_features = crf.get_num_features() print( tabulate([["#characters", num_character_ids], ["#features", num_features]])) # 学習の準備 trainer = nlp.trainer(dataset_labeled=dataset_l, dataset_unlabeled=dataset_u, dictionary=dictionary, npycrf=npycrf, crf_regularization_constant=1.0) # 文字列の単語IDが衝突しているかどうかをチェック # 時間の無駄なので一度したらしなくてよい # メモリを大量に消費します if False: print("ハッシュの衝突を確認中 ...") num_checked_words = trainer.detect_hash_collision(args.max_word_length) print("衝突はありません (総単語数 {})".format(num_checked_words)) learning_rate = args.crf_learning_rate batchsize = 32 start = time.time() # 初期化 trainer.add_labeled_data_to_npylm() # 教師データをNPYLMに追加 trainer.sgd(learning_rate, batchsize, pure_crf=True) # NPYLMを除いてCRF単体を最適化 print("Iteration {} / {} - {:.3f} sec".format(0, args.epochs, time.time() - start)) for epoch in range(1, args.epochs + 1): start = time.time() # 学習 ## NPYLMのパラメータをギブスサンプリング ## 教師データも含めた方がいい気がする trainer.gibbs(include_labeled_data=True) ## CRFを最適化 trainer.sgd(learning_rate, batchsize) # 各種サンプリング ## HPYLMとVPYLMのハイパーパラメータの更新 trainer.sample_hpylm_vpylm_hyperparameters() ## 単語長のポアソン分布のパラメータλの更新 trainer.sample_npylm_lambda() # この推定は数イテレーション後にやるほうが精度が良い if epoch > 3: # VPYLMから長さkの単語が生成される確率P(k|VPYLM)を推定 trainer.update_p_k_given_vpylm() # ログ print("Iteration {} / {} - {:.3f} sec".format(epoch, args.epochs, time.time() - start)) # log_likelihood_l = trainer.compute_log_likelihood_labeled_dev() # log_likelihood_u = trainer.compute_log_likelihood_unlabeled_dev() # table = [ # ["Labeled", log_likelihood_l], # ["Unlabeled", log_likelihood_u] # ] # print(tabulate(table, headers=["Log-likelihood", "Dev"])) trainer.print_segmentation_labeled_dev(10) # ランダムに分割を表示 trainer.print_segmentation_unlabeled_dev(10) # ランダムに分割を表示 precision, recall = trainer.compute_precision_and_recall_labeled_dev() f_measure = 2 * precision * recall / (precision + recall) print( tabulate([["Labeled", precision, recall, f_measure]], headers=["Precision", "Recall", "F-measure"])) # モデルの保存 npylm.save(os.path.join(args.working_directory, "npylm.model")) crf.save(os.path.join(args.working_directory, "crf.model")) # trainer.print_p_k_vpylm() print("lambda_0:", crf.get_lambda_0())
def main(): assert args.working_directory is not None try: os.mkdir(args.working_directory) except: pass # 辞書 dictionary = nlp.dictionary( os.path.join(args.working_directory, "char.dict")) # モデル crf = nlp.crf(os.path.join(args.working_directory, "crf.model")) npylm = nlp.npylm(os.path.join(args.working_directory, "npylm.model")) npycrf = nlp.npycrf(npylm=npylm, crf=crf) num_features = crf.get_num_features() num_character_ids = dictionary.get_num_characters() print( tabulate([["#characters", num_character_ids], ["#features", num_features]])) # ビタビアルゴリズムによる最尤分解を求める assert args.test_filename is not None or args.test_directory is not None sentence_list = [] def preprocess(sentence): sentence = re.sub(r"[0-9.,]+", "#", sentence) sentence = sentence.strip() return sentence if args.test_filename is not None: with codecs.open(args.test_filename, "r", "utf-8") as f: for sentence_str in f: sentence_str = preprocess(sentence_str) sentence_list.append(sentence_str) if args.test_directory is not None: for filename in os.listdir(args.test_directory): with codecs.open(os.path.join(args.test_directory, filename), "r", "utf-8") as f: for sentence_str in f: sentence_str = preprocess(sentence_str) sentence_list.append(sentence_str) # 教師データはMeCabによる分割 tagger = MeCab.Tagger() if args.neologd_path is None else MeCab.Tagger( "-d " + args.neologd_path) tagger.parse("") # バグ回避のため空データを分割 for sentence_str in sentence_list: sentence_str = sentence_str.strip() m = tagger.parseToNode(sentence_str) # 形態素解析 words_true = [] while m: word = m.surface if len(word) > 0: words_true.append(word) m = m.next if len(words_true) > 0: words_npycrf = npycrf.parse(sentence_str, dictionary) words_npylm = npylm.parse(sentence_str, dictionary) print("MeCab+NEologd") print(" / ".join(words_true)) print("NPYCRF") print(" / ".join(words_npycrf)) print("NPYLM") print(" / ".join(words_npylm)) print()