tf.logging.set_verbosity(tf.logging.ERROR) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '' embedding_folder = '' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary() vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.6B.100d.txt") train_batch_generator = BatchGenerator(vocab, train_data, batch_size=60, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model = BiDAF(vocab, pretrained_word_embedding=word_embedding) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator,
from sogou_mrc.train.trainer import Trainer tf.logging.set_verbosity(tf.logging.ERROR) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') base_folder = '/Users/huihui/git/SogouMRCToolkit/' data_folder = base_folder + 'data/' dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary() vocab_save_path = base_folder + 'data/vocab.json' vocab.load(vocab_save_path) # load vocab from save path test_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model_dir = base_folder + 'models/bidaf/best_weights' model = BiDAF(vocab) model.load(model_dir) model.session.run(tf.local_variables_initializer()) model.inference(test_batch_generator) # inference on test data model.evaluate(test_batch_generator, evaluator) # evaluator.exact_match_score(prediction=,ground_truth=) # print(SquadEvaluator.exact_match_score())
tf.logging.set_verbosity(tf.logging.ERROR) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '/root/ZX/SMRCToolkit/data_folder/' embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader(fine_grained=True) train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json") word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") # save vocab vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json' vocab.save(vocab_save_path) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=50, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50)
tf.logging.set_verbosity(tf.logging.ERROR) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '' embedding_folder = '' train_file = data_folder + "cmrc2018_train.json" dev_file = data_folder + "cmrc2018_dev.json" reader = CMRCReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = CMRCEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=32, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model = BiDAF(vocab, pretrained_word_embedding=word_embedding, word_embedding_size=300) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator,
import logging from sogou_mrc.utils.feature_extractor import FeatureExtractor from sogou_mrc.data.batch_generator import BatchGenerator logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '' embedding_folder = '' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") feature_transformer = FeatureExtractor( features=['match_lemma', 'match_lower', 'pos', 'ner', 'context_tf'], build_vocab_feature_names=set(['pos', 'ner']), word_counter=vocab.get_word_counter()) train_data = feature_transformer.fit_transform(dataset=train_data) eval_data = feature_transformer.transform(dataset=eval_data) train_batch_generator = BatchGenerator( vocab, train_data, training=True, batch_size=32, additional_fields=feature_transformer.features,
import random random.seed(1234) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = 'E:/dataset/SQuAD1.0/' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) random.shuffle(train_data) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=True) bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/' bert_data_helper = BertDataHelper(bert_dir) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) # covert data to bert format train_data = bert_data_helper.convert(train_data, data='squad') eval_data = bert_data_helper.convert(eval_data, data='squad') from sogou_mrc.data.batch_generator import BatchGenerator train_batch_generator = BatchGenerator(vocab, train_data, training=True, batch_size=2, additional_fields=[
from sogou_mrc.dataset.coqa import CoQAReader, CoQAEvaluator from sogou_mrc.libraries.BertWrapper import BertDataHelper from sogou_mrc.model.bert_coqa import BertCoQA from sogou_mrc.data.vocabulary import Vocabulary import logging import sys logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') coqa_reader = CoQAReader(-1) data_folder = '' train_filename = "coqa-train-v1.0.json" eval_filename = 'coqa-dev-v1.0.json' vocab = Vocabulary(do_lowercase=False) train_data = coqa_reader.read(data_folder + train_filename, 'train') eval_data = coqa_reader.read(data_folder + eval_filename, 'dev') vocab.build_vocab(train_data + eval_data) evaluator = CoQAEvaluator(data_folder + eval_filename) bert_dir = 'model' bert_data_helper = BertDataHelper(bert_dir) train_data = bert_data_helper.convert(train_data, data='coqa') eval_data = bert_data_helper.convert(eval_data, data='coqa') from sogou_mrc.data.batch_generator import BatchGenerator train_batch_generator = BatchGenerator( vocab, train_data,
level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" t0 = time.time() reader = SquadV2Reader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadV2Evaluator(dev_file) cost = time.time() - t0 logging.info("seg cost=%.3f" % cost) t0 = time.time() vocab = Vocabulary(do_lowercase=True) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding("glove.840B.300d.txt", init_scale=0.05) cost = time.time() - t0 logging.info("make vocab cost=%.3f" % cost) train_batch_generator = BatchGenerator( vocab, train_data, batch_size=16, training=True, additional_fields=["abstractive_answer_mask"]) eval_batch_generator = BatchGenerator( vocab, eval_data,
from sogou_mrc.data.vocabulary import Vocabulary from sogou_mrc.data.batch_generator import BatchGenerator import tensorflow as tf import logging import sys logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') coqa_reader = CoQAReader(-1) data_folder = os.path.join("/", "home", "baheti", "QADialogueSystem", "Data", "QA_datasets", "coqa/") train_filename = "coqa-train-v1.0.json" eval_filename = "coqa-dev-v1.0.json" vocab = Vocabulary(do_lowercase=True) vocab_filepath = os.path.join("models", "vocab.txt") if os.path.exists(vocab_filepath): print("loading from filepath") # load from the filepath vocab.load(vocab_filepath) else: print("creating vocab as new") train_data = coqa_reader.read(data_folder + train_filename, 'train') eval_data = coqa_reader.read(data_folder + eval_filename, 'dev') vocab.build_vocab(train_data + eval_data) vocab.save(vocab_filepath) # Squad seq2seq_train_moses_tokenized # DATA_DIR = os.path.join("/", "home", "baheti", "QADialogueSystem", "RuleBasedQuestionsToAnswer", "squad_seq2seq_train_moses_tokenized") # coqa_format_test_save_file = os.path.join(DATA_DIR, "squad_seq2seq_predicted_responses_test_coqa_format.json")
tf.logging.set_verbosity(tf.logging.ERROR) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '/root/ZX/SMRCToolkit/data_folder/' embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = CMRCReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = CMRCEvaluator(dev_file) vocab = Vocabulary() vocab_save_path = '/root/SMRCToolkit/vocab_save_folder/vocab.json' vocab.load(vocab_save_path) # load vocab from save path word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") train_batch_generator = BatchGenerator(vocab, train_data, batch_size=50, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50) train_batch_generator.init() #test_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)
embedding_folder = '../../embedding_folder' # Prepare the dataset reader and evaluator. # 准备数据集阅读器和鉴别器 print("step 1:准备数据集阅读器和鉴别器...") train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) # Build a vocabulary and load the pretrained embedding # 构建词汇表并加载预训练嵌入 print("step 2:构建词汇表并加载预训练嵌入...") vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder+"glove.840B.300d.txt") # Use the feature extractor,which is only necessary when using linguistic features # 用特征提取器。特征提取器只是在使用语言特征时才需要 print("step 3:用特征提取器(特征提取器只是在使用语言特征时才需要)...") feature_transformer = FeatureExtractor(features=['match_lemma','match_lower','pos','ner','context_tf'],build_vocab_feature_names=set(['pos','ner']),word_counter=vocab.get_word_counter()) train_data = feature_transformer.fit_transform(dataset=train_data) eval_data = feature_transformer.transform(dataset=eval_data) # 构建用于训练和评估的批处理生成器,其中在使用语言特征时需要附加特征和特征词汇表 print("step 4:构建用于训练和评估的批处理生成器,其中在使用语言特征时需要附加特征和特征词汇表...") train_batch_generator = BatchGenerator(vocab,train_data,training=True,batch_size=32,additional_fields=feature_transformer.features,feature_vocab=feature_transformer.vocab) eval_batch_generator = BatchGenerator(vocab,eval_data,batch_size=32,additional_fields=feature_transformer.features,feature_vocab=feature_transformer.vocab)