logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '' embedding_folder = '' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary() vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.6B.100d.txt") train_batch_generator = BatchGenerator(vocab, train_data, batch_size=60, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model = BiDAF(vocab, pretrained_word_embedding=word_embedding) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator, epochs=15, eposides=2)
level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '/root/ZX/SMRCToolkit/data_folder/' embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader(fine_grained=True) train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json") word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") # save vocab vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json' vocab.save(vocab_save_path) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=50, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50) train_batch_generator.init() content = "" question = ""
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '' embedding_folder = '' train_file = data_folder + "cmrc2018_train.json" dev_file = data_folder + "cmrc2018_dev.json" reader = CMRCReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = CMRCEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=32, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60) model = BiDAF(vocab, pretrained_word_embedding=word_embedding, word_embedding_size=300) model.compile(tf.train.AdamOptimizer, 0.001) model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator, epochs=50, eposides=2)
train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" t0 = time.time() reader = SquadV2Reader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadV2Evaluator(dev_file) cost = time.time() - t0 logging.info("seg cost=%.3f" % cost) t0 = time.time() vocab = Vocabulary(do_lowercase=True) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding("glove.840B.300d.txt", init_scale=0.05) cost = time.time() - t0 logging.info("make vocab cost=%.3f" % cost) train_batch_generator = BatchGenerator( vocab, train_data, batch_size=16, training=True, additional_fields=["abstractive_answer_mask"]) eval_batch_generator = BatchGenerator( vocab, eval_data, batch_size=16, training=False, additional_fields=["abstractive_answer_mask"])