Beispiel #1
0
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = ''
embedding_folder = ''
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary()
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.6B.100d.txt")

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=60,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model = BiDAF(vocab, pretrained_word_embedding=word_embedding)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
                         evaluator,
                         epochs=15,
                         eposides=2)
Beispiel #2
0
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

base_folder = '/Users/huihui/git/SogouMRCToolkit/'
data_folder = base_folder + 'data/'
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary()
vocab_save_path = base_folder + 'data/vocab.json'
vocab.load(vocab_save_path)  # load vocab from save path

test_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model_dir = base_folder + 'models/bidaf/best_weights'
model = BiDAF(vocab)
model.load(model_dir)
model.session.run(tf.local_variables_initializer())
model.inference(test_batch_generator)  # inference on test data

model.evaluate(test_batch_generator, evaluator)

# evaluator.exact_match_score(prediction=,ground_truth=)
# print(SquadEvaluator.exact_match_score())
# print(SquadEvaluator.f1_score)

eval_batch_generator = test_batch_generator
eval_batch_generator.init()
vocab = Vocabulary(do_lowercase=True)
bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/'
bert_data_helper = BertDataHelper(bert_dir)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)

# covert data to bert format
train_data = bert_data_helper.convert(train_data, data='squad')
eval_data = bert_data_helper.convert(eval_data, data='squad')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       training=True,
                                       batch_size=2,
                                       additional_fields=[
                                           'input_ids', 'segment_ids',
                                           'input_mask', 'start_position',
                                           'end_position'
                                       ])
eval_batch_generator = BatchGenerator(vocab,
                                      eval_data,
                                      training=False,
                                      batch_size=2,
                                      additional_fields=[
                                          'input_ids', 'segment_ids',
                                          'input_mask', 'start_position',
                                          'end_position'
                                      ])
model = BertBaseline(bert_dir=bert_dir, version_2_with_negative=False)
warmup_proportion = 0.1
Beispiel #4
0
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json")
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")

# save vocab
vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json'
vocab.save(vocab_save_path)

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=50,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50)
train_batch_generator.init()

content = ""
question = ""

# train and save checkpoint in save_dir
save_dir = '/root/ZX/SMRCToolkit/model_save_folder'  #define save_dir path
model = QANET(vocab, pretrained_word_embedding=word_embedding)
model.compile(tf.train.AdamOptimizer, 1e-3)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
                         evaluator,
Beispiel #5
0
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)
vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")
feature_transformer = FeatureExtractor(
    features=['match_lemma', 'match_lower', 'pos', 'ner', 'context_tf'],
    build_vocab_feature_names=set(['pos', 'ner']),
    word_counter=vocab.get_word_counter())
train_data = feature_transformer.fit_transform(dataset=train_data)
eval_data = feature_transformer.transform(dataset=eval_data)
train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    training=True,
    batch_size=32,
    additional_fields=feature_transformer.features,
    feature_vocab=feature_transformer.vocab)
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
    batch_size=32,
    additional_fields=feature_transformer.features,
    feature_vocab=feature_transformer.vocab)
model = DrQA(vocab,
             word_embedding,
             features=feature_transformer.features,
             feature_vocab=feature_transformer.vocab)
# original paper adamax optimizer
model.compile()
reader = SquadV2Reader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadV2Evaluator(dev_file)

vocab = Vocabulary()
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.6B.100d.txt")

vocab_save_path = 'H:/result/bidafv2/vocab.json'
vocab.save(vocab_save_path)

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=60,
                                       training=True,
                                       additional_fields=['is_impossible'])
eval_batch_generator = BatchGenerator(vocab,
                                      eval_data,
                                      batch_size=60,
                                      additional_fields=['is_impossible'])

save_dir = 'H:/result/bidafv2'

model = BiDAF(vocab,
              pretrained_word_embedding=word_embedding,
              enable_na_answer=True)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary()
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.42B.300d.txt")

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    batch_size=60,
    training=True,
    additional_fields=['context_word_len', 'question_word_len'])
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
    batch_size=60,
    additional_fields=['context_word_len', 'question_word_len'])
model = BiDAFPlusSQuad(vocab, pretrained_word_embedding=word_embedding)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
                         evaluator,
                         epochs=15,
                         eposides=2)
Beispiel #8
0
eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
vocab.build_vocab(train_data + eval_data)

evaluator = CoQAEvaluator(data_folder + eval_filename)
bert_dir = 'model'
bert_data_helper = BertDataHelper(bert_dir)
train_data = bert_data_helper.convert(train_data, data='coqa')
eval_data = bert_data_helper.convert(eval_data, data='coqa')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    training=True,
    batch_size=6,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',
        'end_position', 'question_mask', 'rationale_mask', 'yes_mask',
        'extractive_mask', 'no_mask', 'unk_mask', 'qid'
    ])
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
    training=False,
    batch_size=12,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',
        'end_position', 'question_mask', 'rationale_mask', 'yes_mask',
        'extractive_mask', 'no_mask', 'unk_mask', 'qid'
    ])
Beispiel #9
0
eval_data = reader.read(dev_file)
evaluator = SquadV2Evaluator(dev_file)
cost = time.time() - t0
logging.info("seg cost=%.3f" % cost)

t0 = time.time()
vocab = Vocabulary(do_lowercase=True)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding("glove.840B.300d.txt",
                                           init_scale=0.05)
cost = time.time() - t0
logging.info("make vocab cost=%.3f" % cost)

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    batch_size=16,
    training=True,
    additional_fields=["abstractive_answer_mask"])
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
    batch_size=16,
    training=False,
    additional_fields=["abstractive_answer_mask"])

use_elmo = True
save_path = "squad2_elmo"

if use_elmo:
    model = BiDAFPlus(vocab,
                      pretrained_word_embedding=word_embedding,