data_folder = 'E:/dataset/SQuAD1.0/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"
reader = SquadReader()
train_data = reader.read(train_file)
random.shuffle(train_data)

eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)
vocab = Vocabulary(do_lowercase=True)
bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/'
bert_data_helper = BertDataHelper(bert_dir)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)

# covert data to bert format
train_data = bert_data_helper.convert(train_data, data='squad')
eval_data = bert_data_helper.convert(eval_data, data='squad')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       training=True,
                                       batch_size=2,
                                       additional_fields=[
                                           'input_ids', 'segment_ids',
                                           'input_mask', 'start_position',
                                           'end_position'
                                       ])
eval_batch_generator = BatchGenerator(vocab,
                                      eval_data,
Beispiel #2
0
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

coqa_reader = CoQAReader(-1)
data_folder = ''
train_filename = "coqa-train-v1.0.json"
eval_filename = 'coqa-dev-v1.0.json'
vocab = Vocabulary(do_lowercase=False)
train_data = coqa_reader.read(data_folder + train_filename, 'train')
eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
vocab.build_vocab(train_data + eval_data)

evaluator = CoQAEvaluator(data_folder + eval_filename)
bert_dir = 'model'
bert_data_helper = BertDataHelper(bert_dir)
train_data = bert_data_helper.convert(train_data, data='coqa')
eval_data = bert_data_helper.convert(eval_data, data='coqa')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    training=True,
    batch_size=6,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',
        'end_position', 'question_mask', 'rationale_mask', 'yes_mask',
        'extractive_mask', 'no_mask', 'unk_mask', 'qid'
    ])
eval_batch_generator = BatchGenerator(
# SQUAD seq2seq dev moses tokenized
DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer",
                        "squad_seq2seq_dev_moses_tokenized")
coqa_format_test_save_file = os.path.join(
    DATA_DIR, "squad_seq2seq_dev_moses_test_coqa_format.json")
src_squad_seq2seq_predicted_responses_file = os.path.join(
    DATA_DIR, "src_squad_seq2seq_dev_moses_test.txt")
predictions_save_file = "coqa_predictions_on_squad_seq2seq_dev_moses_test.txt"

test_data = coqa_reader.read(coqa_format_test_save_file, 'test')
evaluator = CoQAEvaluator(coqa_format_test_save_file)

best_model_path = os.path.join('models', 'best_weights')
bert_dir = 'uncased_L-12_H-768_A-12'
bert_data_helper = BertDataHelper(bert_dir)
test_data = bert_data_helper.convert(test_data, data='coqa')

model = BertCoQA(bert_dir=bert_dir, answer_verification=True)
print("loading model")
model.load(best_model_path)
print("model loaded")

my_batch_size = 6
test_batch_generator = BatchGenerator(
    vocab,
    test_data,
    training=False,
    batch_size=my_batch_size,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',
        'end_position', 'question_mask', 'rationale_mask', 'yes_mask',
import sys
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

coqa_reader = CoQAReader(-1)
data_folder=''
train_filename = "coqa-train-v1.0.json"
eval_filename = 'coqa-dev-v1.0.json'
vocab = Vocabulary(do_lowercase=True)
train_data = coqa_reader.read(data_folder+train_filename, 'train')
eval_data = coqa_reader.read(data_folder+eval_filename,'dev')
vocab.build_vocab(train_data+eval_data)

evaluator = CoQAEvaluator(data_folder+eval_filename)
bert_dir = 'uncased_L-12_H-768_A-12/'
bert_data_helper = BertDataHelper(bert_dir)
train_data = bert_data_helper.convert(train_data)
eval_data = bert_data_helper.convert(eval_data)


from sogou_mrc.data.batch_generator import BatchGenerator
train_batch_generator = BatchGenerator(vocab,train_data,training=True,batch_size=12,additional_fields=[
    'input_ids','segment_ids','input_mask','start_position','end_position',
    'question_mask','rationale_mask','yes_mask','extractive_mask','no_mask','unk_mask','qid'
])
eval_batch_generator = BatchGenerator(vocab,eval_data,training=False,batch_size=8,additional_fields=['input_ids','segment_ids','input_mask','start_position','end_position',
    'question_mask','rationale_mask','yes_mask','extractive_mask','no_mask','unk_mask','qid'])

model = BertCoQA(bert_dir=bert_dir,answer_verificatioin=False)
warmup_proportion = 0.1
num_epochs = 2
num_train_steps = int(
Beispiel #5
0
else:
    print("creating vocab as new")
    train_data = coqa_reader.read(data_folder + train_filename, 'train')
    eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
    vocab.build_vocab(train_data + eval_data)
    vocab.save(vocab_filepath)

DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer",
                        "squad_seq2seq_train_moses_tokenized")
val_data = coqa_reader.read(data_folder + eval_filename, 'dev')
evaluator = CoQAEvaluator(data_folder + eval_filename)

best_model_path = os.path.join('models', 'best_weights')
bert_dir = 'uncased_L-12_H-768_A-12'
bert_data_helper = BertDataHelper(bert_dir)
val_data = bert_data_helper.convert(val_data, data='coqa')

model = BertCoQA(bert_dir=bert_dir, answer_verification=True)
print("loading model")
model.load(best_model_path)
print("model loaded")

my_batch_size = 6
eval_batch_generator = BatchGenerator(
    vocab,
    val_data,
    training=False,
    batch_size=my_batch_size,
    additional_fields=[
        'input_ids', 'segment_ids', 'input_mask', 'start_position',
        'end_position', 'question_mask', 'rationale_mask', 'yes_mask',