Esempio n. 1
0
tf.logging.set_verbosity(tf.logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = ''
embedding_folder = ''
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary()
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.6B.100d.txt")

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=60,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model = BiDAF(vocab, pretrained_word_embedding=word_embedding)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
Esempio n. 2
0
from sogou_mrc.train.trainer import Trainer

tf.logging.set_verbosity(tf.logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

base_folder = '/Users/huihui/git/SogouMRCToolkit/'
data_folder = base_folder + 'data/'
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary()
vocab_save_path = base_folder + 'data/vocab.json'
vocab.load(vocab_save_path)  # load vocab from save path

test_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model_dir = base_folder + 'models/bidaf/best_weights'
model = BiDAF(vocab)
model.load(model_dir)
model.session.run(tf.local_variables_initializer())
model.inference(test_batch_generator)  # inference on test data

model.evaluate(test_batch_generator, evaluator)

# evaluator.exact_match_score(prediction=,ground_truth=)
# print(SquadEvaluator.exact_match_score())
Esempio n. 3
0
tf.logging.set_verbosity(tf.logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = '/root/ZX/SMRCToolkit/data_folder/'
embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader(fine_grained=True)
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json")
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")

# save vocab
vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json'
vocab.save(vocab_save_path)

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=50,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50)
Esempio n. 4
0
tf.logging.set_verbosity(tf.logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = ''
embedding_folder = ''
train_file = data_folder + "cmrc2018_train.json"
dev_file = data_folder + "cmrc2018_dev.json"

reader = CMRCReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = CMRCEvaluator(dev_file)

vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder)
train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=32,
                                       training=True)
eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model = BiDAF(vocab,
              pretrained_word_embedding=word_embedding,
              word_embedding_size=300)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
                         evaluator,
Esempio n. 5
0
import logging
from sogou_mrc.utils.feature_extractor import FeatureExtractor
from sogou_mrc.data.batch_generator import BatchGenerator

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = ''
embedding_folder = ''
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"
reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)
vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")
feature_transformer = FeatureExtractor(
    features=['match_lemma', 'match_lower', 'pos', 'ner', 'context_tf'],
    build_vocab_feature_names=set(['pos', 'ner']),
    word_counter=vocab.get_word_counter())
train_data = feature_transformer.fit_transform(dataset=train_data)
eval_data = feature_transformer.transform(dataset=eval_data)
train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    training=True,
    batch_size=32,
    additional_fields=feature_transformer.features,
import random

random.seed(1234)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = 'E:/dataset/SQuAD1.0/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"
reader = SquadReader()
train_data = reader.read(train_file)
random.shuffle(train_data)

eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)
vocab = Vocabulary(do_lowercase=True)
bert_dir = 'H:/result/BERT/uncased_L-12_H-768_A-12/'
bert_data_helper = BertDataHelper(bert_dir)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)

# covert data to bert format
train_data = bert_data_helper.convert(train_data, data='squad')
eval_data = bert_data_helper.convert(eval_data, data='squad')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       training=True,
                                       batch_size=2,
                                       additional_fields=[
Esempio n. 7
0
from sogou_mrc.dataset.coqa import CoQAReader, CoQAEvaluator
from sogou_mrc.libraries.BertWrapper import BertDataHelper
from sogou_mrc.model.bert_coqa import BertCoQA
from sogou_mrc.data.vocabulary import Vocabulary
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

coqa_reader = CoQAReader(-1)
data_folder = ''
train_filename = "coqa-train-v1.0.json"
eval_filename = 'coqa-dev-v1.0.json'
vocab = Vocabulary(do_lowercase=False)
train_data = coqa_reader.read(data_folder + train_filename, 'train')
eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
vocab.build_vocab(train_data + eval_data)

evaluator = CoQAEvaluator(data_folder + eval_filename)
bert_dir = 'model'
bert_data_helper = BertDataHelper(bert_dir)
train_data = bert_data_helper.convert(train_data, data='coqa')
eval_data = bert_data_helper.convert(eval_data, data='coqa')

from sogou_mrc.data.batch_generator import BatchGenerator

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
Esempio n. 8
0
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"

t0 = time.time()
reader = SquadV2Reader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadV2Evaluator(dev_file)
cost = time.time() - t0
logging.info("seg cost=%.3f" % cost)

t0 = time.time()
vocab = Vocabulary(do_lowercase=True)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding("glove.840B.300d.txt",
                                           init_scale=0.05)
cost = time.time() - t0
logging.info("make vocab cost=%.3f" % cost)

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    batch_size=16,
    training=True,
    additional_fields=["abstractive_answer_mask"])
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
from sogou_mrc.data.vocabulary import Vocabulary
from sogou_mrc.data.batch_generator import BatchGenerator
import tensorflow as tf
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

coqa_reader = CoQAReader(-1)
data_folder = os.path.join("/", "home", "baheti", "QADialogueSystem", "Data",
                           "QA_datasets", "coqa/")
train_filename = "coqa-train-v1.0.json"
eval_filename = "coqa-dev-v1.0.json"
vocab = Vocabulary(do_lowercase=True)
vocab_filepath = os.path.join("models", "vocab.txt")
if os.path.exists(vocab_filepath):
    print("loading from filepath")
    # load from the filepath
    vocab.load(vocab_filepath)
else:
    print("creating vocab as new")
    train_data = coqa_reader.read(data_folder + train_filename, 'train')
    eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
    vocab.build_vocab(train_data + eval_data)
    vocab.save(vocab_filepath)

# Squad seq2seq_train_moses_tokenized
# DATA_DIR = os.path.join("/", "home", "baheti", "QADialogueSystem", "RuleBasedQuestionsToAnswer", "squad_seq2seq_train_moses_tokenized")
# coqa_format_test_save_file = os.path.join(DATA_DIR, "squad_seq2seq_predicted_responses_test_coqa_format.json")
Esempio n. 10
0
tf.logging.set_verbosity(tf.logging.ERROR)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

data_folder = '/root/ZX/SMRCToolkit/data_folder/'
embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = CMRCReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = CMRCEvaluator(dev_file)

vocab = Vocabulary()
vocab_save_path = '/root/SMRCToolkit/vocab_save_folder/vocab.json'
vocab.load(vocab_save_path)  # load vocab from save path
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=50,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50)
train_batch_generator.init()

#test_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)
Esempio n. 11
0
embedding_folder = '../../embedding_folder'

# Prepare the dataset reader and evaluator.
# 准备数据集阅读器和鉴别器
print("step 1:准备数据集阅读器和鉴别器...")
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"
reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

# Build a vocabulary and load the pretrained embedding
# 构建词汇表并加载预训练嵌入
print("step 2:构建词汇表并加载预训练嵌入...")
vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder+"glove.840B.300d.txt")

# Use the feature extractor,which is only necessary when using linguistic features
# 用特征提取器。特征提取器只是在使用语言特征时才需要
print("step 3:用特征提取器(特征提取器只是在使用语言特征时才需要)...")
feature_transformer = FeatureExtractor(features=['match_lemma','match_lower','pos','ner','context_tf'],build_vocab_feature_names=set(['pos','ner']),word_counter=vocab.get_word_counter())
train_data = feature_transformer.fit_transform(dataset=train_data)
eval_data = feature_transformer.transform(dataset=eval_data)

# 构建用于训练和评估的批处理生成器,其中在使用语言特征时需要附加特征和特征词汇表
print("step 4:构建用于训练和评估的批处理生成器,其中在使用语言特征时需要附加特征和特征词汇表...")
train_batch_generator = BatchGenerator(vocab,train_data,training=True,batch_size=32,additional_fields=feature_transformer.features,feature_vocab=feature_transformer.vocab)
eval_batch_generator = BatchGenerator(vocab,eval_data,batch_size=32,additional_fields=feature_transformer.features,feature_vocab=feature_transformer.vocab)