Esempio n. 1
0
def main():
    args = set_args()
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.info('~Processing SQuAD dataset~')
    train_path = os.path.join(args.data_dir, 'train-v1.1.json')
    valid_path = os.path.join(args.data_dir, 'dev-v1.1.json')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    nlp = spacy.load('en', parser=False)
    set_environment(args.seed)
    logger.info('Loading glove vocab.')
    glove_vocab = load_glove_vocab(glove_path, glove_dim)
    # load data
    logger.info('Loading data vocab.')
    train_data = load_data(train_path)
    valid_data = load_data(valid_path, False)
    vocab_tag = Vocabulary.build(nlp.tagger.tag_names, neat=True)
    vocab_ner = Vocabulary.build([''] + nlp.entity.cfg[u'actions']['1'],
                                 neat=True)
    logger.info('Build vocabulary')
    vocab = build_vocab(train_data + valid_data,
                        glove_vocab,
                        sort_all=args.sort_all,
                        clean_on=True)

    meta_path = os.path.join(args.data_dir, args.meta)
    logger.info('building embedding')
    embedding = build_embedding(glove_path, vocab, glove_dim)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }

    # If you want to check vocab token IDs, etc., load the meta file below (squad_meta.pick).
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    logger.info('started the function build_data')
    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               thread=args.threads)
    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               thread=args.threads)
Esempio n. 2
0
from datetime import datetime
from collections import Counter, defaultdict
from src.model import DocReaderModel
from src.batcher import load_meta, BatchGen
from config import set_args
from my_utils.utils import set_environment
from my_utils.log_wrapper import create_logger

args = set_args()
# set model dir
model_dir = args.model_dir
os.makedirs(model_dir, exist_ok=True)
model_dir = os.path.abspath(model_dir)

# set environment
set_environment(args.seed, args.cuda)
# setup logger
logger = create_logger(__name__, to_disk=True, log_file=args.log_file)


def main():
    logger.info('Launching the SAN')
    opt = vars(args)
    logger.info(opt)
    embedding, opt, vocab = load_meta(opt, args.meta)
    max_doc = opt['max_doc']
    smooth = opt['smooth']
    is_rep = opt['is_rep']
    eval_step = opt['eval_step']
    curve_file = opt['curve_file']
Esempio n. 3
0
from src.batcher import load_meta, BatchGen
from config import set_args
from my_utils.utils import set_environment
from my_utils.log_wrapper import create_logger
from my_utils.squad_eval import evaluate
from my_utils.data_utils import predict_squad, gen_name, gen_gold_name, load_squad_v2_label, compute_acc
from my_utils.squad_eval_v2 import my_evaluation as evaluate_v2

args = set_args()
# set model dir
model_dir = args.model_dir  # default='checkpoint'
os.makedirs(model_dir, exist_ok=True)
model_dir = os.path.abspath(model_dir)  # acquire absolute path

# set environment
set_environment(args.seed, args.cuda)  # seed default=2018
# setup logger
logger = create_logger(__name__, to_disk=True, log_file=args.log_file)


def load_squad(data_path):
    with open(data_path) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json['data']
        return dataset


def main():
    logger.info('Launching the SAN')
    opt = vars(args)
    logger.info('Loading data')
Esempio n. 4
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        # train_path = 'train-v2.0.json'
        # dev_path = 'dev-v2.0.json'

        train_path = 'msmarco_squad_train.json'
        dev_path = 'msmarco_squad_dev.json'

        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('Train path is: {}'.format(train_path))

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on, limit=20000)
    dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500)

    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    del meta
    del embedding
    logger.info('deleted meta and embedding')

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 5
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.warning('~Processing SQuAD dataset~')
    train_path = os.path.join(args.data_dir, 'train-v1.1.json')
    valid_path = os.path.join(args.data_dir, 'dev-v1.1.json')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    # load data
    train_data = load_data(train_path)
    valid_data = load_data(valid_path, False)

    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + valid_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = os.path.join(args.data_dir, args.meta)
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True)

    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False)
    end_time = time.time()
    logger.info('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 6
0
def main():
    args = set_args()
    args.datasets = args.datasets.split(',')
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)

    all_data = []
    all_datasets = []
    for dataset_name in args.datasets:
        test_file_prefix = 'test'
        if test_mode:
            if 'marco' in dataset_name:
                train_file_prefix = 'train'
                dev_file_prefix = 'dev'
            else:
                train_file_prefix = 'dev'
                dev_file_prefix = 'dev'
        else:
            train_file_prefix = 'train'
            dev_file_prefix = 'dev'

        logger.info('Processing %s dataset' % dataset_name)
        this_data_dir = args.data_dir + dataset_name + '/'
        train_data = None
        train_path = os.path.join(this_data_dir, '%s.json' % train_file_prefix)
        logger.info('The path of training data: {}'.format(train_path))
        train_data = load_data(train_path)
        all_data += train_data

        valid_path = os.path.join(this_data_dir, '%s.json' % dev_file_prefix)
        logger.info('The path of validation data: {}'.format(valid_path))
        valid_data = load_data(valid_path, False)
        all_data += valid_data
        if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name:
            test_path = os.path.join(this_data_dir,
                                     '%s.json' % test_file_prefix)
            logger.info('The path of test data: {}'.format(test_path))
            test_data = load_data(test_path, False)
            all_data += test_data
            all_datasets.append((train_data, valid_data, test_data))
        else:
            all_datasets.append((train_data, valid_data))

    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    nlp = spacy.load('en', parser=False)
    set_environment(args.seed)
    logger.info('Loading glove vocab.')
    glove_vocab = load_glove_vocab(glove_path, glove_dim)

    multitask_base_path = '../data/mtmrc/'
    with open(multitask_base_path + 'vocab_tag.pick', 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(multitask_base_path + 'vocab_ner.pick', 'rb') as f:
        vocab_ner = pickle.load(f)

    logger.info('Build vocabulary ')
    vocab = build_vocab(all_data,
                        glove_vocab,
                        sort_all=args.sort_all,
                        clean_on=True,
                        args=args)
    meta_path = os.path.join(args.output_path, args.meta)
    logger.info('building embedding ')
    embedding = build_embedding(glove_path, vocab, glove_dim)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    for i, item in enumerate(all_datasets):
        dataset_name = args.datasets[i]
        if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name:
            train_data, valid_data, test_data = item
        else:
            train_data, valid_data = item
        print('building output file for ', dataset_name)
        train_fout = os.path.join(args.output_path,
                                  dataset_name + '_train.json')
        build_data(train_data,
                   vocab,
                   vocab_tag,
                   vocab_ner,
                   train_fout,
                   True,
                   dataset_name=dataset_name)
        dev_fout = os.path.join(args.output_path, dataset_name + '_dev.json')
        build_data(valid_data,
                   vocab,
                   vocab_tag,
                   vocab_ner,
                   dev_fout,
                   False,
                   dataset_name=dataset_name)
        if args.include_test_set and 'squad' not in dataset_name:
            test_fout = os.path.join(args.output_path,
                                     dataset_name + '_test.json')
            build_data(test_data,
                       vocab,
                       vocab_tag,
                       vocab_ner,
                       test_fout,
                       False,
                       dataset_name=dataset_name)
Esempio n. 7
0
                           n_threads=n_threads)

    dev_data = BatchGen(test_data,
                        batch_size,
                        have_gpu,
                        is_train=False,
                        with_label=True)
    #batches.reset()
    #batches = list(batches)

    model_path = model_root + 'best_checkpoint.pt'

    checkpoint = torch.load(model_path)

    opt = checkpoint['config']
    set_environment(opt['seed'], have_gpu)
    opt['covec_path'] = mtlstm_path
    opt['cuda'] = have_gpu
    opt['multi_gpu'] = False
    opt['max_len'] = max_len
    state_dict = checkpoint['state_dict']
    model = DocReaderModel(opt, state_dict=state_dict)
    model.setup_eval_embed(torch.Tensor(test_embedding))
    logger.info('Loaded model!')

    if have_gpu:
        model.cuda()

    results, score_list = evaluate_squad_v2(model, dev_data)

    dev_gold = load_squad_v2(test_file)
Esempio n. 8
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True,
                           log_file=args.log_file)  # ./san.log
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(
        args.data_dir, train_path)  # args.data_dir=data/, data/train-v2.0.json
    valid_path = os.path.join(args.data_dir, dev_path)  # data/dev-v2.0.json

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(
        args.embedding_dim, args.glove))  # embedding_dim=300
    # could be fasttext embedding
    emb_path = args.glove  # data/glove.840B.300d.txt
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:  # store_true
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)
    """From GLoVe to acquire tokens, to set()"""
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    """
    '--sort_all', action='store_true',
        sort the vocabulary by frequencies of all words, Otherwise consider question words first.
    """
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 9
0
def main():
    # Create a argument parser and read arguments from command line
    args = set_args()
    # logger will be a global variable
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD v1.1 dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'
        version = 'v1'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove))

    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')

    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)

    wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    # what do these vocab tags and vocab ners do?
    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on)
    meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding}
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)
    end_time = time.time()
    logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))