Esempio n. 1
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True,
                           log_file=args.log_file)  # ./san.log
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(
        args.data_dir, train_path)  # args.data_dir=data/, data/train-v2.0.json
    valid_path = os.path.join(args.data_dir, dev_path)  # data/dev-v2.0.json

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(
        args.embedding_dim, args.glove))  # embedding_dim=300
    # could be fasttext embedding
    emb_path = args.glove  # data/glove.840B.300d.txt
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:  # store_true
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)
    """From GLoVe to acquire tokens, to set()"""
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    """
    '--sort_all', action='store_true',
        sort the vocabulary by frequencies of all words, Otherwise consider question words first.
    """
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 2
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        # train_path = 'train-v2.0.json'
        # dev_path = 'dev-v2.0.json'

        train_path = 'msmarco_squad_train.json'
        dev_path = 'msmarco_squad_dev.json'

        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('Train path is: {}'.format(train_path))

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on, limit=20000)
    dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500)

    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    del meta
    del embedding
    logger.info('deleted meta and embedding')

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 3
0
def main():
    # Create a argument parser and read arguments from command line
    args = set_args()
    # logger will be a global variable
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD v1.1 dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'
        version = 'v1'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove))

    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')

    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)

    wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    # what do these vocab tags and vocab ners do?
    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on)
    meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding}
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)
    end_time = time.time()
    logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))