Esempio n. 1
0
def main():
    args = set_args()
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.info('~Processing SQuAD dataset~')
    train_path = os.path.join(args.data_dir, 'train-v1.1.json')
    valid_path = os.path.join(args.data_dir, 'dev-v1.1.json')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    nlp = spacy.load('en', parser=False)
    set_environment(args.seed)
    logger.info('Loading glove vocab.')
    glove_vocab = load_glove_vocab(glove_path, glove_dim)
    # load data
    logger.info('Loading data vocab.')
    train_data = load_data(train_path)
    valid_data = load_data(valid_path, False)
    vocab_tag = Vocabulary.build(nlp.tagger.tag_names, neat=True)
    vocab_ner = Vocabulary.build([''] + nlp.entity.cfg[u'actions']['1'],
                                 neat=True)
    logger.info('Build vocabulary')
    vocab = build_vocab(train_data + valid_data,
                        glove_vocab,
                        sort_all=args.sort_all,
                        clean_on=True)

    meta_path = os.path.join(args.data_dir, args.meta)
    logger.info('building embedding')
    embedding = build_embedding(glove_path, vocab, glove_dim)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }

    # If you want to check vocab token IDs, etc., load the meta file below (squad_meta.pick).
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    logger.info('started the function build_data')
    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               thread=args.threads)
    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               thread=args.threads)
Esempio n. 2
0
from src.model import DocReaderModel
from src.batcher import load_meta, BatchGen
from config import set_args
from my_utils.utils import set_environment
from my_utils.log_wrapper import create_logger

args = set_args()
# set model dir
model_dir = args.model_dir
os.makedirs(model_dir, exist_ok=True)
model_dir = os.path.abspath(model_dir)

# set environment
set_environment(args.seed, args.cuda)
# setup logger
logger = create_logger(__name__, to_disk=True, log_file=args.log_file)


def main():
    logger.info('Launching the SAN')
    opt = vars(args)
    logger.info(opt)
    embedding, opt, vocab = load_meta(opt, args.meta)
    max_doc = opt['max_doc']
    smooth = opt['smooth']
    is_rep = opt['is_rep']
    eval_step = opt['eval_step']
    curve_file = opt['curve_file']

    training_step = 0
    cur_eval_step = 1
def main():
    args = set_args()
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.info('Processing dataset')
    train_path = os.path.join(args.raw_data_dir, 'train')
    valid_path = os.path.join(args.raw_data_dir, 'dev')
    test_path = os.path.join(args.raw_data_dir, 'test')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('The path of test data: {}'.format(test_path))
    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    # set_environment(args.seed)

    # load data
    train_data = load_reddit_data(train_path,
                                  anc_type='section',
                                  fact_len=12,
                                  just_anc=False,
                                  is_train=True)
    valid_data = load_reddit_data(valid_path,
                                  anc_type='section',
                                  fact_len=12,
                                  just_anc=False,
                                  is_train=False)
    test_data = load_reddit_data(test_path,
                                 anc_type='section',
                                 fact_len=12,
                                 just_anc=False,
                                 is_train=False)
    logger.info('#train data: {}'.format(len(train_data)))
    logger.info('#valid data: {}'.format(len(valid_data)))
    logger.info('#test data: {}'.format(len(test_data)))
    meta_path = args.meta

    if not os.path.exists(meta_path):
        logger.info('Build vocabulary')
        vocab = build_vocab(train_data + valid_data)
        logger.info('building embedding')
        embedding = build_embedding(glove_path, vocab, glove_dim)
        logger.info('emb done')
        meta = {'vocab': vocab, 'embedding': embedding}
        with open(meta_path, 'wb') as f:
            pickle.dump(meta, f)
    else:
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
            vocab = meta['vocab']

    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data, vocab, train_fout)
    logger.info('train data done')

    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data, vocab, dev_fout)
    logger.info('valid data done')

    test_fout = os.path.join(args.data_dir, args.test_data)
    build_data(test_data, vocab, test_fout)
    logger.info('test data done')

    write_files(args.data_dir + '/train', train_data)
    write_files(args.data_dir + '/dev', valid_data)
    write_files(args.data_dir + '/test', test_data)
Esempio n. 4
0
    def forward(self, batch):
        doc_input, query_input,\
        doc_emb, query_emb,\
        doc_cove_low, doc_cove_high,\
        query_cove_low, query_cove_high,\
        doc_mask, query_mask,\
        doc_elmo, query_elmo = self.lexicon_encoder(batch)

        query_list, doc_list = [], []
        query_list.append(query_input)
        doc_list.append(doc_input)

        # doc encode
        if self.opt['elmo_on']:
            doc_low = self.doc_encoder_low(
                torch.cat([doc_input, doc_cove_low, doc_elmo[0]], 2), doc_mask)
        else:
            doc_low = self.doc_encoder_low(
                torch.cat([doc_input, doc_cove_low], 2), doc_mask)
        doc_low = self.dropout(doc_low)

        if self.opt['elmo_on']:
            doc_high = self.doc_encoder_high(
                torch.cat([doc_low, doc_cove_high, doc_elmo[1]], 2), doc_mask)
        else:
            doc_high = self.doc_encoder_high(
                torch.cat([doc_low, doc_cove_high], 2), doc_mask)

        doc_high = self.dropout(doc_high)
        # query
        if self.opt['elmo_on']:
            query_low = self.query_encoder_low(
                torch.cat([query_input, query_cove_low, query_elmo[0]], 2),
                query_mask)
        else:
            query_low = self.query_encoder_low(
                torch.cat([query_input, query_cove_low], 2), query_mask)
        query_low = self.dropout(query_low)
        if self.opt['elmo_on']:
            query_high = self.query_encoder_high(
                torch.cat([query_low, query_cove_high, query_elmo[1]], 2),
                query_mask)
        else:
            query_high = self.query_encoder_high(
                torch.cat([query_low, query_cove_high], 2), query_mask)
        query_high = self.dropout(query_high)

        query_mem_hiddens = self.query_understand(
            torch.cat([query_low, query_high], 2), query_mask)
        query_mem_hiddens = self.dropout(query_mem_hiddens)
        query_list = [query_low, query_high, query_mem_hiddens]
        doc_list = [doc_low, doc_high]

        query_att_input = torch.cat(
            [query_emb, query_cove_high, query_low, query_high], 2)
        doc_att_input = torch.cat([doc_emb, doc_cove_high] + doc_list, 2)
        if self.opt['elmo_on'] and self.opt['elmo_att_on']:
            idx = -2 if self.opt['elmo_self_att_on'] else -1
            doc_att_input = torch.cat([doc_att_input, doc_elmo[idx]], 2)
            query_att_input = torch.cat([query_att_input, query_elmo[idx]], 2)
        # setup logger
        args = set_args()
        logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
        # logger.warning('doc_self_hiddens {}{}{}'.format(doc_self_hiddens.output_size,doc_mem_gen.output_size,query_sum_attn.output_size))
        # logger.warning('before att {}{}{}'.format(doc_att_input.shape,query_att_input.shape, query_mask.shape,query_low.shape,query_mem_hiddens.shape ))
        # before att torch.Size([64, 246, 1412])torch.Size([64, 37, 1412])torch.Size([64, 37])
        # s=ConvAtt(doc_att_input,query_att_input,0.5)
        # s=s.cuda()
        # a=s()
        doc_attn_hiddens = self.deep_attn(doc_att_input, query_att_input,
                                          query_list, query_mask)
        # logger.warning('before att {}'.format(doc_attn_hiddens.shape))
        # before att torch.Size([64, 246, 768])
        doc_attn_hiddens = self.dropout(doc_attn_hiddens)
        # doc_attn_hiddens = self.dropout(a)
        doc_mem_hiddens = self.doc_understand(
            torch.cat([doc_attn_hiddens] + doc_list, 2), doc_mask)
        doc_mem_hiddens = self.dropout(doc_mem_hiddens)
        doc_mem_inputs = torch.cat([doc_attn_hiddens] + doc_list, 2)
        if self.opt['self_attention_on']:
            doc_att = torch.cat(
                [doc_mem_inputs, doc_mem_hiddens, doc_cove_high, doc_emb], 2)
            if self.opt['elmo_on'] and self.opt['elmo_self_att_on']:
                doc_att = torch.cat([doc_att, doc_elmo[-1]], 2)

            doc_self_hiddens = self.doc_self_attn(doc_att,
                                                  doc_att,
                                                  doc_mask,
                                                  x3=doc_mem_hiddens)
            doc_mem = self.doc_mem_gen(
                torch.cat([doc_mem_hiddens, doc_self_hiddens], 2), doc_mask)
        else:
            doc_mem = doc_mem_hiddens
        query_mem = self.query_sum_attn(query_mem_hiddens, query_mask)
        start_scores, end_scores = self.decoder(doc_mem, query_mem, doc_mask)
        # logger.warning('query_mem {}'.format(query_mem.shape))
        # logger.warning('hiddens {}'.format(query_mem_hiddens.shape))

        pred_score = None
        if self.classifier is not None:
            doc_sum = self.doc_sum_attn(doc_mem, doc_mask)
            pred_score = F.sigmoid(
                self.classifier(doc_sum, query_mem, doc_mask))
        return start_scores, end_scores, pred_score
Esempio n. 5
0
from config import set_args
from my_utils.utils import set_environment
from my_utils.log_wrapper import create_logger
from my_utils.squad_eval import evaluate_file
from my_utils.squad_eval_v2 import evaluate_file_v2

args = set_args()
# set model dir
model_dir = args.model_dir
os.makedirs(model_dir, exist_ok=True)
model_dir = os.path.abspath(model_dir)

# set environment
set_environment(args.seed, args.cuda)
# setup logger
logger =  create_logger(__name__, to_disk=True, log_file=os.path.join(model_dir, args.log_file))

def check(model, data, gold_path):
    data.reset()
    predictions = {}
    for batch in data:
        phrase, _ = model.predict(batch)
        uids = batch['uids']
        for uid, pred in zip(uids, phrase):
            predictions[uid] = pred

    if args.expect_version == 'v2.0':
        results = evaluate_file_v2(gold_path, predictions, args.na_prob_thresh)
    else:
        results = evaluate_file(gold_path, predictions)
    return results['exact_match'], results['f1'], predictions
Esempio n. 6
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        # train_path = 'train-v2.0.json'
        # dev_path = 'dev-v2.0.json'

        train_path = 'msmarco_squad_train.json'
        dev_path = 'msmarco_squad_dev.json'

        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('Train path is: {}'.format(train_path))

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on, limit=20000)
    dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500)

    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    del meta
    del embedding
    logger.info('deleted meta and embedding')

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 7
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.warning('~Processing SQuAD dataset~')
    train_path = os.path.join(args.data_dir, 'train-v1.1.json')
    valid_path = os.path.join(args.data_dir, 'dev-v1.1.json')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    # load data
    train_data = load_data(train_path)
    valid_data = load_data(valid_path, False)

    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + valid_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = os.path.join(args.data_dir, args.meta)
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True)

    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False)
    end_time = time.time()
    logger.info('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 8
0
def main():
    args = set_args()
    args.datasets = args.datasets.split(',')
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)

    all_data = []
    all_datasets = []
    for dataset_name in args.datasets:
        test_file_prefix = 'test'
        if test_mode:
            if 'marco' in dataset_name:
                train_file_prefix = 'train'
                dev_file_prefix = 'dev'
            else:
                train_file_prefix = 'dev'
                dev_file_prefix = 'dev'
        else:
            train_file_prefix = 'train'
            dev_file_prefix = 'dev'

        logger.info('Processing %s dataset' % dataset_name)
        this_data_dir = args.data_dir + dataset_name + '/'
        train_data = None
        train_path = os.path.join(this_data_dir, '%s.json' % train_file_prefix)
        logger.info('The path of training data: {}'.format(train_path))
        train_data = load_data(train_path)
        all_data += train_data

        valid_path = os.path.join(this_data_dir, '%s.json' % dev_file_prefix)
        logger.info('The path of validation data: {}'.format(valid_path))
        valid_data = load_data(valid_path, False)
        all_data += valid_data
        if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name:
            test_path = os.path.join(this_data_dir,
                                     '%s.json' % test_file_prefix)
            logger.info('The path of test data: {}'.format(test_path))
            test_data = load_data(test_path, False)
            all_data += test_data
            all_datasets.append((train_data, valid_data, test_data))
        else:
            all_datasets.append((train_data, valid_data))

    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    nlp = spacy.load('en', parser=False)
    set_environment(args.seed)
    logger.info('Loading glove vocab.')
    glove_vocab = load_glove_vocab(glove_path, glove_dim)

    multitask_base_path = '../data/mtmrc/'
    with open(multitask_base_path + 'vocab_tag.pick', 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(multitask_base_path + 'vocab_ner.pick', 'rb') as f:
        vocab_ner = pickle.load(f)

    logger.info('Build vocabulary ')
    vocab = build_vocab(all_data,
                        glove_vocab,
                        sort_all=args.sort_all,
                        clean_on=True,
                        args=args)
    meta_path = os.path.join(args.output_path, args.meta)
    logger.info('building embedding ')
    embedding = build_embedding(glove_path, vocab, glove_dim)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    for i, item in enumerate(all_datasets):
        dataset_name = args.datasets[i]
        if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name:
            train_data, valid_data, test_data = item
        else:
            train_data, valid_data = item
        print('building output file for ', dataset_name)
        train_fout = os.path.join(args.output_path,
                                  dataset_name + '_train.json')
        build_data(train_data,
                   vocab,
                   vocab_tag,
                   vocab_ner,
                   train_fout,
                   True,
                   dataset_name=dataset_name)
        dev_fout = os.path.join(args.output_path, dataset_name + '_dev.json')
        build_data(valid_data,
                   vocab,
                   vocab_tag,
                   vocab_ner,
                   dev_fout,
                   False,
                   dataset_name=dataset_name)
        if args.include_test_set and 'squad' not in dataset_name:
            test_fout = os.path.join(args.output_path,
                                     dataset_name + '_test.json')
            build_data(test_data,
                       vocab,
                       vocab_tag,
                       vocab_ner,
                       test_fout,
                       False,
                       dataset_name=dataset_name)
Esempio n. 9
0
#elmo_options_path = 'data_resource/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
#elmo_weight_path = 'data_resource/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
glv = 'glove.840B.300d.updated.txt'

num_model = 20
have_gpu = True
test_file = 'data_v2/dev-v2.0.json'
#output_file = sys.argv[2]
#na_prob_file = sys.argv[3]
batch_size = 32
max_len = 5
num_tune = 89571
avg_on = False
do_count = True

logger = create_logger(__name__, to_disk=False)
workspace = '/home/aerin/Desktop/squad_vteam/'
model_root = workspace
glove_path = os.path.join(workspace, glv)
glove_dim = 300
meta_path = os.path.join(workspace, my_meta)
mtlstm_path = os.path.join(workspace, my_covec)
n_threads = 16

pad = '-' * 10
logger.info('{}Resource Path{}'.format(pad, pad))
logger.info('workspace:{}'.format(workspace))
logger.info('model path:{}'.format(model_root))
logger.info('test file:{}'.format(test_file))
#logger.info('output file:{}'.format(output_file))
#logger.info('no answer prob file:{}'.format(na_prob_file))
Esempio n. 10
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True,
                           log_file=args.log_file)  # ./san.log
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(
        args.data_dir, train_path)  # args.data_dir=data/, data/train-v2.0.json
    valid_path = os.path.join(args.data_dir, dev_path)  # data/dev-v2.0.json

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(
        args.embedding_dim, args.glove))  # embedding_dim=300
    # could be fasttext embedding
    emb_path = args.glove  # data/glove.840B.300d.txt
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:  # store_true
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)
    """From GLoVe to acquire tokens, to set()"""
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    """
    '--sort_all', action='store_true',
        sort the vocabulary by frequencies of all words, Otherwise consider question words first.
    """
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Esempio n. 11
0
def main():
    # Create a argument parser and read arguments from command line
    args = set_args()
    # logger will be a global variable
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD v1.1 dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'
        version = 'v1'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove))

    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')

    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)

    wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    # what do these vocab tags and vocab ners do?
    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on)
    meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding}
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)
    end_time = time.time()
    logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))