Exemple #1
0
import pickle
import re
import collections
import argparse
from sys import path
from data_utils.vocab import Vocabulary
from pytorch_pretrained_bert.tokenization import BertTokenizer
from data_utils.log_wrapper import create_logger
from data_utils.label_map import GLOBAL_MAP
from data_utils.glue_utils import *
DEBUG_MODE = False
MAX_SEQ_LEN = 512

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
logger = create_logger(__name__,
                       to_disk=True,
                       log_file='bert_data_proc_512.log')


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length.
    Copyed from https://github.com/huggingface/pytorch-pretrained-BERT
    """
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
Exemple #2
0
parser = model_config(parser)
parser = train_config(parser)
args = parser.parse_args()

output_dir = args.output_dir
data_dir = args.data_dir
args.train_datasets = args.train_datasets.split(',')
args.test_datasets = args.test_datasets.split(',')
pprint(args)

os.makedirs(output_dir, exist_ok=True)
output_dir = os.path.abspath(output_dir)

set_environment(args.seed, args.cuda)
log_path = args.log_file
logger = create_logger(__name__, to_disk=True, log_file=log_path)
logger.info(args.answer_opt)

task_defs = TaskDefs(args.task_def)
encoder_type = task_defs.encoderType
args.encoder_type = encoder_type


def dump(path, data):
    with open(path, 'w') as f:
        json.dump(data, f)


def generate_decoder_opt(enable_san, max_opt):
    opt_v = 0
    if enable_san and max_opt < 3:
Exemple #3
0
import os
import argparse
import random
from sys import path

path.append(os.getcwd())
from experiments.common_utils import dump_rows
from data_utils.task_def import DataFormat
from data_utils.log_wrapper import create_logger
from experiments.glue.glue_utils import *

logger = create_logger(__name__, to_disk=True, log_file='xnli_prepro.log')


def load_xnli(file, header=True):
    rows = []
    cnt = 0
    with open(file, encoding="utf8") as f:
        for line in f:
            if header:
                header = False
                continue
            blocks = line.strip().split('\t')
            if blocks[1] == '-': continue
            lab = blocks[1]
            if lab is None:
                import pdb
                pdb.set_trace()
            sample = {
                'uid': blocks[9],
                'premise': blocks[6],
import sys
from data_utils import load_data
from data_utils.task_def import TaskType, DataFormat
from data_utils.log_wrapper import create_logger
from experiments.exp_def import TaskDefs, EncoderModelType
#from experiments.squad import squad_utils
from pretrained_models import *

DEBUG_MODE = False
MAX_SEQ_LEN = 512
DOC_STRIDE = 180
MAX_QUERY_LEN = 64
MRC_MAX_SEQ_LEN = 384

logger = create_logger(__name__,
                       to_disk=True,
                       log_file='mt_dnn_data_proc_{}.log'.format(MAX_SEQ_LEN))


def feature_extractor(
    tokenizer,
    text_a,
    text_b=None,
    max_length=512,
    model_type=None,
    enable_padding=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=False
):  # set mask_padding_with_zero default value as False to keep consistent with original setting
Exemple #5
0
"""
import os
import argparse
import torch
import json
from pytorch_pretrained_bert.tokenization import BertTokenizer
from torch.utils.data import DataLoader
from data_utils.log_wrapper import create_logger
from data_utils.utils import set_environment
from mt_dnn.batcher import Collater, SingleTaskDataset
from mt_dnn.model import MTDNNModel
from prepro_std import _truncate_seq_pair
from data_utils.task_def import DataFormat, EncoderModelType, TaskType

logger = create_logger(__name__,
                       to_disk=True,
                       log_file='mt_dnn_feature_extractor.log')


def load_data(file):
    rows = []
    cnt = 0
    is_single_sentence = False
    with open(file, encoding="utf8") as f:
        for line in f:
            blocks = line.strip().split('|||')
            if len(blocks) == 2:
                sample = {
                    'uid': str(cnt),
                    'premise': blocks[0],
                    'hypothesis': blocks[1],
Exemple #6
0
# by: xiaodl
from __future__ import absolute_import
from __future__ import division
import re
import os
import argparse
import tensorflow as tf
import torch
import numpy as np
from pytorch_pretrained_bert.modeling import BertConfig
from sys import path
path.append(os.getcwd())
from mt_dnn.matcher import SANBertNetwork
from data_utils.log_wrapper import create_logger

logger = create_logger(__name__, to_disk=False)


def model_config(parser):
    parser.add_argument('--update_bert_opt', default=0, type=int)
    parser.add_argument('--multi_gpu_on', action='store_true')
    parser.add_argument('--mem_cum_type',
                        type=str,
                        default='simple',
                        help='bilinear/simple/defualt')
    parser.add_argument('--answer_num_turn', type=int, default=5)
    parser.add_argument('--answer_mem_drop_p', type=float, default=0.1)
    parser.add_argument('--answer_att_hidden_size', type=int, default=128)
    parser.add_argument('--answer_att_type',
                        type=str,
                        default='bilinear',
Exemple #7
0
import os
import argparse
import random
from sys import path

path.append(os.getcwd())
from experiments.common_utils import dump_rows
from data_utils.task_def import DataFormat
from data_utils.log_wrapper import create_logger
from experiments.glue.glue_utils import *

logger = create_logger(__name__, to_disk=True, log_file="glue_prepro.log")


def parse_args():
    parser = argparse.ArgumentParser(
        description="Preprocessing GLUE/SNLI/SciTail dataset.")
    parser.add_argument("--seed", type=int, default=13)
    parser.add_argument("--root_dir", type=str, default="data")
    parser.add_argument(
        "--old_glue",
        action="store_true",
        help="whether it is old GLUE, refer official GLUE webpage for details",
    )
    args = parser.parse_args()
    return args


def main(args):
    is_old_glue = args.old_glue
    root = args.root_dir
Exemple #8
0
import os
import argparse
from sys import path

path.append(os.getcwd())
from data_utils.task_def import DataFormat
from data_utils.log_wrapper import create_logger
from experiments.ner.ner_utils import load_conll_chunk, load_conll_ner, load_conll_pos
from experiments.common_utils import dump_rows

logger = create_logger(
    __name__, to_disk=True, log_file="bert_ner_data_proc_512_cased.log"
)


def parse_args():
    parser = argparse.ArgumentParser(description="Preprocessing English NER dataset.")
    parser.add_argument("--data_dir", type=str, required=True)
    parser.add_argument("--seed", type=int, default=13)
    parser.add_argument("--output_dir", type=str, required=True)
    args = parser.parse_args()
    return args


def main(args):
    data_dir = args.data_dir
    data_dir = os.path.abspath(data_dir)
    if not os.path.isdir(data_dir):
        os.mkdir(data_dir)

    train_path = os.path.join(data_dir, "train.txt")