import pickle import re import collections import argparse from sys import path from data_utils.vocab import Vocabulary from pytorch_pretrained_bert.tokenization import BertTokenizer from data_utils.log_wrapper import create_logger from data_utils.label_map import GLOBAL_MAP from data_utils.glue_utils import * DEBUG_MODE = False MAX_SEQ_LEN = 512 bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') logger = create_logger(__name__, to_disk=True, log_file='bert_data_proc_512.log') def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length. Copyed from https://github.com/huggingface/pytorch-pretrained-BERT """ # This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break
parser = model_config(parser) parser = train_config(parser) args = parser.parse_args() output_dir = args.output_dir data_dir = args.data_dir args.train_datasets = args.train_datasets.split(',') args.test_datasets = args.test_datasets.split(',') pprint(args) os.makedirs(output_dir, exist_ok=True) output_dir = os.path.abspath(output_dir) set_environment(args.seed, args.cuda) log_path = args.log_file logger = create_logger(__name__, to_disk=True, log_file=log_path) logger.info(args.answer_opt) task_defs = TaskDefs(args.task_def) encoder_type = task_defs.encoderType args.encoder_type = encoder_type def dump(path, data): with open(path, 'w') as f: json.dump(data, f) def generate_decoder_opt(enable_san, max_opt): opt_v = 0 if enable_san and max_opt < 3:
import os import argparse import random from sys import path path.append(os.getcwd()) from experiments.common_utils import dump_rows from data_utils.task_def import DataFormat from data_utils.log_wrapper import create_logger from experiments.glue.glue_utils import * logger = create_logger(__name__, to_disk=True, log_file='xnli_prepro.log') def load_xnli(file, header=True): rows = [] cnt = 0 with open(file, encoding="utf8") as f: for line in f: if header: header = False continue blocks = line.strip().split('\t') if blocks[1] == '-': continue lab = blocks[1] if lab is None: import pdb pdb.set_trace() sample = { 'uid': blocks[9], 'premise': blocks[6],
import sys from data_utils import load_data from data_utils.task_def import TaskType, DataFormat from data_utils.log_wrapper import create_logger from experiments.exp_def import TaskDefs, EncoderModelType #from experiments.squad import squad_utils from pretrained_models import * DEBUG_MODE = False MAX_SEQ_LEN = 512 DOC_STRIDE = 180 MAX_QUERY_LEN = 64 MRC_MAX_SEQ_LEN = 384 logger = create_logger(__name__, to_disk=True, log_file='mt_dnn_data_proc_{}.log'.format(MAX_SEQ_LEN)) def feature_extractor( tokenizer, text_a, text_b=None, max_length=512, model_type=None, enable_padding=False, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=False ): # set mask_padding_with_zero default value as False to keep consistent with original setting
""" import os import argparse import torch import json from pytorch_pretrained_bert.tokenization import BertTokenizer from torch.utils.data import DataLoader from data_utils.log_wrapper import create_logger from data_utils.utils import set_environment from mt_dnn.batcher import Collater, SingleTaskDataset from mt_dnn.model import MTDNNModel from prepro_std import _truncate_seq_pair from data_utils.task_def import DataFormat, EncoderModelType, TaskType logger = create_logger(__name__, to_disk=True, log_file='mt_dnn_feature_extractor.log') def load_data(file): rows = [] cnt = 0 is_single_sentence = False with open(file, encoding="utf8") as f: for line in f: blocks = line.strip().split('|||') if len(blocks) == 2: sample = { 'uid': str(cnt), 'premise': blocks[0], 'hypothesis': blocks[1],
# by: xiaodl from __future__ import absolute_import from __future__ import division import re import os import argparse import tensorflow as tf import torch import numpy as np from pytorch_pretrained_bert.modeling import BertConfig from sys import path path.append(os.getcwd()) from mt_dnn.matcher import SANBertNetwork from data_utils.log_wrapper import create_logger logger = create_logger(__name__, to_disk=False) def model_config(parser): parser.add_argument('--update_bert_opt', default=0, type=int) parser.add_argument('--multi_gpu_on', action='store_true') parser.add_argument('--mem_cum_type', type=str, default='simple', help='bilinear/simple/defualt') parser.add_argument('--answer_num_turn', type=int, default=5) parser.add_argument('--answer_mem_drop_p', type=float, default=0.1) parser.add_argument('--answer_att_hidden_size', type=int, default=128) parser.add_argument('--answer_att_type', type=str, default='bilinear',
import os import argparse import random from sys import path path.append(os.getcwd()) from experiments.common_utils import dump_rows from data_utils.task_def import DataFormat from data_utils.log_wrapper import create_logger from experiments.glue.glue_utils import * logger = create_logger(__name__, to_disk=True, log_file="glue_prepro.log") def parse_args(): parser = argparse.ArgumentParser( description="Preprocessing GLUE/SNLI/SciTail dataset.") parser.add_argument("--seed", type=int, default=13) parser.add_argument("--root_dir", type=str, default="data") parser.add_argument( "--old_glue", action="store_true", help="whether it is old GLUE, refer official GLUE webpage for details", ) args = parser.parse_args() return args def main(args): is_old_glue = args.old_glue root = args.root_dir
import os import argparse from sys import path path.append(os.getcwd()) from data_utils.task_def import DataFormat from data_utils.log_wrapper import create_logger from experiments.ner.ner_utils import load_conll_chunk, load_conll_ner, load_conll_pos from experiments.common_utils import dump_rows logger = create_logger( __name__, to_disk=True, log_file="bert_ner_data_proc_512_cased.log" ) def parse_args(): parser = argparse.ArgumentParser(description="Preprocessing English NER dataset.") parser.add_argument("--data_dir", type=str, required=True) parser.add_argument("--seed", type=int, default=13) parser.add_argument("--output_dir", type=str, required=True) args = parser.parse_args() return args def main(args): data_dir = args.data_dir data_dir = os.path.abspath(data_dir) if not os.path.isdir(data_dir): os.mkdir(data_dir) train_path = os.path.join(data_dir, "train.txt")