def main(args): # hyper param root = args.root_dir assert os.path.exists(root) tokenizer = AutoTokenizer.from_pretrained(args.model, cache_dir=args.transformer_cache) mt_dnn_root = os.path.join(root, args.model) if not os.path.isdir(mt_dnn_root): os.makedirs(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in task_def.split_names: file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name)) if not os.path.exists(file_path): logger.warning("File %s doesnot exit") sys.exit(1) rows = load_data(file_path, task_def) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data( rows, dump_path, tokenizer, task_def.data_type, lab_dict=task_def.label_vocab, workers=args.workers, )
def main(args): ## hyper param do_lower_case = args.do_lower_case root = args.root_dir assert os.path.exists(root) is_uncased = False if 'uncased' in args.model: is_uncased = True is_bert_model = True if 'xlnet' in args.model: is_bert_model = False if is_bert_model: tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=do_lower_case) else: tokenizer = spm.SentencePieceProcessor() if 'large' in args.model: tokenizer.load('mt_dnn_models/xlnet_large_cased_spiece.model') else: tokenizer.load('mt_dnn_models/xlnet_base_cased_spiece.model') mt_dnn_suffix = 'mt_dnn_b' if is_bert_model else 'mt_dnn_x' if is_uncased: mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix) else: mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix) if do_lower_case: mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) task_def_dic = yaml.safe_load(open(args.task_def)) for task, task_def in task_def_dic.items(): logger.info("Task %s" % task) data_format = DataFormat[task_def["data_format"]] task_type = TaskType[task_def["task_type"]] label_mapper = task_defs.global_map.get(task, None) split_names = task_def.get("split_names", ["train", "dev", "test"]) for split_name in split_names: rows = load_data( os.path.join(root, "%s_%s.tsv" % (task, split_name)), data_format, task_type, label_mapper) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, data_format, is_bert_model=is_bert_model)
def main(args): # hyper param do_lower_case = args.do_lower_case root = "dl/" + args.root_dir assert os.path.exists(root) literal_model_type = args.model.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in args.model: mt_dnn_suffix += "_base" elif 'large' in args.model: mt_dnn_suffix += "_large" config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained("dl/mt-dnn-models/vocab.txt", do_lower_case=do_lower_case) if 'uncased' in args.model: mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix) else: mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix) if do_lower_case: mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in ['test']: file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name)) print(file_path) if not os.path.exists(file_path): logger.warning("File %s doesnot exit") sys.exit(1) rows = load_data(file_path, task_def) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, task_def.data_type, encoderModelType=encoder_model, lab_dict=task_def.label_vocab)
def load_model_for_viz_2(task_def_path, checkpoint_path, model_type='bert-base-cased', do_lower_case=False, use_cuda=True): # load task info task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) assert task in task_defs._task_type_map assert task in task_defs._data_type_map assert task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[task] task_type = task_defs._task_type_map[task] metric_meta = task_defs._metric_meta_map[task] # load model assert os.path.exists(checkpoint_path) if use_cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location=torch.device('cpu')) config = state_dict['config'] config["cuda"] = use_cuda task_def = task_defs.get_task_def(prefix) task_def_list = [task_def] config['task_def_list'] = task_def_list ## temp fix config['fp16'] = False config['answer_opt'] = 0 config['adv_train'] = False del state_dict['optimizer'] config['output_attentions'] = True config['output_hidden_states'] = True config['local_rank'] = -1 encoder_type = config.get('encoder_type', EncoderModelType.BERT) root = os.path.basename(task_def_path) literal_model_type = model_type.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in model_type: mt_dnn_suffix += "_base" elif 'large' in model_type: mt_dnn_suffix += "_large" # load config and tokenizer config = BertConfig.from_dict(config) config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained(model_type, do_lower_case=do_lower_case) return config, tokenizer
def main(args): # load task info task_defs = TaskDefs(args.task_def) assert args.task in task_defs.task_type_map assert args.task in task_defs.data_type_map assert args.task in task_defs.metric_meta_map data_type = task_defs.data_type_map[args.task] task_type = task_defs.task_type_map[args.task] metric_meta = task_defs.metric_meta_map[args.task] # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") config = state_dict['config'] config["cuda"] = args.cuda model = MTDNNModel(config, state_dict=state_dict) model.load(checkpoint_path) encoder_type = config.get('encoder_type', EncoderModelType.BERT) # load data test_data_set = SingleTaskDataset(args.prep_input, False, task_type=task_type, maxlen=args.max_seq_len) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=collater.collate_fn, pin_memory=args.cuda) with torch.no_grad(): test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=metric_meta, use_cuda=args.cuda, with_label=args.with_label) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(args.score, results) if args.with_label: print(test_metrics)
def main(args): # hyper param root = args.root_dir assert os.path.exists(root) tokenizer = AutoTokenizer.from_pretrained(args.model, mirror='tuna') mt_dnn_root = os.path.join(root, args.model) if not os.path.isdir(mt_dnn_root): os.makedirs(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in task_def.split_names: if args.task_type == "clue": file_path = os.path.join(root, task, f"{split_name}.json") else: file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name)) if not os.path.exists(file_path): logger.warning("File %s doesnot exit" % file_path) sys.exit(1) if args.task_type == "glue": rows = load_data(file_path, task_def) elif args.task_type == "clue": rows = load_clue_data(file_path, task_def) elif args.task_type == "qianyan": rows = load_qianyan_data(file_path, task_def) else: raise ValueError(f"{args.task_type} not implemented") dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, task_def.data_type, lab_dict=task_def.label_vocab)
output_dir = args.output_dir data_dir = args.data_dir args.train_datasets = args.train_datasets.split(',') args.test_datasets = args.test_datasets.split(',') pprint(args) os.makedirs(output_dir, exist_ok=True) output_dir = os.path.abspath(output_dir) set_environment(args.seed, args.cuda) log_path = args.log_file logger = create_logger(__name__, to_disk=True, log_file=log_path) logger.info(args.answer_opt) task_defs = TaskDefs(args.task_def) encoder_type = task_defs.encoderType args.encoder_type = encoder_type def dump(path, data): with open(path, 'w') as f: json.dump(data, f) def generate_decoder_opt(enable_san, max_opt): opt_v = 0 if enable_san and max_opt < 3: opt_v = max_opt return opt_v
golds = [] predictions = [] scores = [] for sample_id, label in sample_id_2_label_dic.items(): golds.append(label) pred, score_seg = sample_id_2_pred_score_seg_dic[sample_id] predictions.append(pred) scores.extend(score_seg) return golds, predictions, scores args = parser.parse_args() task_def_path = args.task_def task = args.task task_defs = TaskDefs(task_def_path) n_class = task_defs.n_class_map[task] sample_id_2_pred_score_seg_dic = load_score_file(args.score, n_class) data_format = task_defs.data_format_map[task] task_type = task_defs.task_type_map[task] label_mapper = task_defs.global_map.get(task, None) sample_objs = load_data(args.std_input, data_format, task_type, label_mapper) golds, predictions, scores = generate_golds_predictions_scores( sample_id_2_pred_score_seg_dic, sample_objs) metrics = calc_metrics(task_defs.metric_meta_map[task], golds, predictions, scores) print(metrics)
def main(args): # hyper param root = args.root_dir assert os.path.exists(root) suffix = args.model.split("/")[-1] literal_model_type = suffix.split("-")[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if "base" in args.model: mt_dnn_suffix += "_base" elif "large" in args.model: mt_dnn_suffix += "_large" # tokenizer tokenizer = AutoTokenizer.from_pretrained( args.model, cache_dir=args.cache_dir, use_fast=True, from_slow=True, revision=args.model_revision, ) # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if "uncased" in args.model: mt_dnn_suffix = "{}_uncased".format(mt_dnn_suffix) else: mt_dnn_suffix = "{}_cased".format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) for task in task_defs.get_task_names(): task_def = task_defs.get_task_def(task) logger.info("Task %s" % task) for split_name in task_def.split_names: print(root) file_path = os.path.join(root, "%s_%s.json" % (task, split_name)) print(file_path) if not os.path.exists(file_path): logger.warning("File %s doesnot exit") sys.exit(1) logger.warning("processing %s" % file_path) is_training = True if not "train" in split_name: is_training = False rows = flat_squad(file_path, is_training) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) if is_training: prepare_train_feature( tokenizer, rows, dump_path, pad_on_right=pad_on_right, label_mapper=task_def.label_vocab, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, ) else: prepare_validation_features( tokenizer, rows, dump_path, pad_on_right=pad_on_right, label_mapper=task_def.label_vocab, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, )
def main(args): # hyper param do_lower_case = args.do_lower_case root = args.root_dir assert os.path.exists(root) is_uncased = False if 'uncased' in args.model: is_uncased = True mt_dnn_suffix = 'bert' encoder_model = EncoderModelType.BERT if 'xlnet' in args.model: encoder_model = EncoderModelType.XLNET mt_dnn_suffix = 'xlnet' if 'roberta' in args.model: encoder_model = EncoderModelType.ROBERTA mt_dnn_suffix = 'roberta' if encoder_model == EncoderModelType.ROBERTA: if args.roberta_path is None or (not os.path.exists( args.roberta_path)): print('Please specify roberta model path') encoder = get_encoder('{}/encoder.json'.format(args.roberta_path), '{}/vocab.bpe'.format(args.roberta_path)) vocab = load_dict('{}/ict.txt'.format(args.roberta_path)) tokenizer = RoBERTaTokenizer(vocab, encoder) elif encoder_model == EncoderModelType.XLNET: tokenizer = spm.SentencePieceProcessor() if 'large' in args.model: tokenizer.load('mt_dnn_models/xlnet_large_cased_spiece.model') else: tokenizer.load('mt_dnn_models/xlnet_base_cased_spiece.model') else: tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=do_lower_case) if is_uncased: mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix) else: mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix) if do_lower_case: mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix) mt_dnn_root = os.path.join(root, mt_dnn_suffix) if not os.path.isdir(mt_dnn_root): os.mkdir(mt_dnn_root) task_defs = TaskDefs(args.task_def) task_def_dic = yaml.safe_load(open(args.task_def)) for task, task_def in task_def_dic.items(): logger.info("Task %s" % task) data_format = DataFormat[task_def["data_format"]] task_type = TaskType[task_def["task_type"]] label_mapper = task_defs.global_map.get(task, None) split_names = task_def.get("split_names", ["train", "dev", "test"]) for split_name in split_names: rows = load_data( os.path.join(root, "%s_%s.tsv" % (task, split_name)), data_format, task_type, label_mapper) dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name)) logger.info(dump_path) build_data(rows, dump_path, tokenizer, data_format, encoderModelType=encoder_model)
def load_model_for_viz_0(task_def_path, checkpoint_path, input_path, model_type='bert-base-cased', do_lower_case=False, use_cuda=True): # load task info task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) assert task in task_defs._task_type_map assert task in task_defs._data_type_map assert task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[task] task_type = task_defs._task_type_map[task] metric_meta = task_defs._metric_meta_map[task] # load model assert os.path.exists(checkpoint_path) state_dict = torch.load(checkpoint_path) config = state_dict['config'] config["cuda"] = use_cuda task_def = task_defs.get_task_def(prefix) task_def_list = [task_def] config['task_def_list'] = task_def_list ####### temp fix ####### config['fp16'] = False config['answer_opt'] = 0 config['adv_train'] = False del state_dict['optimizer'] ######################### model = MTDNNModel(config, state_dict=state_dict) encoder_type = config.get('encoder_type', EncoderModelType.BERT) root = os.path.basename(task_def_path) literal_model_type = model_type.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in model_type: mt_dnn_suffix += "_base" elif 'large' in model_type: mt_dnn_suffix += "_large" # load tokenizer config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained(model_type, do_lower_case=do_lower_case) # load data prep_input = input_path test_data_set = SingleTaskDataset(prep_input, False, maxlen=512, task_id=0, task_def=task_def) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader(test_data_set, batch_size=1, collate_fn=collater.collate_fn, pin_memory=True) idx = 0 results = [] return model.mnetwork.module.bert, config, test_data
def load_model_for_viz_1(task_def_path, checkpoint_path, input_path, model_type='bert-base-cased', do_lower_case=False, use_cuda=True): # load task info task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) assert task in task_defs._task_type_map assert task in task_defs._data_type_map assert task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[task] task_type = task_defs._task_type_map[task] metric_meta = task_defs._metric_meta_map[task] # load model assert os.path.exists(checkpoint_path) state_dict = torch.load(checkpoint_path) config = state_dict['config'] config["cuda"] = use_cuda device = torch.device("cuda" if use_cuda else "cpu") task_def = task_defs.get_task_def(prefix) task_def_list = [task_def] config['task_def_list'] = task_def_list ## temp fix config['fp16'] = False config['answer_opt'] = 0 config['adv_train'] = False #del state_dict['optimizer'] config['output_attentions'] = True config['local_rank'] = -1 model = MTDNNModel(config, device, state_dict=state_dict) encoder_type = config.get('encoder_type', EncoderModelType.BERT) root = os.path.basename(task_def_path) literal_model_type = model_type.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in model_type: mt_dnn_suffix += "_base" elif 'large' in model_type: mt_dnn_suffix += "_large" # load tokenizer config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained(model_type, do_lower_case=do_lower_case) # load data prep_input = input_path test_data_set = SingleTaskDataset(prep_input, False, maxlen=512, task_id=0, task_def=task_def) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader(test_data_set, batch_size=1, collate_fn=collater.collate_fn, pin_memory=True) idx = 0 results = [] for batch_meta, batch_data in tqdm(test_data): if idx < 360: idx += 1 continue batch_meta, batch_data = Collater.patch_data(device, batch_meta, batch_data) model.network.eval() task_id = batch_meta['task_id'] task_def = TaskDef.from_dict(batch_meta['task_def']) task_type = task_def.task_type task_obj = tasks.get_task_obj(task_def) inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) input_ids = inputs[0] token_type_ids = inputs[1] attention = model.mnetwork.module.bert( input_ids, token_type_ids=token_type_ids)[-1] batch_size = batch_data[0].shape[0] for i in range(batch_size): attention = tuple([item[i:i + 1, :, :, :] for item in attention]) input_id_list = input_ids[i].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) idx_sep = listRightIndex(tokens, '[SEP]') + 1 tokens = tokens[:idx_sep] attention = tuple( [item[:, :, :idx_sep, :idx_sep] for item in attention]) results.append((attention, tokens)) idx += batch_size return results
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--task_def", type=str, required=True, default="experiments/glue/glue_task_def.yml") parser.add_argument("--task", type=str, required=True) parser.add_argument("--task_id", type=int, default=0, help="the id of this task when training") parser.add_argument("--checkpoint", default='mt_dnn_models/bert_model_base_uncased.pt', type=str) parser.add_argument( "--output_dir", default= '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/checkpoints/bert-cased_lcp-single_2020-12-23T2029/', type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--prep_input", default= '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/data_complex/bert_base_cased/lcp_dev.json', type=str, required=True, ) parser.add_argument( '--bert_model_type', default='bert-base-cased', type=str, help="What type of bert model should we be using", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help= "Pretrained config name or path if not the same as model_name_or_path", ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Pretrained tokenizer name or path if not the same as model_name_or_path", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers") parser.add_argument( "--dont_normalize_global_importance", action="store_true", help="Don't normalize all importance scores between 0 and 1", ) parser.add_argument( "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy.") parser.add_argument( "--masking_threshold", default=0.9, type=float, help= "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", ) parser.add_argument( "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.") parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, sequences shorter padded.", ) # temp fix: technically these parameters should've already bin in checkpoint's config... parser.add_argument("--world_size", type=int, default=1, help="For distributed training: world size") parser.add_argument("--batch_size", default=8, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=2018) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(), help='whether to use GPU acceleration.') parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--do_proper", type=str, default=False, help="Can be used for distant debugging.") parser.add_argument("--do_improper", type=str, default=False, help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup devices and distributed training device = torch.device("cuda") if args.local_rank > -1: device = initialize_distributed(args) elif torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # load task info task = args.task task_defs = TaskDefs(args.task_def) assert args.task in task_defs._task_type_map assert args.task in task_defs._data_type_map assert args.task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[args.task] task_type = task_defs._task_type_map[args.task] metric_meta = task_defs._metric_meta_map[args.task] # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") opt = state_dict['config'] args.bin_on = False opt.update(vars(args)) model = MTDNNModel(opt, device=device, state_dict=state_dict) # Load pretrained model and tokenizer # Load data data = pd.read_csv('data_complex/lcp_test.tsv', sep='\t', header=None, names=['idx', 'complexity', 'sentence', 'token']) data['complexity'] = np.load( '/content/gdrive/My Drive/Colab Notebooks/cs99/from_macbook/single_test_labels.npy' ) data['class'] = pd.cut(data['complexity'], labels=[1, 2, 3, 4, 5], bins=[0, 0.2, 0.4, 0.6, 0.8, 1], include_lowest=True) data['sent_len'] = data['sentence'].str.len() with open( '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/lcp_test_scores_epoch_4.json', 'r') as file: single_dev_bert_scores = json.load(file) data['finetuned_complexity'] = single_dev_bert_scores['scores'] data['finetuned_error'] = data['finetuned_complexity'] - data[ 'complexity'] data['finetuned_abs_error'] = (data['finetuned_complexity'] - data['complexity']).abs() with open( '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/pretrained.json', 'r') as file: single_dev_bert_scores = json.load(file) data['pretrained_complexity'] = single_dev_bert_scores['scores'] data['pretrained_error'] = data['pretrained_complexity'] - data[ 'complexity'] data['pretrained_abs_error'] = (data['pretrained_complexity'] - data['complexity']).abs() data['improvement'] = data['pretrained_abs_error'] - data[ 'finetuned_abs_error'] data['proper'] = data['token'].apply(lambda x: x[0].isupper()) # Distributed training: # download model & vocab. printable = opt['local_rank'] in [-1, 0] encoder_type = opt.get('encoder_type', EncoderModelType.BERT) collater = Collater(is_train=True, encoder_type=encoder_type, max_seq_len=opt['max_seq_len'], do_padding=opt['do_padding']) dev_data = SingleTaskDataset(opt['prep_input'], True, maxlen=opt['max_seq_len'], task_id=opt['task_id'], task_def=task_def, printable=printable) if args.do_proper: dev_data._data = np.array( dev_data._data)[data[data['proper']]['idx'].to_numpy()].tolist() if args.do_improper: dev_data._data = np.array( dev_data._data)[data[~data['proper']]['idx'].to_numpy()].tolist() dev_data_loader = DataLoader(dev_data, batch_size=opt['batch_size_eval'], collate_fn=collater.collate_fn, pin_memory=opt['cuda']) # Compute head entropy and importance score results = [] for seed in tqdm(range(2010 + 1, 2020 + 1)): # Set seeds set_seed(seed) attn_entropy, head_importance, preds, labels = compute_heads_importance( opt, model, dev_data_loader) results.append((attn_entropy, head_importance)) pkl.dump( results, open('checkpoints/bert-cased_lcp-single_2021-01-19T0309/results.pkl', 'wb')) # Try head masking (set heads to zero until the score goes under a threshold) # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: head_mask = mask_heads(opt, model, dev_data_loader)
def main(): task_def_path = 'data_complex/lcp.yml' task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() encoder_type = args.encoder_type layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis fout_temp = '{}.tmp'.format(args.finput) dump_data(data, fout_temp) collater = Collater(is_train=False, encoder_type=encoder_type) dataset = SingleTaskDataset(fout_temp, False, maxlen=args.max_seq_length, task_def=task_def)#, data_type=data_type) batcher = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict['config'] config['dump_feature'] = True config['local_rank'] = -1 opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) return num_all_batches = len(batcher) model = MTDNNModel( opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes] #import pdb; pdb.set_trace() uids = batch_meta['uids'] masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, 'w', encoding='utf-8') as writer: for sample in data: uid = sample['uid'] tokens = sample['tokens'] feature = features_dict[uid] feature['tokens'] = tokens feature['uid'] = uid writer.write('{}\n'.format(json.dumps(feature)))