コード例 #1
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model,
                                              cache_dir=args.transformer_cache)

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(
                rows,
                dump_path,
                tokenizer,
                task_def.data_type,
                lab_dict=task_def.label_vocab,
                workers=args.workers,
            )
コード例 #2
0
ファイル: prepro_std.py プロジェクト: r-terada/mt-dnn
def main(args):
    ## hyper param
    do_lower_case = args.do_lower_case
    root = args.root_dir
    assert os.path.exists(root)

    is_uncased = False
    if 'uncased' in args.model:
        is_uncased = True

    is_bert_model = True
    if 'xlnet' in args.model:
        is_bert_model = False

    if is_bert_model:
        tokenizer = BertTokenizer.from_pretrained(args.model,
                                                  do_lower_case=do_lower_case)
    else:
        tokenizer = spm.SentencePieceProcessor()
        if 'large' in args.model:
            tokenizer.load('mt_dnn_models/xlnet_large_cased_spiece.model')
        else:
            tokenizer.load('mt_dnn_models/xlnet_base_cased_spiece.model')

    mt_dnn_suffix = 'mt_dnn_b' if is_bert_model else 'mt_dnn_x'
    if is_uncased:
        mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix)

    if do_lower_case:
        mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)
    task_def_dic = yaml.safe_load(open(args.task_def))

    for task, task_def in task_def_dic.items():
        logger.info("Task %s" % task)
        data_format = DataFormat[task_def["data_format"]]
        task_type = TaskType[task_def["task_type"]]
        label_mapper = task_defs.global_map.get(task, None)
        split_names = task_def.get("split_names", ["train", "dev", "test"])
        for split_name in split_names:
            rows = load_data(
                os.path.join(root, "%s_%s.tsv" % (task, split_name)),
                data_format, task_type, label_mapper)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       data_format,
                       is_bert_model=is_bert_model)
コード例 #3
0
def main(args):
    # hyper param
    do_lower_case = args.do_lower_case
    root = "dl/" + args.root_dir
    assert os.path.exists(root)

    literal_model_type = args.model.split('-')[0].upper()

    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in args.model:
        mt_dnn_suffix += "_base"
    elif 'large' in args.model:
        mt_dnn_suffix += "_large"

    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained("dl/mt-dnn-models/vocab.txt",
                                                do_lower_case=do_lower_case)

    if 'uncased' in args.model:
        mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix)

    if do_lower_case:
        mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in ['test']:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            print(file_path)
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       task_def.data_type,
                       encoderModelType=encoder_model,
                       lab_dict=task_def.label_vocab)
コード例 #4
0
def load_model_for_viz_2(task_def_path,
                         checkpoint_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    if use_cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path,
                                map_location=torch.device('cpu'))
    config = state_dict['config']
    config["cuda"] = use_cuda
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ## temp fix
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    del state_dict['optimizer']
    config['output_attentions'] = True
    config['output_hidden_states'] = True
    config['local_rank'] = -1
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load config and tokenizer
    config = BertConfig.from_dict(config)
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    return config, tokenizer
コード例 #5
0
def main(args):
    # load task info
    task_defs = TaskDefs(args.task_def)
    assert args.task in task_defs.task_type_map
    assert args.task in task_defs.data_type_map
    assert args.task in task_defs.metric_meta_map
    data_type = task_defs.data_type_map[args.task]
    task_type = task_defs.task_type_map[args.task]
    metric_meta = task_defs.metric_meta_map[args.task]

    # load model
    checkpoint_path = args.checkpoint
    assert os.path.exists(checkpoint_path)
    if args.cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path, map_location="cpu")
    config = state_dict['config']
    config["cuda"] = args.cuda
    model = MTDNNModel(config, state_dict=state_dict)
    model.load(checkpoint_path)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)

    # load data
    test_data_set = SingleTaskDataset(args.prep_input,
                                      False,
                                      task_type=task_type,
                                      maxlen=args.max_seq_len)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=args.batch_size_eval,
                           collate_fn=collater.collate_fn,
                           pin_memory=args.cuda)

    with torch.no_grad():
        test_metrics, test_predictions, scores, golds, test_ids = eval_model(
            model,
            test_data,
            metric_meta=metric_meta,
            use_cuda=args.cuda,
            with_label=args.with_label)

        results = {
            'metrics': test_metrics,
            'predictions': test_predictions,
            'uids': test_ids,
            'scores': scores
        }
        dump(args.score, results)
        if args.with_label:
            print(test_metrics)
コード例 #6
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model, mirror='tuna')

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            if args.task_type == "clue":
                file_path = os.path.join(root, task, f"{split_name}.json")
            else:
                file_path = os.path.join(root,
                                         "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit" % file_path)
                sys.exit(1)
            if args.task_type == "glue":
                rows = load_data(file_path, task_def)
            elif args.task_type == "clue":
                rows = load_clue_data(file_path, task_def)
            elif args.task_type == "qianyan":
                rows = load_qianyan_data(file_path, task_def)
            else:
                raise ValueError(f"{args.task_type} not implemented")
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       task_def.data_type,
                       lab_dict=task_def.label_vocab)
コード例 #7
0
ファイル: train.py プロジェクト: tj1116/mt-dnn
output_dir = args.output_dir
data_dir = args.data_dir
args.train_datasets = args.train_datasets.split(',')
args.test_datasets = args.test_datasets.split(',')
pprint(args)

os.makedirs(output_dir, exist_ok=True)
output_dir = os.path.abspath(output_dir)

set_environment(args.seed, args.cuda)
log_path = args.log_file
logger = create_logger(__name__, to_disk=True, log_file=log_path)
logger.info(args.answer_opt)

task_defs = TaskDefs(args.task_def)
encoder_type = task_defs.encoderType
args.encoder_type = encoder_type


def dump(path, data):
    with open(path, 'w') as f:
        json.dump(data, f)


def generate_decoder_opt(enable_san, max_opt):
    opt_v = 0
    if enable_san and max_opt < 3:
        opt_v = max_opt
    return opt_v
コード例 #8
0
ファイル: calc_metrics.py プロジェクト: Mehrad0711/mt-dnn-1
    golds = []
    predictions = []
    scores = []
    for sample_id, label in sample_id_2_label_dic.items():
        golds.append(label)
        pred, score_seg = sample_id_2_pred_score_seg_dic[sample_id]
        predictions.append(pred)
        scores.extend(score_seg)
    return golds, predictions, scores


args = parser.parse_args()

task_def_path = args.task_def
task = args.task
task_defs = TaskDefs(task_def_path)

n_class = task_defs.n_class_map[task]
sample_id_2_pred_score_seg_dic = load_score_file(args.score, n_class)

data_format = task_defs.data_format_map[task]
task_type = task_defs.task_type_map[task]
label_mapper = task_defs.global_map.get(task, None)
sample_objs = load_data(args.std_input, data_format, task_type, label_mapper)

golds, predictions, scores = generate_golds_predictions_scores(
    sample_id_2_pred_score_seg_dic, sample_objs)

metrics = calc_metrics(task_defs.metric_meta_map[task], golds, predictions,
                       scores)
print(metrics)
コード例 #9
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)
    suffix = args.model.split("/")[-1]
    literal_model_type = suffix.split("-")[0].upper()

    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if "base" in args.model:
        mt_dnn_suffix += "_base"
    elif "large" in args.model:
        mt_dnn_suffix += "_large"

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        args.model,
        cache_dir=args.cache_dir,
        use_fast=True,
        from_slow=True,
        revision=args.model_revision,
    )
    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if "uncased" in args.model:
        mt_dnn_suffix = "{}_uncased".format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = "{}_cased".format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            print(root)
            file_path = os.path.join(root, "%s_%s.json" % (task, split_name))
            print(file_path)

            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            logger.warning("processing %s" % file_path)
            is_training = True
            if not "train" in split_name:
                is_training = False

            rows = flat_squad(file_path, is_training)
            dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            if is_training:
                prepare_train_feature(
                    tokenizer,
                    rows,
                    dump_path,
                    pad_on_right=pad_on_right,
                    label_mapper=task_def.label_vocab,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                )
            else:
                prepare_validation_features(
                    tokenizer,
                    rows,
                    dump_path,
                    pad_on_right=pad_on_right,
                    label_mapper=task_def.label_vocab,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                )
コード例 #10
0
def main(args):
    # hyper param
    do_lower_case = args.do_lower_case
    root = args.root_dir
    assert os.path.exists(root)

    is_uncased = False
    if 'uncased' in args.model:
        is_uncased = True

    mt_dnn_suffix = 'bert'
    encoder_model = EncoderModelType.BERT
    if 'xlnet' in args.model:
        encoder_model = EncoderModelType.XLNET
        mt_dnn_suffix = 'xlnet'

    if 'roberta' in args.model:
        encoder_model = EncoderModelType.ROBERTA
        mt_dnn_suffix = 'roberta'

    if encoder_model == EncoderModelType.ROBERTA:
        if args.roberta_path is None or (not os.path.exists(
                args.roberta_path)):
            print('Please specify roberta model path')
        encoder = get_encoder('{}/encoder.json'.format(args.roberta_path),
                              '{}/vocab.bpe'.format(args.roberta_path))
        vocab = load_dict('{}/ict.txt'.format(args.roberta_path))
        tokenizer = RoBERTaTokenizer(vocab, encoder)

    elif encoder_model == EncoderModelType.XLNET:
        tokenizer = spm.SentencePieceProcessor()
        if 'large' in args.model:
            tokenizer.load('mt_dnn_models/xlnet_large_cased_spiece.model')
        else:
            tokenizer.load('mt_dnn_models/xlnet_base_cased_spiece.model')
    else:
        tokenizer = BertTokenizer.from_pretrained(args.model,
                                                  do_lower_case=do_lower_case)

    if is_uncased:
        mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix)

    if do_lower_case:
        mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)
    task_def_dic = yaml.safe_load(open(args.task_def))

    for task, task_def in task_def_dic.items():
        logger.info("Task %s" % task)
        data_format = DataFormat[task_def["data_format"]]
        task_type = TaskType[task_def["task_type"]]
        label_mapper = task_defs.global_map.get(task, None)
        split_names = task_def.get("split_names", ["train", "dev", "test"])
        for split_name in split_names:
            rows = load_data(
                os.path.join(root, "%s_%s.tsv" % (task, split_name)),
                data_format, task_type, label_mapper)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       data_format,
                       encoderModelType=encoder_model)
コード例 #11
0
def load_model_for_viz_0(task_def_path,
                         checkpoint_path,
                         input_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    state_dict = torch.load(checkpoint_path)
    config = state_dict['config']
    config["cuda"] = use_cuda
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ####### temp fix #######
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    del state_dict['optimizer']
    #########################
    model = MTDNNModel(config, state_dict=state_dict)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    # load data
    prep_input = input_path
    test_data_set = SingleTaskDataset(prep_input,
                                      False,
                                      maxlen=512,
                                      task_id=0,
                                      task_def=task_def)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=1,
                           collate_fn=collater.collate_fn,
                           pin_memory=True)
    idx = 0
    results = []
    return model.mnetwork.module.bert, config, test_data
コード例 #12
0
def load_model_for_viz_1(task_def_path,
                         checkpoint_path,
                         input_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    state_dict = torch.load(checkpoint_path)
    config = state_dict['config']
    config["cuda"] = use_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ## temp fix
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    #del state_dict['optimizer']
    config['output_attentions'] = True
    config['local_rank'] = -1
    model = MTDNNModel(config, device, state_dict=state_dict)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    # load data
    prep_input = input_path
    test_data_set = SingleTaskDataset(prep_input,
                                      False,
                                      maxlen=512,
                                      task_id=0,
                                      task_def=task_def)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=1,
                           collate_fn=collater.collate_fn,
                           pin_memory=True)
    idx = 0
    results = []
    for batch_meta, batch_data in tqdm(test_data):
        if idx < 360:
            idx += 1
            continue
        batch_meta, batch_data = Collater.patch_data(device, batch_meta,
                                                     batch_data)
        model.network.eval()
        task_id = batch_meta['task_id']
        task_def = TaskDef.from_dict(batch_meta['task_def'])
        task_type = task_def.task_type
        task_obj = tasks.get_task_obj(task_def)
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        input_ids = inputs[0]
        token_type_ids = inputs[1]
        attention = model.mnetwork.module.bert(
            input_ids, token_type_ids=token_type_ids)[-1]
        batch_size = batch_data[0].shape[0]
        for i in range(batch_size):
            attention = tuple([item[i:i + 1, :, :, :] for item in attention])
            input_id_list = input_ids[i].tolist()
            tokens = tokenizer.convert_ids_to_tokens(input_id_list)
            idx_sep = listRightIndex(tokens, '[SEP]') + 1
            tokens = tokens[:idx_sep]
            attention = tuple(
                [item[:, :, :idx_sep, :idx_sep] for item in attention])
            results.append((attention, tokens))
        idx += batch_size
    return results
コード例 #13
0
def main():
    parser = argparse.ArgumentParser()
    #   Required parameters
    parser.add_argument("--task_def",
                        type=str,
                        required=True,
                        default="experiments/glue/glue_task_def.yml")
    parser.add_argument("--task", type=str, required=True)
    parser.add_argument("--task_id",
                        type=int,
                        default=0,
                        help="the id of this task when training")
    parser.add_argument("--checkpoint",
                        default='mt_dnn_models/bert_model_base_uncased.pt',
                        type=str)
    parser.add_argument(
        "--output_dir",
        default=
        '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/checkpoints/bert-cased_lcp-single_2020-12-23T2029/',
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--prep_input",
        default=
        '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/data_complex/bert_base_cased/lcp_dev.json',
        type=str,
        required=True,
    )
    parser.add_argument(
        '--bert_model_type',
        default='bert-base-cased',
        type=str,
        help="What type of bert model should we be using",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Pretrained config name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Pretrained tokenizer name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--data_subset",
        type=int,
        default=-1,
        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Whether to overwrite data in output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")

    parser.add_argument("--dont_normalize_importance_by_layer",
                        action="store_true",
                        help="Don't normalize importance score by layers")
    parser.add_argument(
        "--dont_normalize_global_importance",
        action="store_true",
        help="Don't normalize all importance scores between 0 and 1",
    )

    parser.add_argument(
        "--try_masking",
        action="store_true",
        help="Whether to try to mask head until a threshold of accuracy.")
    parser.add_argument(
        "--masking_threshold",
        default=0.9,
        type=float,
        help=
        "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
    )
    parser.add_argument(
        "--masking_amount",
        default=0.1,
        type=float,
        help="Amount to heads to masking at each masking step.")
    parser.add_argument("--metric_name",
                        default="acc",
                        type=str,
                        help="Metric to use for head masking.")

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, sequences shorter padded.",
    )
    # temp fix: technically these parameters should've already bin in checkpoint's config...
    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="For distributed training: world size")

    parser.add_argument("--batch_size",
                        default=8,
                        type=int,
                        help="Batch size.")
    parser.add_argument("--seed", type=int, default=2018)
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--cuda',
                        type=bool,
                        default=torch.cuda.is_available(),
                        help='whether to use GPU acceleration.')
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_proper",
                        type=str,
                        default=False,
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_improper",
                        type=str,
                        default=False,
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup devices and distributed training
    device = torch.device("cuda")
    if args.local_rank > -1:
        device = initialize_distributed(args)
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    # load task info
    task = args.task
    task_defs = TaskDefs(args.task_def)
    assert args.task in task_defs._task_type_map
    assert args.task in task_defs._data_type_map
    assert args.task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[args.task]
    task_type = task_defs._task_type_map[args.task]
    metric_meta = task_defs._metric_meta_map[args.task]
    # load model
    checkpoint_path = args.checkpoint
    assert os.path.exists(checkpoint_path)
    if args.cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path, map_location="cpu")
    opt = state_dict['config']
    args.bin_on = False
    opt.update(vars(args))
    model = MTDNNModel(opt, device=device, state_dict=state_dict)

    # Load pretrained model and tokenizer
    # Load data
    data = pd.read_csv('data_complex/lcp_test.tsv',
                       sep='\t',
                       header=None,
                       names=['idx', 'complexity', 'sentence', 'token'])
    data['complexity'] = np.load(
        '/content/gdrive/My Drive/Colab Notebooks/cs99/from_macbook/single_test_labels.npy'
    )
    data['class'] = pd.cut(data['complexity'],
                           labels=[1, 2, 3, 4, 5],
                           bins=[0, 0.2, 0.4, 0.6, 0.8, 1],
                           include_lowest=True)
    data['sent_len'] = data['sentence'].str.len()
    with open(
            '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/lcp_test_scores_epoch_4.json',
            'r') as file:
        single_dev_bert_scores = json.load(file)
        data['finetuned_complexity'] = single_dev_bert_scores['scores']
        data['finetuned_error'] = data['finetuned_complexity'] - data[
            'complexity']
        data['finetuned_abs_error'] = (data['finetuned_complexity'] -
                                       data['complexity']).abs()
    with open(
            '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/pretrained.json',
            'r') as file:
        single_dev_bert_scores = json.load(file)
        data['pretrained_complexity'] = single_dev_bert_scores['scores']
        data['pretrained_error'] = data['pretrained_complexity'] - data[
            'complexity']
        data['pretrained_abs_error'] = (data['pretrained_complexity'] -
                                        data['complexity']).abs()
    data['improvement'] = data['pretrained_abs_error'] - data[
        'finetuned_abs_error']
    data['proper'] = data['token'].apply(lambda x: x[0].isupper())
    # Distributed training:
    # download model & vocab.
    printable = opt['local_rank'] in [-1, 0]
    encoder_type = opt.get('encoder_type', EncoderModelType.BERT)
    collater = Collater(is_train=True,
                        encoder_type=encoder_type,
                        max_seq_len=opt['max_seq_len'],
                        do_padding=opt['do_padding'])
    dev_data = SingleTaskDataset(opt['prep_input'],
                                 True,
                                 maxlen=opt['max_seq_len'],
                                 task_id=opt['task_id'],
                                 task_def=task_def,
                                 printable=printable)
    if args.do_proper:
        dev_data._data = np.array(
            dev_data._data)[data[data['proper']]['idx'].to_numpy()].tolist()
    if args.do_improper:
        dev_data._data = np.array(
            dev_data._data)[data[~data['proper']]['idx'].to_numpy()].tolist()
    dev_data_loader = DataLoader(dev_data,
                                 batch_size=opt['batch_size_eval'],
                                 collate_fn=collater.collate_fn,
                                 pin_memory=opt['cuda'])

    # Compute head entropy and importance score
    results = []
    for seed in tqdm(range(2010 + 1, 2020 + 1)):  # Set seeds
        set_seed(seed)
        attn_entropy, head_importance, preds, labels = compute_heads_importance(
            opt, model, dev_data_loader)
        results.append((attn_entropy, head_importance))
    pkl.dump(
        results,
        open('checkpoints/bert-cased_lcp-single_2021-01-19T0309/results.pkl',
             'wb'))

    # Try head masking (set heads to zero until the score goes under a threshold)
    # and head pruning (remove masked heads and see the effect on the network)
    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
        head_mask = mask_heads(opt, model, dev_data_loader)
コード例 #14
0
def main():
    task_def_path = 'data_complex/lcp.yml'
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    parser = argparse.ArgumentParser()
    model_config(parser)
    set_config(parser)
    train_config(parser)
    args = parser.parse_args()
    encoder_type = args.encoder_type
    layer_indexes = [int(x) for x in args.layers.split(",")]
    set_environment(args.seed)
    # process data
    data, is_single_sentence = process_data(args)
    data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis
    fout_temp = '{}.tmp'.format(args.finput)
    dump_data(data, fout_temp)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    dataset = SingleTaskDataset(fout_temp, False, maxlen=args.max_seq_length, task_def=task_def)#, data_type=data_type)
    batcher = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda)
    opt = vars(args)
    # load model
    if os.path.exists(args.checkpoint):
        state_dict = torch.load(args.checkpoint)
        config = state_dict['config']
        config['dump_feature'] = True
        config['local_rank'] = -1
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error(
            'Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        return
    num_all_batches = len(batcher)
    model = MTDNNModel(
        opt,
        state_dict=state_dict,
        num_train_step=num_all_batches)
    if args.cuda:
        model.cuda()

    features_dict = {}
    for batch_meta, batch_data in batcher:
        batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data)
        all_encoder_layers, _ = model.extract(batch_meta, batch_data)
        embeddings = [all_encoder_layers[idx].detach().cpu().numpy()
                      for idx in layer_indexes]

        #import pdb; pdb.set_trace()
        uids = batch_meta['uids']
        masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist()
        for idx, uid in enumerate(uids):
            slen = sum(masks[idx])
            features = {}
            for yidx, layer in enumerate(layer_indexes):
                features[layer] = str(embeddings[yidx][idx][:slen].tolist())
            features_dict[uid] = features

    # save features
    with open(args.foutput, 'w', encoding='utf-8') as writer:
        for sample in data:
            uid = sample['uid']
            tokens = sample['tokens']
            feature = features_dict[uid]
            feature['tokens'] = tokens
            feature['uid'] = uid
            writer.write('{}\n'.format(json.dumps(feature)))