Exemple #1
0
def load_model_for_viz_2(task_def_path,
                         checkpoint_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    if use_cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path,
                                map_location=torch.device('cpu'))
    config = state_dict['config']
    config["cuda"] = use_cuda
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ## temp fix
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    del state_dict['optimizer']
    config['output_attentions'] = True
    config['output_hidden_states'] = True
    config['local_rank'] = -1
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load config and tokenizer
    config = BertConfig.from_dict(config)
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    return config, tokenizer
Exemple #2
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model,
                                              cache_dir=args.transformer_cache)

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(
                rows,
                dump_path,
                tokenizer,
                task_def.data_type,
                lab_dict=task_def.label_vocab,
                workers=args.workers,
            )
Exemple #3
0
def main(args):
    # hyper param
    do_lower_case = args.do_lower_case
    root = "dl/" + args.root_dir
    assert os.path.exists(root)

    literal_model_type = args.model.split('-')[0].upper()

    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in args.model:
        mt_dnn_suffix += "_base"
    elif 'large' in args.model:
        mt_dnn_suffix += "_large"

    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained("dl/mt-dnn-models/vocab.txt",
                                                do_lower_case=do_lower_case)

    if 'uncased' in args.model:
        mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix)

    if do_lower_case:
        mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in ['test']:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            print(file_path)
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       task_def.data_type,
                       encoderModelType=encoder_model,
                       lab_dict=task_def.label_vocab)
Exemple #4
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model, mirror='tuna')

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            if args.task_type == "clue":
                file_path = os.path.join(root, task, f"{split_name}.json")
            else:
                file_path = os.path.join(root,
                                         "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit" % file_path)
                sys.exit(1)
            if args.task_type == "glue":
                rows = load_data(file_path, task_def)
            elif args.task_type == "clue":
                rows = load_clue_data(file_path, task_def)
            elif args.task_type == "qianyan":
                rows = load_qianyan_data(file_path, task_def)
            else:
                raise ValueError(f"{args.task_type} not implemented")
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(rows,
                       dump_path,
                       tokenizer,
                       task_def.data_type,
                       lab_dict=task_def.label_vocab)
Exemple #5
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)
    suffix = args.model.split("/")[-1]
    literal_model_type = suffix.split("-")[0].upper()

    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if "base" in args.model:
        mt_dnn_suffix += "_base"
    elif "large" in args.model:
        mt_dnn_suffix += "_large"

    # tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        args.model,
        cache_dir=args.cache_dir,
        use_fast=True,
        from_slow=True,
        revision=args.model_revision,
    )
    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if "uncased" in args.model:
        mt_dnn_suffix = "{}_uncased".format(mt_dnn_suffix)
    else:
        mt_dnn_suffix = "{}_cased".format(mt_dnn_suffix)

    mt_dnn_root = os.path.join(root, mt_dnn_suffix)
    if not os.path.isdir(mt_dnn_root):
        os.mkdir(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            print(root)
            file_path = os.path.join(root, "%s_%s.json" % (task, split_name))
            print(file_path)

            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            logger.warning("processing %s" % file_path)
            is_training = True
            if not "train" in split_name:
                is_training = False

            rows = flat_squad(file_path, is_training)
            dump_path = os.path.join(mt_dnn_root, "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            if is_training:
                prepare_train_feature(
                    tokenizer,
                    rows,
                    dump_path,
                    pad_on_right=pad_on_right,
                    label_mapper=task_def.label_vocab,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                )
            else:
                prepare_validation_features(
                    tokenizer,
                    rows,
                    dump_path,
                    pad_on_right=pad_on_right,
                    label_mapper=task_def.label_vocab,
                    max_seq_length=args.max_seq_length,
                    doc_stride=args.doc_stride,
                )
Exemple #6
0
def load_model_for_viz_0(task_def_path,
                         checkpoint_path,
                         input_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    state_dict = torch.load(checkpoint_path)
    config = state_dict['config']
    config["cuda"] = use_cuda
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ####### temp fix #######
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    del state_dict['optimizer']
    #########################
    model = MTDNNModel(config, state_dict=state_dict)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    # load data
    prep_input = input_path
    test_data_set = SingleTaskDataset(prep_input,
                                      False,
                                      maxlen=512,
                                      task_id=0,
                                      task_def=task_def)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=1,
                           collate_fn=collater.collate_fn,
                           pin_memory=True)
    idx = 0
    results = []
    return model.mnetwork.module.bert, config, test_data
Exemple #7
0
def load_model_for_viz_1(task_def_path,
                         checkpoint_path,
                         input_path,
                         model_type='bert-base-cased',
                         do_lower_case=False,
                         use_cuda=True):
    # load task info
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    assert task in task_defs._task_type_map
    assert task in task_defs._data_type_map
    assert task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[task]
    task_type = task_defs._task_type_map[task]
    metric_meta = task_defs._metric_meta_map[task]
    # load model
    assert os.path.exists(checkpoint_path)
    state_dict = torch.load(checkpoint_path)
    config = state_dict['config']
    config["cuda"] = use_cuda
    device = torch.device("cuda" if use_cuda else "cpu")
    task_def = task_defs.get_task_def(prefix)
    task_def_list = [task_def]
    config['task_def_list'] = task_def_list
    ## temp fix
    config['fp16'] = False
    config['answer_opt'] = 0
    config['adv_train'] = False
    #del state_dict['optimizer']
    config['output_attentions'] = True
    config['local_rank'] = -1
    model = MTDNNModel(config, device, state_dict=state_dict)
    encoder_type = config.get('encoder_type', EncoderModelType.BERT)
    root = os.path.basename(task_def_path)
    literal_model_type = model_type.split('-')[0].upper()
    encoder_model = EncoderModelType[literal_model_type]
    literal_model_type = literal_model_type.lower()
    mt_dnn_suffix = literal_model_type
    if 'base' in model_type:
        mt_dnn_suffix += "_base"
    elif 'large' in model_type:
        mt_dnn_suffix += "_large"
    # load tokenizer
    config_class, model_class, tokenizer_class = MODEL_CLASSES[
        literal_model_type]
    tokenizer = tokenizer_class.from_pretrained(model_type,
                                                do_lower_case=do_lower_case)
    # load data
    prep_input = input_path
    test_data_set = SingleTaskDataset(prep_input,
                                      False,
                                      maxlen=512,
                                      task_id=0,
                                      task_def=task_def)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    test_data = DataLoader(test_data_set,
                           batch_size=1,
                           collate_fn=collater.collate_fn,
                           pin_memory=True)
    idx = 0
    results = []
    for batch_meta, batch_data in tqdm(test_data):
        if idx < 360:
            idx += 1
            continue
        batch_meta, batch_data = Collater.patch_data(device, batch_meta,
                                                     batch_data)
        model.network.eval()
        task_id = batch_meta['task_id']
        task_def = TaskDef.from_dict(batch_meta['task_def'])
        task_type = task_def.task_type
        task_obj = tasks.get_task_obj(task_def)
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        input_ids = inputs[0]
        token_type_ids = inputs[1]
        attention = model.mnetwork.module.bert(
            input_ids, token_type_ids=token_type_ids)[-1]
        batch_size = batch_data[0].shape[0]
        for i in range(batch_size):
            attention = tuple([item[i:i + 1, :, :, :] for item in attention])
            input_id_list = input_ids[i].tolist()
            tokens = tokenizer.convert_ids_to_tokens(input_id_list)
            idx_sep = listRightIndex(tokens, '[SEP]') + 1
            tokens = tokens[:idx_sep]
            attention = tuple(
                [item[:, :, :idx_sep, :idx_sep] for item in attention])
            results.append((attention, tokens))
        idx += batch_size
    return results
Exemple #8
0
    golds = []
    predictions = []
    scores = []
    for sample_id, label in sample_id_2_label_dic.items():
        golds.append(label)
        pred, score_seg = sample_id_2_pred_score_seg_dic[sample_id]
        predictions.append(pred)
        scores.extend(score_seg)
    return golds, predictions, scores


args = parser.parse_args()

task_def_path = args.task_def
task_defs = TaskDefs(task_def_path)
task_def = task_defs.get_task_def(args.task)
n_class = task_def.n_class
sample_id_2_pred_score_seg_dic = load_score_file(args.score, n_class)

data_type = task_def.data_type
task_type = task_def.task_type
label_mapper = task_def.label_vocab
sample_objs = load_data(args.std_input, data_type, task_type, label_mapper)

golds, predictions, scores = generate_golds_predictions_scores(
    sample_id_2_pred_score_seg_dic, sample_objs)

metrics = calc_metrics(task_def.metric_meta, golds, predictions, scores)
print(metrics)
parser.add_argument("--task_def", type=str, default="experiments/glue/glue_task_def.yml")
parser.add_argument("--task", type=str)
parser.add_argument("--add_soft_label", action="store_true",
                    help="without this option, we replace hard label with soft label")

parser.add_argument("--std_input", type=str)
parser.add_argument("--score", type=str)
parser.add_argument("--std_output", type=str)

args = parser.parse_args()

task_def_path = args.task_def
task = args.task
task_defs = TaskDefs(task_def_path)

n_class = task_defs.get_task_def(task).n_class
sample_id_2_pred_score_seg_dic = load_score_file(args.score, n_class)

with open(args.std_output, "w", encoding="utf-8") as out_f:
    for line in open(args.std_input, encoding="utf-8"):
        fields = line.strip("\n").split("\t")
        sample_id = fields[0]
        target_score_idx = 1  # TODO: here we assume binary classification task
        score = sample_id_2_pred_score_seg_dic[sample_id][1][target_score_idx]
        if args.add_soft_label:
            fields = fields[:2] + [str(score)] + fields[2:]
        else:
            fields[1] = str(score)
        out_f.write("\t".join(fields))
        out_f.write("\n")
Exemple #10
0
parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(),
                    help='whether to use GPU acceleration.')

parser.add_argument("--checkpoint", default='mt_dnn_models/mt_dnn_base_uncased.pt', type=str)
parser.add_argument("--encoder_type", default=EncoderModelType.BERT)

args = parser.parse_args()

# load task info
task = args.task
task_defs = TaskDefs(args.task_def)
assert args.task in task_defs._task_type_map
assert args.task in task_defs._data_type_map
assert args.task in task_defs._metric_meta_map
prefix = task.split('_')[0]
task_def = task_defs.get_task_def(prefix)
data_type = task_defs._data_type_map[args.task]
task_type = task_defs._task_type_map[args.task]
metric_meta = task_defs._metric_meta_map[args.task]
# load model
checkpoint_path = args.checkpoint
assert os.path.exists(checkpoint_path)
if args.cuda:
    state_dict = torch.load(checkpoint_path)
else:
    state_dict = torch.load(checkpoint_path, map_location="cpu")
config = state_dict['config']
config["cuda"] = args.cuda
task_def = task_defs.get_task_def(prefix)
task_def_list = [task_def]
config['task_def_list'] = task_def_list
Exemple #11
0
def main():
    parser = argparse.ArgumentParser()
    #   Required parameters
    parser.add_argument("--task_def",
                        type=str,
                        required=True,
                        default="experiments/glue/glue_task_def.yml")
    parser.add_argument("--task", type=str, required=True)
    parser.add_argument("--task_id",
                        type=int,
                        default=0,
                        help="the id of this task when training")
    parser.add_argument("--checkpoint",
                        default='mt_dnn_models/bert_model_base_uncased.pt',
                        type=str)
    parser.add_argument(
        "--output_dir",
        default=
        '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/checkpoints/bert-cased_lcp-single_2020-12-23T2029/',
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--prep_input",
        default=
        '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/data_complex/bert_base_cased/lcp_dev.json',
        type=str,
        required=True,
    )
    parser.add_argument(
        '--bert_model_type',
        default='bert-base-cased',
        type=str,
        help="What type of bert model should we be using",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Pretrained config name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Pretrained tokenizer name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--cache_dir",
        default=None,
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--data_subset",
        type=int,
        default=-1,
        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Whether to overwrite data in output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")

    parser.add_argument("--dont_normalize_importance_by_layer",
                        action="store_true",
                        help="Don't normalize importance score by layers")
    parser.add_argument(
        "--dont_normalize_global_importance",
        action="store_true",
        help="Don't normalize all importance scores between 0 and 1",
    )

    parser.add_argument(
        "--try_masking",
        action="store_true",
        help="Whether to try to mask head until a threshold of accuracy.")
    parser.add_argument(
        "--masking_threshold",
        default=0.9,
        type=float,
        help=
        "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
    )
    parser.add_argument(
        "--masking_amount",
        default=0.1,
        type=float,
        help="Amount to heads to masking at each masking step.")
    parser.add_argument("--metric_name",
                        default="acc",
                        type=str,
                        help="Metric to use for head masking.")

    parser.add_argument(
        "--max_seq_length",
        default=512,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, sequences shorter padded.",
    )
    # temp fix: technically these parameters should've already bin in checkpoint's config...
    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="For distributed training: world size")

    parser.add_argument("--batch_size",
                        default=8,
                        type=int,
                        help="Batch size.")
    parser.add_argument("--seed", type=int, default=2018)
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--cuda',
                        type=bool,
                        default=torch.cuda.is_available(),
                        help='whether to use GPU acceleration.')
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_proper",
                        type=str,
                        default=False,
                        help="Can be used for distant debugging.")
    parser.add_argument("--do_improper",
                        type=str,
                        default=False,
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup devices and distributed training
    device = torch.device("cuda")
    if args.local_rank > -1:
        device = initialize_distributed(args)
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    # load task info
    task = args.task
    task_defs = TaskDefs(args.task_def)
    assert args.task in task_defs._task_type_map
    assert args.task in task_defs._data_type_map
    assert args.task in task_defs._metric_meta_map
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    data_type = task_defs._data_type_map[args.task]
    task_type = task_defs._task_type_map[args.task]
    metric_meta = task_defs._metric_meta_map[args.task]
    # load model
    checkpoint_path = args.checkpoint
    assert os.path.exists(checkpoint_path)
    if args.cuda:
        state_dict = torch.load(checkpoint_path)
    else:
        state_dict = torch.load(checkpoint_path, map_location="cpu")
    opt = state_dict['config']
    args.bin_on = False
    opt.update(vars(args))
    model = MTDNNModel(opt, device=device, state_dict=state_dict)

    # Load pretrained model and tokenizer
    # Load data
    data = pd.read_csv('data_complex/lcp_test.tsv',
                       sep='\t',
                       header=None,
                       names=['idx', 'complexity', 'sentence', 'token'])
    data['complexity'] = np.load(
        '/content/gdrive/My Drive/Colab Notebooks/cs99/from_macbook/single_test_labels.npy'
    )
    data['class'] = pd.cut(data['complexity'],
                           labels=[1, 2, 3, 4, 5],
                           bins=[0, 0.2, 0.4, 0.6, 0.8, 1],
                           include_lowest=True)
    data['sent_len'] = data['sentence'].str.len()
    with open(
            '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/lcp_test_scores_epoch_4.json',
            'r') as file:
        single_dev_bert_scores = json.load(file)
        data['finetuned_complexity'] = single_dev_bert_scores['scores']
        data['finetuned_error'] = data['finetuned_complexity'] - data[
            'complexity']
        data['finetuned_abs_error'] = (data['finetuned_complexity'] -
                                       data['complexity']).abs()
    with open(
            '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/pretrained.json',
            'r') as file:
        single_dev_bert_scores = json.load(file)
        data['pretrained_complexity'] = single_dev_bert_scores['scores']
        data['pretrained_error'] = data['pretrained_complexity'] - data[
            'complexity']
        data['pretrained_abs_error'] = (data['pretrained_complexity'] -
                                        data['complexity']).abs()
    data['improvement'] = data['pretrained_abs_error'] - data[
        'finetuned_abs_error']
    data['proper'] = data['token'].apply(lambda x: x[0].isupper())
    # Distributed training:
    # download model & vocab.
    printable = opt['local_rank'] in [-1, 0]
    encoder_type = opt.get('encoder_type', EncoderModelType.BERT)
    collater = Collater(is_train=True,
                        encoder_type=encoder_type,
                        max_seq_len=opt['max_seq_len'],
                        do_padding=opt['do_padding'])
    dev_data = SingleTaskDataset(opt['prep_input'],
                                 True,
                                 maxlen=opt['max_seq_len'],
                                 task_id=opt['task_id'],
                                 task_def=task_def,
                                 printable=printable)
    if args.do_proper:
        dev_data._data = np.array(
            dev_data._data)[data[data['proper']]['idx'].to_numpy()].tolist()
    if args.do_improper:
        dev_data._data = np.array(
            dev_data._data)[data[~data['proper']]['idx'].to_numpy()].tolist()
    dev_data_loader = DataLoader(dev_data,
                                 batch_size=opt['batch_size_eval'],
                                 collate_fn=collater.collate_fn,
                                 pin_memory=opt['cuda'])

    # Compute head entropy and importance score
    results = []
    for seed in tqdm(range(2010 + 1, 2020 + 1)):  # Set seeds
        set_seed(seed)
        attn_entropy, head_importance, preds, labels = compute_heads_importance(
            opt, model, dev_data_loader)
        results.append((attn_entropy, head_importance))
    pkl.dump(
        results,
        open('checkpoints/bert-cased_lcp-single_2021-01-19T0309/results.pkl',
             'wb'))

    # Try head masking (set heads to zero until the score goes under a threshold)
    # and head pruning (remove masked heads and see the effect on the network)
    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
        head_mask = mask_heads(opt, model, dev_data_loader)
Exemple #12
0
def main():
    task_def_path = 'data_complex/lcp.yml'
    task = os.path.splitext(os.path.basename(task_def_path))[0]
    task_defs = TaskDefs(task_def_path)
    prefix = task.split('_')[0]
    task_def = task_defs.get_task_def(prefix)
    parser = argparse.ArgumentParser()
    model_config(parser)
    set_config(parser)
    train_config(parser)
    args = parser.parse_args()
    encoder_type = args.encoder_type
    layer_indexes = [int(x) for x in args.layers.split(",")]
    set_environment(args.seed)
    # process data
    data, is_single_sentence = process_data(args)
    data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis
    fout_temp = '{}.tmp'.format(args.finput)
    dump_data(data, fout_temp)
    collater = Collater(is_train=False, encoder_type=encoder_type)
    dataset = SingleTaskDataset(fout_temp, False, maxlen=args.max_seq_length, task_def=task_def)#, data_type=data_type)
    batcher = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda)
    opt = vars(args)
    # load model
    if os.path.exists(args.checkpoint):
        state_dict = torch.load(args.checkpoint)
        config = state_dict['config']
        config['dump_feature'] = True
        config['local_rank'] = -1
        opt.update(config)
    else:
        logger.error('#' * 20)
        logger.error(
            'Could not find the init model!\n The parameters will be initialized randomly!')
        logger.error('#' * 20)
        return
    num_all_batches = len(batcher)
    model = MTDNNModel(
        opt,
        state_dict=state_dict,
        num_train_step=num_all_batches)
    if args.cuda:
        model.cuda()

    features_dict = {}
    for batch_meta, batch_data in batcher:
        batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data)
        all_encoder_layers, _ = model.extract(batch_meta, batch_data)
        embeddings = [all_encoder_layers[idx].detach().cpu().numpy()
                      for idx in layer_indexes]

        #import pdb; pdb.set_trace()
        uids = batch_meta['uids']
        masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist()
        for idx, uid in enumerate(uids):
            slen = sum(masks[idx])
            features = {}
            for yidx, layer in enumerate(layer_indexes):
                features[layer] = str(embeddings[yidx][idx][:slen].tolist())
            features_dict[uid] = features

    # save features
    with open(args.foutput, 'w', encoding='utf-8') as writer:
        for sample in data:
            uid = sample['uid']
            tokens = sample['tokens']
            feature = features_dict[uid]
            feature['tokens'] = tokens
            feature['uid'] = uid
            writer.write('{}\n'.format(json.dumps(feature)))