Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Pretrained tokenizer name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--data_subset",
        type=int,
        default=-1,
        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument("--overwrite_output_dir",
                        action='store_true',
                        help="Whether to overwrite data in output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")

    parser.add_argument("--dont_normalize_importance_by_layer",
                        action='store_true',
                        help="Don't normalize importance score by layers")
    parser.add_argument(
        "--dont_normalize_global_importance",
        action='store_true',
        help="Don't normalize all importance scores between 0 and 1")

    parser.add_argument(
        "--try_masking",
        action='store_true',
        help="Whether to try to mask head until a threshold of accuracy.")
    parser.add_argument(
        "--masking_threshold",
        default=0.9,
        type=float,
        help=
        "masking threshold in term of metrics (stop masking when metric < threshold * original metric value)."
    )
    parser.add_argument(
        "--masking_amount",
        default=0.1,
        type=float,
        help="Amount to heads to masking at each masking step.")
    parser.add_argument("--metric_name",
                        default="acc",
                        type=str,
                        help="Metric to use for head masking.")

    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, sequences shorter padded."
    )
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        help="Batch size.")

    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup devices and distributed training
    if args.local_rank == -1 or args.no_cuda:
        args.device = torch.device("cuda" if torch.cuda.is_available()
                                   and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        torch.distributed.init_process_group(
            backend='nccl')  # Initializes the distributed backend

    # Setup logging
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("device: {} n_gpu: {}, distributed: {}".format(
        args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = ""
    for key in MODEL_CLASSES:
        if key in args.model_name_or_path.lower():
            args.model_type = key  # take the first match in model types
            break
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        output_attentions=True,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task
    eval_data = load_and_cache_examples(args,
                                        args.task_name,
                                        tokenizer,
                                        evaluate=True)
    if args.data_subset > 0:
        eval_data = Subset(eval_data,
                           list(range(min(args.data_subset, len(eval_data)))))
    eval_sampler = SequentialSampler(
        eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    # Compute head entropy and importance score
    compute_heads_importance(args, model, eval_dataloader)

    # Try head masking (set heads to zero until the score goes under a threshole)
    # and head pruning (remove masked heads and see the effect on the network)
    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
        head_mask = mask_heads(args, model, eval_dataloader)
        prune_heads(args, model, eval_dataloader, head_mask)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Pretrained config name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Pretrained tokenizer name or path if not the same as model_name_or_path",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--data_subset",
        type=int,
        default=-1,
        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Whether to overwrite data in output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")

    parser.add_argument("--dont_normalize_importance_by_layer",
                        action="store_true",
                        help="Don't normalize importance score by layers")
    parser.add_argument(
        "--dont_normalize_global_importance",
        action="store_true",
        help="Don't normalize all importance scores between 0 and 1",
    )
    parser.add_argument("--use_train_data",
                        action="store_true",
                        help="Use training set for computing masks")
    parser.add_argument(
        "--masking_threshold",
        default=0.9,
        type=float,
        help=
        "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
    )
    parser.add_argument(
        "--masking_amount",
        default=0.1,
        type=float,
        help="Amount to heads to masking at each masking step.")
    parser.add_argument("--metric_name",
                        default=None,
                        type=str,
                        help="Metric to use for head masking.")

    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, sequences shorter padded.",
    )
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        help="Batch size.")

    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup devices and distributed training
    if args.local_rank == -1 or args.no_cuda:
        args.device = torch.device("cuda" if torch.cuda.is_available()
                                   and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        torch.distributed.init_process_group(
            backend="nccl")  # Initializes the distributed backend

    # Setup logging
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("device: {} n_gpu: {}, distributed: {}".format(
        args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
    set_seed(args)

    tracker = ImpactTracker(args.output_dir)
    tracker.launch_impact_monitor()

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))

    if args.metric_name is None:
        args.metric_name = {
            "cola": "mcc",
            "mnli": "acc",
            "mnli-mm": "acc",
            "mrpc": "acc",
            "sst-2": "acc",
            "sts-b": "corr",
            "qqp": "acc",
            "qnli": "acc",
            "rte": "acc",
            "wnli": "acc",
            "hans": "acc",
            "mnli_two": "acc",
            "hans_mnli": "acc"
        }[args.task_name]

    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    MODEL_CLASSES["bert"] = (BertConfig, BertForSequenceClassification,
                             MODEL_CLASSES["bert"][2])

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    if args.model_type != "bert":
        raise NotImplemented("Not implemented for non BERT classes yet.")

    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        output_attentions=True,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task

    eval_data = load_and_cache_examples(args,
                                        args.task_name,
                                        tokenizer,
                                        evaluate=True)
    true_eval_len = len(eval_data)

    if args.use_train_data:
        train_data = load_and_cache_examples(args,
                                             args.task_name,
                                             tokenizer,
                                             evaluate=False)
        eval_data = random_split(
            train_data,
            [true_eval_len, len(train_data) - true_eval_len])[0]

    if args.data_subset > 0:
        eval_data = Subset(eval_data,
                           list(range(min(args.data_subset, len(eval_data)))))

    eval_sampler = SequentialSampler(
        eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    _, masked_amount = mask_global(args, model, eval_dataloader)

    # Generating a random mask of equal size
    fresh_model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    mask = prune_model(fresh_model, masked_amount, prune.RandomUnstructured)
    torch.save(mask, os.path.join(args.output_dir, "random_mask.p"))

    fresh_model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    mask = prune_model(fresh_model, masked_amount, L1UnstructuredInvert)
    torch.save(mask, os.path.join(args.output_dir, "bad_mask.pt"))
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(description="Training")

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--output_dir", type=str, required=True,
                        help="The output directory (log, checkpoints, parameters, etc.)")
    # parser.add_argument("--data_file", type=str, required=True,
    #                     help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
    # parser.add_argument("--token_counts", type=str, required=True,
    #                     help="The token counts in the data_file for MLM.")
    parser.add_argument("--force", action='store_true',
                        help="Overwrite output_dir if it already exists.")
    parser.add_argument("--task_name", default=None, type=str, required=True,
                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    # parser.add_argument("--output_dir", default=None, type=str, required=True,
    #                     help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument("--vocab_size", default=30522, type=int,
                        help="The vocabulary size.")
    parser.add_argument("--max_position_embeddings", default=512, type=int,
                        help="Maximum sequence length we can model (including [CLS] and [SEP]).")
    parser.add_argument("--sinusoidal_pos_embds", action='store_false',
                        help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
    parser.add_argument("--n_layers", default=6, type=int,
                        help="Number of Transformer blocks.")
    parser.add_argument("--n_heads", default=12, type=int,
                        help="Number of heads in the self-attention module.")
    parser.add_argument("--dim", default=768, type=int,
                        help="Dimension through the network. Must be divisible by n_heads")
    parser.add_argument("--hidden_dim", default=3072, type=int,
                        help="Intermediate dimension in the FFN.")
    parser.add_argument("--dropout", default=0.1, type=float,
                        help="Dropout.")
    parser.add_argument("--attention_dropout", default=0.1, type=float,
                        help="Dropout in self-attention.")
    parser.add_argument("--activation", default='gelu', type=str,
                        help="Activation to use in self-attention")
    parser.add_argument("--tie_weights_", action='store_false',
                        help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")

    # parser.add_argument("--from_pretrained_weights", default=None, type=str,
    #                     help="Load student initialization checkpoint.")
    # parser.add_argument("--from_pretrained_config", default=None, type=str,
    #                     help="Load student initialization architecture config.")
    parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
                        help="The teacher model.")

    parser.add_argument("--temperature", default=2., type=float,
                        help="Temperature for the softmax temperature.")
    parser.add_argument("--alpha_ce", default=1.0, type=float,
                        help="Linear weight for the distillation loss. Must be >=0.")
    # parser.add_argument("--alpha_mlm", default=0.5, type=float,
    #                     help="Linear weight for the MLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse", default=0.0, type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument("--alpha_cos", default=0.0, type=float,
                        help="Linear weight of the cosine embedding loss. Must be >=0.")
    # parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
    #                     help="Proportion of tokens for which we need to make a prediction.")
    # parser.add_argument("--word_mask", default=0.8, type=float,
    #                     help="Proportion of tokens to mask out.")
    # parser.add_argument("--word_keep", default=0.1, type=float,
    #                     help="Proportion of tokens to keep.")
    # parser.add_argument("--word_rand", default=0.1, type=float,
    #                     help="Proportion of tokens to randomly replace.")
    # parser.add_argument("--mlm_smoothing", default=0.7, type=float,
    #                     help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
    # parser.add_argument("--restrict_ce_to_mask", action='store_true',
    #                     help="If true, compute the distilation loss only the [MLM] prediction distribution.")

    parser.add_argument("--n_epoch", type=int, default=3,
                        help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size", type=int, default=5,
                        help="Batch size (for each process).")
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    # parser.add_argument("--tokens_per_batch", type=int, default=-1,
    #                     help="If specified, modify the batches so that they have approximately this number of tokens.")
    # parser.add_argument("--shuffle", action='store_false',
    #                     help="If true, shuffle the sequence order. Default is true.")
    parser.add_argument("--group_by_size", action='store_false',
                        help="If true, group sequences that have similar length into the same batch. Default is true.")

    parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
                        help="Gradient accumulation for larger training batches.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_prop", default=0.05, type=float,
                        help="Linear warmup proportion.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--learning_rate", default=5e-4, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon", default=1e-6, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=5.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--initializer_range", default=0.02, type=float,
                        help="Random initialization range.")

    # parser.add_argument('--fp16', action='store_true',
    #                     help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    # parser.add_argument('--fp16_opt_level', type=str, default='O1',
    #                     help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
    #                          "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--n_gpu", type=int, default=1,
                        help="Number of GPUs in the node.")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Distributed training - Local rank")
    parser.add_argument("--seed", type=int, default=56,
                        help="Random seed")

    parser.add_argument("--log_interval", type=int, default=10,
                        help="Tensorboard logging interval.")
    parser.add_argument('--log_examples', action='store_false',
                        help="Show input examples on the command line during evaluation. Enabled by default.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--checkpoint_interval", type=int, default=100,
                        help="Checkpoint interval.")
    parser.add_argument("--no_cuda", type=bool, default=False,
                        help="Avoid using CUDA when available")
    parser.add_argument("--toy_mode", action='store_true', help="Toy mode for development.")
    parser.add_argument("--rich_eval", action='store_true', help="Rich evaluation (more metrics + mistake reporting).")

    args = parser.parse_args()
    args.no_cuda = False
    print("NO CUDA", args.no_cuda)

    ## ARGS ##
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.output_dir):
            if not args.force:
                raise ValueError(f'Serialization dir {args.output_dir} already exists, but you have not precised wheter to overwrite it'
                                   'Use `--force` if you want to overwrite it')
            else:
                shutil.rmtree(args.output_dir)

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info(f'Experiment will be dumped and logged in {args.output_dir}')


        ### SAVE PARAMS ###
        logger.info(f'Param: {args}')
        with open(os.path.join(args.output_dir, 'parameters.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
        # git_log(args.output_dir)
    print(args.local_rank)
    print(args.no_cuda)
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        args.local_rank = -1
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        device = torch.device("cuda", args.local_rank)
    args.device = device

    logger.info("DEVICE: {}".format(args.device))
    logger.info("N_GPU: {}".format(args.n_gpu))
    # exit(0)
    print(args.local_rank)

    ### TOKENIZER ###
    # if args.teacher_type == 'bert':
    tokenizer = BertTokenizer.from_pretrained(args.teacher_name, do_lower_case=args.do_lower_case)
    # elif args.teacher_type == 'roberta':
    #     tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    args.model_name_or_path = args.teacher_name
    args.max_seq_length = args.max_position_embeddings
    args.model_type = "bert"
    args.output_mode = output_modes[args.task_name]
    train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)

    ## DATA LOADER ##
    # logger.info(f'Loading data from {args.data_file}')
    # with open(args.data_file, 'rb') as fp:
    #     data = pickle.load(fp)


    # assert os.path.isfile(args.token_counts)
    # logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
    # with open(args.token_counts, 'rb') as fp:
    #     counts = pickle.load(fp)
    #     assert len(counts) == args.vocab_size
    # token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
    # for idx in special_tok_ids.values():
    #     token_probs[idx] = 0.  # do not predict special tokens
    # token_probs = torch.from_numpy(token_probs)


    # train_dataloader = Dataset(params=args, data=data)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.batch_size)
    logger.info(f'Data loader created.')


    ## STUDENT ##
    # if args.from_pretrained_weights is not None:
    #     assert os.path.isfile(args.from_pretrained_weights)
    #     assert os.path.isfile(args.from_pretrained_config)
    #     logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
    #     logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
    #     stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
    #     stu_architecture_config.output_hidden_states = True
    #     student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
    #                                                     config=stu_architecture_config)
    # else:
    #     args.vocab_size_or_config_json_file = args.vocab_size
    #     stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
    #     student = DistilBertForMaskedLM(stu_architecture_config)
    student_config = BertConfig(
        vocab_size_or_config_json_file=args.vocab_size,
        hidden_size=args.dim,
        num_hidden_layers=args.n_layers,
        num_attention_heads=args.n_heads,
        intermediate_size=args.hidden_dim,
        hidden_dropout_prob=args.dropout,
        attention_probs_dropout_prob=args.attention_dropout,
        max_position_embeddings=args.max_position_embeddings,
        hidden_act=args.activation,
        initializer_range=0.02)
    student = BertForSequenceClassification(student_config)
    if args.n_gpu > 0:
        student.to(f'cuda:{args.local_rank}')
    logger.info(f'Student loaded.')


    ## TEACHER ##
    teacher = BertForSequenceClassification.from_pretrained(args.teacher_name) # take outputs[1] for the logits
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
    logger.info(f'Teacher loaded from {args.teacher_name}.')

    ## DISTILLER ##
    torch.cuda.empty_cache()
    distiller = Distiller(params=args,
                          dataloader=train_dataloader,
                          # token_probs=token_probs,
                          student=student,
                          teacher=teacher,
                          tokenizer=tokenizer)
    distiller.train()
    logger.info("Let's go get some drinks.")
args["model_name_or_path"] = 'bert-multilingual-base'
args["max_seq_length"] = 128
args["overwrite_cache"] = True
args["data_dir"] = "/data/misc/cc/2_pairing/glue_data/MRPC"
args["model_type"] = "bert"
args["task_name"] = "mrpc"
args["output_dir"] = "/tmp/mrpc_eval_outputs"
args["per_gpu_eval_batch_size"] = 2
args["n_gpu"] = 1
args["device"] = "cuda"
args["output_mode"] = "classification"

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_dataset = load_and_cache_examples(args,
                                        "mrpc",
                                        tokenizer,
                                        evaluate=False)


def split_token_pair(ids_tensor, sep_token_id=102, target_len=128):
    ids_np = ids_tensor.numpy()
    split_idxs = np.argwhere(ids_np == sep_token_id).flatten()
    out = [ids_np[:split_idxs[0]], ids_np[split_idxs[0] + 1:split_idxs[1]]]
    out = [
        np.append(part, np.zeros(target_len - len(part)), axis=0)
        for part in out
    ]
    return torch.tensor(out)


train_tokens = torch.stack([