Beispiel #1
0
def train(dim):
    logger = logging.getLogger()
    csv.field_size_limit(
        2147483647
    )  # Increase CSV reader's field limit incase we have long text.

    logging.basicConfig(level=logging.INFO)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # hyperparameters
    DATA_DIR = 'data/%s' % dim  # directory of where the train/test/dev files are stored
    OUTPUT_DIR = 'weights/BERT/%s' % dim  # where the model weights will be stored
    BERT_MODEL = 'bert-base-cased'  # BERT model type
    CACHE_DIR = 'cache/'  # where BERT will look for pre-trained models to load parameters from

    MAX_SEQ_LENGTH = 20
    TRAIN_BATCH_SIZE = 24
    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 4
    GRADIENT_ACCUMULATION_STEPS = 1
    OUTPUT_MODE = 'classification'
    CONFIG_NAME = "config.json"
    WEIGHTS_NAME = "pytorch_model.bin"

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # load processor model
    from models.BERT_functions import BinaryClassificationProcessor, convert_example_to_feature
    processor = BinaryClassificationProcessor()

    train_examples = processor.get_train_examples(DATA_DIR)
    train_examples_len = len(train_examples)
    label_list = processor.get_labels()  # [0, 1] for binary classification
    num_labels = len(label_list)
    num_train_optimization_steps = int(
        train_examples_len / TRAIN_BATCH_SIZE /
        GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

    if os.path.exists(join(DATA_DIR, "train_features.pckl")):
        with open(join(DATA_DIR, "train_features.pkl"), "rb") as f:
            train_features = pickle.load(f)
    else:
        # Load pre-trained model tokenizer (vocabulary)
        tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                                  do_lower_case=False)
        label_map = {label: i for i, label in enumerate(label_list)}
        train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH,
                                          tokenizer, OUTPUT_MODE)
                                         for example in train_examples]
        process_count = min(cpu_count() - 1, 8)
        # process_count = 30
        if __name__ == '__main__':
            print(f'Preparing to convert {train_examples_len} examples..')
            print(f'Spawning {process_count} processes..')
            with Pool(process_count) as p:
                train_features = list(
                    tqdm_notebook(p.imap(convert_example_to_feature,
                                         train_examples_for_processing),
                                  total=train_examples_len))
        with open(join(DATA_DIR, "train_features.pkl"), "wb") as f:
            pickle.dump(train_features, f)

    model = BertForSequenceClassification.from_pretrained(
        BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)
    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(params=optimizer_grouped_parameters,
                      lr=LEARNING_RATE,
                      correct_bias=False)

    # optimizer = BertAdam(optimizer_grouped_parameters,
    #                      lr=LEARNING_RATE,
    #                      warmup=WARMUP_PROPORTION,
    #                      t_total=num_train_optimization_steps)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_examples_len)
    logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)

    if OUTPUT_MODE == "classification":
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
    elif OUTPUT_MODE == "regression":
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.float)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=TRAIN_BATCH_SIZE)

    model.train()
    for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(
                tqdm_notebook(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            outputs = model(input_ids, segment_ids, input_mask, labels=None)
            logits = outputs[0]

            if OUTPUT_MODE == "classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, num_labels),
                                label_ids.view(-1))
            elif OUTPUT_MODE == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), label_ids.view(-1))

            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / GRADIENT_ACCUMULATION_STEPS

            loss.backward()
            print("\r%f" % loss, end='')

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
    output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

    torch.save(model_to_save.state_dict(), output_model_file)

    tokenizer.save_vocabulary(OUTPUT_DIR)
    print(model.config.to_json_string())
    model_to_save.config.to_json_file(output_config_file)
    return
Beispiel #2
0
    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                labels=None,
                position_ids=None,
                head_mask=None,
                ext_mask=None,
                ext_start_labels=None,
                ext_end_labels=None,
                aug_mask=None,
                aug_start_labels=None,
                aug_end_labels=None,
                loss_lambda=1.0):
        # run through bert
        bert_outputs = self.bert(input_ids,
                                 position_ids=position_ids,
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 head_mask=head_mask)

        # label classifier
        pooled_output = bert_outputs[1]
        pooled_output = self.dropout(pooled_output)
        label_logits = self.label_classifier(pooled_output)

        # extraction classifier
        output = bert_outputs[0]
        ext_mask = ext_mask.unsqueeze(-1)
        ext_start_logits = self.ext_start_classifier(output) * ext_mask
        ext_end_logits = self.ext_end_classifier(output) * ext_mask

        # augmentation classifier
        output = bert_outputs[0]
        aug_mask = aug_mask.unsqueeze(-1)
        aug_start_logits = self.aug_start_classifier(output) * aug_mask
        aug_end_logits = self.aug_end_classifier(output) * aug_mask

        span_logits = (
            ext_start_logits,
            ext_end_logits,
            aug_start_logits,
            aug_end_logits,
        )
        outputs = (label_logits, ) + span_logits + bert_outputs[2:]

        if labels is not None and \
                ext_start_labels is not None and ext_end_labels is not None and \
                aug_start_labels is not None and aug_end_labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(label_logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()

                # label loss
                labels_loss = loss_fct(label_logits.view(-1, self.num_labels),
                                       labels.view(-1))

                # extraction loss
                ext_start_loss = loss_fct(ext_start_logits.squeeze(),
                                          ext_start_labels)
                ext_end_loss = loss_fct(ext_end_logits.squeeze(),
                                        ext_end_labels)

                # augmentation loss
                aug_start_loss = loss_fct(aug_start_logits.squeeze(),
                                          aug_start_labels)
                aug_end_loss = loss_fct(aug_end_logits.squeeze(),
                                        aug_end_labels)

                span_loss = (ext_start_loss + ext_end_loss + aug_start_loss +
                             aug_end_loss) / 4

                # combined loss
                loss = labels_loss + loss_lambda * span_loss

            outputs = (loss, labels_loss, span_loss, ext_start_loss,
                       ext_end_loss, aug_start_loss, aug_end_loss) + outputs

        return outputs  # (loss), (logits), (hidden_states), (attentions)
Beispiel #3
0
optimizers = {
    "adam": Adam,
    "sgd": SGD,
    "rmsprop": RMSprop,
    "adagrad": Adagrad,
}

losses = {
    "negative log likelihood": NLLLoss(),
    "nll": NLLLoss(),
    "binary cross entropy": BCELoss(),
    "bce": BCELoss(),
    "categorical cross entropy": CrossEntropyLoss(),
    "cce": CrossEntropyLoss(),
    "mean squared error": MSELoss(),
    "mse": MSELoss(),
    "mean absolute error": L1Loss(),
    "mae": L1Loss(),
    "huber loss": SmoothL1Loss(),
}


class RawDataset(Dataset):
    """ Simple Wrapper for torch dataset (used by classes such as TorchModel and Dataloader) """
    def __init__(self, X, y, output_dtype):
        self.X, self.y = assure_tensor(X, torch.float32), assure_tensor(
            y, output_dtype)

    def __len__(self):
        return len(self.X)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )

    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # =================model defination===================
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mnli-mm": MnliMismatchedProcessor,
        "mrpc": MrpcProcessor,
        "sst-2": Sst2Processor,
        "sts-b": StsbProcessor,
        "qqp": QqpProcessor,
        "qnli": QnliProcessor,
        "rte": RteProcessor,
        "wnli": WnliProcessor,
    }

    output_modes = {
        "cola": "classification",
        "mnli": "classification",
        "mrpc": "classification",
        "sst-2": "classification",
        "sts-b": "regression",
        "qqp": "classification",
        "qnli": "classification",
        "rte": "classification",
        "wnli": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:
        if torch.cuda.is_available() and not args.no_cuda:
            device = torch.device("cuda:3")
        elif args.no_cuda or torch.cuda.is_available():
            device = torch.device("cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None

    train_examples = processor.get_train_examples(args.data_dir)

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)

    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, num_labels=num_labels)
    hidden_layers = model.config.num_hidden_layers
    model.to(device)
    hookFs = [
        BertHook(model.bert.encoder.layer[i].attention.self.dropout)
        for i in range(hidden_layers)
    ]

    #==================End of Model Def=================
    #=====================Start Eval====================
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):

        eval_examples = processor.get_test_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, output_mode,
                                                     logger)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        # att_ls = []
        first_prob = [[] for i in range(hidden_layers)]

        counter = 1
        save_folder_name = args.data_dir.split("/")[-1]

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask, labels=None)

            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                         label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(preds[0],
                                     logits.detach().cpu().numpy(),
                                     axis=0)

            probs = [
                hookFs[i].input[0].data.cpu().numpy()
                for i in range(hidden_layers)
            ]
            seq_lens = input_mask.sum(dim=-1)

            for i in range(args.eval_batch_size):
                for j in range(hidden_layers):
                    first_prob[j].append(probs[j][i, :, 0, :])
            if counter % 1000 == 0:
                print("saving ...")
                np.save(
                    "data/BERT_att_weights/" + save_folder_name +
                    "-bert-att-all" + str(counter), first_prob)
                first_prob = [[] for i in range(hidden_layers)]
            counter += 1

        print("-" * 80)
        print("len(first_prob): ", len(first_prob))
        print("len(first_prob[0]): ", len(first_prob[0]))
        print("len(first_prob[0][0]): ", len(first_prob[0][0]))

        print("-" * 80)
        print(save_folder_name)
        print("-" * 80)
        np.save("data/BERT_att_weights/" + save_folder_name + "-bert-att-all",
                first_prob)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(task_name, preds, all_label_ids.numpy())
        loss = tr_loss / nb_tr_steps if args.do_train else None

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = build_argparse()
    parser.add_argument('--kd_decay', type=float, default=0.999)
    parser.add_argument('--kd_coeff', type=float, default=1.0)
    args = parser.parse_args()
    # output dir
    if args.model_name is None:
        args.model_name = args.model_path.split("/")[-1]
    args.output_dir = args.output_dir + '{}'.format(args.model_name)
    os.makedirs(args.output_dir, exist_ok=True)
    prefix = "_".join([args.model_name, args.task_name])
    logger = TrainLogger(log_dir=args.output_dir, prefix=prefix)
    # device
    logger.info("initializing device")
    args.device, args.n_gpu = prepare_device(args.gpu, args.local_rank)
    seed_everything(args.seed)
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    # data processor
    logger.info("initializing data processor")
    tokenizer = tokenizer_class.from_pretrained(
        args.model_path, do_lower_case=args.do_lower_case)
    processor = ColaProcessor(data_dir=args.data_dir,
                              tokenizer=tokenizer,
                              prefix=prefix)
    label_list = processor.get_labels()
    num_labels = len(label_list)
    args.num_labels = num_labels
    # model
    logger.info("initializing model and config")
    config = config_class.from_pretrained(
        args.model_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(args.model_path, config=config)
    model.to(args.device)
    # trainer
    logger.info("initializing traniner")
    trainer = SDATrainer(logger=logger,
                         args=args,
                         collate_fn=processor.collate_fn,
                         batch_input_keys=processor.get_batch_keys(),
                         kd_model=copy.deepcopy(model),
                         kd_loss_fct=MSELoss(),
                         metrics=[MattewsCorrcoef()])
    # do train
    if args.do_train:
        train_dataset = processor.create_dataset(args.train_max_seq_length,
                                                 'train.tsv', 'train')
        eval_dataset = processor.create_dataset(args.eval_max_seq_length,
                                                'dev.tsv', 'dev')
        trainer.train(model,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset)
    if args.do_eval and args.local_rank in [-1, 0]:
        results = {}
        eval_dataset = processor.create_dataset(args.eval_max_seq_length,
                                                'dev.tsv', 'dev')
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints or args.checkpoint_number > 0:
            checkpoints = get_checkpoints(args.output_dir,
                                          args.checkpoint_number, WEIGHTS_NAME)
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("/")[-1].split("-")[-1]
            model = model_class.from_pretrained(checkpoint, config=config)
            model.to(args.device)
            trainer.evaluate(model,
                             eval_dataset,
                             save_preds=True,
                             prefix=str(global_step))
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in trainer.records['result'].items()
                }
                results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        dict_to_text(output_eval_file, results)
    if args.do_predict:
        test_dataset = processor.create_dataset(args.eval_max_seq_length,
                                                'test.tsv', 'test')
        if args.checkpoint_number == 0:
            raise ValueError("checkpoint number should > 0,but get %d",
                             args.checkpoint_number)
        checkpoints = get_checkpoints(args.output_dir, args.checkpoint_number,
                                      WEIGHTS_NAME)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("/")[-1].split("-")[-1]
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            trainer.predict(model,
                            test_dataset=test_dataset,
                            prefix=str(global_step))
Beispiel #6
0
def eval_model_multi_sent(eval_examples, processor, args, label_list, tokenizer, output_mode,
                          model, device, num_labels, task_name, tr_loss,
                          nb_tr_steps, global_step, out_prob):
    """return List of List of float"""
    # eval_examples = processor.get_train_examples(args.data_dir)

    eval_features, index_cls = examples2features_sents(
        eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)  # modify
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_cls_pos = torch.tensor(index_cls, dtype=torch.long)  # Modify
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_cls_pos)  # modify
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    model.eval()
    eval_loss = 0
    nb_eval_steps = 0
    preds = []
    prob_res = []

    for input_ids, input_mask, segment_ids, label_ids, cls_pos in tqdm(eval_dataloader, desc="Evaluating"):  # modify
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        cls_pos = cls_pos.to(device)  # modify

        with torch.no_grad():
            logits = model(input_ids, cls_pos, segment_ids, input_mask, labels=None)  # modify

        # create eval loss and other metric required by the task
        if output_mode == "classification":
            loss_fct = CrossEntropyLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif output_mode == "regression":
            loss_fct = MSELoss()
            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = np.append(
                preds[0], logits.detach().cpu().numpy(), axis=0)
        # -----------------------------------
        sm_func = Softmax(dim=1)
        prob = sm_func(logits)  # [batch_size,num_class]
        prob_res = prob_res + list(prob.detach().cpu().numpy())
    prob_list = list(map(lambda x: list(x), prob_res))
    if out_prob:
        print(type(prob_list))
        print(type(prob_list[0]))
        print(prob_list[:3])
        import pickle
        pickle.dump(prob_list, open(os.path.join(args.output_dir, "oof_test.pkl"), "wb"))
    # ---------------------------------
    eval_loss = eval_loss / nb_eval_steps
    preds = preds[0]
    if output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(preds)
    result = compute_metrics(task_name, preds, all_label_ids.numpy())
    loss = tr_loss / nb_tr_steps if args.do_train else None

    # 展示在验证集上的结果
    result['eval_loss'] = eval_loss
    result['global_step'] = global_step
    result['loss'] = loss

    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
    eval_result.append(result)
    return prob_list
Beispiel #7
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=xlnet_collate_fn if
                                  args.model_type in ['xlnet'] else collate_fn)
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs
    args.warmup_steps = int(t_total * args.warmup_proportion)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    if args.do_kd:
        kd_loss_fct = MSELoss()
        kd_model = copy.deepcopy(model)
        kd_model.eval()
    seed_everything(
        args.seed
    )  # Added here for reproductibility (even between python 2 and 3)
    for _ in range(int(args.num_train_epochs)):
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet', 'albert', 'roberta'
                ] else None  # XLM, DistilBERT don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            if args.do_kd:
                inputs['labels'] = None
                with torch.no_grad():
                    kd_logits = kd_model(**inputs)[0]
                kd_loss = kd_loss_fct(outputs[1], kd_logits)
                loss += args.kd_coeff * kd_loss
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            pbar(step, {'loss': loss.item()})
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.do_kd:
                    decay = min(args.decay,
                                (1 + global_step) / (10 + global_step))
                    one_minus_decay = 1.0 - decay
                    with torch.no_grad():
                        parameters = [
                            p for p in model.parameters() if p.requires_grad
                        ]
                        for s_param, param in zip(kd_model.parameters(),
                                                  parameters):
                            s_param.sub_(one_minus_decay * (s_param - param))
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    print(" ")
                    # Log metrics
                    if args.local_rank == -1:  # Only evaluate when single GPU otherwise metrics may not average well
                        evaluate(args, model, tokenizer)

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    tokenizer.save_vocabulary(vocab_path=output_dir)
        print(" ")
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    return global_step, tr_loss / global_step
Beispiel #8
0
    discriminator = Discriminator(
        voc_size=dataset.size,
        hidden_size=args.hidden,
        device=args.device,
        num_layers=args.rnn_layers,
    ).to(args.device)

    print("Q")
    print(Q_func)
    print("R")
    print(reconstructor)
    print("D")
    print(discriminator)

    binary_crossentropy = BCELoss()
    mse_loss = MSELoss()
    categorical_crossentropy = CrossEntropyLoss()

    Q_optimizer = Adam(Q_func.parameters(), lr=args.lr)
    R_optimizer = Adam(reconstructor.parameters(), lr=args.lr)
    D_optimizer = Adam(discriminator.parameters(), lr=args.lr)

    for epoch in range(1, 1 + args.epochs):

        print("epoch {} / {}".format(epoch, args.epochs))
        avgloss = []

        for (q_input, dis_input) in zip(dataloader, dataloader):

            loss = torch.tensor(0.0, device=args.device)
Beispiel #9
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        #####
        #return outputs
        #####

        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, ImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = feature_extractor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: maillot
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.classifier(sequence_output[:, 0, :])
        # we don't use the distillation token

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long
                                              or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
y_train = y_train.values[order]
y_train = np.reshape(y_train, newshape=(y_train.shape[0], 1))
x_train = x_train.values[order, :]

x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(y_train)

Net = Sequential(
    # BatchNorm1d(num_features=2),
    Linear(in_features=2, out_features=10),
    ReLU(inplace=True),
    Linear(in_features=10, out_features=1),
)

optimizer = RMSprop(Net.parameters(), lr=0.001)
loss_func = MSELoss()

x_data, y_data = Variable(x_train), Variable(y_train)
bar = ProgressBar(1, STEPS, "train_loss:%.9f")

predict = []
myloss = []

for step in range(STEPS):
    prediction = Net(x_data)
    loss = loss_func(prediction, y_data)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    bar.show(1, loss.item())
                                 max_size=val_size,
                                 transform=simple_transform,
                                 device=device)

    # Loaders
    batch_size = int(train_size / 4)
    loader_train = DataLoader(dataset_train,
                              batch_size=batch_size,
                              shuffle=True)
    loader_val = DataLoader(dataset_val, batch_size=batch_size)

    # Define baseline model
    baseline = Baseline(H1=H1, H2=H2).double().to(device)

    # Define Loss, metric and optimizer
    loss_function = MSELoss()
    challenge_metric = ChallengeMetric()
    opt = Adam(baseline.parameters())

    # Lists to record train and val scores
    train_losses = []
    val_losses = []
    val_scores = []
    best_val_score = 0.

    for epoch in range(1, 1 + epochs):
        print("epoch", epoch)
        train_loss = 0
        val_loss = 0
        val_score = 0
        baseline.train()
    def forward(
        self,
        pixel_values=None,
        head_mask=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the image classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples::

            >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassification
            >>> from PIL import Image
            >>> import requests

            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
            >>> image = Image.open(requests.get(url, stream=True).raw)

            >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
            >>> # so the head will be randomly initialized, hence the predictions will be random
            >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
            >>> model = DeiTForImageClassification.from_pretrained('facebook/deit-base-distilled-patch16-224')

            >>> inputs = feature_extractor(images=image, return_tensors="pt")
            >>> outputs = model(**inputs)
            >>> logits = outputs.logits
            >>> # model predicts one of the 1000 ImageNet classes
            >>> predicted_class_idx = logits.argmax(-1).item()
            >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.deit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.classifier(sequence_output[:, 0, :])
        # we don't use the distillation token

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
Beispiel #14
0
from torch.nn import MSELoss, BCELoss

# Losses
MSE = MSELoss()
BCE = BCELoss()
LOSSES = [MSE, BCE]
Beispiel #15
0
    loss_function = semi_mse_loss(centers, lam=1)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    # Train the semi-supervised model
    tsne_dict = {}  # For visualizing
    nsamples = 5000
    for epoch in range(1, args.epochs + 1):
        t0 = time.time()
        if args.tsne:
            tsne_dict[epoch - 1] = getTSNE(model, epoch, data_loaders,
                                           nsamples, device)
        run_epoch(model, epoch, data_loaders, optimizer, device, args,
                  loss_function, writer)
        test_model(model, epoch, data_loaders, MSELoss(), centers, device,
                   writer)
        print('Wall clock time for epoch: {}'.format(time.time() - t0))
    if args.tsne:
        tsne_dict[epoch] = getTSNE(model, epoch, data_loaders, nsamples,
                                   device)

    # Train the supervised model for comparison
    if args.compare:
        # Reset model and accesories
        if args.dataset == 'Projection':
            model = SimpleNet(args.num_clusters, device)
        else:
            model = LeNet(args.dropout, device)

        optimizer = optim.SGD(model.parameters(),
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--teacher_model',
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument('--student_model',
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument('--output_dir', default=None, type=str, required=True)

    # Other parameters
    parser.add_argument(
        '--max_seq_length',
        default=128,
        type=int,
        help=
        'The maximum total input sequence length after WordPiece tokenization. \n'
        'Sequences longer than this will be truncated, and sequences shorter \n'
        'than this will be padded.',
    )

    parser.add_argument(
        '--reduce_memory',
        action='store_true',
        help=
        'Store training data as on-disc memmaps to massively reduce memory usage',
    )
    parser.add_argument(
        '--do_eval',
        action='store_true',
        help='Whether to run eval on the dev set.',
    )
    parser.add_argument(
        '--do_lower_case',
        action='store_true',
        help='Set this flag if you are using an uncased model.',
    )
    parser.add_argument(
        '--train_batch_size',
        default=32,
        type=int,
        help='Total batch size for training.',
    )
    parser.add_argument(
        '--eval_batch_size',
        default=8,
        type=int,
        help='Total batch size for eval.',
    )
    parser.add_argument(
        '--learning_rate',
        default=5e-5,
        type=float,
        help='The initial learning rate for Adam.',
    )
    parser.add_argument(
        '--weight_decay',
        '--wd',
        default=1e-4,
        type=float,
        metavar='W',
        help='weight decay',
    )
    parser.add_argument(
        '--num_train_epochs',
        default=3.0,
        type=float,
        help='Total number of training epochs to perform.',
    )
    parser.add_argument(
        '--warmup_proportion',
        default=0.1,
        type=float,
        help=
        'Proportion of training to perform linear learning rate warmup for. '
        'E.g., 0.1 = 10%% of training.',
    )
    parser.add_argument(
        '--no_cuda',
        action='store_true',
        help='Whether not to use CUDA when available',
    )
    parser.add_argument(
        '--local_rank',
        type=int,
        default=-1,
        help='local_rank for distributed training on gpus',
    )
    parser.add_argument(
        '--seed',
        type=int,
        default=42,
        help='random seed for initialization',
    )
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        'Number of updates steps to accumulate before performing a backward/update pass.',
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help='Whether to use 16-bit float precision instead of 32-bit',
    )
    parser.add_argument(
        '--continue_train',
        action='store_true',
        help='Whether to train from checkpoints',
    )

    # Additional arguments
    parser.add_argument('--eval_step', type=int, default=1000)

    # This is used for running on Huawei Cloud.
    parser.add_argument('--data_url', type=str, default='')

    args = parser.parse_args()
    logger.info('args:{}'.format(args))

    samples_per_epoch = []
    for i in range(int(args.num_train_epochs)):
        epoch_file = args.pregenerated_data / 'epoch_{}.json'.format(i)
        metrics_file = args.pregenerated_data / 'epoch_{}_metrics.json'.format(
            i)
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit('No training data was found!')
            print(
                'Warning! There are fewer epochs of pregenerated data ({}) than training epochs ({}).'
                .format(i, args.num_train_epochs))
            print(
                'This script will loop over the available data, but training diversity may be negatively impacted.'
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.num_train_epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device('cuda' if torch.cuda.is_available()
                              and not args.no_cuda else 'cpu')
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    logger.info(
        'device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}'.
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            'Invalid gradient_accumulation_steps parameter: {}, should be >= 1'
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = (args.train_batch_size //
                             args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            'Output directory ({}) already exists and is not empty.'.format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    total_train_examples = 0
    for i in range(int(args.num_train_epochs)):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = (num_train_optimization_steps //
                                        torch.distributed.get_world_size())

    if args.continue_train:
        student_model = TinyBertForPreTraining.from_pretrained(
            args.student_model)
    else:
        student_model = TinyBertForPreTraining.from_scratch(args.student_model)
    teacher_model = BertModel.from_pretrained(args.teacher_model)

    # student_model = TinyBertForPreTraining.from_scratch(args.student_model, fit_size=teacher_model.config.hidden_size)
    student_model.to(device)
    teacher_model.to(device)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                'Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.'
            )

        teacher_model = DDP(teacher_model)
    elif n_gpu > 1:
        student_model = torch.nn.DataParallel(student_model)
        teacher_model = torch.nn.DataParallel(teacher_model)

    size = 0
    for n, p in student_model.named_parameters():
        logger.info('n: {}'.format(n))
        logger.info('p: {}'.format(p.nelement()))
        size += p.nelement()

    logger.info('Total parameters: {}'.format(size))

    # Prepare optimizer
    param_optimizer = list(student_model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01,
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0,
        },
    ]

    loss_mse = MSELoss()
    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=args.learning_rate,
        warmup=args.warmup_proportion,
        t_total=num_train_optimization_steps,
    )

    global_step = 0
    logging.info('***** Running training *****')
    logging.info('  Num examples = {}'.format(total_train_examples))
    logging.info('  Batch size = %d', args.train_batch_size)
    logging.info('  Num steps = %d', num_train_optimization_steps)

    for epoch in trange(int(args.num_train_epochs), desc='Epoch'):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory,
        )
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(
            epoch_dataset,
            sampler=train_sampler,
            batch_size=args.train_batch_size,
        )

        tr_loss = 0.0
        tr_att_loss = 0.0
        tr_rep_loss = 0.0
        student_model.train()
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader),
                  desc='Epoch {}'.format(epoch)) as pbar:
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc='Iteration', ascii=True)):
                batch = tuple(t.to(device) for t in batch)

                input_ids, input_mask, segment_ids, lm_label_ids, is_next = (
                    batch)
                if input_ids.size()[0] != args.train_batch_size:
                    continue

                att_loss = 0.0
                rep_loss = 0.0

                student_atts, student_reps = student_model(
                    input_ids, segment_ids, input_mask)
                teacher_reps, teacher_atts, _ = teacher_model(
                    input_ids, segment_ids, input_mask)
                teacher_reps = [
                    teacher_rep.detach() for teacher_rep in teacher_reps
                ]  # speedup 1.5x
                teacher_atts = [
                    teacher_att.detach() for teacher_att in teacher_atts
                ]

                teacher_layer_num = len(teacher_atts)
                student_layer_num = len(student_atts)
                assert teacher_layer_num % student_layer_num == 0
                layers_per_block = int(teacher_layer_num / student_layer_num)
                new_teacher_atts = [
                    teacher_atts[i * layers_per_block + layers_per_block - 1]
                    for i in range(student_layer_num)
                ]

                for student_att, teacher_att in zip(student_atts,
                                                    new_teacher_atts):
                    student_att = torch.where(
                        student_att <= -1e2,
                        torch.zeros_like(student_att).to(device),
                        student_att,
                    )
                    teacher_att = torch.where(
                        teacher_att <= -1e2,
                        torch.zeros_like(teacher_att).to(device),
                        teacher_att,
                    )
                    att_loss += loss_mse(student_att, teacher_att)

                new_teacher_reps = [
                    teacher_reps[i * layers_per_block]
                    for i in range(student_layer_num + 1)
                ]
                new_student_reps = student_reps

                for student_rep, teacher_rep in zip(new_student_reps,
                                                    new_teacher_reps):
                    rep_loss += loss_mse(student_rep, teacher_rep)

                loss = att_loss + rep_loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_att_loss += att_loss.item()
                tr_rep_loss += rep_loss.item()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)

                mean_loss = (tr_loss * args.gradient_accumulation_steps /
                             nb_tr_steps)
                mean_att_loss = (tr_att_loss *
                                 args.gradient_accumulation_steps /
                                 nb_tr_steps)
                mean_rep_loss = (tr_rep_loss *
                                 args.gradient_accumulation_steps /
                                 nb_tr_steps)

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                    if (global_step + 1) % args.eval_step == 0:
                        result = {}
                        result['global_step'] = global_step
                        result['loss'] = mean_loss
                        result['att_loss'] = mean_att_loss
                        result['rep_loss'] = mean_rep_loss
                        output_eval_file = os.path.join(
                            args.output_dir, 'log.txt')
                        with open(output_eval_file, 'a') as writer:
                            logger.info('***** Eval results *****')
                            for key in sorted(result.keys()):
                                logger.info('  %s = %s', key, str(result[key]))
                                writer.write('%s = %s\n' %
                                             (key, str(result[key])))

                        # Save a trained model
                        model_name = 'step_{}_{}'.format(
                            global_step, WEIGHTS_NAME)
                        logging.info(
                            '** ** * Saving fine-tuned model ** ** * ')
                        # Only save the model it-self
                        model_to_save = (student_model.module if hasattr(
                            student_model, 'module') else student_model)

                        output_model_file = os.path.join(
                            args.output_dir, model_name)
                        output_config_file = os.path.join(
                            args.output_dir, CONFIG_NAME)

                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        # tokenizer.save_vocabulary(args.output_dir)

                        if oncloud:
                            logging.info(
                                mox.file.list_directory(args.output_dir,
                                                        recursive=True))
                            logging.info(
                                mox.file.list_directory('.', recursive=True))
                            mox.file.copy_parallel(args.output_dir,
                                                   args.data_url)
                            mox.file.copy_parallel('.', args.data_url)

            model_name = 'step_{}_{}'.format(global_step, WEIGHTS_NAME)
            logging.info('** ** * Saving fine-tuned model ** ** * ')
            model_to_save = (student_model.module if hasattr(
                student_model, 'module') else student_model)

            output_model_file = os.path.join(args.output_dir, model_name)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            # tokenizer.save_vocabulary(args.output_dir)

            if oncloud:
                logging.info(
                    mox.file.list_directory(args.output_dir, recursive=True))
                logging.info(mox.file.list_directory('.', recursive=True))
                mox.file.copy_parallel(args.output_dir, args.data_url)
                mox.file.copy_parallel('.', args.data_url)
Beispiel #17
0
    def forward(self,
                input_ids,
                attention_mask=None,
                e1_mask=None,
                e2_mask=None,
                labels=None):
        outputs = self.Bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0][:, :, :]
        pooled_output = outputs[0][:, 0, :]

        #sequence_output = outputs[0][0]

        def extract_entity(sequence_output, e_mask):
            extended_e_mask = e_mask.unsqueeze(1)
            extended_e_mask = torch.bmm(extended_e_mask.float(),
                                        sequence_output).squeeze(1)
            return extended_e_mask.float()

        e1_h = extract_entity(sequence_output, e1_mask)
        e2_h = extract_entity(sequence_output, e2_mask)
        context = self.dropout(pooled_output)
        pooled_output = torch.cat([context, e1_h, e2_h], dim=-1)
        print(pooled_output.size())

        logits = self.classifier(pooled_output)

        # add hidden states and attention if they are here
        outputs = (logits, ) + outputs[2:]

        device = logits.get_device()
        l2 = l2_loss(self.parameters())
        #
        if device >= 0:
            l2 = l2.to(device)
        loss = l2 * self.l2_reg_lambda
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss += loss_fct(logits.view(-1), labels.view(-1))
            else:
                # loss_fct = CrossEntropyLoss()
                # loss += loss_fct(
                #     logits.view(-1, self.num_labels), labels.view(-1))
                # I thought that using Gumbel softmax should be better than the following code.

                probabilities = F.softmax(logits, dim=-1)
                log_probs = F.log_softmax(logits, dim=-1)
                one_hot_labels = F.one_hot(labels, num_classes=self.num_labels)
                if device >= 0:
                    one_hot_labels = one_hot_labels.to(device)

                dist = one_hot_labels[:, 1:].float() * log_probs[:, 1:]
                example_loss_except_other, _ = dist.min(dim=-1)
                per_example_loss = -example_loss_except_other.mean()

                rc_probabilities = probabilities - probabilities * one_hot_labels.float(
                )
                second_pre, _ = rc_probabilities[:, 1:].max(dim=-1)
                rc_loss = -(1 - second_pre).log().mean()

                #
                loss += per_example_loss + 5 * rc_loss

            outputs = (loss, ) + outputs

        return outputs
Beispiel #18
0
def get_tcn_model(dataset=None, plot=False, verbose=False):
    if (dataset is None):
        df = pd.read_csv("jeans_day.csv")
    else:
        df = pd.DataFrame.from_dict(dataset)
    ts = TimeSeries.from_dataframe(df,
                                   time_col='time_interval',
                                   value_cols=['count'])

    train, val = ts.split_after(0.8)  #80% train, 20% val

    scaler = Scaler()
    train_transformed = scaler.fit_transform(train)
    val_transformed = scaler.transform(val)
    ts_transformed = scaler.transform(ts)

    params = dict()
    params['kernel_size'] = [4, 6]
    params['num_filters'] = [10]
    params['random_state'] = [0, 1]
    params['input_chunk_length'] = [14]
    params['output_chunk_length'] = [1]
    params['dilation_base'] = [2, 3]
    params['n_epochs'] = [100]
    params['dropout'] = [0]
    params['loss_fn'] = [MSELoss()]
    params['weight_norm'] = [True]
    tcn = TCNModel.gridsearch(parameters=params,
                              series=train_transformed,
                              val_series=val_transformed,
                              verbose=verbose,
                              metric=mse)

    params = tcn[1]
    tcn_model = tcn[0]
    tcn_model.fit(series=train_transformed)
    if (plot):
        backtest = tcn_model.historical_forecasts(series=ts_transformed,
                                                  start=0.8,
                                                  forecast_horizon=1,
                                                  stride=1,
                                                  retrain=False,
                                                  verbose=verbose)
        val = scaler.inverse_transform(val_transformed)
        backtest = scaler.inverse_transform(backtest)
        train = scaler.inverse_transform(train_transformed)
        print(scaler.inverse_transform(tcn_model.predict(7)))
        print("R2: {}".format(r2_score(val, backtest[1:], intersect=False)))
        print("MAPE: {}".format(mape(val, backtest[1:])))
        print("MASE: {}".format(mase(val, backtest[1:], train)))
        print("MAE: {}".format(mae(val, backtest[1:])))
        print("RMSE: {}".format(np.sqrt(mse(val, backtest[1:]))))
        backtest.plot(label='backtest')
        ts.plot(label='actual')
        plt.title("H&M Daily, TCN Model")
        plt.xlabel("Date")
        plt.ylabel("Count")
        plt.legend()
        plt.show()
    else:
        return [tcn_model, params]
Beispiel #19
0
def get_loss_criterion(config):
    """
    Returns the loss function based on provided configuration
    :param config: (dict) a top level configuration object containing the 'loss' key
    :return: an instance of the loss function
    """
    assert 'loss' in config, 'Could not find loss function configuration'
    loss_config = config['loss']
    name = loss_config['name']

    ignore_index = loss_config.get('ignore_index', None)
    weight = loss_config.get('weight', None)

    if weight is not None:
        # convert to cuda tensor if necessary
        weight = torch.tensor(weight).to(config['device'])

    if name == 'BCEWithLogitsLoss':
        skip_last_target = loss_config.get('skip_last_target', False)
        if ignore_index is None and not skip_last_target:
            return nn.BCEWithLogitsLoss()
        else:
            return BCELossWrapper(nn.BCEWithLogitsLoss(),
                                  ignore_index=ignore_index,
                                  skip_last_target=skip_last_target)
    elif name == 'CrossEntropyLoss':
        if ignore_index is None:
            ignore_index = -100  # use the default 'ignore_index' as defined in the CrossEntropyLoss
        return nn.CrossEntropyLoss(weight=weight, ignore_index=ignore_index)
    elif name == 'WeightedCrossEntropyLoss':
        if ignore_index is None:
            ignore_index = -100  # use the default 'ignore_index' as defined in the CrossEntropyLoss
        return WeightedCrossEntropyLoss(weight=weight,
                                        ignore_index=ignore_index)
    elif name == 'PixelWiseCrossEntropyLoss':
        return PixelWiseCrossEntropyLoss(class_weights=weight,
                                         ignore_index=ignore_index)
    elif name == 'GeneralizedDiceLoss':
        return GeneralizedDiceLoss(weight=weight, ignore_index=ignore_index)
    elif name == 'DiceLoss':
        sigmoid_normalization = loss_config.get('sigmoid_normalization', True)
        skip_last_target = loss_config.get('skip_last_target', False)
        return DiceLoss(weight=weight,
                        ignore_index=ignore_index,
                        sigmoid_normalization=sigmoid_normalization,
                        skip_last_target=skip_last_target)
    elif name == 'TagsAngularLoss':
        tags_coefficients = loss_config['tags_coefficients']
        return TagsAngularLoss(tags_coefficients)
    elif name == 'MSEWithLogitsLoss':
        return MSEWithLogitsLoss()
    elif name == 'MSELoss':
        return MSELoss()
    elif name == 'SmoothL1Loss':
        return SmoothL1Loss()
    elif name == 'L1Loss':
        return L1Loss()
    else:
        raise RuntimeError(
            f"Unsupported loss function: '{name}'. Supported losses: {SUPPORTED_LOSSES}"
        )
Beispiel #20
0
def codeword_epoch_trainer(model,
                           optim_state,
                           data_loader,
                           device,
                           lr=0.0001,
                           epochs=1,
                           epoch_offset=0,
                           weight_decay=1e-4,
                           verbose=0,
                           multigpu=False):

    optimizer = optim.Adam

    model.train()
    # if codeword trainable, only train the codewords
    # otherwise, train other layers but with fixed codewords

    if multigpu:
        parameters = model.module.sparse_layers.parameters()
        model_optimizer = optimizer(parameters,
                                    lr=lr,
                                    weight_decay=weight_decay)
    else:
        parameters = model.sparse_layers.parameters()
        model_optimizer = optimizer(parameters,
                                    lr=lr,
                                    weight_decay=weight_decay)

    if optim_state is not None:
        print('loading old optimizer state')
        model_optimizer.load_state_dict(optim_state)
    else:
        print('--train with new optimizer state')

    losses = []

    L = MSELoss()

    for epoch_idx in range(epochs):
        if verbose:
            loader = tqdm(data_loader,
                          desc='epoch ' + str(epoch_offset + epoch_idx + 1) +
                          ': ',
                          ncols=80,
                          ascii=True)
        else:
            loader = data_loader

        train_loss, counter = 0, 0

        iteration = 0
        for inputs, _ in loader:
            model_optimizer.zero_grad()

            inputs = inputs.to(device)

            features, reps = model(inputs)

            loss = 0
            for feature, rep in zip(features, reps):
                loss = loss + L(feature, rep)

            loss.backward()
            model_optimizer.step()

            train_loss += loss.item()
            counter += inputs.size(0)

            iteration += 1

        losses.append(train_loss / counter)
        print('epoch: %s' % str(epoch_offset + epoch_idx + 1))
        print('--loss: %.6f' % (train_loss / counter))

    model_optimizer.zero_grad()

    return losses, model_optimizer.state_dict()
Beispiel #21
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        # n_gpu = torch.cuda.device_count()

        device = torch.device("cuda", 0)
        n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model_path = join(args.output_dir, WEIGHTS_NAME)

    # if os.path.exists(model_path):
    #     output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
    #     config = BertConfig(output_config_file)
    #     model = BertForSequenceClassification(config, num_labels)
    #     model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
    #     print('Loaded Saved Model !!!!!!!!!!!!!!')
    # else:
    #     model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)

    if os.path.exists(model_path):
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        print('Loaded Saved Model !!!!!!!!!!!!!!')
    else:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)

    if args.local_rank == 0:
        torch.distributed.barrier()

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    if args.do_train:
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()

        # Prepare data loader
        train_examples = processor.get_train_examples(args.data_dir)
        cached_train_features_file = os.path.join(
            args.data_dir, 'train_{0}_{1}_{2}'.format(
                list(filter(None, args.bert_model.split('/'))).pop(),
                str(args.max_seq_length), str(task_name)))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer,
                output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        num_train_optimization_steps = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        for epoch in range(int(args.num_train_epochs)):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids,
                               token_type_ids=segment_ids,
                               attention_mask=input_mask)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels),
                                    label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                print('Batch Loss {}/{}: {}'.format(step,
                                                    len(train_dataloader),
                                                    loss.item()),
                      end='\r')
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr',
                                             optimizer.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)
            print('Epoch Loss {}/{}: {}'.format(
                epoch, args.num_train_epochs, tr_loss / len(train_dataloader)))

    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    ### Example:
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)

    model.to(device)

    ### Evaluation
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        cached_eval_features_file = os.path.join(
            args.data_dir, 'dev_{0}_{1}_{2}'.format(
                list(filter(None, args.bert_model.split('/'))).pop(),
                str(args.max_seq_length), str(task_name)))
        try:
            with open(cached_eval_features_file, "rb") as reader:
                eval_features = pickle.load(reader)
        except:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer,
                output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving eval features into cached file %s",
                            cached_eval_features_file)
                with open(cached_eval_features_file, "wb") as writer:
                    pickle.dump(eval_features, writer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # Run prediction for full data
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(
                eval_data)  # Note that this sampler samples randomly
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        out_label_ids = None

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               token_type_ids=segment_ids,
                               attention_mask=input_mask)

            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                         label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds[0] = np.append(preds[0],
                                     logits.detach().cpu().numpy(),
                                     axis=0)
                out_label_ids = np.append(out_label_ids,
                                          label_ids.detach().cpu().numpy(),
                                          axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(task_name, preds, out_label_ids)

        loss = tr_loss / global_step if args.do_train else None

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

            if os.path.exists(args.output_dir +
                              '-MM') and os.listdir(args.output_dir +
                                                    '-MM') and args.do_train:
                raise ValueError(
                    "Output directory ({}) already exists and is not empty.".
                    format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer,
                output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                         dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
            out_label_ids = None

            for input_ids, input_mask, segment_ids, label_ids in tqdm(
                    eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids,
                                   token_type_ids=segment_ids,
                                   attention_mask=input_mask,
                                   labels=None)

                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                         label_ids.view(-1))

                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                    out_label_ids = label_ids.detach().cpu().numpy()
                else:
                    preds[0] = np.append(preds[0],
                                         logits.detach().cpu().numpy(),
                                         axis=0)
                    out_label_ids = np.append(out_label_ids,
                                              label_ids.detach().cpu().numpy(),
                                              axis=0)

            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
            result = compute_metrics(task_name, preds, out_label_ids)

            loss = tr_loss / global_step if args.do_train else None

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM',
                                            "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.{kornli, korsts}")
    parser.add_argument("--checkpoint",
                        type=str,
                        required=True,
                        help="pretrained model checkpoint")
    # Other parameters
    parser.add_argument("--config_file",
                        type=str,
                        default="data/large_config.json",
                        help="model configuration file")
    parser.add_argument("--vocab_file",
                        type=str,
                        default="data/large_v1_32k_vocab.txt",
                        help="tokenizer vocab file")
    parser.add_argument(
        "--output_dir",
        default="output",
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--num_train_epochs",
                        default=10.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O2',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    args.n_gpu = n_gpu

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    set_seed(args)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer(args.vocab_file, max_len=args.max_seq_length)

    train_examples = processor.get_train_examples(args.data_dir)
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    config = Config(args.config_file)
    model = SequenceClassification(config, num_labels=num_labels)
    model.bert.load_state_dict(torch.load(args.checkpoint))
    num_params = count_parameters(model)
    logger.info("Total Parameter: %d" % num_params)
    model.to(device)

    global_step, tr_loss = 0, 0.
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer, output_mode)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)

    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.float)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=t_total *
                                     args.warmup_proportion,
                                     t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        if args.fp16_opt_level == "O2":
            keep_batchnorm_fp32 = False
        else:
            keep_batchnorm_fp32 = True
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level=args.fp16_opt_level,
            keep_batchnorm_fp32=keep_batchnorm_fp32)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
    writer = open(output_eval_file, "w", encoding="utf-8")
    model.train()
    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        epoch_iterator = tqdm(train_dataloader,
                              desc="Train(XX Epoch) Step(X/X) (loss=X.X)",
                              disable=args.local_rank not in [-1, 0])
        model.train()
        for step, batch in enumerate(epoch_iterator):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            # define a new function to compute loss values for both output_modes
            logits = model(input_ids, segment_ids, input_mask, labels=None)

            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, num_labels),
                                label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), label_ids.view(-1))

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                epoch_iterator.set_description(
                    "Train(%d Epoch) Step(%d / %d) (loss=%5.5f)" %
                    (_, global_step, t_total, loss.item()))

        model_checkpoint = "%s_%d.bin" % (args.task_name, _)
        logger.info(model_checkpoint)
        output_model_file = os.path.join(args.output_dir, model_checkpoint)
        if n_gpu > 1 or args.local_rank != -1:
            logger.info("***** Saving file *****(module)")
            torch.save(model.module.state_dict(), output_model_file)
        else:
            logger.info("***** Saving file *****")
            torch.save(model.state_dict(), output_model_file)

        if args.do_eval and (args.local_rank == -1
                             or torch.distributed.get_rank() == 0):
            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer,
                output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)

            if output_mode == "classification":
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)
            elif output_mode == "regression":
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.float)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []

            for input_ids, input_mask, segment_ids, label_ids in tqdm(
                    eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids,
                                   segment_ids,
                                   input_mask,
                                   labels=None)

                # create eval loss and other metric required by the task
                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                             label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    tmp_eval_loss = loss_fct(logits.view(-1),
                                             label_ids.view(-1))

                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                else:
                    preds[0] = np.append(preds[0],
                                         logits.detach().cpu().numpy(),
                                         axis=0)

            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            if output_mode == "classification":
                preds = np.argmax(preds, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(preds)
            result = compute_metrics(task_name, preds, all_label_ids.numpy())
            loss = tr_loss / global_step

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            logger.info("***** Eval results *****")
            writer.write("%d_epoch\n" % (_ + 1))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
            writer.write("\n")
    writer.close()
Beispiel #23
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

        Examples::

            from transformers import AlbertTokenizer, AlbertForSequenceClassification
            import torch

            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
            outputs = model(input_ids, labels=labels)
            loss, logits = outputs[:2]

        """
        t0 = time.time()
        outputs, t1, t11 = self.albert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        t2 = time.time()

        logits = logits.to(DEVICE)

        t22 = time.time()

        outputs = (logits, ) + outputs[
            2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))
                # loss = self.pow_loss(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss, ) + outputs

        self.whole_time += t22 - t0
        self.last_time += t2 - t1
        self.last_comu_time += t22 - t11
        return outputs  # (loss), logits, (hidden_states), (attentions)
 def __init__(self):
     super().__init__()
     self.loss_func = L1Loss() if SILHOUETTE_LOSS_FUNC == 'L1' else MSELoss()
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        adapter_names=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).

    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        """
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            adapter_names=adapter_names,
        )
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)
    return (
        [batch[0] for batch in batches],
        [batch[1] for batch in batches],
    ) 

dataset = DyadLSTMDataset()
idx = [x for x in range(dataset.__len__())]
random.shuffle(idx)
valid_idx = idx[:10000]
train_idx = idx[10000:]
valid_sampler = SubsetRandomSampler(valid_idx)
train_sampler = SubsetRandomSampler(train_idx)

train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collation, sampler=train_sampler) 
valid_loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collation, sampler=valid_sampler) 
loss_fn = MSELoss()

def compute_prediction(batch):
    labels, words = batch
    lengths = [len(x) for x in words]
    max_len = max(lengths)
    z = zip(words, lengths, labels)
    z = sorted(z, key=lambda x: x[1] * -1)
    words, lengths, labels = zip(*z)
    
    words = [x + [0] * (max_len - len(x)) for x in words]
    
    # words = [y for x in words for y in x]
   
    labels = torch.FloatTensor(labels)
    #lengths = torch.FloatTensor(lengths)
    def train(self, dbn, Xtrain, Ytrain, Xtest, Ytest, train_specs,
              global_specs):
        '''
        Train procedure of the RBM.
        Here the choice is to associate the gradients to the model itself, so to avoid independent
        variables to the passed to the CD update function. When the model shall be saved, these 
        gradients are deleted, so not to serialize a huge object.
        Note that the training procedure is wholly contained in this block. For the sake of this 
        work this is the best choice
        Input:
            - dbn (dbns.DeepBeliefNetwork): the ``parent'' model which this
                                            RBM comes from. It is useful for saving
                                            the partially-trained DBN
            - Xtrain, Xtest, Ytrain, Ytest (torch.Tensor): dataset
            - train_specs (dict): specifications of the SGD learning algorithm
            - global_specs (dict): global variables
        Output:
            - _Xtrain, _Xtest (torch.Tensor): the dataset projected on the subsequent layer
            
        Note: a crucial aspect of this learning segment is the preparation and update of the
        _Xtrain and _Xtest variables. In the context of the bigger picture involving the DBN
        training, it is important, whether using the greedy or iterative learning strategy,
        to produce, update and swap the datasets in the correct way
        '''

        if train_specs['verbose']:
            print('---\nLayer {}: Input size, output size = ({},{})\n'.format(
                self.layer_id, self.Nv, self.Nh))
        #end

        self.dW = torch.zeros_like(self.W)
        self.da = torch.zeros_like(self.a)
        self.db = torch.zeros_like(self.b)

        _Xtrain = torch.zeros([Xtrain.shape[0], Xtrain.shape[1], self.Nh])
        _Xtest = torch.zeros([Xtest.shape[0], Xtest.shape[1], self.Nh])

        # initialization of the loss function.
        # In the scope of this function, we are training one RBM layer
        # hence it is legitimate to reset gradients, loss, ...
        criterion = MSELoss(reduction='mean')

        for epoch in range(train_specs['epochs']):
            if train_specs['verbose']: print('\nEpoch {}'.format(epoch))

            # mask vector for dropout implementation. Generated at each epoch
            self.dropmask = torch.Tensor(np.random.binomial(
                1, p=train_specs['dropout'], size=(Xtrain.shape[1], self.Nh)),
                                         device=self.device)

            train_loss = 0.0

            # shuffle the indices to present the data batches in different orders
            batch_indices = list(range(Xtrain.shape[0]))
            random.shuffle(batch_indices)
            for n in batch_indices:

                pos_v = Xtrain[n, :, :]

                # parameters update and loss computation
                loss = self.contrastive_divergence_params_update(
                    criterion, pos_v, epoch, Xtrain.shape[1], train_specs)
                train_loss += loss
                # _Xtrain[n,:,:] = self.forward(Xtrain[n,:,:].clone())[0].clone()

                # if train_specs['prog_train']:
                #     # if we need to keep track of the performance of the DeltaRule classifier during
                #     # training, we can train it here
                #     linclass_loss, linclass_acc = self.linclass.train(self.forward(pos_v)[0].clone(), Ytrain[n,:,:].clone())
                #     self.loss_finer.append(linclass_loss)
                #     self.acc_finer.append(linclass_acc)
                # #end

                # Data forth-projection AFTER parameters update
                _Xtrain[n, :, :], _ = self.forward(pos_v)

            #end batches loops

            # test set projection
            for n in range(Xtest.shape[0]):
                _Xtest[n, :, :], _ = self.forward(Xtest[n, :, :])
            #end

            self.cost_profile.append(train_loss / Xtrain.shape[0])
            if global_specs['readout_acc']:
                # if we need readout, here we can compute it from the projected activities
                self.readout_profile.append(
                    self.linear_readout(_Xtrain, Ytrain, _Xtest, Ytest))
            #end

            if train_specs['verbose']:
                print('Training loss = {:.6f}'.format(train_loss /
                                                      Xtrain.shape[0]))
                if global_specs['readout_acc']:
                    print(
                        '[RidgeClassifier] (Test)  Read-out accuracy at layer {:d}: {:.4f}'
                        .format(self.layer_id, self.readout_profile[-1]))
                #end
            #end

            if global_specs['save_tmp']:
                # we may want to save the partially trained model, at an
                # epoch stamp lesser than the whole training span
                if epoch in global_specs['epochs_glws']:
                    dbn.save_model(global_specs,
                                   epoch=epoch,
                                   rbml=self.layer_id)
                #end
            #end

        #end epochs loop

        return _Xtrain, _Xtest
Beispiel #28
0
    },
]

###############################################################################
#                          Batchnorm evaluation mode                          #
###############################################################################

SQRT_GGN_SETTINGS += [
    {
        "input_fn":
        lambda: rand(2, 3, 4),
        "module_fn":
        lambda: initialize_training_false_recursive(
            Sequential(BatchNorm1d(num_features=3), Flatten())),
        "loss_function_fn":
        lambda: MSELoss(),
        "target_fn":
        lambda: regression_targets((2, 4 * 3)),
    },
    {
        "input_fn":
        lambda: rand(3, 2, 4, 3),
        "module_fn":
        lambda: initialize_training_false_recursive(
            Sequential(BatchNorm2d(num_features=2), Flatten())),
        "loss_function_fn":
        lambda: MSELoss(),
        "target_fn":
        lambda: regression_targets((3, 2 * 4 * 3)),
    },
    {
    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = transformer_outputs[0]
        logits = self.classifier(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."

        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=pooled_logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
Beispiel #30
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        intent_labels=None,
        entities_labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        """
        training model if entities_labels and intent_labels are passed, else inference

        :param input_ids: embedding ids of tokens
        :param attention_mask: attention_mask
        :param token_type_ids: token_type_ids
        :param position_ids: position_ids (optional)
        :param head_mask: head_mask (optional)
        :param inputs_embeds: inputs_embeds (optional)
        :param intent_labels: labels of intent
        :param entities_labels: labels of entities
        :param output_attentions: return attention weight or not
        :param output_hidden_states: return hidden_states or not
        :param return_dict: return dictionary or not
        :return:
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0][:, 1:]
        sequence_output = self.dropout(sequence_output)

        pooled_output = outputs[0][:, :1]
        pooled_output = self.dropout(pooled_output)

        entities_logits = self.entities_classifier(sequence_output)
        intent_logits = self.intents_classifier(pooled_output)

        entities_loss = None
        if entities_labels is not None:
            entities_loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask[:, 1:].reshape(-1) == 1
                active_logits = entities_logits.view(-1, self.num_entities)
                active_labels = torch.where(
                    active_loss, entities_labels.view(-1),
                    torch.tensor(entities_loss_fct.ignore_index).type_as(
                        entities_labels))
                entities_loss = entities_loss_fct(active_logits, active_labels)
            else:
                entities_loss = entities_loss_fct(
                    entities_logits.view(-1, self.num_entities),
                    entities_labels.view(-1))

        intent_loss = None
        if intent_labels is not None:
            if self.num_intents == 1:
                intent_loss_fct = MSELoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1),
                                              intent_labels.view(-1))
            else:
                intent_loss_fct = CrossEntropyLoss()
                intent_loss = intent_loss_fct(
                    intent_logits.view(-1, self.num_intents),
                    intent_labels.view(-1))

        if (entities_labels is not None) and (intent_labels is not None):
            loss = entities_loss * 0.1 + intent_loss * 0.9
        else:
            loss = None

        if self.training:
            return_dict = True

        if not return_dict:
            output = (loss, ) + outputs[2:]
            return ((loss, ) +
                    output) if ((entities_loss is not None) and
                                (intent_loss is not None)) else output

        return dict(
            entities_loss=entities_loss,
            intent_loss=intent_loss,
            loss=loss,
            logits=(entities_logits, intent_logits),
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )