Exemple #1
0
def train(model,
          loader_train,
          loader_valid,
          num_train_epochs=70,
          x_for_rouge=None,
          x_sent_align=None,
          optim='adam',
          learning_rate=3e-5,
          unchanged_limit=20,
          weights=None,
          ofp_fname='PLT',
          batch_ids=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    rouge_sys_sent_path = 'data.nosync/rouge_sent/' + ofp_fname + '/'
    rouge_sys_segs_path = 'data.nosync/rouge_segs/' + ofp_fname + '/'

    output_model_file = 'saved_models/' + ofp_fname
    output_config_file = 'saved_configs/' + ofp_fname

    if not os.path.exists(rouge_sys_sent_path):
        os.mkdir(rouge_sys_sent_path)
    if not os.path.exists(rouge_sys_segs_path):
        os.mkdir(rouge_sys_segs_path)

    if not os.path.exists('saved_models'):
        os.mkdir('saved_models')
    if not os.path.exists('saved_configs'):
        os.mkdir('saved_configs')

    if optim == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=learning_rate,
                                    weight_decay=0.01)
    else:
        optimizer = BertAdam(model.parameters(), lr=learning_rate)

    model.train()

    loss_ls, loss_ls_s, loss_ls_qa, loss_valid_ls = [], [], [], []
    qa_acc, qa_f1, sent_acc, sent_f1 = [], [], [], []

    acc_loss, acc_loss_s, acc_loss_qa = [], [], []
    best_qa_f1, best_sent_f1 = None, None

    best_valid = 1e3
    unchanged = 0

    if weights is not None:
        weights = torch.tensor([weights, 1.0], dtype=torch.float32).to(device)

    cur_used_ls_mean, total_used, total_s, mean_seg_len = None, None, None, None

    for _ in trange(num_train_epochs, desc="Epoch"):
        for step, batch in enumerate(tqdm(loader_train, desc="Iteration")):
            optimizer.zero_grad()

            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, start_positions, end_position, sent_labels, seg_ids = batch

            loss, loss_s, loss_q = model(input_ids,
                                         seg_ids,
                                         input_mask,
                                         sent_labels,
                                         start_positions,
                                         end_position,
                                         weights,
                                         train=True)

            loss.backward()
            optimizer.step()

            acc_loss.append(loss.cpu().data.numpy())
            acc_loss_s.append(loss_s.cpu().data.numpy())
            acc_loss_qa.append(loss_q.cpu().data.numpy())

            if (step + 1) % 10000 == 0:
                loss_ls.append(np.mean(acc_loss))
                loss_ls_s.append(np.mean(acc_loss_s))
                loss_ls_qa.append(np.mean(acc_loss_qa))

                acc_loss, acc_loss_s, acc_loss_qa = [], [], []

                with torch.no_grad():
                    eval_gt_start, eval_gt_end, eval_gt_sent = [], [], []
                    eval_sys_start, eval_sys_end, eval_sys_sent = [], [], []

                    valid_ls = []

                    for _, batch_valid in enumerate(
                            tqdm(loader_valid, desc="Validation")):
                        batch_valid = tuple(
                            t2.to(device) for t2 in batch_valid)

                        input_ids, input_mask, start_positions, end_position, sent_labels, seg_ids = batch_valid
                        start_l, end_l, sent_l, valid_l = model(
                            input_ids, seg_ids, input_mask, sent_labels,
                            start_positions, end_position, None)

                        eval_gt_start.extend(
                            start_positions.cpu().data.numpy())
                        eval_gt_end.extend(end_position.cpu().data.numpy())
                        eval_gt_sent.extend(sent_labels.cpu().data.numpy())

                        eval_sys_start.extend(start_l.cpu().data.numpy())
                        eval_sys_end.extend(end_l.cpu().data.numpy())
                        eval_sys_sent.extend(sent_l.cpu().data.numpy())

                        valid_ls.append(valid_l.cpu().data.numpy())

                    qa_acc_val, qa_f1_val, sent_acc_val, sent_f1_val = get_valid_evaluation(
                        eval_gt_start, eval_gt_end, eval_gt_sent,
                        eval_sys_start, eval_sys_end, eval_sys_sent)

                    avg_val_loss = np.mean(valid_ls)

                    qa_acc.append(qa_acc_val)
                    qa_f1.append(qa_f1_val)
                    sent_acc.append(sent_acc_val)
                    sent_f1.append(sent_f1_val)
                    loss_valid_ls.append(avg_val_loss)

                    if avg_val_loss < best_valid:
                        best_valid = avg_val_loss
                        unchanged = 0

                        best_qa_f1 = qa_f1_val
                        best_sent_f1 = sent_f1_val

                        cur_used_ls_mean, total_used, total_s, mean_seg_len, _ = create_valid_rouge(
                            x_for_rouge, eval_sys_sent, eval_sys_start,
                            eval_sys_end, eval_gt_sent, eval_gt_start,
                            eval_gt_end, batch_ids, x_sent_align,
                            rouge_sys_sent_path, rouge_sys_segs_path,
                            ofp_fname)

                        torch.save(model, output_model_file)

                    elif unchanged > unchanged_limit:
                        create_metric_figure(ofp_fname, loss_ls, loss_ls_s,
                                             loss_ls_qa, loss_valid_ls, qa_f1,
                                             sent_f1, cur_used_ls_mean,
                                             total_used, total_s, mean_seg_len,
                                             best_qa_f1, best_sent_f1)
                        return
                    else:
                        unchanged += 1

    create_metric_figure(ofp_fname, loss_ls, loss_ls_s, loss_ls_qa,
                         loss_valid_ls, qa_f1, sent_f1, cur_used_ls_mean,
                         total_used, total_s, mean_seg_len, best_qa_f1,
                         best_sent_f1)
class DualTrainer(object):
    def __init__(self, qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = DualNet(qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path)
        train_dir = os.path.join("./save", "dual")
        self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        # read data-set and prepare iterator
        self.train_loader = self.get_data_loader("./squad/train-v1.1.json")
        self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json")

        num_train_optimization_steps = len(self.train_loader) * config.num_epochs
        # optimizer
        param_optimizer = list(self.model.qa_model.named_parameters())
        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]]
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        self.qa_opt = BertAdam(optimizer_grouped_parameters,
                               lr=config.qa_lr,
                               warmup=config.warmup_proportion,
                               t_total=num_train_optimization_steps)

        params = list(self.model.ca2q_model.encoder.parameters()) \
                 + list(self.model.ca2q_model.decoder.parameters())
        # self.qg_lr = config.lr
        self.qg_opt = optim.Adam(params, config.qa_lr)

        # assign model to device and wrap it with DataParallel
        torch.cuda.set_device(0)
        self.model.cuda()
        self.model = nn.DataParallel(self.model)

    def get_data_loader(self, file):
        train_examples = read_squad_examples(file, is_training=True, debug=config.debug)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer=self.tokenizer,
                                                      max_seq_length=config.max_seq_len,
                                                      max_query_length=config.max_query_len,
                                                      doc_stride=128,
                                                      is_training=True)
        all_c_ids = torch.tensor([f.c_ids for f in train_features], dtype=torch.long)
        all_c_lens = torch.sum(torch.sign(all_c_ids), 1)
        all_tag_ids = torch.tensor([f.tag_ids for f in train_features], dtype=torch.long)
        all_q_ids = torch.tensor([f.q_ids for f in train_features], dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
        all_noq_start_positions = torch.tensor([f.noq_start_position for f in train_features], dtype=torch.long)
        all_noq_end_positions = torch.tensor([f.noq_end_position for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_c_ids, all_c_lens, all_tag_ids,
                                   all_q_ids, all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions, all_end_positions,
                                   all_noq_start_positions, all_noq_end_positions)

        sampler = RandomSampler(train_data)
        batch_size = int(config.batch_size / config.gradient_accumulation_steps)
        train_loader = DataLoader(train_data, sampler=sampler, batch_size=batch_size)

        return train_loader

    def save_model(self, loss, epoch):
        loss = round(loss, 3)
        dir_name = os.path.join(self.save_dir, "bert_{}_{:.3f}".format(epoch, loss))
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # save bert model
        model_to_save = self.model.module.qa_model if hasattr(self.model, "module") else self.model.qa_model
        model_file = os.path.join(dir_name, "pytorch_model.bin")
        config_file = os.path.join(dir_name, "bert_config.json")

        state_dict = model_to_save.state_dict()
        torch.save(state_dict, model_file)
        model_to_save.config.to_json_file(config_file)
        # save qg model
        model_to_save = self.model.module.ca2q_model if hasattr(self.model, "module") else self.model.ca2q_model
        file = os.path.join(self.save_dir, "{}_{:.3f}".format(epoch, loss))
        state_dict = {
            "encoder_state_dict": model_to_save.encoder.state_dict(),
            "decoder_state_dict": model_to_save.decoder.state_dict()
        }
        torch.save(state_dict, file)

    def train(self):
        global_step = 1
        batch_num = len(self.train_loader)
        best_loss = 1e10
        qa_loss_lst = []
        qg_loss_lst = []
        for epoch in range(1, config.num_epochs + 1):
            start = time.time()
            for step, batch in enumerate(self.train_loader, start=1):
                qa_loss, ca2q_loss = self.model(batch)

                # mean() to average across multiple gpu and back-propagation
                qa_loss = qa_loss.mean() / config.gradient_accumulation_steps
                ca2q_loss = ca2q_loss.mean() / config.gradient_accumulation_steps

                qa_loss.backward(retain_graph=True)
                ca2q_loss.backward()

                qa_loss_lst.append(qa_loss.detach().item())
                qg_loss_lst.append(ca2q_loss.detach().item())
                # clip gradient
                nn.utils.clip_grad_norm_(self.model.module.ca2q_model.parameters(), config.max_grad_norm)

                # update params
                if step % config.gradient_accumulation_steps == 0:
                    self.qa_opt.step()
                    self.qg_opt.step()
                    # zero grad
                    self.qa_opt.zero_grad()
                    self.qg_opt.zero_grad()
                    global_step += 1
                    avg_qa_loss = sum(qa_loss_lst)
                    avg_qg_loss = sum(qg_loss_lst)
                    # empty list
                    qa_loss_lst = []
                    qg_loss_lst = []
                    msg = "{}/{} {} - ETA : {} - qa_loss: {:.2f}, ca2q_loss :{:.2f}" \
                        .format(step, batch_num, progress_bar(step, batch_num),
                                eta(start, step, batch_num),
                                avg_qa_loss, avg_qg_loss)
                    print(msg, end="\r")

            val_qa_loss, val_qg_loss = self.evaluate(msg)
            if val_qg_loss <= best_loss:
                best_loss = val_qg_loss
                self.save_model(val_qg_loss, epoch)

            print("Epoch {} took {} - final loss : {:.4f} -  qa_loss :{:.4f}, qg_loss :{:.4f}"
                  .format(epoch, user_friendly_time(time_since(start)), ca2q_loss, val_qa_loss, val_qg_loss))

    def evaluate(self, msg):
        self.model.module.qa_model.eval()
        self.model.module.ca2q_model.eval_mode()
        num_val_batches = len(self.dev_loader)
        val_qa_losses = []
        val_qg_losses = []
        for i, val_data in enumerate(self.dev_loader, start=1):
            with torch.no_grad():
                val_batch_loss = self.model(val_data)
                qa_loss, qg_loss = val_batch_loss
                val_qa_losses.append(qa_loss.mean().item())
                val_qg_losses.append(qg_loss.mean().item())
                msg2 = "{} => Evaluating :{}/{}".format(msg, i, num_val_batches)
                print(msg2, end="\r")
        val_qa_loss = np.mean(val_qa_losses)
        val_qg_loss = np.mean(val_qg_losses)
        self.model.module.qa_model.train()
        self.model.module.ca2q_model.train_mode()
        return val_qa_loss, val_qg_loss
Exemple #3
0
    'best_acc_test': 0,
    'best_mac_test': 0,
    'best_mic_test': 0
}
for epoch in range(args.max_epoch):
    print('-' * 20, 'Epoch {}'.format(epoch), '-' * 20)
    start_time = time.time()

    epoch_loss = []
    progress = tqdm.tqdm(total=batch_num,
                         mininterval=1,
                         desc='Epoch: {}'.format(epoch))
    for batch_idx in range(batch_num):
        global_step += 1
        progress.update(1)
        optimizer.zero_grad()
        batch = train_set.next_batch(label_size,
                                     batch_size,
                                     drop_last=True,
                                     shuffle=True,
                                     gpu=gpu)
        (
            elmos,
            labels,
            men_masks,
            ctx_masks,
            dists,
            gathers,
            men_ids,
        ) = batch
        loss = model.forward(elmos, labels, men_masks, ctx_masks, dists,
class MTDNNModel(MTDNNPretrainedModel):
    """Instance of an MTDNN Model

    Arguments:
        MTDNNPretrainedModel {BertPretrainedModel} -- Inherited from Bert Pretrained
        config  {MTDNNConfig} -- MTDNN Configuration Object
        pretrained_model_name {str} -- Name of the pretrained model to initial checkpoint
        num_train_step  {int} -- Number of steps to take each training

    Raises:
        RuntimeError: [description]
        ImportError: [description]

    Returns:
        MTDNNModel -- An Instance of an MTDNN Model
    """
    def __init__(
        self,
        config: MTDNNConfig,
        task_defs: MTDNNTaskDefs,
        data_processor: MTDNNDataProcess,
        pretrained_model_name: str = "mtdnn-base-uncased",
        test_datasets_list: list = [],
        output_dir: str = "checkpoint",
    ):

        # Input validation
        assert (
            config.init_checkpoint in self.supported_init_checkpoints()
        ), f"Initial checkpoint must be in {self.supported_init_checkpoints()}"

        num_train_step = data_processor.get_num_all_batches()
        decoder_opts = data_processor.get_decoder_options_list()
        task_types = data_processor.get_task_types_list()
        dropout_list = data_processor.get_tasks_dropout_prob_list()
        loss_types = data_processor.get_loss_types_list()
        kd_loss_types = data_processor.get_kd_loss_types_list()
        tasks_nclass_list = data_processor.get_task_nclass_list()

        # data loaders
        multitask_train_dataloader = data_processor.get_train_dataloader()
        dev_dataloaders_list = data_processor.get_dev_dataloaders()
        test_dataloaders_list = data_processor.get_test_dataloaders()

        assert decoder_opts, "Decoder options list is required!"
        assert task_types, "Task types list is required!"
        assert dropout_list, "Task dropout list is required!"
        assert loss_types, "Loss types list is required!"
        assert kd_loss_types, "KD Loss types list is required!"
        assert tasks_nclass_list, "Tasks nclass list is required!"
        assert (multitask_train_dataloader
                ), "DataLoader for multiple tasks cannot be None"

        super(MTDNNModel, self).__init__(config)

        # Initialize model config and update with training options
        self.config = config
        self.update_config_with_training_opts(
            decoder_opts,
            task_types,
            dropout_list,
            loss_types,
            kd_loss_types,
            tasks_nclass_list,
        )
        wandb.init(project='mtl-uncertainty-final',
                   entity='feifang24',
                   config=self.config.to_dict())
        self.tasks = data_processor.tasks  # {task_name: task_idx}
        self.task_defs = task_defs
        self.multitask_train_dataloader = multitask_train_dataloader
        self.dev_dataloaders_list = dev_dataloaders_list
        self.test_dataloaders_list = test_dataloaders_list
        self.test_datasets_list = self._configure_test_ds(test_datasets_list)
        self.output_dir = output_dir

        self.batch_bald = BatchBALD(num_samples=10,
                                    num_draw=500,
                                    shuffle_prop=0.0,
                                    reverse=True,
                                    reduction='mean')
        self.loss_weights = [None] * self.num_tasks

        # Create the output_dir if it's doesn't exist
        MTDNNCommonUtils.create_directory_if_not_exists(self.output_dir)

        self.pooler = None

        # Resume from model checkpoint
        if self.config.resume and self.config.model_ckpt:
            assert os.path.exists(
                self.config.model_ckpt), "Model checkpoint does not exist"
            logger.info(f"loading model from {self.config.model_ckpt}")
            self = self.load(self.config.model_ckpt)
            return

        # Setup the baseline network
        # - Define the encoder based on config options
        # - Set state dictionary based on configuration setting
        # - Download pretrained model if flag is set
        # TODO - Use Model.pretrained_model() after configuration file is hosted.
        if self.config.use_pretrained_model:
            with MTDNNCommonUtils.download_path() as file_path:
                path = pathlib.Path(file_path)
                self.local_model_path = MTDNNCommonUtils.maybe_download(
                    url=self.
                    pretrained_model_archive_map[pretrained_model_name],
                    log=logger,
                )
            self.bert_model = MTDNNCommonUtils.load_pytorch_model(
                self.local_model_path)
            self.state_dict = self.bert_model["state"]
        else:
            # Set the config base on encoder type set for initial checkpoint
            if config.encoder_type == EncoderModelType.BERT:
                self.bert_config = BertConfig.from_dict(self.config.to_dict())
                self.bert_model = BertModel.from_pretrained(
                    self.config.init_checkpoint)
                self.state_dict = self.bert_model.state_dict()
                self.config.hidden_size = self.bert_config.hidden_size
            if config.encoder_type == EncoderModelType.ROBERTA:
                # Download and extract from PyTorch hub if not downloaded before
                self.bert_model = torch.hub.load("pytorch/fairseq",
                                                 config.init_checkpoint)
                self.config.hidden_size = self.bert_model.args.encoder_embed_dim
                self.pooler = LinearPooler(self.config.hidden_size)
                new_state_dict = {}
                for key, val in self.bert_model.state_dict().items():
                    if key.startswith("model.decoder.sentence_encoder"
                                      ) or key.startswith(
                                          "model.classification_heads"):
                        key = f"bert.{key}"
                        new_state_dict[key] = val
                    # backward compatibility PyTorch <= 1.0.0
                    if key.startswith("classification_heads"):
                        key = f"bert.model.{key}"
                        new_state_dict[key] = val
                self.state_dict = new_state_dict

        self.updates = (self.state_dict["updates"] if self.state_dict
                        and "updates" in self.state_dict else 0)
        self.local_updates = 0
        self.train_loss = AverageMeter()
        self.train_loss_by_task = [
            AverageMeter() for _ in range(len(self.tasks))
        ]
        self.network = SANBERTNetwork(
            init_checkpoint_model=self.bert_model,
            pooler=self.pooler,
            config=self.config,
        )
        if self.state_dict:
            self.network.load_state_dict(self.state_dict, strict=False)
        self.mnetwork = (nn.DataParallel(self.network)
                         if self.config.multi_gpu_on else self.network)
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])

        # Move network to GPU if device available and flag set
        if self.config.cuda:
            self.network.cuda(device=self.config.cuda_device)
        self.optimizer_parameters = self._get_param_groups()
        self._setup_optim(self.optimizer_parameters, self.state_dict,
                          num_train_step)
        self.para_swapped = False
        self.optimizer.zero_grad()
        self._setup_lossmap()

    @property
    def num_tasks(self):
        return len(self.tasks)

    def _configure_test_ds(self, test_datasets_list):
        if test_datasets_list: return test_datasets_list
        result = []
        for task in self.task_defs.get_task_names():
            if task == 'mnli':
                result.append('mnli_matched')
                result.append('mnli_mismatched')
            else:
                result.append(task)
        return result

    def _get_param_groups(self):
        no_decay = [
            "bias", "gamma", "beta", "LayerNorm.bias", "LayerNorm.weight"
        ]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in self.network.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.network.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        return optimizer_parameters

    def _setup_optim(self,
                     optimizer_parameters,
                     state_dict: dict = None,
                     num_train_step: int = -1):

        # Setup optimizer parameters
        if self.config.optimizer == "sgd":
            self.optimizer = optim.SGD(
                optimizer_parameters,
                self.config.learning_rate,
                weight_decay=self.config.weight_decay,
            )
        elif self.config.optimizer == "adamax":
            self.optimizer = Adamax(
                optimizer_parameters,
                self.config.learning_rate,
                warmup=self.config.warmup,
                t_total=num_train_step,
                max_grad_norm=self.config.grad_clipping,
                schedule=self.config.warmup_schedule,
                weight_decay=self.config.weight_decay,
            )

        elif self.config.optimizer == "radam":
            self.optimizer = RAdam(
                optimizer_parameters,
                self.config.learning_rate,
                warmup=self.config.warmup,
                t_total=num_train_step,
                max_grad_norm=self.config.grad_clipping,
                schedule=self.config.warmup_schedule,
                eps=self.config.adam_eps,
                weight_decay=self.config.weight_decay,
            )

            # The current radam does not support FP16.
            self.config.fp16 = False
        elif self.config.optimizer == "adam":
            self.optimizer = Adam(
                optimizer_parameters,
                lr=self.config.learning_rate,
                warmup=self.config.warmup,
                t_total=num_train_step,
                max_grad_norm=self.config.grad_clipping,
                schedule=self.config.warmup_schedule,
                weight_decay=self.config.weight_decay,
            )

        else:
            raise RuntimeError(
                f"Unsupported optimizer: {self.config.optimizer}")

        # Clear scheduler for certain optimizer choices
        if self.config.optimizer in ["adam", "adamax", "radam"]:
            if self.config.have_lr_scheduler:
                self.config.have_lr_scheduler = False

        if state_dict and "optimizer" in state_dict:
            self.optimizer.load_state_dict(state_dict["optimizer"])

        if self.config.fp16:
            try:
                global amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(
                self.network,
                self.optimizer,
                opt_level=self.config.fp16_opt_level)
            self.network = model
            self.optimizer = optimizer

        if self.config.have_lr_scheduler:
            if self.config.scheduler_type == "rop":
                self.scheduler = ReduceLROnPlateau(self.optimizer,
                                                   mode="max",
                                                   factor=self.config.lr_gamma,
                                                   patience=3)
            elif self.config.scheduler_type == "exp":
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=self.config.lr_gamma
                                               or 0.95)
            else:
                milestones = [
                    int(step) for step in (
                        self.config.multi_step_lr or "10,20,30").split(",")
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=self.config.lr_gamma)
        else:
            self.scheduler = None

    def _setup_lossmap(self):
        self.task_loss_criterion = []
        for idx, cs in enumerate(self.config.loss_types):
            assert cs is not None, "Loss type must be defined."
            lc = LOSS_REGISTRY[cs](name=f"Loss func of task {idx}: {cs}")
            self.task_loss_criterion.append(lc)

    def _setup_kd_lossmap(self):
        loss_types = self.config.kd_loss_types
        self.kd_task_loss_criterion = []
        if self.config.mkd_opt > 0:
            for idx, cs in enumerate(loss_types):
                assert cs, "Loss type must be defined."
                lc = LOSS_REGISTRY[cs](
                    name="Loss func of task {}: {}".format(idx, cs))
                self.kd_task_loss_criterion.append(lc)

    def _to_cuda(self, tensor):
        # Set tensor to gpu (non-blocking) if a PyTorch tensor
        if tensor is None:
            return tensor

        if isinstance(tensor, list) or isinstance(tensor, tuple):
            y = [
                e.cuda(device=self.config.cuda_device, non_blocking=True)
                for e in tensor
            ]
            for t in y:
                t.requires_grad = False
        else:
            y = tensor.cuda(device=self.config.cuda_device, non_blocking=True)
            y.requires_grad = False
        return y

    def train(self):
        if self.para_swapped:
            self.para_swapped = False

    def update(self, batch_meta, batch_data):
        self.network.train()
        target = batch_data[batch_meta["label"]]
        soft_labels = None

        task_type = batch_meta["task_type"]
        target = self._to_cuda(target) if self.config.cuda else target

        task_id = batch_meta["task_id"]
        inputs = batch_data[:batch_meta["input_len"]]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        weight = self.loss_weights[task_id]
        if self.config.weighted_on:
            if self.config.cuda:
                weight = batch_data[batch_meta["factor"]].cuda(
                    device=self.config.cuda_device, non_blocking=True)
            else:
                weight = batch_data[batch_meta["factor"]]
        logits = self.mnetwork(*inputs)

        # compute loss
        loss = 0
        if self.task_loss_criterion[task_id] and (target is not None):
            loss = self.task_loss_criterion[task_id](logits,
                                                     target,
                                                     weight,
                                                     ignore_index=-1)

        # compute kd loss
        if self.config.mkd_opt > 0 and ("soft_label" in batch_meta):
            soft_labels = batch_meta["soft_label"]
            soft_labels = (self._to_cuda(soft_labels)
                           if self.config.cuda else soft_labels)
            kd_lc = self.kd_task_loss_criterion[task_id]
            kd_loss = (kd_lc(logits, soft_labels, weight, ignore_index=-1)
                       if kd_lc else 0)
            loss = loss + kd_loss

        self.train_loss_by_task[task_id].update(
            loss.item() / (self.loss_weights[task_id]
                           if self.loss_weights[task_id] is not None else 1.),
            batch_data[batch_meta["token_id"]].size(0))
        self.train_loss.update(loss.item(),
                               batch_data[batch_meta["token_id"]].size(0))
        # scale loss
        loss = loss / (self.config.grad_accumulation_step or 1)
        if self.config.fp16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.local_updates += 1
        if self.local_updates % self.config.grad_accumulation_step == 0:
            if self.config.global_grad_clipping > 0:
                if self.config.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(self.optimizer),
                        self.config.global_grad_clipping,
                    )
                else:
                    torch.nn.utils.clip_grad_norm_(
                        self.network.parameters(),
                        self.config.global_grad_clipping)
            self.updates += 1
            # reset number of the grad accumulation
            self.optimizer.step()
            self.optimizer.zero_grad()

    def eval_mode(self,
                  data: DataLoader,
                  metric_meta,
                  use_cuda=True,
                  with_label=True,
                  label_mapper=None,
                  task_type=TaskType.Classification):
        eval_loss = AverageMeter()
        if use_cuda:
            self.cuda()
        predictions = []
        golds = []
        scores = []
        uncertainties = []
        ids = []
        metrics = {}
        for idx, (batch_info, batch_data) in enumerate(data):
            if idx % 100 == 0:
                logger.info(f"predicting {idx}")
            batch_info, batch_data = MTDNNCollater.patch_data(
                use_cuda, batch_info, batch_data)
            score, pred, gold, loss, uncertainty = self._predict_batch(
                batch_info, batch_data)
            predictions.extend(pred)
            golds.extend(gold)
            scores.extend(score)
            uncertainties.extend(uncertainty)
            ids.extend(batch_info["uids"])
            eval_loss.update(loss.item(), len(batch_info["uids"]))

        if task_type == TaskType.Span:
            golds = merge_answers(ids, golds)
            predictions, scores = select_answers(ids, predictions, scores)
        if with_label:
            metrics = calc_metrics(metric_meta, golds, predictions, scores,
                                   label_mapper)
        return metrics, predictions, scores, golds, ids, (
            eval_loss.avg, eval_loss.count), np.mean(uncertainties)

    def _predict_batch(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta["task_id"]
        task_type = batch_meta["task_type"]
        inputs = batch_data[:batch_meta["input_len"]]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)

        # get logits (and val loss if we have labels)
        label = batch_meta["label"]
        target = batch_data[label] if type(label) is int else torch.tensor(
            label)
        target = self._to_cuda(target) if self.config.cuda else target

        weight = None
        if self.config.weighted_on:
            if self.config.cuda:
                weight = batch_data[batch_meta["factor"]].cuda(
                    device=self.config.cuda_device, non_blocking=True)
            else:
                weight = batch_data[batch_meta["factor"]]

        score = self.mnetwork(*inputs)
        if self.config.mc_dropout_samples > 0:

            def apply_dropout(m):
                if isinstance(m, DropoutWrapper):
                    m.train()

            self.network.apply(apply_dropout)
            mc_sample_scores = torch.stack([
                self.mnetwork(*inputs)
                for _ in range(self.config.mc_dropout_samples)
            ], -1)
            mc_sample_scores = F.softmax(mc_sample_scores,
                                         dim=1).data.cpu().numpy()
            uncertainty = self.batch_bald.get_uncertainties(mc_sample_scores)
        else:
            uncertainty = 1.0

        loss = None
        if self.task_loss_criterion[task_id] and (target is not None):
            loss = self.task_loss_criterion[task_id](score,
                                                     target,
                                                     weight,
                                                     ignore_index=-1)

        if task_type == TaskType.Ranking:
            score = score.contiguous().view(-1, batch_meta["pairwise_size"])
            assert task_type == TaskType.Ranking
            score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta["true_label"], loss
        elif task_type == TaskType.SequenceLabeling:
            mask = batch_data[batch_meta["mask"]]
            score = score.contiguous()
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).reshape(mask.size()).tolist()
            valied_lenght = mask.sum(1).tolist()
            final_predict = []
            for idx, p in enumerate(predict):
                final_predict.append(p[:valied_lenght[idx]])
            score = score.reshape(-1).tolist()
            return score, final_predict, batch_meta["label"], loss
        elif task_type == TaskType.Span:
            start, end = score
            predictions = []
            if self.config.encoder_type == EncoderModelType.BERT:
                scores, predictions = extract_answer(
                    batch_meta,
                    batch_data,
                    start,
                    end,
                    self.config.get("max_answer_len", 5),
                )
            return scores, predictions, batch_meta["answer"], loss
        else:
            if task_type == TaskType.Classification:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta["label"], loss, uncertainty

    def _rerank_batches(self,
                        batches,
                        start_idx,
                        task_id_to_weights,
                        softmax_task_weights=False):
        def weights_to_probs(weights):
            if softmax_task_weights:
                probs = softmax(weights)
            else:
                probs = weights / np.sum(weights)
            return probs

        # reshuffle all batches; sort them by task_id
        new_batches = [list(self.multitask_train_dataloader) for _ in range(5)]
        for i in range(len(new_batches)):
            random.shuffle(new_batches[i])  # this line somehow helps?
        new_batches = [b for batches in new_batches
                       for b in batches]  # flatten
        task_id_by_batch = [
            batch_meta["task_id"] for batch_meta, _ in new_batches
        ]
        batches_by_task = [[] for _ in range(self.num_tasks)]
        for batch_idx, task_id in enumerate(task_id_by_batch):
            batches_by_task[task_id].append(batch_idx)

        task_probs = weights_to_probs(task_id_to_weights)

        # multiply weight by num batches per task
        # task_probs = weights_to_probs(task_id_to_weights * np.asarray([len(batches) for batches in batches_by_task]))  # comment out as see fit

        if self.config.uncertainty_based_weight:
            rel_loss_weights = (1. / task_id_to_weights)
            self.loss_weights = (rel_loss_weights * self.num_tasks / np.sum(rel_loss_weights)) * \
                                    (np.mean(self.dev_loss_by_task) / self.dev_loss_by_task)
            # self.loss_weights = rel_loss_weights * np.mean(task_id_to_weights)

        num_batches = len(batches[start_idx:])
        # sample num_batches many tasks w/ replacement
        task_indices_sampled = np.random.choice(self.num_tasks,
                                                num_batches,
                                                replace=True,
                                                p=task_probs)

        reranked_batches = [None] * num_batches
        counters = [0] * self.num_tasks
        for i, task_id in enumerate(task_indices_sampled):
            batch_idx = batches_by_task[task_id][counters[task_id] %
                                                 len(batches_by_task[task_id])]
            counters[task_id] += 1
            reranked_batches[i] = new_batches[batch_idx]

        weights_by_task_name = {}
        for task_name, task_id in self.tasks.items():
            weights_by_task_name[f'task_weight/{task_name}'] = task_probs[
                task_id]

        return [None] * start_idx + reranked_batches, weights_by_task_name

    def fit(self, epochs=0):
        """ Fit model to training datasets """
        epochs = epochs or self.config.epochs
        logger.info(f"Total number of params: {self.total_param}")
        FIRST_STEP_TO_LOG = 10
        for epoch in range(1, epochs + 1):
            logger.info(f"At epoch {epoch}")
            logger.info(
                f"Amount of data to go over: {len(self.multitask_train_dataloader)}"
            )

            start = datetime.now()
            # Create batches and train
            batches = list(self.multitask_train_dataloader)
            if self.config.uncertainty_based_sampling and epoch > 1:
                batches, weights_by_task_name = self._rerank_batches(
                    batches,
                    start_idx=0,
                    task_id_to_weights=self.smoothed_uncertainties_by_task)
            for idx in range(len(batches)):
                batch_meta, batch_data = batches[idx]
                batch_meta, batch_data = MTDNNCollater.patch_data(
                    self.config.cuda, batch_meta, batch_data)

                task_id = batch_meta["task_id"]
                self.update(batch_meta, batch_data)
                if (self.local_updates == FIRST_STEP_TO_LOG
                        or (self.local_updates) %
                    (self.config.log_per_updates *
                     self.config.grad_accumulation_step) == 0):

                    time_left = str((datetime.now() - start) / (idx + 1) *
                                    (len(self.multitask_train_dataloader) -
                                     idx - 1)).split(".")[0]
                    logger.info(
                        "Updates - [{0:6}] Training Loss - [{1:.5f}] Time Remaining - [{2}]"
                        .format(self.updates, self.train_loss.avg, time_left))
                    val_logs, uncertainties_by_task = self._eval(
                        epoch, save_scores=False, eval_type='dev')
                    test_logs, _ = self._eval(epoch,
                                              save_scores=False,
                                              eval_type='test')
                    if self.local_updates == FIRST_STEP_TO_LOG:
                        weights_by_task_name = {
                            f'task_weight/{task_name}': 1.0
                            for task_name in self.tasks
                        }
                    else:
                        if self.local_updates == self.config.log_per_updates * self.config.grad_accumulation_step:
                            self.smoothed_uncertainties_by_task = uncertainties_by_task
                            self.initial_train_loss_by_task = np.asarray(
                                [loss.avg for loss in self.train_loss_by_task])
                        else:
                            alpha = self.config.smooth_uncertainties
                            self.smoothed_uncertainties_by_task = alpha * self.smoothed_uncertainties_by_task + \
                                                                    (1 - alpha) * uncertainties_by_task
                        if self.config.uncertainty_based_sampling and idx < len(
                                batches) - 1:
                            batches, weights_by_task_name = self._rerank_batches(
                                batches,
                                start_idx=idx + 1,
                                task_id_to_weights=self.
                                smoothed_uncertainties_by_task)
                        if self.config.rate_based_weight:
                            current_train_loss_by_task = np.asarray(
                                [loss.avg for loss in self.train_loss_by_task])
                            rate_of_training_by_task = current_train_loss_by_task / self.initial_train_loss_by_task
                            self.loss_weights = (rate_of_training_by_task / np.mean(rate_of_training_by_task)) * \
                                                    (np.mean(current_train_loss_by_task) / current_train_loss_by_task)
                    self._log_training({
                        **val_logs,
                        **test_logs,
                        **weights_by_task_name
                    })

                if self.config.save_per_updates_on and (
                    (self.local_updates) %
                    (self.config.save_per_updates *
                     self.config.grad_accumulation_step) == 0):
                    model_file = os.path.join(
                        self.output_dir,
                        "model_{}_{}.pt".format(epoch, self.updates),
                    )
                    logger.info(f"Saving mt-dnn model to {model_file}")
                    self.save(model_file)

            # Eval and save checkpoint after each epoch
            logger.info('=' * 5 + f' End of EPOCH {epoch} ' + '=' * 5)
            logger.info(f'Train loss (epoch avg): {self.train_loss.avg}')
            val_logs, uncertainties_by_task = self._eval(epoch,
                                                         save_scores=True,
                                                         eval_type='dev')
            test_logs, _ = self._eval(epoch,
                                      save_scores=True,
                                      eval_type='test')
            self._log_training({
                **val_logs,
                **test_logs,
                **weights_by_task_name
            })

            # model_file = os.path.join(self.output_dir, "model_{}.pt".format(epoch))
            # logger.info(f"Saving mt-dnn model to {model_file}")
            # self.save(model_file)

    def _eval(self, epoch, save_scores, eval_type='dev'):
        if eval_type not in {'dev', 'test'}:
            raise ValueError(
                "eval_type must be one of the following: 'dev' or 'test'.")
        is_dev = eval_type == 'dev'

        log_dict = {}
        loss_agg = AverageMeter()
        loss_by_task = {}
        uncertainties_by_task = {}
        for idx, dataset in enumerate(self.test_datasets_list):
            logger.info(
                f"Evaluating on {eval_type} ds {idx}: {dataset.upper()}")
            prefix = dataset.split("_")[0]
            results = self._predict(idx,
                                    prefix,
                                    dataset,
                                    eval_type=eval_type,
                                    saved_epoch_idx=epoch,
                                    save_scores=save_scores)

            avg_loss = results['avg_loss']
            num_samples = results['num_samples']
            loss_agg.update(avg_loss, n=num_samples)
            loss_by_task[dataset] = avg_loss
            if is_dev:
                logger.info(
                    f"Task {dataset} -- {eval_type} loss: {avg_loss:.3f}")

            metrics = results['metrics']
            for key, val in metrics.items():
                if is_dev:
                    logger.info(
                        f"Task {dataset} -- {eval_type} {key}: {val:.3f}")
                log_dict[f'{dataset}/{eval_type}_{key}'] = val

            uncertainty = results['uncertainty']
            if is_dev:
                logger.info(
                    f"Task {dataset} -- {eval_type} uncertainty: {uncertainty:.3f}"
                )
            log_dict[
                f'{eval_type}_uncertainty_by_task/{dataset}'] = uncertainty
            if prefix not in uncertainties_by_task:
                uncertainties_by_task[prefix] = uncertainty
            else:
                # exploiting the fact that only mnli has two dev sets
                uncertainties_by_task[prefix] += uncertainty
                uncertainties_by_task[prefix] /= 2
        if is_dev: logger.info(f'{eval_type} loss: {loss_agg.avg}')
        log_dict[f'{eval_type}_loss'] = loss_agg.avg
        log_dict.update({
            f'{eval_type}_loss_by_task/{task}': loss
            for task, loss in loss_by_task.items()
        })

        loss_by_task_id = [None] * self.num_tasks
        for task_name, loss in loss_by_task.items():
            loss_by_task_id[self.tasks[task_name]] = loss
        loss_by_task_id = np.asarray(loss_by_task_id)

        if is_dev:
            self.dev_loss_by_task = loss_by_task_id
        else:
            self.test_loss_by_task = loss_by_task_id

        # convert uncertainties_by_task from dict to list, where list[i] = weight of task_id i
        uncertainties_by_task_id = [None] * self.num_tasks
        for task_name, weight in uncertainties_by_task.items():
            task_id = self.tasks[task_name]
            uncertainties_by_task_id[task_id] = weight
        uncertainties_by_task_id = np.asarray(uncertainties_by_task_id)

        return log_dict, uncertainties_by_task_id

    def _log_training(self, val_logs):
        train_loss_by_task = {
            f'train_loss_by_task/{task}': self.train_loss_by_task[task_idx].avg
            for task, task_idx in self.tasks.items()
        }
        train_loss_agg = {'train_loss': self.train_loss.avg}
        loss_weights_by_task = {}
        if self.config.uncertainty_based_weight or self.config.rate_based_weight:
            for task_name, task_id in self.tasks.items():
                loss_weights_by_task[
                    f'loss_weight/{task_name}'] = self.loss_weights[
                        task_id] if self.loss_weights[
                            task_id] is not None else 1.
        log_dict = {
            'global_step': self.updates,
            **train_loss_by_task,
            **train_loss_agg,
            **val_logs,
            **loss_weights_by_task
        }
        wandb.log(log_dict)

    def _predict(self,
                 eval_ds_idx,
                 eval_ds_prefix,
                 eval_ds_name,
                 eval_type='dev',
                 saved_epoch_idx=None,
                 save_scores=True):
        if eval_type not in {'dev', 'test'}:
            raise ValueError(
                "eval_type must be one of the following: 'dev' or 'test'.")
        is_dev = eval_type == 'dev'

        label_dict = self.task_defs.global_map.get(eval_ds_prefix, None)

        if is_dev:
            data: DataLoader = self.dev_dataloaders_list[eval_ds_idx]
        else:
            data: DataLoader = self.test_dataloaders_list[eval_ds_idx]

        if data is None:
            results = None
        else:
            with torch.no_grad():
                (
                    metrics,
                    predictions,
                    scores,
                    golds,
                    ids,
                    (eval_ds_avg_loss, eval_ds_num_samples),
                    uncertainty,
                ) = self.eval_mode(
                    data,
                    metric_meta=self.task_defs.metric_meta_map[eval_ds_prefix],
                    use_cuda=self.config.cuda,
                    with_label=True,
                    label_mapper=label_dict,
                    task_type=self.task_defs.task_type_map[eval_ds_prefix])
            results = {
                "metrics": metrics,
                "predictions": predictions,
                "uids": ids,
                "scores": scores,
                "uncertainty": uncertainty
            }
            if save_scores:
                score_file_prefix = f"{eval_ds_name}_{eval_type}_scores" \
                                    + (f'_{saved_epoch_idx}' if saved_epoch_idx is not None else "")
                score_file = os.path.join(self.output_dir,
                                          score_file_prefix + ".json")
                MTDNNCommonUtils.dump(score_file, results)
                if self.config.use_glue_format:
                    official_score_file = os.path.join(
                        self.output_dir, score_file_prefix + ".tsv")
                    submit(official_score_file, results, label_dict)

            results.update({
                "avg_loss": eval_ds_avg_loss,
                "num_samples": eval_ds_num_samples
            })

        return results

    def predict(self, trained_model_chckpt: str = None):
        """
        Inference of model on test datasets
        """
        # Load a trained checkpoint if a valid model checkpoint
        if trained_model_chckpt and gfile.exists(trained_model_chckpt):
            logger.info(
                f"Running predictions using: {trained_model_chckpt}. This may take 3 minutes."
            )
            self.load(trained_model_chckpt)
            logger.info("Checkpoint loaded.")

        self.config.batch_size_eval = 128
        self.config.use_glue_format = True

        # test eval
        for idx, dataset in enumerate(self.test_datasets_list):
            prefix = dataset.split("_")[0]
            results = self._predict(idx, prefix, dataset, eval_type='test')
            if results:
                logger.info(f"[new test scores saved for {dataset}.]")
            else:
                logger.info(f"Data not found for {dataset}.")

    def extract(self, batch_meta, batch_data):
        self.network.eval()
        # 'token_id': 0; 'segment_id': 1; 'mask': 2
        inputs = batch_data[:3]
        all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs)
        return all_encoder_layers, pooled_output

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        params = {
            "state": network_state,
            "optimizer": self.optimizer.state_dict(),
            "config": self.config,
        }
        torch.save(params, gfile.GFile(filename, mode='wb'))
        logger.info("model saved to {}".format(filename))

    def load(self, checkpoint):
        model_state_dict = torch.load(gfile.GFile(checkpoint, mode='rb'))
        self.network.load_state_dict(model_state_dict["state"], strict=False)
        self.optimizer.load_state_dict(model_state_dict["optimizer"])
        self.config = model_state_dict["config"]

    def cuda(self):
        self.network.cuda(device=self.config.cuda_device)

    def supported_init_checkpoints(self):
        """List of allowed check points
        """
        return [
            "bert-base-uncased",
            "bert-base-cased",
            "bert-large-uncased",
            "mtdnn-base-uncased",
            "mtdnn-large-uncased",
            "roberta.base",
            "roberta.large",
        ]

    def update_config_with_training_opts(
        self,
        decoder_opts,
        task_types,
        dropout_list,
        loss_types,
        kd_loss_types,
        tasks_nclass_list,
    ):
        # Update configurations with options obtained from preprocessing training data
        setattr(self.config, "decoder_opts", decoder_opts)
        setattr(self.config, "task_types", task_types)
        setattr(self.config, "tasks_dropout_p", dropout_list)
        setattr(self.config, "loss_types", loss_types)
        setattr(self.config, "kd_loss_types", kd_loss_types)
        setattr(self.config, "tasks_nclass_list", tasks_nclass_list)
Exemple #5
0
    def finetune(self, clean_file, corrupt_file, data_dir="", validation_split=0.2, n_epochs=2, new_vocab_list=[]):

        if new_vocab_list:
            raise NotImplementedError("Do not currently support modifying output vocabulary of the models")

        # load data and split in train-validation
        data_dir = DEFAULT_TRAINTEST_DATA_PATH if data_dir == "default" else data_dir
        train_data = load_data(data_dir, clean_file, corrupt_file)
        train_data, valid_data = train_validation_split(train_data, 0.8, seed=11690)
        print("len of train and test data: ", len(train_data), len(valid_data))

        # load vocab and model
        self.__model_status()

        # finetune
        #############################################
        # training and validation
        #############################################
        model, vocab = self.model, self.vocab
        TRAIN_BATCH_SIZE, VALID_BATCH_SIZE = 16, 32
        GRADIENT_ACC = 4
        DEVICE = self.device
        START_EPOCH, N_EPOCHS = 0, n_epochs
        CHECKPOINT_PATH = os.path.join(data_dir, "new_models", os.path.split(self.bert_pretrained_name_or_path)[-1])
        if os.path.exists(CHECKPOINT_PATH):
            num = 1
            while True:
                NEW_CHECKPOINT_PATH = CHECKPOINT_PATH + f"-{num}"
                if not os.path.exists(NEW_CHECKPOINT_PATH):
                    break
                num += 1
            CHECKPOINT_PATH = NEW_CHECKPOINT_PATH
        VOCAB_PATH = os.path.join(CHECKPOINT_PATH, "vocab.pkl")
        if not os.path.exists(CHECKPOINT_PATH):
            os.makedirs(CHECKPOINT_PATH)
        print(f"CHECKPOINT_PATH: {CHECKPOINT_PATH}")

        # running stats
        max_dev_acc, argmax_dev_acc = -1, -1
        patience = 100

        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        t_total = int(len(train_data) / TRAIN_BATCH_SIZE / GRADIENT_ACC * N_EPOCHS)
        if t_total == 0:
            t_total = 1
        optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=t_total)

        # model to device
        model.to(DEVICE)

        # load parameters if not training from scratch
        if START_EPOCH > 1:
            progress_write_file = (
                open(os.path.join(CHECKPOINT_PATH, f"progress_retrain_from_epoch{START_EPOCH}.txt"), 'w')
            )
            model, optimizer, max_dev_acc, argmax_dev_acc = load_pretrained(model, CHECKPOINT_PATH, optimizer=optimizer)
            progress_write_file.write(f"Training model params after loading from path: {CHECKPOINT_PATH}\n")
        else:
            progress_write_file = open(os.path.join(CHECKPOINT_PATH, "progress.txt"), 'w')
            print(f"Training model params")
            progress_write_file.write(f"Training model params\n")
        progress_write_file.flush()

        # train and eval
        for epoch_id in range(START_EPOCH, N_EPOCHS + 1):
            # check for patience
            if (epoch_id - argmax_dev_acc) > patience:
                print("patience count reached. early stopping initiated")
                print("max_dev_acc: {}, argmax_dev_acc: {}".format(max_dev_acc, argmax_dev_acc))
                break
            # print epoch
            print(f"In epoch: {epoch_id}")
            progress_write_file.write(f"In epoch: {epoch_id}\n")
            progress_write_file.flush()
            # train loss and backprop
            train_loss = 0.
            train_acc = 0.
            train_acc_count = 0.
            print("train_data size: {}".format(len(train_data)))
            progress_write_file.write("train_data size: {}\n".format(len(train_data)))
            progress_write_file.flush()
            train_data_iter = batch_iter(train_data, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
            nbatches = int(np.ceil(len(train_data) / TRAIN_BATCH_SIZE))
            optimizer.zero_grad()
            for batch_id, (batch_labels, batch_sentences) in enumerate(train_data_iter):
                st_time = time.time()
                # set batch data for bert
                batch_labels_, batch_sentences_, batch_bert_inp, batch_bert_splits = \
                    bert_tokenize_for_valid_examples(batch_labels, batch_sentences, self.bert_pretrained_name_or_path)
                if len(batch_labels_) == 0:
                    print("################")
                    print("Not training the following lines due to pre-processing mismatch: \n")
                    print([(a, b) for a, b in zip(batch_labels, batch_sentences)])
                    print("################")
                    continue
                else:
                    batch_labels, batch_sentences = batch_labels_, batch_sentences_
                batch_bert_inp = {k: v.to(DEVICE) for k, v in batch_bert_inp.items()}
                # set batch data for others
                batch_labels, batch_lengths = labelize(batch_labels, vocab)
                # batch_lengths = batch_lengths.to(device)
                batch_labels = batch_labels.to(DEVICE)
                # forward
                model.train()
                loss = model(batch_bert_inp, batch_bert_splits, targets=batch_labels)
                batch_loss = loss.cpu().detach().numpy()
                train_loss += batch_loss
                # backward
                if GRADIENT_ACC > 1:
                    loss = loss / GRADIENT_ACC
                loss.backward()
                # step
                if (batch_id + 1) % GRADIENT_ACC == 0 or batch_id >= nbatches - 1:
                    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    # scheduler.step()
                    optimizer.zero_grad()
                # compute accuracy in numpy
                if batch_id % 10000 == 0:
                    train_acc_count += 1
                    model.eval()
                    with torch.no_grad():
                        _, batch_predictions = model(batch_bert_inp, batch_bert_splits, targets=batch_labels)
                    model.train()
                    batch_labels = batch_labels.cpu().detach().numpy()
                    batch_lengths = batch_lengths.cpu().detach().numpy()
                    ncorr, ntotal = batch_accuracy_func(batch_predictions, batch_labels, batch_lengths)
                    batch_acc = ncorr / ntotal
                    train_acc += batch_acc
                    # update progress
                progressBar(batch_id + 1,
                            int(np.ceil(len(train_data) / TRAIN_BATCH_SIZE)),
                            ["batch_time", "batch_loss", "avg_batch_loss", "batch_acc", "avg_batch_acc"],
                            [time.time() - st_time, batch_loss, train_loss / (batch_id + 1), batch_acc,
                             train_acc / train_acc_count])
                if batch_id == 0 or (batch_id + 1) % 5000 == 0:
                    nb = int(np.ceil(len(train_data) / TRAIN_BATCH_SIZE))
                    progress_write_file.write(f"{batch_id + 1}/{nb}\n")
                    progress_write_file.write(
                        f"batch_time: {time.time() - st_time}, avg_batch_loss: {train_loss / (batch_id + 1)}, "
                        f"avg_batch_acc: {train_acc / train_acc_count}\n")
                    progress_write_file.flush()
            print(f"\nEpoch {epoch_id} train_loss: {train_loss / (batch_id + 1)}")

            # valid loss
            valid_loss = 0.
            valid_acc = 0.
            print("valid_data size: {}".format(len(valid_data)))
            progress_write_file.write("valid_data size: {}\n".format(len(valid_data)))
            progress_write_file.flush()
            valid_data_iter = batch_iter(valid_data, batch_size=VALID_BATCH_SIZE, shuffle=False)
            for batch_id, (batch_labels, batch_sentences) in enumerate(valid_data_iter):
                st_time = time.time()
                # set batch data for bert
                batch_labels_, batch_sentences_, batch_bert_inp, batch_bert_splits = \
                    bert_tokenize_for_valid_examples(batch_labels, batch_sentences, self.bert_pretrained_name_or_path)
                if len(batch_labels_) == 0:
                    print("################")
                    print("Not validating the following lines due to pre-processing mismatch: \n")
                    print([(a, b) for a, b in zip(batch_labels, batch_sentences)])
                    print("################")
                    continue
                else:
                    batch_labels, batch_sentences = batch_labels_, batch_sentences_
                batch_bert_inp = {k: v.to(DEVICE) for k, v in batch_bert_inp.items()}
                # set batch data for others
                batch_labels, batch_lengths = labelize(batch_labels, vocab)
                # batch_lengths = batch_lengths.to(device)
                batch_labels = batch_labels.to(DEVICE)
                # forward
                model.eval()
                with torch.no_grad():
                    batch_loss, batch_predictions = model(batch_bert_inp, batch_bert_splits, targets=batch_labels)
                model.train()
                valid_loss += batch_loss
                # compute accuracy in numpy
                batch_labels = batch_labels.cpu().detach().numpy()
                batch_lengths = batch_lengths.cpu().detach().numpy()
                ncorr, ntotal = batch_accuracy_func(batch_predictions, batch_labels, batch_lengths)
                batch_acc = ncorr / ntotal
                valid_acc += batch_acc
                # update progress
                progressBar(batch_id + 1,
                            int(np.ceil(len(valid_data) / VALID_BATCH_SIZE)),
                            ["batch_time", "batch_loss", "avg_batch_loss", "batch_acc", "avg_batch_acc"],
                            [time.time() - st_time, batch_loss, valid_loss / (batch_id + 1), batch_acc,
                             valid_acc / (batch_id + 1)])
                if batch_id == 0 or (batch_id + 1) % 2000 == 0:
                    nb = int(np.ceil(len(valid_data) / VALID_BATCH_SIZE))
                    progress_write_file.write(f"{batch_id}/{nb}\n")
                    progress_write_file.write(
                        f"batch_time: {time.time() - st_time}, avg_batch_loss: {valid_loss / (batch_id + 1)}, "
                        f"avg_batch_acc: {valid_acc / (batch_id + 1)}\n")
                    progress_write_file.flush()
            print(f"\nEpoch {epoch_id} valid_loss: {valid_loss / (batch_id + 1)}")

            # save model, optimizer and test_predictions if val_acc is improved
            if valid_acc >= max_dev_acc:
                print(f"validation accuracy improved from {max_dev_acc:.4f} to {valid_acc:.4f}")
                # name = "model.pth.tar".format(epoch_id)
                # torch.save({
                #     'epoch_id': epoch_id,
                #     'max_dev_acc': max_dev_acc,
                #     'argmax_dev_acc': argmax_dev_acc,
                #     'model_state_dict': model.state_dict(),
                #     'optimizer_state_dict': optimizer.state_dict()},
                #     os.path.join(CHECKPOINT_PATH, name))
                name = "pytorch_model.bin"
                torch.save(model.state_dict(), os.path.join(CHECKPOINT_PATH, name))
                print("Model saved at {} in epoch {}".format(os.path.join(CHECKPOINT_PATH, name), epoch_id))
                save_vocab_dict(VOCAB_PATH, vocab)

                # re-assign
                max_dev_acc, argmax_dev_acc = valid_acc, epoch_id

        print(f"Model and logs saved at {CHECKPOINT_PATH}")
        return
def train_bert_cased(t_config, p_config, s_config):

    device = torch.device('cuda')
    seed_everything(s_config.seed)

    train = pd.read_csv('../input/train.csv').sample(
        t_config.num_to_load + t_config.valid_size, random_state=s_config.seed)
    train = prepare_train_text(train, p_config)
    train = train.fillna(0)

    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    train_processed = get_tokenized_samples(t_config.MAX_SEQUENCE_LENGTH,
                                            tokenizer, train['text_proc'])

    sequences = train_processed
    lengths = np.argmax(sequences == 0, axis=1)
    lengths[lengths == 0] = sequences.shape[1]

    MyModel = BertForSequenceClassification.from_pretrained(
        'bert-base-cased', num_labels=t_config.num_labels)
    MyModel.to(device)

    # Prepare target
    target_train = train['target'].values[:t_config.num_to_load]
    target_train_aux = train[[
        'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'
    ]].values[:t_config.num_to_load]
    target_train_identity = train[identity_columns].values[:t_config.
                                                           num_to_load]
    target_val = train['target'].values[t_config.num_to_load:]
    target_val_aux = train[[
        'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'
    ]].values[t_config.num_to_load:]
    target_val_identity = train[identity_columns].values[t_config.num_to_load:]

    # Prepare training data
    inputs_train = train_processed[:t_config.num_to_load]
    inputs_val = train_processed[t_config.num_to_load:]
    weight_train = train['weight'].values[:t_config.num_to_load]
    weight_val = train['weight'].values[t_config.num_to_load:]
    lengths_train = lengths[:t_config.num_to_load]
    lengths_val = lengths[t_config.num_to_load:]

    inputs_train = torch.tensor(inputs_train, dtype=torch.int64)
    Target_train = torch.Tensor(target_train)
    Target_train_aux = torch.Tensor(target_train_aux)
    Target_train_identity = torch.Tensor(target_train_identity)
    weight_train = torch.Tensor(weight_train)
    Lengths_train = torch.tensor(lengths_train, dtype=torch.int64)

    inputs_val = torch.tensor(inputs_val, dtype=torch.int64)
    Target_val = torch.Tensor(target_val)
    Target_val_aux = torch.Tensor(target_val_aux)
    Target_val_identity = torch.Tensor(target_val_identity)
    weight_val = torch.Tensor(weight_val)
    Lengths_val = torch.tensor(lengths_val, dtype=torch.int64)

    # Prepare dataset
    train_dataset = data.TensorDataset(inputs_train, Target_train,
                                       Target_train_aux, Target_train_identity,
                                       weight_train, Lengths_train)
    val_dataset = data.TensorDataset(inputs_val, Target_val, Target_val_aux,
                                     Target_val_identity, weight_val,
                                     Lengths_val)

    # Bucket sequencing
    ids_train = lengths_train.argsort(kind="stable")
    ids_train_new = resort_index(ids_train, t_config.num_of_bucket,
                                 s_config.seed)
    train_loader = torch.utils.data.DataLoader(data.Subset(
        train_dataset, ids_train_new),
                                               batch_size=t_config.batch_size,
                                               collate_fn=clip_to_max_len,
                                               shuffle=False)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in list(MyModel.named_parameters())
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in list(MyModel.named_parameters())
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=t_config.learning_rate,
                         betas=[0.9, 0.999],
                         warmup=t_config.warmup,
                         t_total=t_config.num_epoch * len(train_loader) //
                         t_config.accumulation_steps)

    i = 0
    for n, p in list(MyModel.named_parameters()):
        if i < 10:
            p.requires_grad = False
        i += 1

    p = train['target'].mean()
    likelihood = np.log(p / (1 - p))
    model_bias = torch.tensor(likelihood).type(torch.float)
    MyModel.classifier.bias = nn.Parameter(model_bias.to(device))

    MyModel, optimizer = amp.initialize(MyModel,
                                        optimizer,
                                        opt_level="O1",
                                        verbosity=0)

    for epoch in range(t_config.num_epoch):
        i = 0

        print('Training start')

        optimizer.zero_grad()
        MyModel.train()
        for batch_idx, (input, target, target_aux, target_identity,
                        sample_weight) in tqdm_notebook(
                            enumerate(train_loader), total=len(train_loader)):

            y_pred = MyModel(
                input.to(device),
                attention_mask=(input > 0).to(device),
            )
            loss = F.binary_cross_entropy_with_logits(y_pred[0][:, 0],
                                                      target.to(device),
                                                      reduction='none')
            loss = (loss * sample_weight.to(device)).sum() / (
                sample_weight.sum().to(device))
            loss_aux = F.binary_cross_entropy_with_logits(
                y_pred[0][:, 1:6], target_aux.to(device),
                reduction='none').mean(axis=1)
            loss_aux = (loss_aux * sample_weight.to(device)).sum() / (
                sample_weight.sum().to(device))
            loss += loss_aux
            if t_config.num_labels == 15:
                loss_identity = F.binary_cross_entropy_with_logits(
                    y_pred[0][:, 6:],
                    target_identity.to(device),
                    reduction='none').mean(axis=1)
                loss_identity = (loss_identity * sample_weight.to(device)
                                 ).sum() / (sample_weight.sum().to(device))
                loss += loss_identity
            # Use apex for better gradients and smaller model sizes
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            # Use accumulation steps to tune the effective batch size of training
            if (i + 1) % t_config.accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            i += 1

        torch.save(
            {
                'model_state_dict': MyModel.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'{t_config.PATH}')
    def run_train(self, train, dev):
        if not os.path.isdir(self.args.dsave):
            os.makedirs(self.args.dsave)

        logger = logging.getLogger(self.__class__.__name__)
        logger.setLevel(logging.DEBUG)
        fh = logging.FileHandler(os.path.join(self.args.dsave, 'train.log'))
        fh.setLevel(logging.CRITICAL)
        logger.addHandler(fh)
        ch = logging.StreamHandler()
        ch.setLevel(logging.CRITICAL)
        logger.addHandler(ch)

        num_train_steps = int(len(train) / self.args.train_batch * self.args.epoch)

        # remove pooler
        param_optimizer = list(self.named_parameters())
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]

        optimizer = BertAdam(optimizer_grouped_parameters, lr=self.args.learning_rate, warmup=self.args.warmup, t_total=num_train_steps)

        print('num_train', len(train))
        print('num_dev', len(dev))

        global_step = 0
        best_metrics = {self.args.early_stop: -float('inf')}
        for epoch in trange(self.args.epoch, desc='epoch',):
            self.epoch = epoch
            train = train[:]
            np.random.shuffle(train)

            train_stats = defaultdict(list)
            gates = []
            preds = []
            self.train()
            for i in trange(0, len(train), self.args.train_batch, desc='batch'):
                actual_train_batch = int(self.args.train_batch / self.args.gradient_accumulation_steps)
                batch_stats = defaultdict(list)
                batch = train[i: i + self.args.train_batch]

                for accu_i in range(0, len(batch), actual_train_batch):
                    actual_batch = batch[accu_i : accu_i + actual_train_batch]
                    out = self(actual_batch)
                    gates.extend(out['gate'])
                    pred = self.extract_preds(out, actual_batch)
                    loss = self.compute_loss(out, actual_batch)

                    for k, v in loss.items():
                        loss[k] = v / self.args.gradient_accumulation_steps
                        batch_stats[k].append(v.item()/ self.args.gradient_accumulation_steps)
                    sum(loss.values()).backward()
                    preds += pred

                lr_this_step = self.args.learning_rate * warmup_linear(global_step/num_train_steps, self.args.warmup)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

                for k in batch_stats.keys():
                    train_stats['loss_' + k].append(sum(batch_stats[k]))

                if global_step % self.args.eval_every_steps == 0:
                    dev_stats = defaultdict(list)
                    dev_preds, dev_gates = self.run_pred(dev)
                    dev_metrics = {k: sum(v) / len(v) for k, v in dev_stats.items()}
                    dev_metrics.update(self.compute_metrics(dev_preds, dev))
                    dev_metrics.update({'gate_avg': mean(dev_gates)})
                    dev_metrics.update({'gate_std': stdev(dev_gates)})
                    metrics = {'global_step': global_step}
                    # metrics.update({'train_' + k: v for k, v in train_metrics.items()})
                    metrics.update({'dev_' + k: v for k, v in dev_metrics.items()})
                    logger.critical(pformat(metrics))

                    if metrics[self.args.early_stop] > best_metrics[self.args.early_stop]:
                        logger.critical('Found new best! Saving to ' + self.args.dsave)
                        best_metrics = metrics
                        self.save(best_metrics, self.args.dsave, self.args.early_stop)
                        with open(os.path.join(self.args.dsave, 'dev.preds.json'), 'wt') as f:
                            json.dump(dev_preds, f, indent=2)
                        with open(os.path.join(self.args.dsave, 'dev.best_metrics.json'), 'wt') as f:
                            json.dump(best_metrics, f, indent=2)

                    self.train()

            train_metrics = {k: sum(v) / len(v) for k, v in train_stats.items()}
            train_metrics.update(self.compute_metrics(preds, train))
            train_metrics.update({'gate_avg': mean(gates)})
            train_metrics.update({'gate_std': stdev(gates)})

            dev_stats = defaultdict(list)
            dev_preds, dev_gates = self.run_pred(dev)
            dev_metrics = {k: sum(v) / len(v) for k, v in dev_stats.items()}
            dev_metrics.update(self.compute_metrics(dev_preds, dev))
            dev_metrics.update({'gate_avg': mean(dev_gates)})
            dev_metrics.update({'gate_std': stdev(dev_gates)})
            metrics = {'global_step': global_step}
            metrics.update({'train_' + k: v for k, v in train_metrics.items()})
            metrics.update({'dev_' + k: v for k, v in dev_metrics.items()})
            logger.critical(pformat(metrics))

            if metrics[self.args.early_stop] > best_metrics[self.args.early_stop]:
                logger.critical('Found new best! Saving to ' + self.args.dsave)
                best_metrics = metrics
                self.save(best_metrics, self.args.dsave, self.args.early_stop)
                with open(os.path.join(self.args.dsave, 'dev.preds.json'), 'wt') as f:
                    json.dump(dev_preds, f, indent=2)
                with open(os.path.join(self.args.dsave, 'dev.best_metrics.json'), 'wt') as f:
                    json.dump(best_metrics, f, indent=2)

        logger.critical('Best dev')
        logger.critical(pformat(best_metrics))
Exemple #8
0
    def train(self):
        with open(os.path.join(self.results_folder, "log.txt"), "w") as f_log:
            for train, test in LeaveOneOut().split(self.dfs):
                train_set = [self.dfs[i] for i in train]
                test_set = self.dfs[test[0]]
                # Create sentence and label lists
                sentences_list = []
                labels_list = []
                for i, book in enumerate(train_set):
                    sentences_list.extend(book.sentence.values)
                    labels_list.extend(book.label.values)
                    f_log.write("Length book: " + str(len(sentences_list[i])) +
                                '\n')
                f_log.write("Sentences: " + str(len(sentences_list)) +
                            ", labels:" + str(len(labels_list)) + '\n')

                MAX_LEN = 128
                # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
                sentences_train = [
                    self.tokenizer.encode_plus(sent,
                                               add_special_tokens=True,
                                               max_length=MAX_LEN)
                    for i, sent in enumerate(sentences_list)
                ]

                le = LabelEncoder()
                labels_train = labels_list
                f_log.write(str(labels_train[:10]) + '\n')
                f_log.write('Analyze labels' + '\n')
                le.fit(labels_train)
                le_name_mapping = dict(
                    zip(le.classes_, le.transform(le.classes_)))
                f_log.write(str(le_name_mapping) + '\n')
                labels_train = le.fit_transform(labels_train)

                # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
                input_ids_train = [
                    inputs["input_ids"] for inputs in sentences_train
                ]

                # Pad our input tokens
                input_ids_train = pad_sequences(input_ids_train,
                                                maxlen=MAX_LEN,
                                                truncating="post",
                                                padding="post")
                # Create attention masks
                attention_masks_train = []

                # Create a mask of 1s for each token followed by 0s for padding
                for seq in input_ids_train:
                    seq_mask_train = [float(i > 0) for i in seq]
                    attention_masks_train.append(seq_mask_train)

                # Use train_test_split to split our data into train and validation sets for training
                train_inputs, train_labels = input_ids_train, labels_train
                train_masks, _ = attention_masks_train, input_ids_train

                # Convert all of our data into torch tensors, the required datatype for our model
                train_inputs = torch.tensor(train_inputs).to(torch.int64)
                train_labels = torch.tensor(train_labels).to(torch.int64)
                train_masks = torch.tensor(train_masks).to(torch.int64)

                batch_size = 32
                # Create an iterator of our data with torch DataLoader. This helps save on memory during training
                # because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into
                # memory
                train_data = TensorDataset(train_inputs, train_masks,
                                           train_labels)
                train_sampler = RandomSampler(train_data)
                train_dataloader = DataLoader(train_data,
                                              sampler=train_sampler,
                                              batch_size=batch_size)
                torch.cuda.empty_cache()

                # BINARY CLASSIFIER
                model = BertForSequenceClassification.from_pretrained(
                    "bert-base-uncased", num_labels=2)
                model.cuda()
                param_optimizer = list(model.named_parameters())
                no_decay = ['bias', 'gamma', 'beta']
                optimizer_grouped_parameters = [{
                    'params': [
                        p for n, p in param_optimizer
                        if not any(nd in n for nd in no_decay)
                    ],
                    'weight_decay_rate':
                    0.01
                }, {
                    'params': [
                        p for n, p in param_optimizer
                        if any(nd in n for nd in no_decay)
                    ],
                    'weight_decay_rate':
                    0.0
                }]

                # This variable contains all of the hyperparemeter information our training loop needs
                optimizer = BertAdam(optimizer_grouped_parameters,
                                     lr=2e-5,
                                     warmup=.1)

                train_loss_set = []

                # Number of training epochs (authors recommend between 2 and 4)
                epochs = 10

                device = torch.device(
                    "cuda" if torch.cuda.is_available() else "cpu")
                torch.cuda.get_device_name(0)

                for _ in trange(epochs, desc="Epoch"):
                    # Training
                    # Set our model to training mode (as opposed to evaluation mode)
                    model.train()

                    # Tracking variables
                    tr_loss = 0
                    nb_tr_examples, nb_tr_steps = 0, 0

                    # Train the data for one epoch
                    for step, batch in enumerate(train_dataloader):
                        # Add batch to GPU
                        batch = tuple(t.to(device) for t in batch)
                        # Unpack the inputs from our dataloader
                        b_input_ids, b_input_mask, b_labels = batch
                        # Clear out the gradients (by default they accumulate)
                        optimizer.zero_grad()
                        # Forward pass
                        loss = model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)
                        train_loss_set.append(loss.item())
                        # Backward pass
                        loss.backward()
                        # Update parameters and take a step using the computed gradient
                        optimizer.step()

                        # Update tracking variables
                        tr_loss += loss.item()
                        nb_tr_examples += b_input_ids.size(0)
                        nb_tr_steps += 1

                    f_log.write("Train loss: {}".format(tr_loss /
                                                        nb_tr_steps) + '\n')

                plt.figure(figsize=(15, 8))
                plt.title("Training loss")
                plt.xlabel("Batch")
                plt.ylabel("Loss")
                plt.plot(train_loss_set)
                plt.savefig(self.img_folder + 'train' + str(test[0]) + '.png')

                model_to_save = model
                WEIGHTS_NAME = "BERT_Novel_test" + str(test[0]) + ".bin"
                OUTPUT_DIR = self.models_folder
                output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
                f_log.write(str(output_model_file) + '\n')
                torch.save(model_to_save.state_dict(), output_model_file)
                state_dict = torch.load(output_model_file)
                model.load_state_dict(state_dict)

                sentences6 = test_set.sentence.values
                f_log.write(str(len(sentences6)) + '\n')
                labels6 = test_set.label.values

                labels_test = labels6
                sentences11 = sentences6
                sentences_test = [
                    self.tokenizer.encode_plus(sent,
                                               add_special_tokens=True,
                                               max_length=MAX_LEN)
                    for i, sent in enumerate(sentences11)
                ]

                f_log.write('Analyze labels test' + '\n')
                le.fit(labels_test)
                le_name_mapping = dict(
                    zip(le.classes_, le.transform(le.classes_)))
                f_log.write(str(le_name_mapping) + '\n')
                labels_test = le.fit_transform(labels_test)
                MAX_LEN = 128

                # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
                input_ids1 = [inputs["input_ids"] for inputs in sentences_test]
                # Pad our input tokens
                input_ids1 = pad_sequences(input_ids1,
                                           maxlen=MAX_LEN,
                                           truncating="post",
                                           padding="post")
                # Create attention masks
                attention_masks1 = []

                # Create a mask of 1s for each token followed by 0s for padding
                for seq in input_ids1:
                    seq_mask1 = [float(i > 0) for i in seq]
                    attention_masks1.append(seq_mask1)

                f_log.write(str(len(attention_masks1[0])) + '\n')

                prediction_inputs = torch.tensor(input_ids1).to(torch.int64)
                prediction_masks = torch.tensor(attention_masks1).to(
                    torch.int64)

                prediction_labels = torch.tensor(labels_test).to(torch.int64)

                batch_size = 32
                prediction_data = TensorDataset(prediction_inputs,
                                                prediction_masks,
                                                prediction_labels)
                prediction_sampler = SequentialSampler(prediction_data)
                prediction_dataloader = DataLoader(prediction_data,
                                                   sampler=prediction_sampler,
                                                   batch_size=batch_size)

                # Prediction on test set
                # Put model in evaluation mode
                model.eval()
                # Tracking variables
                predictions, true_labels = [], []
                # Predict
                for batch in prediction_dataloader:
                    # Add batch to GPU
                    batch = tuple(t.to(device) for t in batch)
                    # Unpack the inputs from our dataloader
                    b_input_ids, b_input_mask, b_labels = batch
                    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
                    with torch.no_grad():
                        # Forward pass, calculate logit predictions
                        logits = model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask)

                    # Move logits and labels to CPU
                    logits = logits.detach().cpu().numpy()
                    label_ids = b_labels.to('cpu').numpy()

                    # Store predictions and true labels
                    predictions.append(logits)
                    true_labels.append(label_ids)

                f_log.write(
                    str(len(predictions)) + ' ' + str(len(true_labels)) + '\n')
                f_log.write(str(predictions[0][0]) + '\n')

                # Import and evaluate each test batch using Matthew's correlation coefficient
                matthews_set = []

                for i in range(len(true_labels)):
                    matthews = matthews_corrcoef(
                        true_labels[i],
                        np.argmax(predictions[i], axis=1).flatten())
                    matthews_set.append(matthews)

                # Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
                flat_predictions = [
                    item for sublist in predictions for item in sublist
                ]
                flat_predictions = np.argmax(flat_predictions,
                                             axis=1).flatten()
                flat_true_labels = [
                    item for sublist in true_labels for item in sublist
                ]

                f_log.write(
                    str(len(flat_predictions) + ' ' + len(flat_true_labels)) +
                    '\n')
                f_log.write(
                    str(flat_predictions[989:994] + ' ' +
                        flat_true_labels[989:994]) + '\n')
                f_log.write(
                    str(flat_predictions[0:11] + ' ' +
                        flat_true_labels[0:11]) + '\n')
                f_log.write('Classification Report' + '\n')
                f_log.write(
                    str(
                        classification_report(flat_true_labels,
                                              flat_predictions)) + '\n')
                f_log.write(
                    str(confusion_matrix(flat_true_labels, flat_predictions)) +
                    '\n')
Exemple #9
0
    def train(self):
        if self.debug_mode: self.epochs = 1
        # 加载 dataloader
        train_loader, valid_loader = self.create_dataloader()
        # 训练
        self.seed_everything()
        lr = 2e-5
        accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
        # 预训练 bert 转成 pytorch
        if os.path.exists(self.work_dir + 'pytorch_model.bin') is False:
            print("Convert pre-trained model")
            convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
                self.bert_model_path + 'bert_model.ckpt',
                self.bert_model_path + 'bert_config.json',
                self.work_dir + 'pytorch_model.bin')
        shutil.copyfile(self.bert_model_path + 'bert_config.json',
                        self.work_dir + 'bert_config.json')
        # 加载预训练模型
        print("Load checkpoint")
        model = BertNeuralNet.from_pretrained(self.work_dir, cache_dir=None)
        # TODO: 读取模型
        model.load_state_dict(
            torch.load("../input/train48-bert-kernel/model_last.bin"))
        model.zero_grad()
        model = model.to(self.device)
        # 不同的参数组设置不同的 weight_decay
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        epoch_steps = int(self.train_len * 0.5 / self.base_batch_size /
                          accumulation_steps)
        num_train_optimization_steps = int(self.epochs * epoch_steps)
        valid_every = math.floor(epoch_steps * accumulation_steps / 5)
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=lr,
                             warmup=-1,
                             t_total=-1)
        # 渐变学习速率
        #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        # 开始训练
        print("Train")
        best_auc_score_1 = 0
        best_auc_score_2 = 0
        best_auc_score_3 = 0
        best_auc_score_4 = 0
        f_log = open("train_log.txt", "w")

        model.eval()
        new_valid_loader = copy.deepcopy(valid_loader)
        y_pred = np.zeros((len(self.train_df) - self.train_len))
        for j, valid_batch_data in enumerate(new_valid_loader):
            x_batch = valid_batch_data[0]
            batch_y_pred = self.sigmoid(
                model(x_batch.to(self.device),
                      attention_mask=(x_batch > 0).to(self.device),
                      labels=None).detach().cpu().numpy())[:, 0]
            y_pred[j * self.base_batch_size:(j + 1) *
                   self.base_batch_size] = batch_y_pred
        # 计算得分
        auc_score = self.evaluator.get_final_metric(y_pred)
        f_log.write("init auc_score: %.4f\n" % auc_score)
        print("init auc_score: %.4f" % auc_score)

        for epoch in range(self.epochs):
            model.train()
            optimizer.zero_grad()
            # 加载每个 batch 并训练
            train_start_time = time.time()
            for i, batch_data in enumerate(train_loader):
                x_batch = batch_data[0]
                y_batch = batch_data[1]
                target_weight_batch = batch_data[2]
                aux_weight_batch = batch_data[3]
                identity_weight_batch = batch_data[4]
                np_weight_batch = batch_data[5]
                np_identity_weight_batch = batch_data[6]
                y_pred = model(x_batch.to(self.device),
                               attention_mask=(x_batch > 0).to(self.device),
                               labels=None)
                target_loss, aux_loss, identity_loss, np_loss = self.custom_loss(
                    y_pred, y_batch, epoch, target_weight_batch,
                    aux_weight_batch, identity_weight_batch, np_weight_batch)
                loss = target_loss + aux_loss + identity_loss + np_loss
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if (i + 1) % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                # 验证
                if (i + 1) % valid_every == 0:
                    model.eval()
                    stage = int((i + 1) / valid_every)
                    train_stage_duration = int(
                        (time.time() - train_start_time) / 60)
                    valid_start_time = time.time()
                    y_pred = np.zeros((len(self.train_df) - self.train_len))
                    for j, valid_batch_data in enumerate(valid_loader):
                        x_batch = valid_batch_data[0]
                        batch_y_pred = self.sigmoid(
                            model(x_batch.to(self.device),
                                  attention_mask=(x_batch > 0).to(self.device),
                                  labels=None).detach().cpu().numpy())[:, 0]
                        y_pred[j * self.base_batch_size:(j + 1) *
                               self.base_batch_size] = batch_y_pred
                    # 计算得分
                    auc_score = self.evaluator.get_final_metric(y_pred)
                    valid_duration = int((time.time() - valid_start_time) / 60)
                    train_start_time = time.time()
                    f_log.write(
                        "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n"
                        % (epoch, stage, train_stage_duration, valid_duration,
                           auc_score))
                    print(
                        "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f"
                        % (epoch, stage, train_stage_duration, valid_duration,
                           auc_score))
                    if auc_score > best_auc_score_4:
                        state_dict = model.state_dict()
                        if auc_score > best_auc_score_1:
                            best_auc_score_1 = auc_score
                            torch.save(state_dict, "model1.bin")
                        elif auc_score > best_auc_score_2:
                            best_auc_score_2 = auc_score
                            torch.save(state_dict, "model2.bin")
                        elif auc_score > best_auc_score_3:
                            best_auc_score_3 = auc_score
                            torch.save(state_dict, "model3.bin")
                        else:
                            best_auc_score_4 = auc_score
                            torch.save(state_dict, "model4.bin")
                        with open("model_score.txt", "w") as f:
                            f.write(
                                "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                                % (best_auc_score_1, best_auc_score_2,
                                   best_auc_score_3, best_auc_score_4))
                        print(
                            "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                            % (best_auc_score_1, best_auc_score_2,
                               best_auc_score_3, best_auc_score_4))
                    model.train()
            if self.last is True:
                state_dict = model.state_dict()
                torch.save(state_dict, "model_last.bin")
        # del 训练相关输入和模型
        training_history = [
            train_loader, valid_loader, model, optimizer, param_optimizer,
            optimizer_grouped_parameters
        ]
        for variable in training_history:
            del variable
        gc.collect()
Exemple #10
0
class MTDNNModel(MTDNNPretrainedModel):
    """Instance of an MTDNN Model
    
    Arguments:
        MTDNNPretrainedModel {BertPretrainedModel} -- Inherited from Bert Pretrained
        config  {MTDNNConfig} -- MTDNN Configuration Object 
        pretrained_model_name {str} -- Name of the pretrained model to initial checkpoint
        num_train_step  {int} -- Number of steps to take each training
    
    Raises:
        RuntimeError: [description]
        ImportError: [description]
    
    Returns:
        MTDNNModel -- An Instance of an MTDNN Model
    """
    def __init__(
        self,
        config: MTDNNConfig,
        task_defs: MTDNNTaskDefs,
        pretrained_model_name: str = "mtdnn-base-uncased",
        num_train_step: int = -1,
        decoder_opts: list = None,
        task_types: list = None,
        dropout_list: list = None,
        loss_types: list = None,
        kd_loss_types: list = None,
        tasks_nclass_list: list = None,
        multitask_train_dataloader: DataLoader = None,
        dev_dataloaders_list: list = None,  # list of dataloaders
        test_dataloaders_list: list = None,  # list of dataloaders
        test_datasets_list: list = ["mnli_mismatched", "mnli_matched"],
        output_dir: str = "checkpoint",
        log_dir: str = "tensorboard_logdir",
    ):

        # Input validation
        assert (
            config.init_checkpoint in self.supported_init_checkpoints()
        ), f"Initial checkpoint must be in {self.supported_init_checkpoints()}"

        assert decoder_opts, "Decoder options list is required!"
        assert task_types, "Task types list is required!"
        assert dropout_list, "Task dropout list is required!"
        assert loss_types, "Loss types list is required!"
        assert kd_loss_types, "KD Loss types list is required!"
        assert tasks_nclass_list, "Tasks nclass list is required!"
        assert (multitask_train_dataloader
                ), "DataLoader for multiple tasks cannot be None"
        assert test_datasets_list, "Pass a list of test dataset prefixes"

        super(MTDNNModel, self).__init__(config)

        # Initialize model config and update with training options
        self.config = config
        self.update_config_with_training_opts(
            decoder_opts,
            task_types,
            dropout_list,
            loss_types,
            kd_loss_types,
            tasks_nclass_list,
        )
        self.task_defs = task_defs
        self.multitask_train_dataloader = multitask_train_dataloader
        self.dev_dataloaders_list = dev_dataloaders_list
        self.test_dataloaders_list = test_dataloaders_list
        self.test_datasets_list = test_datasets_list
        self.output_dir = output_dir
        self.log_dir = log_dir

        # Create the output_dir if it's doesn't exist
        MTDNNCommonUtils.create_directory_if_not_exists(self.output_dir)
        self.tensor_board = SummaryWriter(log_dir=self.log_dir)

        self.pooler = None

        # Resume from model checkpoint
        if self.config.resume and self.config.model_ckpt:
            assert os.path.exists(
                self.config.model_ckpt), "Model checkpoint does not exist"
            logger.info(f"loading model from {self.config.model_ckpt}")
            self = self.load(self.config.model_ckpt)
            return

        # Setup the baseline network
        # - Define the encoder based on config options
        # - Set state dictionary based on configuration setting
        # - Download pretrained model if flag is set
        # TODO - Use Model.pretrained_model() after configuration file is hosted.
        if self.config.use_pretrained_model:
            with MTDNNCommonUtils.download_path() as file_path:
                path = pathlib.Path(file_path)
                self.local_model_path = MTDNNCommonUtils.maybe_download(
                    url=self.
                    pretrained_model_archive_map[pretrained_model_name],
                    log=logger,
                )
            self.bert_model = MTDNNCommonUtils.load_pytorch_model(
                self.local_model_path)
            self.state_dict = self.bert_model["state"]
        else:
            # Set the config base on encoder type set for initial checkpoint
            if config.encoder_type == EncoderModelType.BERT:
                self.bert_config = BertConfig.from_dict(self.config.to_dict())
                self.bert_model = BertModel.from_pretrained(
                    self.config.init_checkpoint)
                self.state_dict = self.bert_model.state_dict()
                self.config.hidden_size = self.bert_config.hidden_size
            if config.encoder_type == EncoderModelType.ROBERTA:
                # Download and extract from PyTorch hub if not downloaded before
                self.bert_model = torch.hub.load("pytorch/fairseq",
                                                 config.init_checkpoint)
                self.config.hidden_size = self.bert_model.args.encoder_embed_dim
                self.pooler = LinearPooler(self.config.hidden_size)
                new_state_dict = {}
                for key, val in self.bert_model.state_dict().items():
                    if key.startswith("model.decoder.sentence_encoder"
                                      ) or key.startswith(
                                          "model.classification_heads"):
                        key = f"bert.{key}"
                        new_state_dict[key] = val
                    # backward compatibility PyTorch <= 1.0.0
                    if key.startswith("classification_heads"):
                        key = f"bert.model.{key}"
                        new_state_dict[key] = val
                self.state_dict = new_state_dict

        self.updates = (self.state_dict["updates"] if self.state_dict
                        and "updates" in self.state_dict else 0)
        self.local_updates = 0
        self.train_loss = AverageMeter()
        self.network = SANBERTNetwork(
            init_checkpoint_model=self.bert_model,
            pooler=self.pooler,
            config=self.config,
        )
        if self.state_dict:
            self.network.load_state_dict(self.state_dict, strict=False)
        self.mnetwork = (nn.DataParallel(self.network)
                         if self.config.multi_gpu_on else self.network)
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])

        # Move network to GPU if device available and flag set
        if self.config.cuda:
            self.network.cuda(device=self.config.cuda_device)
        self.optimizer_parameters = self._get_param_groups()
        self._setup_optim(self.optimizer_parameters, self.state_dict,
                          num_train_step)
        self.para_swapped = False
        self.optimizer.zero_grad()
        self._setup_lossmap()

    def _get_param_groups(self):
        no_decay = [
            "bias", "gamma", "beta", "LayerNorm.bias", "LayerNorm.weight"
        ]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in self.network.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in self.network.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        return optimizer_parameters

    def _setup_optim(self,
                     optimizer_parameters,
                     state_dict: dict = None,
                     num_train_step: int = -1):

        # Setup optimizer parameters
        if self.config.optimizer == "sgd":
            self.optimizer = optim.SGD(
                optimizer_parameters,
                self.config.learning_rate,
                weight_decay=self.config.weight_decay,
            )
        elif self.config.optimizer == "adamax":
            self.optimizer = Adamax(
                optimizer_parameters,
                self.config.learning_rate,
                warmup=self.config.warmup,
                t_total=num_train_step,
                max_grad_norm=self.config.grad_clipping,
                schedule=self.config.warmup_schedule,
                weight_decay=self.config.weight_decay,
            )

        elif self.config.optimizer == "radam":
            self.optimizer = RAdam(
                optimizer_parameters,
                self.config.learning_rate,
                warmup=self.config.warmup,
                t_total=num_train_step,
                max_grad_norm=self.config.grad_clipping,
                schedule=self.config.warmup_schedule,
                eps=self.config.adam_eps,
                weight_decay=self.config.weight_decay,
            )

            # The current radam does not support FP16.
            self.config.fp16 = False
        elif self.config.optimizer == "adam":
            self.optimizer = Adam(
                optimizer_parameters,
                lr=self.config.learning_rate,
                warmup=self.config.warmup,
                t_total=num_train_step,
                max_grad_norm=self.config.grad_clipping,
                schedule=self.config.warmup_schedule,
                weight_decay=self.config.weight_decay,
            )

        else:
            raise RuntimeError(
                f"Unsupported optimizer: {self.config.optimizer}")

        # Clear scheduler for certain optimizer choices
        if self.config.optimizer in ["adam", "adamax", "radam"]:
            if self.config.have_lr_scheduler:
                self.config.have_lr_scheduler = False

        if state_dict and "optimizer" in state_dict:
            self.optimizer.load_state_dict(state_dict["optimizer"])

        if self.config.fp16:
            try:
                global amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(
                self.network,
                self.optimizer,
                opt_level=self.config.fp16_opt_level)
            self.network = model
            self.optimizer = optimizer

        if self.config.have_lr_scheduler:
            if self.config.scheduler_type == "rop":
                self.scheduler = ReduceLROnPlateau(self.optimizer,
                                                   mode="max",
                                                   factor=self.config.lr_gamma,
                                                   patience=3)
            elif self.config.scheduler_type == "exp":
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=self.config.lr_gamma
                                               or 0.95)
            else:
                milestones = [
                    int(step) for step in (
                        self.config.multi_step_lr or "10,20,30").split(",")
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=self.config.lr_gamma)
        else:
            self.scheduler = None

    def _setup_lossmap(self):
        self.task_loss_criterion = []
        for idx, cs in enumerate(self.config.loss_types):
            assert cs is not None, "Loss type must be defined."
            lc = LOSS_REGISTRY[cs](name=f"Loss func of task {idx}: {cs}")
            self.task_loss_criterion.append(lc)

    def _setup_kd_lossmap(self):
        loss_types = self.config.kd_loss_types
        self.kd_task_loss_criterion = []
        if config.mkd_opt > 0:
            for idx, cs in enumerate(loss_types):
                assert cs, "Loss type must be defined."
                lc = LOSS_REGISTRY[cs](
                    name="Loss func of task {}: {}".format(idx, cs))
                self.kd_task_loss_criterion.append(lc)

    def _to_cuda(self, tensor):
        # Set tensor to gpu (non-blocking) if a PyTorch tensor
        if tensor is None:
            return tensor

        if isinstance(tensor, list) or isinstance(tensor, tuple):
            y = [
                e.cuda(device=self.config.cuda_device, non_blocking=True)
                for e in tensor
            ]
            for t in y:
                t.requires_grad = False
        else:
            y = tensor.cuda(device=self.config.cuda_device, non_blocking=True)
            y.requires_grad = False
        return y

    def train(self):
        if self.para_swapped:
            self.para_swapped = False

    def update(self, batch_meta, batch_data):
        self.network.train()
        target = batch_data[batch_meta["label"]]
        soft_labels = None

        task_type = batch_meta["task_type"]
        target = self._to_cuda(target) if self.config.cuda else target

        task_id = batch_meta["task_id"]
        inputs = batch_data[:batch_meta["input_len"]]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        weight = None
        if self.config.weighted_on:
            if self.config.cuda:
                weight = batch_data[batch_meta["factor"]].cuda(
                    device=self.config.cuda_device, non_blocking=True)
            else:
                weight = batch_data[batch_meta["factor"]]
        logits = self.mnetwork(*inputs)

        # compute loss
        loss = 0
        if self.task_loss_criterion[task_id] and (target is not None):
            loss = self.task_loss_criterion[task_id](logits,
                                                     target,
                                                     weight,
                                                     ignore_index=-1)

        # compute kd loss
        if self.config.mkd_opt > 0 and ("soft_label" in batch_meta):
            soft_labels = batch_meta["soft_label"]
            soft_labels = (self._to_cuda(soft_labels)
                           if self.config.cuda else soft_labels)
            kd_lc = self.kd_task_loss_criterion[task_id]
            kd_loss = (kd_lc(logits, soft_labels, weight, ignore_index=-1)
                       if kd_lc else 0)
            loss = loss + kd_loss

        self.train_loss.update(loss.item(),
                               batch_data[batch_meta["token_id"]].size(0))
        # scale loss
        loss = loss / (self.config.grad_accumulation_step or 1)
        if self.config.fp16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.local_updates += 1
        if self.local_updates % self.config.grad_accumulation_step == 0:
            if self.config.global_grad_clipping > 0:
                if self.config.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(self.optimizer),
                        self.config.global_grad_clipping,
                    )
                else:
                    torch.nn.utils.clip_grad_norm_(
                        self.network.parameters(),
                        self.config.global_grad_clipping)
            self.updates += 1
            # reset number of the grad accumulation
            self.optimizer.step()
            self.optimizer.zero_grad()

    def eval_mode(
        self,
        data: DataLoader,
        metric_meta,
        use_cuda=True,
        with_label=True,
        label_mapper=None,
        task_type=TaskType.Classification,
    ):
        if use_cuda:
            self.cuda()
        predictions = []
        golds = []
        scores = []
        ids = []
        metrics = {}
        for idx, (batch_info, batch_data) in enumerate(data):
            if idx % 100 == 0:
                logger.info(f"predicting {idx}")
            batch_info, batch_data = MTDNNCollater.patch_data(
                use_cuda, batch_info, batch_data)
            score, pred, gold = self._predict_batch(batch_info, batch_data)
            predictions.extend(pred)
            golds.extend(gold)
            scores.extend(score)
            ids.extend(batch_info["uids"])

        if task_type == TaskType.Span:
            golds = merge_answers(ids, golds)
            predictions, scores = select_answers(ids, predictions, scores)
        if with_label:
            metrics = calc_metrics(metric_meta, golds, predictions, scores,
                                   label_mapper)
        return metrics, predictions, scores, golds, ids

    def _predict_batch(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta["task_id"]
        task_type = batch_meta["task_type"]
        inputs = batch_data[:batch_meta["input_len"]]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        score = self.mnetwork(*inputs)
        if task_type == TaskType.Ranking:
            score = score.contiguous().view(-1, batch_meta["pairwise_size"])
            assert task_type == TaskType.Ranking
            score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta["true_label"]
        elif task_type == TaskType.SequenceLabeling:
            mask = batch_data[batch_meta["mask"]]
            score = score.contiguous()
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).reshape(mask.size()).tolist()
            valied_lenght = mask.sum(1).tolist()
            final_predict = []
            for idx, p in enumerate(predict):
                final_predict.append(p[:valied_lenght[idx]])
            score = score.reshape(-1).tolist()
            return score, final_predict, batch_meta["label"]
        elif task_type == TaskType.Span:
            start, end = score
            predictions = []
            if self.config.encoder_type == EncoderModelType.BERT:
                scores, predictions = extract_answer(
                    batch_meta,
                    batch_data,
                    start,
                    end,
                    self.config.get("max_answer_len", 5),
                )
            return scores, predictions, batch_meta["answer"]
        else:
            if task_type == TaskType.Classification:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta["label"]

    def fit(self, epochs=0):
        """ Fit model to training datasets """
        epochs = epochs or self.config.epochs
        logger.info(f"Total number of params: {self.total_param}")
        for epoch in range(epochs):
            logger.info(f"At epoch {epoch}")
            logger.info(
                f"Amount of data to go over: {len(self.multitask_train_dataloader)}"
            )

            start = datetime.now()
            # Create batches and train
            for idx, (batch_meta, batch_data) in enumerate(
                    self.multitask_train_dataloader):
                batch_meta, batch_data = MTDNNCollater.patch_data(
                    self.config.cuda, batch_meta, batch_data)

                task_id = batch_meta["task_id"]
                self.update(batch_meta, batch_data)
                if (self.local_updates == 1 or (self.local_updates) %
                    (self.config.log_per_updates *
                     self.config.grad_accumulation_step) == 0):

                    time_left = str((datetime.now() - start) / (idx + 1) *
                                    (len(self.multitask_train_dataloader) -
                                     idx - 1)).split(".")[0]
                    logger.info(
                        "Task - [{0:2}] Updates - [{1:6}] Training Loss - [{2:.5f}] Time Remaining - [{3}]"
                        .format(
                            task_id,
                            self.updates,
                            self.train_loss.avg,
                            time_left,
                        ))
                    if self.config.use_tensor_board:
                        self.tensor_board.add_scalar(
                            "train/loss",
                            self.train_loss.avg,
                            global_step=self.updates,
                        )

                if self.config.save_per_updates_on and (
                    (self.local_updates) %
                    (self.config.save_per_updates *
                     self.config.grad_accumulation_step) == 0):
                    model_file = os.path.join(
                        self.output_dir,
                        "model_{}_{}.pt".format(epoch, self.updates),
                    )
                    logger.info(f"Saving mt-dnn model to {model_file}")
                    self.save(model_file)

            # TODO: Alternatively, we need to refactor save function
            # and move into prediction
            # Saving each checkpoint after model training
            model_file = os.path.join(self.output_dir,
                                      "model_{}.pt".format(epoch))
            logger.info(f"Saving mt-dnn model to {model_file}")
            self.save(model_file)

    def predict(self,
                trained_model_chckpt: str = None,
                saved_epoch_idx: int = 0):
        """ 
        Inference of model on test datasets
        """

        # Load a trained checkpoint if a valid model checkpoint
        if trained_model_chckpt and os.path.exists(trained_model_chckpt):
            logger.info(f"Running predictions using: {trained_model_chckpt}")
            self.load(trained_model_chckpt)

        # Create batches and train
        start = datetime.now()
        for idx, dataset in enumerate(self.test_datasets_list):
            prefix = dataset.split("_")[0]
            label_dict = self.task_defs.global_map.get(prefix, None)
            dev_data: DataLoader = self.dev_dataloaders_list[idx]
            if dev_data is not None:
                with torch.no_grad():
                    (
                        dev_metrics,
                        dev_predictions,
                        scores,
                        golds,
                        dev_ids,
                    ) = self.eval_mode(
                        dev_data,
                        metric_meta=self.task_defs.metric_meta_map[prefix],
                        use_cuda=self.config.cuda,
                        label_mapper=label_dict,
                        task_type=self.task_defs.task_type_map[prefix],
                    )
                for key, val in dev_metrics.items():
                    if self.config.use_tensor_board:
                        self.tensor_board.add_scalar(
                            f"dev/{dataset}/{key}",
                            val,
                            global_step=saved_epoch_idx)
                    if isinstance(val, str):
                        logger.info(
                            f"Task {dataset} -- epoch {saved_epoch_idx} -- Dev {key}:\n {val}"
                        )
                    else:
                        logger.info(
                            f"Task {dataset} -- epoch {saved_epoch_idx} -- Dev {key}: {val:.3f}"
                        )
                score_file = os.path.join(
                    self.output_dir,
                    f"{dataset}_dev_scores_{saved_epoch_idx}.json")
                results = {
                    "metrics": dev_metrics,
                    "predictions": dev_predictions,
                    "uids": dev_ids,
                    "scores": scores,
                }

                # Save results to file
                MTDNNCommonUtils.dump(score_file, results)
                if self.config.use_glue_format:
                    official_score_file = os.path.join(
                        self.output_dir,
                        "{}_dev_scores_{}.tsv".format(dataset,
                                                      saved_epoch_idx),
                    )
                    submit(official_score_file, results, label_dict)

            # test eval
            test_data: DataLoader = self.test_dataloaders_list[idx]
            if test_data is not None:
                with torch.no_grad():
                    (
                        test_metrics,
                        test_predictions,
                        scores,
                        golds,
                        test_ids,
                    ) = self.eval_mode(
                        test_data,
                        metric_meta=self.task_defs.metric_meta_map[prefix],
                        use_cuda=self.config.cuda,
                        with_label=False,
                        label_mapper=label_dict,
                        task_type=self.task_defs.task_type_map[prefix],
                    )
                score_file = os.path.join(
                    self.output_dir,
                    f"{dataset}_test_scores_{saved_epoch_idx}.json")
                results = {
                    "metrics": test_metrics,
                    "predictions": test_predictions,
                    "uids": test_ids,
                    "scores": scores,
                }
                MTDNNCommonUtils.dump(score_file, results)
                if self.config.use_glue_format:
                    official_score_file = os.path.join(
                        self.output_dir,
                        f"{dataset}_test_scores_{saved_epoch_idx}.tsv")
                    submit(official_score_file, results, label_dict)
                logger.info("[new test scores saved.]")

        # Close tensorboard connection if opened
        self.close_connections()

    def close_connections(self):
        # Close tensor board connection
        if self.config.use_tensor_board:
            self.tensor_board.close()

    def extract(self, batch_meta, batch_data):
        self.network.eval()
        # 'token_id': 0; 'segment_id': 1; 'mask': 2
        inputs = batch_data[:3]
        all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs)
        return all_encoder_layers, pooled_output

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        params = {
            "state": network_state,
            "optimizer": self.optimizer.state_dict(),
            "config": self.config,
        }
        torch.save(params, filename)
        logger.info("model saved to {}".format(filename))

    def load(self, checkpoint):
        model_state_dict = torch.load(checkpoint)
        self.network.load_state_dict(model_state_dict["state"], strict=False)
        self.optimizer.load_state_dict(model_state_dict["optimizer"])
        self.config = model_state_dict["config"]

    def cuda(self):
        self.network.cuda(device=self.config.cuda_device)

    def supported_init_checkpoints(self):
        """List of allowed check points
        """
        return [
            "bert-base-uncased",
            "bert-base-cased",
            "bert-large-uncased",
            "mtdnn-base-uncased",
            "mtdnn-large-uncased",
            "roberta.base",
            "roberta.large",
        ]

    def update_config_with_training_opts(
        self,
        decoder_opts,
        task_types,
        dropout_list,
        loss_types,
        kd_loss_types,
        tasks_nclass_list,
    ):
        # Update configurations with options obtained from preprocessing training data
        setattr(self.config, "decoder_opts", decoder_opts)
        setattr(self.config, "task_types", task_types)
        setattr(self.config, "tasks_dropout_p", dropout_list)
        setattr(self.config, "loss_types", loss_types)
        setattr(self.config, "kd_loss_types", kd_loss_types)
        setattr(self.config, "tasks_nclass_list", tasks_nclass_list)
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('mode', choices=['train', 'validate', 'predict', 'train_all'])
    arg('run_root')
    arg('--model', default='mybert')
    arg('--pretrained', type=int, default=0)
    arg('--batch-size', type=int, default=32)
    arg('--step', type=int, default=1)
    arg('--workers', type=int, default=2)
    arg('--lr', type=float, default=0.0002)
    arg('--patience', type=int, default=4)
    arg('--clean', action='store_true')
    arg('--n-epochs', type=int, default=1)
    arg('--kloss', type=float, default=1.0)
    arg('--loss_fn', default='loss1')
    arg('--fold_name', default='/folds_binary_weights_kernal.pkl')
    arg('--limit', type=int)
    arg('--fold', type=int, default=0)
    arg('--multi-gpu', type=int, default=0)
    arg('--lr_layerdecay', type=float, default=0.95)
    arg('--warmup', type=float, default=0.05)
    arg('--split_point', type=float, default=0.3)
    arg('--bsample', type=bool, default=True)
    args = parser.parse_args()

    set_seed()
    BERT_PRETRAIN_PATH = '../input/torch-bert-weights/%s/' % (args.model)
    run_root = Path('../experiments/' + args.run_root)
    DATA_ROOT = Path(
        '../input/jigsaw-unintended-bias-in-toxicity-classification')

    folds = pd.read_pickle(DATA_ROOT / 'folds.pkl')

    identity_columns = [
        'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
        'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
    ]

    weights = np.ones((len(folds), )) / 4
    # # Subgroup
    weights += (folds[identity_columns].fillna(0).values >= 0.5).sum(
        axis=1).astype(bool).astype(np.int) / 4
    # # Background Positive, Subgroup Negative
    weights += (((folds['target'].values >= 0.5).astype(bool).astype(np.int) +
                 (folds[identity_columns].fillna(0).values < 0.5).sum(
                     axis=1).astype(bool).astype(np.int)) >
                1).astype(bool).astype(np.int) / 4
    # # Background Negative, Subgroup Positive
    weights += (((folds['target'].values < 0.5).astype(bool).astype(np.int) +
                 (folds[identity_columns].fillna(0).values >= 0.5).sum(
                     axis=1).astype(bool).astype(np.int)) >
                1).astype(bool).astype(np.int) / 4

    folds['weights'] = weights
    print(folds['weights'].mean())

    if args.mode == "train_all":
        train_fold = folds
    else:
        train_fold = folds[folds['fold'] != args.fold]
        valid_fold = folds[folds['fold'] == args.fold]
        valid_fold = valid_fold.sort_values(by=["len"])

    if args.limit:
        train_fold = train_fold[:args.limit]
        if args.mode != "train_all":
            valid_fold = valid_fold[:args.limit * 3]

    if args.mode == "train_all":
        valid_df = None
    else:
        valid_df = valid_fold[identity_columns + ["target"]]

    loss_weight = 1 / folds['weights'].mean() * args.kloss

    if args.loss_fn == "loss1":
        loss_fn = custom_loss
    elif args.loss_fn == "loss2":
        loss_fn = custom_loss2

    criterion = partial(loss_fn, loss_weight=loss_weight)

    if args.mode == 'train' or args.mode == "train_all":
        if run_root.exists() and args.clean:
            shutil.rmtree(run_root)
        run_root.mkdir(exist_ok=True, parents=True)
        (run_root / 'params.json').write_text(
            json.dumps(vars(args), indent=4, sort_keys=True))

        training_set = TrainDataset(train_fold['comment_text'].tolist(),
                                    lens=train_fold['len'].tolist(),
                                    target=train_fold[[
                                        'binary_target', 'weights', 'target',
                                        'severe_toxicity', 'obscene',
                                        'identity_attack', 'insult', 'threat'
                                    ]].values.tolist(),
                                    identity_df=train_fold[identity_columns],
                                    weights=train_fold['weights'].tolist(),
                                    model=args.model,
                                    split_point=args.split_point)
        if args.bsample:
            bbsampler = BucketBatchSampler(training_set,
                                           batch_size=args.batch_size,
                                           drop_last=True,
                                           sort_key=lambda x: x[1],
                                           biggest_batches_first=None,
                                           bucket_size_multiplier=100,
                                           shuffle=True)
            batchsize = 1
            shuffle = False

        else:
            bbsampler = None
            batchsize = args.batch_size
            shuffle = True

        training_loader = DataLoader(training_set,
                                     batch_sampler=bbsampler,
                                     collate_fn=collate_fn,
                                     num_workers=args.workers,
                                     batch_size=batchsize,
                                     shuffle=shuffle)

        if args.mode == "train":
            valid_set = TrainDataset(
                valid_fold['comment_text'].tolist(),
                lens=valid_fold['len'].tolist(),
                target=valid_fold['binary_target'].values.tolist(),
                identity_df=valid_fold[identity_columns],
                weights=valid_fold['weights'].tolist(),
                model=args.model,
                split_point=args.split_point)
            valid_loader = DataLoader(valid_set,
                                      batch_size=args.batch_size,
                                      shuffle=False,
                                      collate_fn=collate_fn,
                                      num_workers=args.workers)
        else:
            valid_loader = None

        # model = BertForSequenceClassification.from_pretrained(BERT_PRETRAIN_PATH,cache_dir=None,num_labels=1)
        model = BertModel(BERT_PRETRAIN_PATH)
        model.cuda()

        if args.model in [
                "bert-base-uncased", "bert-base-cased", "mybert", "gpt2",
                'mybert-base-cased', 'mybert-base-uncased'
        ]:
            NUM_LAYERS = 12
        elif args.model in [
                "bert-large-uncased", "bert-large-cased", "mybertlarge", "wmm",
                "mybertlargecased", "mybert-large-uncased",
                'mybert-wwm-uncased'
        ]:
            NUM_LAYERS = 24
        else:
            raise ValueError('%s is not a valid model' % args.model)

        optimizer_grouped_parameters = [{
            'params':
            model.bert.bert.embeddings.parameters(),
            'lr':
            args.lr * (args.lr_layerdecay**NUM_LAYERS)
        }, {
            'params': model.main_head.parameters(),
            'lr': args.lr
        }, {
            'params': model.aux_head.parameters(),
            'lr': args.lr
        }, {
            'params':
            model.bert.bert.pooler.parameters(),
            'lr':
            args.lr
        }]

        for layer in range(NUM_LAYERS):
            optimizer_grouped_parameters.append(
                {
                    'params':
                    model.bert.bert.encoder.layer.__getattr__(
                        '%d' % (NUM_LAYERS - 1 - layer)).parameters(),
                    'lr':
                    args.lr * (args.lr_layerdecay**layer)
                }, )
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.lr,
                             warmup=args.warmup,
                             t_total=len(training_loader) // args.step)

        scheduler = ReduceLROnPlateau(optimizer,
                                      patience=0,
                                      factor=0.1,
                                      verbose=True,
                                      mode='max',
                                      min_lr=1e-7)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          verbosity=0)

        optimizer.zero_grad()

        if args.multi_gpu == 1:
            model = nn.DataParallel(model)

        train(args,
              model,
              optimizer,
              scheduler,
              criterion,
              train_loader=training_loader,
              valid_df=valid_df,
              valid_loader=valid_loader,
              epoch_length=len(training_set))

    elif args.mode == 'validate':

        valid_set = TrainDataset(valid_fold['comment_text'].tolist(),
                                 lens=valid_fold['len'].tolist(),
                                 target=valid_fold[['binary_target'
                                                    ]].values.tolist(),
                                 identity_df=valid_fold[identity_columns],
                                 weights=valid_fold['weights'].tolist(),
                                 model=args.model,
                                 split_point=args.split_point)
        valid_loader = DataLoader(valid_set,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  collate_fn=collate_fn,
                                  num_workers=args.workers)
        model = BertModel(BERT_PRETRAIN_PATH)
        load_model(model,
                   run_root / ('best-model-%d.pt' % args.fold),
                   multi2single=False)
        model.cuda()

        optimizer = BertAdam(model.parameters(), lr=1e-5, warmup=0.95)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O2",
                                          verbosity=0)

        if args.multi_gpu == 1:
            model = nn.DataParallel(model)

        validation(model,
                   criterion,
                   valid_df,
                   valid_loader,
                   args,
                   save_result=True,
                   progress=True)
Exemple #12
0
def fine_tune(corpus_name, train_corpus, dev_corpus, column_names):

    device = get_cuda_device()

    train_corpus_df = parse_csv(train_corpus, column_names)
    input_ids, labels, attention_masks = RENAME_ME(train_corpus_df,
                                                   corpus_name, True, MAX_LEN)
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(
        input_ids, labels, random_state=RANDOM_STATE, test_size=TEST_SIZE)
    train_masks, test_masks, _, _ = train_test_split(attention_masks,
                                                     input_ids,
                                                     random_state=RANDOM_STATE,
                                                     test_size=TEST_SIZE)
    train_data_loader = get_data_loader(train_inputs, train_labels,
                                        train_masks, BATCH_SIZE)
    test_data_loader = get_data_loader(test_inputs, test_labels, test_masks,
                                       BATCH_SIZE)

    model = BertForSequenceClassification.from_pretrained(corpus_name,
                                                          num_labels=2)
    model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)

    for _ in range(EPOCHS):

        model.train()

        for step, batch in enumerate(train_data_loader):
            input_ids, mask, labels = tuple(t.to(device) for t in batch)
            optimizer.zero_grad()
            loss = model(input_ids,
                         token_type_ids=None,
                         attention_mask=mask,
                         labels=labels)
            loss.backward()
            optimizer.step()

        model.eval()

        for batch in test_data_loader:
            input_ids, mask, labels = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                logits = model(input_ids,
                               token_type_ids=None,
                               attention_mask=mask)
            #logits = logits.detach().cpu().numpy()
            #label_ids = labels.to('cpu').numpy()

    dev_corpus_df = parse_csv(dev_corpus, column_names)
    dev_input_ids, dev_labels, dev_attentions_masks = RENAME_ME(
        dev_corpus_df, corpus_name, True, MAX_LEN)
    dev_data_loader = get_data_loader(dev_input_ids, dev_labels, dev_masks,
                                      BATCH_SIZE)

    model.eval()

    predictions = []
    true_labels = []
    for batch in prediction_data_loader:
        input_ids, mask, labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits = model(input_ids, token_type_ids=None, attention_mask=mask)
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        predictions.append(logits)
        true_labels.append(label_ids)

    matthews_set = []

    for true_label, prediction in zip(true_labels, predictions):
        matthews = matthews_corrcoef(true_label,
                                     np.argmax(prediction, axis=1).flatten())
        matthews_set.append(matthews)

    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
    flat_true_labels = [item for sublist in true_labels for item in sublist]

    matthews_corrcoef(flat_true_labels, flat_predictions)
Exemple #13
0
class MTDNNModel(object):
    def __init__(self,
                 opt,
                 state_dict=None,
                 num_train_step=-1,
                 use_parse=False,
                 embedding_matrix=None,
                 token2idx=None,
                 stx_parse_dim=None,
                 unked_words=None,
                 use_generic_features=False,
                 num_generic_features=None,
                 use_domain_features=False,
                 num_domain_features=None,
                 feature_dim=None):
        self.config = opt
        self.updates = state_dict[
            'updates'] if state_dict and 'updates' in state_dict else 0
        self.train_loss = AverageMeter()
        self.network = SANBertNetwork(
            opt,
            use_parse=use_parse,
            embedding_matrix=embedding_matrix,
            token2idx=token2idx,
            stx_parse_dim=stx_parse_dim,
            unked_words=unked_words,
            use_generic_features=use_generic_features,
            num_generic_features=num_generic_features,
            use_domain_features=use_domain_features,
            num_domain_features=num_domain_features,
            feature_dim=feature_dim)

        if state_dict:
            new_state = set(self.network.state_dict().keys())
            for k in list(state_dict['state'].keys()):
                if k not in new_state:
                    del state_dict['state'][k]
            for k, v in list(self.network.state_dict().items()):
                if k not in state_dict['state']:
                    state_dict['state'][k] = v
            self.network.load_state_dict(state_dict['state'])
        self.mnetwork = nn.DataParallel(
            self.network) if opt['multi_gpu_on'] else self.network
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])

        no_decay = [
            'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'
        ]
        optimizer_parameters = [{
            'params': [
                p for n, p in self.network.named_parameters()
                if n not in no_decay
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in self.network.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        # note that adamax are modified based on the BERT code
        if opt['optimizer'] == 'sgd':
            self.optimizer = optim.sgd(optimizer_parameters,
                                       opt['learning_rate'],
                                       weight_decay=opt['weight_decay'])

        elif opt['optimizer'] == 'adamax':
            self.optimizer = Adamax(optimizer_parameters,
                                    opt['learning_rate'],
                                    warmup=opt['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=opt['grad_clipping'],
                                    schedule=opt['warmup_schedule'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        elif opt['optimizer'] == 'adadelta':
            self.optimizer = optim.Adadelta(optimizer_parameters,
                                            opt['learning_rate'],
                                            rho=0.95)
        elif opt['optimizer'] == 'adam':
            self.optimizer = Adam(optimizer_parameters,
                                  lr=opt['learning_rate'],
                                  warmup=opt['warmup'],
                                  t_total=num_train_step,
                                  max_grad_norm=opt['grad_clipping'],
                                  schedule=opt['warmup_schedule'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        else:
            raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])

        if state_dict and 'optimizer' in state_dict:
            self.optimizer.load_state_dict(state_dict['optimizer'])

        if opt.get('have_lr_scheduler', False):
            if opt.get('scheduler_type', 'rop') == 'rop':
                self.scheduler = ReduceLROnPlateau(self.optimizer,
                                                   mode='max',
                                                   factor=opt['lr_gamma'],
                                                   patience=3)
            elif opt.get('scheduler_type', 'rop') == 'exp':
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=opt.get('lr_gamma', 0.95))
            else:
                milestones = [
                    int(step)
                    for step in opt.get('multi_step_lr', '10,20,30').split(',')
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=opt.get('lr_gamma'))
        else:
            self.scheduler = None
        self.ema = None
        if opt['ema_opt'] > 0:
            self.ema = EMA(self.config['ema_gamma'], self.network)
        self.para_swapped = False

    def setup_ema(self):
        if self.config['ema_opt']:
            self.ema.setup()

    def update_ema(self):
        if self.config['ema_opt']:
            self.ema.update()

    def eval(self):
        if self.config['ema_opt']:
            self.ema.swap_parameters()
            self.para_swapped = True

    def train(self):
        if self.para_swapped:
            self.ema.swap_parameters()
            self.para_swapped = False

    def _value_for(self, key, batch_data, batch_meta):
        return batch_data[batch_meta[key]] if key in batch_meta else None

    def update(self,
               batch_meta,
               batch_data,
               bin_parse_as=None,
               bin_parse_bs=None,
               parse_as_mask=None,
               parse_bs_mask=None,
               generic_features=None,
               domain_features=None):
        self.network.train()
        labels = batch_data[batch_meta['label']]
        if batch_meta['pairwise']:
            labels = labels.contiguous().view(-1,
                                              batch_meta['pairwise_size'])[:,
                                                                           0]
        if self.config['cuda']:
            y = Variable(labels.cuda(non_blocking=True), requires_grad=False)
        else:
            y = Variable(labels, requires_grad=False)
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        logits = self.mnetwork(
            *inputs,
            bin_parse_as=self._value_for('parse_ids_a', batch_data,
                                         batch_meta),
            bin_parse_bs=self._value_for('parse_ids_b', batch_data,
                                         batch_meta),
            parse_as_mask=self._value_for('parse_masks_a', batch_data,
                                          batch_meta),
            parse_bs_mask=self._value_for('parse_masks_b', batch_data,
                                          batch_meta),
            generic_features=self._value_for('generic_features', batch_data,
                                             batch_meta),
            domain_features=self._value_for('domain_features', batch_data,
                                            batch_meta))

        if batch_meta['pairwise']:
            logits = logits.view(-1, batch_meta['pairwise_size'])

        if self.config.get('weighted_on', False):
            if self.config['cuda']:
                weight = Variable(
                    batch_data[batch_meta['factor']].cuda(non_blocking=True))
            else:
                weight = Variable(batch_data[batch_meta['factor']])
            if task_type > 0:
                loss = torch.mean(
                    F.mse_loss(logits.squeeze(), y, reduce=False) * weight)
            else:
                loss = torch.mean(
                    F.cross_entropy(logits, y, reduce=False) * weight)
        else:
            if task_type > 0:
                loss = F.mse_loss(logits.squeeze(), y)
            else:
                loss = F.cross_entropy(logits, y)

        self.train_loss.update(loss.item(), logits.size(0))
        self.optimizer.zero_grad()

        loss.backward()
        if self.config['global_grad_clipping'] > 0:
            torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                           self.config['global_grad_clipping'])
        self.optimizer.step()
        self.updates += 1
        self.update_ema()

    def predict(self,
                batch_meta,
                batch_data,
                bin_parse_as=None,
                bin_parse_bs=None,
                parse_as_mask=None,
                parse_bs_mask=None,
                generic_features=None,
                domain_features=None):
        self.network.eval()
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        with torch.no_grad():
            score = self.network(
                *inputs,
                bin_parse_as=self._value_for('parse_ids_a', batch_data,
                                             batch_meta),
                bin_parse_bs=self._value_for('parse_ids_b', batch_data,
                                             batch_meta),
                parse_as_mask=self._value_for('parse_masks_a', batch_data,
                                              batch_meta),
                parse_bs_mask=self._value_for('parse_masks_b', batch_data,
                                              batch_meta),
                generic_features=self._value_for('generic_features',
                                                 batch_data, batch_meta),
                domain_features=self._value_for('domain_features', batch_data,
                                                batch_meta))
        if batch_meta['pairwise']:
            score = score.contiguous().view(-1, batch_meta['pairwise_size'])
            if task_type < 1:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta['true_label']
        else:
            if task_type < 1:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta['label']

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        ema_state = dict([
            (k, v.cpu()) for k, v in self.ema.model.state_dict().items()
        ]) if self.ema is not None else dict()
        params = {
            'state': network_state,
            #'optimizer': self.optimizer.state_dict(),
            'ema': ema_state,
            'config': self.config,
        }
        torch.save(params, filename)
        logger.info('model saved to {}'.format(filename))

    def cuda(self):
        self.network.cuda()
        if self.config['ema_opt']:
            self.ema.cuda()
Exemple #14
0
def train(args):

    label_name = ['fake', 'real']

    device = torch.device("cuda:0" if args['CUDA'] == 'gpu' else "cpu")

    prefix = args['MODEL'] + '_' + args['BERT_CONFIG']

    bert_size = args['BERT_CONFIG'].split('-')[1]

    start_time = time.time()
    print('Importing data...', file=sys.stderr)
    df_train = pd.read_csv(args['--train'], index_col=0)
    df_val = pd.read_csv(args['--dev'], index_col=0)

    train_label = dict(df_train.information_label.value_counts())

    print("Train label", train_label)

    label_max = float(max(train_label.values()))

    print("Label max", label_max)

    train_label_weight = torch.tensor(
        [label_max / train_label[i] for i in range(len(train_label))],
        device=device)

    print(train_label_weight)

    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    start_time = time.time()
    print('Set up model...', file=sys.stderr)

    if args['MODEL'] == 'cnn':
        model = CustomBertConvModel(args['BERT_CONFIG'],
                                    device,
                                    float(args['--dropout']),
                                    len(label_name),
                                    out_channel=int(args['--out-channel']))
        optimizer = BertAdam([{
            'params': model.bert.parameters()
        }, {
            'params': model.conv.parameters(),
            'lr': float(args['--lr'])
        }, {
            'params': model.hidden_to_softmax.parameters(),
            'lr': float(args['--lr'])
        }],
                             lr=float(args['--lr-bert']),
                             max_grad_norm=float(args['--clip-grad']))
    elif args['MODEL'] == 'lstm':
        model = CustomBertLSTMModel(args['BERT_CONFIG'],
                                    device,
                                    float(args['--dropout']),
                                    len(label_name),
                                    lstm_hidden_size=int(
                                        args['--hidden-size']))

        optimizer = BertAdam([{
            'params': model.bert.parameters()
        }, {
            'params': model.lstm.parameters(),
            'lr': float(args['--lr'])
        }, {
            'params': model.hidden_to_softmax.parameters(),
            'lr': float(args['--lr'])
        }],
                             lr=float(args['--lr-bert']),
                             max_grad_norm=float(args['--clip-grad']))
    else:
        print('please input valid model')
        exit(0)

    model = model.to(device)
    print('Use device: %s' % device, file=sys.stderr)
    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    model.train()

    cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight,
                                        reduction='mean')
    torch.save(cn_loss, 'loss_func')  # for later testing

    train_batch_size = int(args['--batch-size'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = prefix + '_model.bin'

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = 0
    cum_examples = report_examples = epoch = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('Begin Maximum Likelihood training...')

    while True:
        epoch += 1

        for sents, targets in batch_iter(df_train,
                                         batch_size=train_batch_size,
                                         shuffle=True,
                                         bert=bert_size):  # for each epoch
            train_iter += 1  # increase training iteration
            # set gradients to zero before starting to do backpropagation.
            # Pytorch accummulates the gradients on subsequnt backward passes.
            optimizer.zero_grad()
            batch_size = len(sents)
            pre_softmax = model(sents).double()

            loss = cn_loss(
                pre_softmax,
                torch.tensor(targets, dtype=torch.long, device=device))
            # The gradients are "stored" by the tensors themselves once you call backwards
            # on the loss.
            loss.backward()
            '''
             After computing the gradients for all tensors in the model, calling optimizer.step() makes the optimizer iterate over 
             all parameters (tensors) it is supposed to update and use their internally stored grad to update their values.
            '''
            optimizer.step()

            # loss.item() contains the loss for the mini-batch, but divided by the batch_size
            # that's why multiply by the batch_size
            batch_losses_val = loss.item() * batch_size
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, '
                      'cum. examples %d, speed %.2f examples/sec, '
                      'time elapsed %.2f sec' %
                      (epoch, train_iter, report_loss / report_examples,
                       cum_examples, report_examples /
                       (time.time() - train_time), time.time() - begin_time),
                      file=sys.stderr)

                train_time = time.time()
                report_loss = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. examples %d' %
                    (epoch, train_iter, cum_loss / cum_examples, cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = 0

                print('begin validation....', file=sys.stderr)

                validation_loss = validation(
                    model, df_val, bert_size, cn_loss,
                    device)  # dev batch size can be a bit larger

                print('validation: iter %d, loss %f' %
                      (train_iter, validation_loss),
                      file=sys.stderr)

                is_better = len(
                    hist_valid_scores
                ) == 0 or validation_loss < min(hist_valid_scores)
                hist_valid_scores.append(validation_loss)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)

                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')

                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        print(
                            'load previously best model and decay learning rate to %f%%'
                            % (float(args['--lr-decay']) * 100),
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] *= float(args['--lr-decay'])

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
def train_topic(st, tt):

    device = torch.device("cuda:1")

    # with open('train_sem_mask.pickle', 'rb') as f:
    #     train_dataeval_mask_set = pickle.load(f)

    # with open('test_sem_mask.pickle', 'rb') as f:
    #     test_dataeval_mask_set = pickle.load(f)

    # with open('train_sem.pickle', 'rb') as f:
    #     train_dataeval_set = pickle.load(f)

    # with open('test_sem.pickle', 'rb') as f:
    #     test_dataeval_set = pickle.load(f)

    # with open('framenet.pickle', 'rb') as f:
    #     test_framenet_set = pickle.load(f)

    # with open('framenet_mask.pickle', 'rb') as f:
    #     test_framenet_mask_set = pickle.load(f)

    # with open('data_seen.pickle', 'rb') as f:
    #     data = pickle.load(f)
    # train_set, test_set = data['train'], data['test']

    # with open('data_seen_mask.pickle', 'rb') as f:
    #     data = pickle.load(f)
    # train_set_mask, test_set_mask = data['train'], data['test']

    ### Reading data...
    with open('data.pickle', 'rb') as f:
        data = pickle.load(f)
    # train_set, test_set = split_train_test(data)
    train_set = get_topic(data, st)
    test_set = get_topic(data, tt)

    with open('data_mask.pickle', 'rb') as f:
        data_mask = pickle.load(f)
    #train_set_mask, test_set_mask = split_train_test(data)
    train_set_mask = get_topic(data_mask, st)
    test_set_mask = get_topic(data_mask, tt)

    test_set, test_set_mask = test_set, test_set_mask

    train_pair = list(zip(train_set, train_set_mask))
    train_pair = negative_sampling(train_pair, 0.8)
    train_set, train_set_mask = [d[0] for d in train_pair
                                 ], [d[1] for d in train_pair]

    ###
    test_dataset = Dataset(10, test_set)
    test_dataset_mask = Dataset(10, test_set_mask)

    test_dataset_batch = [
        batch for batch in test_dataset.reader(device, False)
    ]
    test_dataset_mask_batch = [
        batch for batch in test_dataset_mask.reader(device, False)
    ]

    test_dataset_mix = list(zip(test_dataset_batch, test_dataset_mask_batch))

    ###
    train_dataset = Dataset(20, train_set)
    train_dataset_mask = Dataset(20, train_set_mask)

    train_dataset_batch = [
        batch for batch in train_dataset.reader(device, False)
    ]
    train_dataset_mask_batch = [
        batch for batch in train_dataset_mask.reader(device, False)
    ]

    train_dataset_mix = list(zip(train_dataset_batch,
                                 train_dataset_mask_batch))

    model = BertCausalModel(3).to(device)
    model_mask = BertCausalModel(3).to(device)

    learning_rate = 1e-5
    optimizer = BertAdam(model.parameters(), lr=learning_rate)
    optimizer_mask = BertAdam(model_mask.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')

    for _ in range(0, 20):
        idx = 0
        for batch, batch_mask in tqdm(train_dataset_mix,
                                      mininterval=2,
                                      total=len(train_dataset_mix),
                                      file=sys.stdout,
                                      ncols=80):
            idx += 1
            model.train()
            model_mask.train()
            sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask, data_y, _ = batch
            sentences_s_mask = batch_mask[0]

            opt = model.forward_logits(sentences_s, mask_s, sentences_t,
                                       mask_t, event1, event1_mask, event2,
                                       event2_mask)
            opt_mask = model_mask.forward_logits(sentences_s_mask, mask_s,
                                                 sentences_t, mask_t, event1,
                                                 event1_mask, event2,
                                                 event2_mask)

            opt_mix = torch.cat([opt, opt_mask], dim=-1)
            logits = model.additional_fc(opt_mix)
            loss = loss_fn(logits, data_y)

            optimizer.zero_grad()
            optimizer_mask.zero_grad()
            loss.backward()
            optimizer.step()
            optimizer_mask.step()

        model.eval()
        model_mask.eval()
        with torch.no_grad():
            predicted_all = []
            gold_all = []
            for batch, batch_mask in test_dataset_mix:
                sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask, data_y, _ = batch
                sentences_s_mask = batch_mask[0]

                opt = model.forward_logits(sentences_s, mask_s, sentences_t,
                                           mask_t, event1, event1_mask, event2,
                                           event2_mask)
                opt_mask = model_mask.forward_logits(sentences_s_mask, mask_s,
                                                     sentences_t, mask_t,
                                                     event1, event1_mask,
                                                     event2, event2_mask)

                opt_mix = torch.cat([opt, opt_mask], dim=-1)
                logits = model.additional_fc(opt_mix)

                predicted = torch.argmax(logits, -1)
                predicted = list(predicted.cpu().numpy())
                predicted_all += predicted

                gold = list(data_y.cpu().numpy())
                gold_all += gold
            p, r, f = compute_f1(gold_all, predicted_all)
            print(p, r, f)
            print('Here')
Exemple #16
0
class ClassificationModel:
    def __init__(self, bert_model=config.bert_model, gpu=False, seed=0):

        self.gpu = gpu
        self.bert_model = bert_model

        self.train_df = data_reader.load_train_dataset(config.data_path)
        self.val_df = data_reader.load_dev_dataset(config.data_path)
        self.test_df = data_reader.load_test_dataset(config.data_path)

        self.num_classes = len(LABELS)

        self.model = None
        self.optimizer = None
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)

        # to plot loss during training process
        self.plt_x = []
        self.plt_y = []

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if self.gpu:
            torch.cuda.manual_seed_all(seed)

    def __init_model(self):
        if self.gpu:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.model.to(self.device)
        print(torch.cuda.memory_allocated(self.device))
        # log available cuda
        if self.device.type == 'cuda':
            print(torch.cuda.get_device_name(0))
            print('Memory Usage:')
            print('Allocated:',
                  round(torch.cuda.memory_allocated(0) / 1024**3, 1), 'GB')
            print('Cached:   ',
                  round(torch.cuda.memory_cached(0) / 1024**3, 1), 'GB')

    def new_model(self):
        self.model = BertForSequenceClassification.from_pretrained(
            self.bert_model, num_labels=self.num_classes)
        self.__init_model()

    def load_model(self, path_model, path_config):
        self.model = BertForSequenceClassification(BertConfig(path_config),
                                                   num_labels=self.num_classes)
        self.model.load_state_dict(torch.load(path_model))
        self.__init_model()

    def save_model(self, path_model, path_config, epoch_n, acc, f1):

        if not os.path.exists(path_model):
            os.makedirs(path_model)

        model_save_path = os.path.join(
            path_model, 'model_{:.4f}_{:.4f}_{:.4f}'.format(epoch_n, acc, f1))

        torch.save(self.model.state_dict(), model_save_path)

        if not os.path.exists(path_config):
            os.makedirs(path_config)

        model_config_path = os.path.join(path_config, 'config.cf')
        with open(model_config_path, 'w') as f:
            f.write(self.model.config.to_json_string())

    def train(self,
              epochs,
              batch_size=config.batch_size,
              lr=config.lr,
              plot_path=None,
              model_path=None,
              config_path=None):

        model_params = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model_params
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in model_params if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        self.optimizer = BertAdam(
            optimizer_grouped_parameters,
            lr=lr,
            warmup=0.1,
            t_total=int(len(self.train_df) / batch_size) * epochs)

        nb_tr_steps = 0
        train_features = data_reader.convert_examples_to_features(
            self.train_df, config.MAX_SEQ_LENGTH, self.tokenizer)

        # create tensor of all features
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)

        # eval dataloader
        eval_features = data_reader.convert_examples_to_features(
            self.val_df, config.MAX_SEQ_LENGTH, self.tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)

        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=batch_size)

        # class weighting
        _, counts = np.unique(self.train_df['label'], return_counts=True)
        class_weights = [sum(counts) / c for c in counts]
        # assign wight to each input sample
        example_weights = [class_weights[e] for e in self.train_df['label']]
        sampler = WeightedRandomSampler(example_weights,
                                        len(self.train_df['label']))
        train_dataloader = DataLoader(train_data,
                                      sampler=sampler,
                                      batch_size=batch_size)

        self.model.train()
        for e in range(epochs):
            print("Epoch {}".format(e))
            if e is not 0:
                f1, acc = self.val(eval_dataloader)
                print("\nF1 score: {}, Accuracy: {}".format(f1, acc))
            if model_path is not None and config_path is not None:
                if e is not 0:
                    self.save_model(model_path, config_path, e, acc, f1)
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                loss = self.model(input_ids, segment_ids, input_mask,
                                  label_ids)
                loss.backward()

                #if plot_path is not None:
                #    self.plt_y.append(loss.item())
                #    self.plt_x.append(nb_tr_steps)
                #    self.save_plot(plot_path)

                nb_tr_steps += 1
                self.optimizer.step()
                self.optimizer.zero_grad()

                if self.gpu:
                    torch.cuda.empty_cache()

    def val(self, eval_dataloader, batch_size=config.batch_size):

        f1, acc = 0, 0
        nb_eval_examples = 0

        for input_ids, input_mask, segment_ids, gnd_labels in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)

            with torch.no_grad():
                logits = self.model(input_ids, segment_ids, input_mask)

            predicted_labels = np.argmax(logits.detach().cpu().numpy(), axis=1)
            acc += np.sum(predicted_labels == gnd_labels.numpy())
            tmp_eval_f1 = f1_score(predicted_labels,
                                   gnd_labels,
                                   average='macro')
            f1 += tmp_eval_f1 * input_ids.size(0)
            nb_eval_examples += input_ids.size(0)

        return f1 / nb_eval_examples, acc / nb_eval_examples

    def save_plot(self, path):

        fig, ax = plt.subplots()
        ax.plot(self.plt_x, self.plt_y)

        ax.set(xlabel='Training steps', ylabel='Loss')

        fig.savefig(path)
        plt.close()

    def create_test_predictions(self, path):
        tests_features = data_reader.convert_examples_to_features(
            self.x_test, [-1] * len(self.test_df), config.MAX_SEQ_LENGTH,
            self.tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in tests_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in tests_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in tests_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in tests_features],
                                     dtype=torch.long)
        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)

        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=16)

        predictions = []
        inverse_labels = {v: k for k, v in LABELS}

        for input_ids, input_mask, segment_ids, gnd_labels in tqdm(
                test_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)

            with torch.no_grad():
                encoded_layers, logits = self.model(input_ids, segment_ids,
                                                    input_mask)

            predictions += [
                inverse_labels[p]
                for p in list(np.argmax(logits.detach().cpu().numpy(), axis=1))
            ]
        with open(path, "w") as csv_file:
            writer = csv.writer(csv_file, delimiter=',')
            for i, prediction in enumerate(predictions):
                writer.writerow([int(self.x_test_ids[i]), prediction])

        return predictions
Exemple #17
0
class MTDNNModel(object):
    def __init__(self, opt, state_dict=None, num_train_step=-1):
        self.config = opt
        self.updates = state_dict[
            'updates'] if state_dict and 'updates' in state_dict else 0
        self.train_loss = AverageMeter()
        self.network = SANBertNetwork(opt)

        if state_dict:
            new_state = set(self.network.state_dict().keys())
            for k in list(state_dict['state'].keys()):
                if k not in new_state:
                    del state_dict['state'][k]
            for k, v in list(self.network.state_dict().items()):
                if k not in state_dict['state']:
                    state_dict['state'][k] = v
            self.network.load_state_dict(state_dict['state'])
        self.mnetwork = nn.DataParallel(
            self.network) if opt['multi_gpu_on'] else self.network
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])

        no_decay = [
            'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'
        ]

        optimizer_parameters = [{
            'params': [
                p for n, p in self.network.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in self.network.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        # note that adamax are modified based on the BERT code
        if opt['optimizer'] == 'sgd':
            self.optimizer = optim.sgd(optimizer_parameters,
                                       opt['learning_rate'],
                                       weight_decay=opt['weight_decay'])

        elif opt['optimizer'] == 'adamax':
            self.optimizer = Adamax(optimizer_parameters,
                                    opt['learning_rate'],
                                    warmup=opt['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=opt['grad_clipping'],
                                    schedule=opt['warmup_schedule'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        elif opt['optimizer'] == 'adadelta':
            self.optimizer = optim.Adadelta(optimizer_parameters,
                                            opt['learning_rate'],
                                            rho=0.95)
        elif opt['optimizer'] == 'adam':
            self.optimizer = Adam(optimizer_parameters,
                                  lr=opt['learning_rate'],
                                  warmup=opt['warmup'],
                                  t_total=num_train_step,
                                  max_grad_norm=opt['grad_clipping'],
                                  schedule=opt['warmup_schedule'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        else:
            raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])

        if state_dict and 'optimizer' in state_dict:
            self.optimizer.load_state_dict(state_dict['optimizer'])

        if opt.get('have_lr_scheduler', False):
            if opt.get('scheduler_type', 'rop') == 'rop':
                self.scheduler = ReduceLROnPlateau(self.optimizer,
                                                   mode='max',
                                                   factor=opt['lr_gamma'],
                                                   patience=3)
            elif opt.get('scheduler_type', 'rop') == 'exp':
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=opt.get('lr_gamma', 0.95))
            else:
                milestones = [
                    int(step)
                    for step in opt.get('multi_step_lr', '10,20,30').split(',')
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=opt.get('lr_gamma'))
        else:
            self.scheduler = None
        self.ema = None
        if opt['ema_opt'] > 0:
            self.ema = EMA(self.config['ema_gamma'], self.network)
        self.para_swapped = False

    def setup_ema(self):
        if self.config['ema_opt']:
            self.ema.setup()

    def update_ema(self):
        if self.config['ema_opt']:
            self.ema.update()

    def eval(self):
        if self.config['ema_opt']:
            self.ema.swap_parameters()
            self.para_swapped = True

    def train(self):
        if self.para_swapped:
            self.ema.swap_parameters()
            self.para_swapped = False

    def update(self, batch_meta, batch_data):
        self.network.train()
        labels = batch_data[batch_meta['label']]
        soft_labels = None
        temperature = 1.0
        if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta):
            soft_labels = batch_meta['soft_label']

        if batch_meta['pairwise']:
            labels = labels.contiguous().view(-1,
                                              batch_meta['pairwise_size'])[:,
                                                                           0]
        if self.config['cuda']:
            y = Variable(labels.cuda(async=True), requires_grad=False)
        else:
            y = Variable(labels, requires_grad=False)
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        logits = self.mnetwork(*inputs)
        if batch_meta['pairwise']:
            logits = logits.view(-1, batch_meta['pairwise_size'])

        if self.config.get('weighted_on', False):
            if self.config['cuda']:
                weight = Variable(
                    batch_data[batch_meta['factor']].cuda(async=True))
            else:
                weight = Variable(batch_data[batch_meta['factor']])
            if task_type > 0:
                loss = torch.mean(
                    F.mse_loss(logits.squeeze(), y, reduce=False) * weight)
            else:
                loss = torch.mean(
                    F.cross_entropy(logits, y, reduce=False) * weight)
                if soft_labels is not None:
                    # compute KL
                    label_size = soft_labels.size(1)
                    kd_loss = F.kl_div(
                        F.log_softmax(logits.view(-1, label_size).float(), 1),
                        soft_labels) * label_size
                    loss = loss + kd_loss
        else:
            if task_type > 0:
                loss = F.mse_loss(logits.squeeze(), y)
            else:
                loss = F.cross_entropy(logits, y)
                if soft_labels is not None:
                    # compute KL
                    label_size = soft_labels.size(1)
                    # note that kl_div return element-wised mean, thus it requires to time with the label size
                    # In the pytorch v1.x, it simply uses the flag: reduction='batchmean'
                    # TODO: updated the package to support the latest PyTorch (xiaodl)
                    kd_loss = F.kl_div(
                        F.log_softmax(logits.view(-1, label_size).float(), 1),
                        soft_labels) * label_size
                    loss = loss + kd_loss

        self.train_loss.update(loss.item(), logits.size(0))
        self.optimizer.zero_grad()

        loss.backward()
        if self.config['global_grad_clipping'] > 0:
            torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                           self.config['global_grad_clipping'])
        self.optimizer.step()
        self.updates += 1
        self.update_ema()

    def predict(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        score = self.mnetwork(*inputs)
        if batch_meta['pairwise']:
            score = score.contiguous().view(-1, batch_meta['pairwise_size'])
            if task_type < 1:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta['true_label']
        else:
            if task_type < 1:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta['label']

    def extract(self, batch_meta, batch_data):
        self.network.eval()
        # 'token_id': 0; 'segment_id': 1; 'mask': 2
        inputs = batch_data[:3]
        all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs)
        return all_encoder_layers, pooled_output

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        ema_state = dict([
            (k, v.cpu()) for k, v in self.ema.model.state_dict().items()
        ]) if self.ema is not None else dict()
        params = {
            'state': network_state,
            'optimizer': self.optimizer.state_dict(),
            'ema': ema_state,
            'config': self.config,
        }
        torch.save(params, filename)
        logger.info('model saved to {}'.format(filename))

    def cuda(self):
        self.network.cuda()
        if self.config['ema_opt']:
            self.ema.cuda()
Exemple #18
0
class RelationModel(object):
    """ A wrapper class for the training and evaluation of models. """
    def __init__(self, opt, batch_num):
        self.opt = opt
        self.model = Extraction(opt)
        self.model.cuda()

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        num_train_optimization_steps = batch_num * (opt['num_epoch'] + 1)
        self.optimizer = BertAdam(optimizer_grouped_parameters,
                                  lr=opt['lr'],
                                  warmup=0.1,
                                  t_total=num_train_optimization_steps)
        self.bce = nn.BCELoss(reduction='none')

        self.ema = layers.EMA(self.model, opt['ema'])
        self.ema.register()

    def update(self, batch, i):
        """ Run a step of forward and backward model update. """
        if self.opt['cuda']:
            inputs = [Variable(torch.LongTensor(b).cuda()) for b in batch[:4]]
            o_labels = Variable(torch.FloatTensor(batch[4]).cuda())
            mask = Variable(torch.FloatTensor(batch[5]).cuda())

        # step forward
        self.model.train()
        self.optimizer.zero_grad()

        loss_mask = mask.unsqueeze(-1)
        o_probs = self.model(inputs, mask)

        o_probs = o_probs.pow(2)

        o_loss = self.bce(o_probs, o_labels)  # .view(batch_size, seq_len, 2)
        o_loss = 0.5 * torch.sum(o_loss.mul(loss_mask)) / torch.sum(loss_mask)

        loss = o_loss

        # backward
        loss.backward()
        self.optimizer.step()

        self.ema.update()

        loss_val = loss.data.item()
        return loss_val

    def predict_obj_per_instance(self, inputs, mask, user_cuda=None):
        """ Run forward prediction. If unsort is True, recover the original order of the batch. """
        if self.opt['cuda']:
            if user_cuda == None:
                inputs = [Variable(torch.LongTensor(b).cuda()) for b in inputs]
                mask = Variable(torch.FloatTensor(mask).cuda())
            else:
                inputs = [
                    Variable(torch.LongTensor(b).cuda(user_cuda))
                    for b in inputs
                ]
                mask = Variable(torch.FloatTensor(mask).cuda(user_cuda))

        self.model.eval()

        words, distance_to_s, s_start, s_end = inputs
        hidden, sentence_rep = self.model.based_encoder(words)

        o_probs = self.model.o_sublayer(hidden, sentence_rep, distance_to_s,
                                        s_start, s_end, mask)

        o_probs = o_probs.pow(2)

        o_probs = o_probs.mul(mask.unsqueeze(-1)).data.cpu().numpy()

        return o_probs

    def update_lr(self, new_lr):
        torch_utils.change_lr(self.optimizer, new_lr)

    def save(self, filename, epoch):
        params = {
            'model': self.model.state_dict(),
            'config': self.opt,
            'epoch': epoch
        }
        try:
            torch.save(params, filename)
            print("model saved to {}".format(filename))
        except BaseException:
            print("[Warning: Saving failed... continuing anyway.]")

    def load(self, filename):
        try:
            checkpoint = torch.load(filename)
        except BaseException:
            print("Cannot load model from {}".format(filename))
            exit()
        self.model.load_state_dict(checkpoint['model'])
        self.opt = checkpoint['config']
Exemple #19
0
class MTDNNModel(object):
    def __init__(self, opt, state_dict=None, num_train_step=-1):
        self.config = opt
        self.updates = state_dict[
            'updates'] if state_dict and 'updates' in state_dict else 0
        self.local_updates = 0
        self.train_loss = AverageMeter()
        self.network = SANBertNetwork(opt)

        if state_dict:
            self.network.load_state_dict(state_dict['state'], strict=False)
        self.mnetwork = nn.DataParallel(
            self.network) if opt['multi_gpu_on'] else self.network
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])
        if opt['cuda']:
            self.network.cuda()

        no_decay = [
            'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'
        ]

        optimizer_parameters = [{
            'params': [
                p for n, p in self.network.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in self.network.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        # note that adamax are modified based on the BERT code
        if opt['optimizer'] == 'sgd':
            self.optimizer = optim.SGD(optimizer_parameters,
                                       opt['learning_rate'],
                                       weight_decay=opt['weight_decay'])

        elif opt['optimizer'] == 'adamax':
            self.optimizer = Adamax(optimizer_parameters,
                                    opt['learning_rate'],
                                    warmup=opt['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=opt['grad_clipping'],
                                    schedule=opt['warmup_schedule'],
                                    weight_decay=opt['weight_decay'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        elif opt['optimizer'] == 'radam':
            self.optimizer = RAdam(optimizer_parameters,
                                   opt['learning_rate'],
                                   warmup=opt['warmup'],
                                   t_total=num_train_step,
                                   max_grad_norm=opt['grad_clipping'],
                                   schedule=opt['warmup_schedule'],
                                   eps=opt['adam_eps'],
                                   weight_decay=opt['weight_decay'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
            # The current radam does not support FP16.
            opt['fp16'] = False
        elif opt['optimizer'] == 'adadelta':
            self.optimizer = optim.Adadelta(optimizer_parameters,
                                            opt['learning_rate'],
                                            rho=0.95)
        elif opt['optimizer'] == 'adam':
            self.optimizer = Adam(optimizer_parameters,
                                  lr=opt['learning_rate'],
                                  warmup=opt['warmup'],
                                  t_total=num_train_step,
                                  max_grad_norm=opt['grad_clipping'],
                                  schedule=opt['warmup_schedule'],
                                  weight_decay=opt['weight_decay'])
            if opt.get('have_lr_scheduler', False):
                opt['have_lr_scheduler'] = False
        else:
            raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])

        if state_dict and 'optimizer' in state_dict:
            self.optimizer.load_state_dict(state_dict['optimizer'])

        if opt['fp16']:
            try:
                from apex import amp
                global amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(self.network,
                                              self.optimizer,
                                              opt_level=opt['fp16_opt_level'])
            self.network = model
            self.optimizer = optimizer

        if opt.get('have_lr_scheduler', False):
            if opt.get('scheduler_type', 'rop') == 'rop':
                self.scheduler = ReduceLROnPlateau(self.optimizer,
                                                   mode='max',
                                                   factor=opt['lr_gamma'],
                                                   patience=3)
            elif opt.get('scheduler_type', 'rop') == 'exp':
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=opt.get('lr_gamma', 0.95))
            else:
                milestones = [
                    int(step)
                    for step in opt.get('multi_step_lr', '10,20,30').split(',')
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=opt.get('lr_gamma'))
        else:
            self.scheduler = None

        self.ema = None
        if opt['ema_opt'] > 0:
            self.ema = EMA(self.config['ema_gamma'], self.network)
            if opt['cuda']:
                self.ema.cuda()

        self.para_swapped = False
        # zero optimizer grad
        self.optimizer.zero_grad()

    def setup_ema(self):
        if self.config['ema_opt']:
            self.ema.setup()

    def update_ema(self):
        if self.config['ema_opt']:
            self.ema.update()

    def eval(self):
        if self.config['ema_opt']:
            self.ema.swap_parameters()
            self.para_swapped = True

    def train(self):
        if self.para_swapped:
            self.ema.swap_parameters()
            self.para_swapped = False

    def update(self, batch_meta, batch_data):
        self.network.train()
        labels = batch_data[batch_meta['label']]
        soft_labels = None
        if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta):
            soft_labels = batch_meta['soft_label']

        task_type = batch_meta['task_type']
        if task_type == TaskType.Span:
            start = batch_data[batch_meta['start']]
            end = batch_data[batch_meta['end']]
            if self.config["cuda"]:
                start = start.cuda(non_blocking=True)
                end = end.cuda(non_blocking=True)
            start.requires_grad = False
            end.requires_grad = False
        else:
            y = labels
            if task_type == TaskType.Ranking:
                y = y.contiguous().view(-1, batch_meta['pairwise_size'])[:, 0]
            if self.config['cuda']:
                y = y.cuda(non_blocking=True)
            y.requires_grad = False

        task_id = batch_meta['task_id']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)

        if self.config.get('weighted_on', False):
            if self.config['cuda']:
                weight = batch_data[batch_meta['factor']].cuda(
                    non_blocking=True)
            else:
                weight = batch_data[batch_meta['factor']]

        if task_type == TaskType.Span:
            start_logits, end_logits = self.mnetwork(*inputs)
            ignored_index = start_logits.size(1)
            start.clamp_(0, ignored_index)
            end.clamp_(0, ignored_index)
            if self.config.get('weighted_on', False):
                loss = torch.mean(F.cross_entropy(start_logits, start, reduce=False) * weight) + \
                    torch.mean(F.cross_entropy(end_logits, end, reduce=False) * weight)
            else:
                loss = F.cross_entropy(start_logits, start, ignore_index=ignored_index) + \
                    F.cross_entropy(end_logits, end, ignore_index=ignored_index)
            loss = loss / 2
        elif task_type == TaskType.SequenceLabeling:
            y = y.view(-1)
            logits = self.mnetwork(*inputs)
            loss = F.cross_entropy(logits, y, ignore_index=-1)
        else:
            logits = self.mnetwork(*inputs)
            if task_type == TaskType.Ranking:
                logits = logits.view(-1, batch_meta['pairwise_size'])
            if self.config.get('weighted_on', False):
                if task_type == TaskType.Regression:
                    loss = torch.mean(
                        F.mse_loss(logits.squeeze(), y, reduce=False) * weight)
                else:
                    loss = torch.mean(
                        F.cross_entropy(logits, y, reduce=False) * weight)
                    if soft_labels is not None:
                        # compute KL
                        label_size = soft_labels.size(1)
                        kd_loss = F.kl_div(F.log_softmax(
                            logits.view(-1, label_size).float(), 1),
                                           soft_labels,
                                           reduction='batchmean')
                        loss = loss + kd_loss
            else:
                if task_type == TaskType.Regression:
                    loss = F.mse_loss(logits.squeeze(), y)
                else:
                    loss = F.cross_entropy(logits, y)
                    if soft_labels is not None:
                        # compute KL
                        label_size = soft_labels.size(1)
                        kd_loss = F.kl_div(F.log_softmax(
                            logits.view(-1, label_size).float(), 1),
                                           soft_labels,
                                           reduction='batchmean')
                        loss = loss + kd_loss

        self.train_loss.update(loss.item(), logits.size(0))
        # scale loss
        loss = loss / self.config.get('grad_accumulation_step', 1)
        if self.config['fp16']:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.local_updates += 1
        if self.local_updates % self.config.get('grad_accumulation_step',
                                                1) == 0:
            if self.config['global_grad_clipping'] > 0:
                if self.config['fp16']:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(self.optimizer),
                        self.config['global_grad_clipping'])
                else:
                    torch.nn.utils.clip_grad_norm_(
                        self.network.parameters(),
                        self.config['global_grad_clipping'])

            self.updates += 1
            # reset number of the grad accumulation
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.update_ema()

    def predict(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        score = self.mnetwork(*inputs)
        if task_type == TaskType.Ranking:
            score = score.contiguous().view(-1, batch_meta['pairwise_size'])
            assert task_type == TaskType.Ranking
            score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta['true_label']
        elif task_type == TaskType.SequenceLabeling:
            mask = batch_data[batch_meta['mask']]
            score = score.contiguous()
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).reshape(mask.size()).tolist()
            valied_lenght = mask.sum(1).tolist()
            final_predict = []
            for idx, p in enumerate(predict):
                final_predict.append(p[:valied_lenght[idx]])
            score = score.reshape(-1).tolist()
            return score, final_predict, batch_meta['label']
        else:
            if task_type == TaskType.Classification:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta['label']

    def extract(self, batch_meta, batch_data):
        self.network.eval()
        # 'token_id': 0; 'segment_id': 1; 'mask': 2
        inputs = batch_data[:3]
        all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs)
        return all_encoder_layers, pooled_output

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        ema_state = dict([
            (k, v.cpu()) for k, v in self.ema.model.state_dict().items()
        ]) if self.ema is not None else dict()
        params = {
            'state': network_state,
            'optimizer': self.optimizer.state_dict(),
            'ema': ema_state,
            'config': self.config,
        }
        torch.save(params, filename)
        logger.info('model saved to {}'.format(filename))

    def load(self, checkpoint):

        model_state_dict = torch.load(checkpoint)
        if model_state_dict['config']['init_checkpoint'].rsplit('/', 1)[1] != \
                self.config['init_checkpoint'].rsplit('/', 1)[1]:
            logger.error(
                '*** SANBert network is pretrained on a different Bert Model. Please use that to fine-tune for other tasks. ***'
            )
            sys.exit()

        self.network.load_state_dict(model_state_dict['state'], strict=False)
        self.optimizer.load_state_dict(model_state_dict['optimizer'])
        self.config = model_state_dict['config']
        if self.ema:
            self.ema.model.load_state_dict(model_state_dict['ema'])

    def cuda(self):
        self.network.cuda()
        if self.config['ema_opt']:
            self.ema.cuda()
Exemple #20
0
def main():

    torch.cuda.empty_cache()

    device_name = tf.test.gpu_device_name()
    if device_name != '/device:GPU:0':
        raise SystemError('GPU device not found')
    print('Found GPU at: {}'.format(device_name))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    #model.cuda()
    scores_train = []
    first_sent_train = []
    second_sent_train = []

    scores_test = []
    first_sent_test = []
    second_sent_test = []

    sent_pairs = []

    with open(train_data_path, encoding='utf-8') as fin:
        train_data = fin.read().split('\n')
    train_data = [line for line in train_data if line.strip()]
    for line in train_data:
        pair = []
        line1 = line.split('\t')
        if float(line1[4]) <= 2.5:
            scores_train.append(0)
        else:
            scores_train.append(1)
        first_sent_train.append(line1[5])
        second_sent_train.append(line1[6])
        pair.append(str(line1[5]))
        pair.append(str(line1[6]))
        sent_pairs.append(pair)

    with open(test_data_path, encoding='utf-8') as fin:
        test_data = fin.read().split('\n')
    test_data = [line for line in test_data if line.strip()]
    for line in test_data:
        line1 = line.split('\t')
        if float(line1[4]) <= 2.5:
            scores_test.append(0)
        else:
            scores_test.append(1)
        first_sent_test.append(line1[5])
        second_sent_test.append(line1[6])

    pairs_train = []
    pairs_test = []
    segment_ids_train = []
    segment_ids_test = []
    tokenized_pairs_train = []
    tokenized_pairs_test = []

    for sent1, sent2 in zip(first_sent_train, second_sent_train):
        token1 = tokenizer.tokenize(sent1)
        token2 = tokenizer.tokenize(sent2)
        pair_tokens = []
        pair_segment_ids = []
        pair_tokens.append("[CLS] ")
        pair_segment_ids.append(0)
        for t in token1:
            pair_tokens.append(t)
            pair_segment_ids.append(0)
        pair_tokens.append('[SEP]')
        for t in token2:
            pair_tokens.append(t)
            pair_segment_ids.append(1)
        pair_tokens.append('[SEP]')
        pair_segment_ids.append(1)
        tokenized_pairs_train.append(pair_tokens)
        segment_ids_train.append(pair_segment_ids)

    for sent1, sent2 in zip(first_sent_test, second_sent_test):
        token1 = tokenizer.tokenize(sent1)
        token2 = tokenizer.tokenize(sent2)
        pair_tokens = []
        pair_segment_ids = []
        pair_tokens.append("[CLS] ")
        pair_segment_ids.append(0)
        for t in token1:
            pair_tokens.append(t)
            pair_segment_ids.append(0)
        pair_tokens.append('[SEP]')
        for t in token2:
            pair_tokens.append(t)
            pair_segment_ids.append(1)
        pair_tokens.append('[SEP]')
        pair_segment_ids.append(1)
        tokenized_pairs_test.append(pair_tokens)
        segment_ids_test.append(pair_segment_ids)

    print("the first tokenized pair:")
    print(tokenized_pairs_train[0])
    print("the first segment ids:")
    print(segment_ids_train[0])

    input_ids_train = [
        tokenizer.convert_tokens_to_ids(x) for x in tokenized_pairs_train
    ]
    input_ids_train = pad_sequences(input_ids_train,
                                    maxlen=MAX_LEN,
                                    dtype="long",
                                    truncating="post",
                                    padding="post")
    input_ids_test = [
        tokenizer.convert_tokens_to_ids(x) for x in tokenized_pairs_test
    ]
    input_ids_test = pad_sequences(input_ids_test,
                                   maxlen=MAX_LEN,
                                   dtype="long",
                                   truncating="post",
                                   padding="post")
    segment_ids_train = pad_sequences(segment_ids_train,
                                      maxlen=MAX_LEN,
                                      dtype="long",
                                      truncating="post",
                                      padding="post")
    segment_ids_test = pad_sequences(segment_ids_test,
                                     maxlen=MAX_LEN,
                                     dtype="long",
                                     truncating="post",
                                     padding="post")

    #encoded = [tokenizer.encode(s, add_special_tokens=True) for s in sent_pairs]
    #input_ids2 = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in sent_pairs]).unsqueeze(0)

    attention_masks_train = []
    attention_masks_test = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids_train:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks_train.append(seq_mask)
    for seq in input_ids_test:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks_test.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model

    train_inputs = torch.tensor(input_ids_train).to(torch.int64)
    validation_inputs = torch.tensor(input_ids_test).to(torch.int64)
    train_labels = torch.tensor(scores_train).float()
    validation_labels = torch.tensor(scores_test).float()
    train_masks = torch.tensor(attention_masks_train).to(torch.int64)
    validation_masks = torch.tensor(attention_masks_test).to(torch.int64)
    segment_ids_train = torch.tensor(segment_ids_train).to(torch.int64)
    segment_ids_test = torch.tensor(segment_ids_test).to(torch.int64)

    # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
    # with an iterator the entire dataset does not need to be loaded into memory

    train_data = TensorDataset(train_inputs, train_masks, train_labels,
                               segment_ids_train)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks,
                                    validation_labels, segment_ids_test)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data,
                                       sampler=validation_sampler,
                                       batch_size=batch_size)

    #BertPreTrainedModel = BertModel.from_pretrained('bert-base-uncased')

    model = BertSimilarity.from_pretrained('bert-base-uncased')
    model = model.cuda()

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=.1)

    # Store our loss and accuracy for plotting
    train_loss_set = []
    accuracy = {}

    # trange is a tqdm wrapper around the normal python range
    for _ in trange(epochs, desc="Epoch"):

        # Training

        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_segment_ids = batch
            # Clear out the gradients (by default they accumulate)
            optimizer.zero_grad()
            # Forward pass
            probs = model(b_input_ids,
                          attention_mask=b_input_mask,
                          token_type_ids=b_segment_ids)
            loss_func = torch.nn.BCELoss()
            batch_loss = loss_func(probs, b_labels)

            train_loss_set.append(batch_loss)
            # Backward pass
            batch_loss.backward()
            # Update parameters and take a step using the computed gradient
            optimizer.step()

            # Update tracking variables
            tr_loss += batch_loss
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss / nb_tr_steps))

        accuracy['train_loss'] = tr_loss / nb_tr_steps

        # Validation

        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_segment_ids = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                sigmoid = model(b_input_ids,
                                attention_mask=b_input_mask,
                                token_type_ids=b_segment_ids)

            # Move logits and labels to CPU
            sigmoid = sigmoid.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(sigmoid, label_ids)

            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        accuracy['valid_loss'] = eval_accuracy / nb_eval_steps

        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))

    print("Saving to output folder")

    acc_filename = os.path.join(model_save_path, 'accuracy.json')
    with open(acc_filename, 'w') as f:
        json.dump(accuracy, f)
    f.close()

    train_loss_filename = os.path.join(model_save_path, 'trainloss.txt')
    with open(train_loss_filename, 'w') as f:
        f.writelines(train_loss_set)

    model_to_save = model.module if hasattr(
        model,
        'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    #storage_client = storage.Client()
    bucket_name = 'gs://gridspace-tts-data'
    #bucket = storage_client.get_bucket(bucket_name)

    cp_to_bucket_cmd = 'cp {} {}'.format(model_save_path, bucket_name)

    subprocess.check_call(cp_to_bucket_cmd, shell=True)
Exemple #21
0
class MTDNNModel(object):
    def __init__(self, opt, state_dict=None, num_train_step=-1):
        self.config = opt
        self.updates = state_dict[
            'updates'] if state_dict and 'updates' in state_dict else 0
        self.local_updates = 0
        self.train_loss = AverageMeter()
        self.network = SANBertNetwork(opt)
        if state_dict:
            self.network.load_state_dict(state_dict['state'], strict=False)
        self.mnetwork = nn.DataParallel(
            self.network) if opt['multi_gpu_on'] else self.network
        self.total_param = sum([
            p.nelement() for p in self.network.parameters() if p.requires_grad
        ])
        if opt['cuda']:
            self.network.cuda()
        optimizer_parameters = self._get_param_groups()
        self._setup_optim(optimizer_parameters, state_dict, num_train_step)
        self.para_swapped = False
        self.optimizer.zero_grad()
        self._setup_lossmap(self.config)

    def _get_param_groups(self):
        no_decay = [
            'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'
        ]
        optimizer_parameters = [{
            'params': [
                p for n, p in self.network.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in self.network.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        return optimizer_parameters

    def _setup_optim(self,
                     optimizer_parameters,
                     state_dict=None,
                     num_train_step=-1):
        if self.config['optimizer'] == 'sgd':
            self.optimizer = optim.SGD(
                optimizer_parameters,
                self.config['learning_rate'],
                weight_decay=self.config['weight_decay'])

        elif self.config['optimizer'] == 'adamax':
            self.optimizer = Adamax(optimizer_parameters,
                                    self.config['learning_rate'],
                                    warmup=self.config['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=self.config['grad_clipping'],
                                    schedule=self.config['warmup_schedule'],
                                    weight_decay=self.config['weight_decay'])
            if self.config.get('have_lr_scheduler', False):
                self.config['have_lr_scheduler'] = False
        elif self.config['optimizer'] == 'radam':
            self.optimizer = RAdam(optimizer_parameters,
                                   self.config['learning_rate'],
                                   warmup=self.config['warmup'],
                                   t_total=num_train_step,
                                   max_grad_norm=self.config['grad_clipping'],
                                   schedule=self.config['warmup_schedule'],
                                   eps=self.config['adam_eps'],
                                   weight_decay=self.config['weight_decay'])
            if self.config.get('have_lr_scheduler', False):
                self.config['have_lr_scheduler'] = False
            # The current radam does not support FP16.
            self.config['fp16'] = False
        elif self.config['optimizer'] == 'adam':
            self.optimizer = Adam(optimizer_parameters,
                                  lr=self.config['learning_rate'],
                                  warmup=self.config['warmup'],
                                  t_total=num_train_step,
                                  max_grad_norm=self.config['grad_clipping'],
                                  schedule=self.config['warmup_schedule'],
                                  weight_decay=self.config['weight_decay'])
            if self.config.get('have_lr_scheduler', False):
                self.config['have_lr_scheduler'] = False
        else:
            raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])

        if state_dict and 'optimizer' in state_dict:
            self.optimizer.load_state_dict(state_dict['optimizer'])

        if self.config['fp16']:
            try:
                from apex import amp
                global amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(
                self.network,
                self.optimizer,
                opt_level=self.config['fp16_opt_level'])
            self.network = model
            self.optimizer = optimizer

        if self.config.get('have_lr_scheduler', False):
            if self.config.get('scheduler_type', 'rop') == 'rop':
                self.scheduler = ReduceLROnPlateau(
                    self.optimizer,
                    mode='max',
                    factor=self.config['lr_gamma'],
                    patience=3)
            elif self.config.get('scheduler_type', 'rop') == 'exp':
                self.scheduler = ExponentialLR(self.optimizer,
                                               gamma=self.config.get(
                                                   'lr_gamma', 0.95))
            else:
                milestones = [
                    int(step) for step in self.config.get(
                        'multi_step_lr', '10,20,30').split(',')
                ]
                self.scheduler = MultiStepLR(self.optimizer,
                                             milestones=milestones,
                                             gamma=self.config.get('lr_gamma'))
        else:
            self.scheduler = None

    def _setup_lossmap(self, config):
        loss_types = config['loss_types']
        self.task_loss_criterion = []
        for idx, cs in enumerate(loss_types):
            assert cs is not None
            lc = LOSS_REGISTRY[cs](
                name='Loss func of task {}: {}'.format(idx, cs))
            self.task_loss_criterion.append(lc)

    def _setup_kd_lossmap(self, config):
        loss_types = config['kd_loss_types']
        self.kd_task_loss_criterion = []
        if config.get('mkd_opt', 0) > 0:
            for idx, cs in enumerate(loss_types):
                assert cs is not None
                lc = LOSS_REGISTRY[cs](
                    name='Loss func of task {}: {}'.format(idx, cs))
                self.kd_task_loss_criterion.append(lc)

    def train(self):
        if self.para_swapped:
            self.para_swapped = False

    def _to_cuda(self, tensor):
        if tensor is None: return tensor

        if isinstance(tensor, list) or isinstance(tensor, tuple):
            y = [e.cuda(non_blocking=True) for e in tensor]
            for e in y:
                e.requires_grad = False
        else:
            y = tensor.cuda(non_blocking=True)
            y.requires_grad = False
        return y

    def update(self, batch_meta, batch_data):
        self.network.train()
        y = batch_data[batch_meta['label']]
        soft_labels = None

        task_type = batch_meta['task_type']
        y = self._to_cuda(y) if self.config['cuda'] else y

        task_id = batch_meta['task_id']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        weight = None
        if self.config.get('weighted_on', False):
            if self.config['cuda']:
                weight = batch_data[batch_meta['factor']].cuda(
                    non_blocking=True)
            else:
                weight = batch_data[batch_meta['factor']]
        logits = self.mnetwork(*inputs)

        # compute loss
        loss = 0
        if self.task_loss_criterion[task_id] and (y is not None):
            loss = self.task_loss_criterion[task_id](logits,
                                                     y,
                                                     weight,
                                                     ignore_index=-1)

        # compute kd loss
        if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta):
            soft_labels = batch_meta['soft_label']
            soft_labels = self._to_cuda(
                soft_labels) if self.config['cuda'] else soft_labels
            kd_lc = self.kd_task_loss_criterion[task_id]
            kd_loss = kd_lc(logits, soft_labels, weight,
                            ignore_index=-1) if kd_lc else 0
            loss = loss + kd_loss

        self.train_loss.update(loss.item(),
                               batch_data[batch_meta['token_id']].size(0))
        # scale loss
        loss = loss / self.config.get('grad_accumulation_step', 1)
        if self.config['fp16']:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.local_updates += 1
        if self.local_updates % self.config.get('grad_accumulation_step',
                                                1) == 0:
            if self.config['global_grad_clipping'] > 0:
                if self.config['fp16']:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(self.optimizer),
                        self.config['global_grad_clipping'])
                else:
                    torch.nn.utils.clip_grad_norm_(
                        self.network.parameters(),
                        self.config['global_grad_clipping'])
            self.updates += 1
            # reset number of the grad accumulation
            self.optimizer.step()
            self.optimizer.zero_grad()

    def predict(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta['task_id']
        task_type = batch_meta['task_type']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        score = self.mnetwork(*inputs)
        if task_type == TaskType.Ranking:
            score = score.contiguous().view(-1, batch_meta['pairwise_size'])
            assert task_type == TaskType.Ranking
            score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta['true_label']
        elif task_type == TaskType.SeqenceLabeling:
            mask = batch_data[batch_meta['mask']]
            score = score.contiguous()
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).reshape(mask.size()).tolist()
            valied_lenght = mask.sum(1).tolist()
            final_predict = []
            for idx, p in enumerate(predict):
                final_predict.append(p[:valied_lenght[idx]])
            score = score.reshape(-1).tolist()
            return score, final_predict, batch_meta['label']
        else:
            if task_type == TaskType.Classification:
                score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).tolist()
            score = score.reshape(-1).tolist()
        return score, predict, batch_meta['label']

    def extract(self, batch_meta, batch_data):
        self.network.eval()
        # 'token_id': 0; 'segment_id': 1; 'mask': 2
        inputs = batch_data[:3]
        all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs)
        return all_encoder_layers, pooled_output

    def save(self, filename):
        network_state = dict([(k, v.cpu())
                              for k, v in self.network.state_dict().items()])
        params = {
            'state': network_state,
            'optimizer': self.optimizer.state_dict(),
            'config': self.config,
        }
        torch.save(params, filename)
        logger.info('model saved to {}'.format(filename))

    def load(self, checkpoint):
        model_state_dict = torch.load(checkpoint)
        if model_state_dict['config']['init_checkpoint'].rsplit('/', 1)[1] != \
                self.config['init_checkpoint'].rsplit('/', 1)[1]:
            logger.error(
                '*** SANBert network is pretrained on a different Bert Model. Please use that to fine-tune for other tasks. ***'
            )
            sys.exit()

        self.network.load_state_dict(model_state_dict['state'], strict=False)
        self.optimizer.load_state_dict(model_state_dict['optimizer'])
        self.config = model_state_dict['config']

    def cuda(self):
        self.network.cuda()
Exemple #22
0
class ClassificationModel:
    def __init__(self,
                 task,
                 val=0.1,
                 bert_model=BERT_MODEL,
                 gpu=False,
                 seed=0):
        self.gpu = gpu
        self.task = task
        self.bert_model = bert_model
        self.x_train, self.y_train = load_train_dataset(self.task)
        self.x_val = np.random.choice(self.x_train,
                                      size=(int(val * len(self.x_train)), ),
                                      replace=False)
        self.y_val = np.random.choice(self.y_train,
                                      size=(int(val * len(self.x_train)), ),
                                      replace=False)
        self.x_test_ids, self.x_test = load_test_dataset(self.task)
        self.num_classes = len(TASK_LABELS[task])

        self.model = None
        self.optimizer = None
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_model)

        self.plt_x = []
        self.plt_y = []

        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if self.gpu:
            torch.cuda.manual_seed_all(seed)

    def __init_model(self):
        if self.gpu:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.model.to(self.device)
        print(torch.cuda.memory_allocated(self.device))

    def new_model(self):
        self.model = BertForSequenceClassification.from_pretrained(
            self.bert_model, num_labels=self.num_classes)
        self.__init_model()

    def load_model(self, path_model, path_config):
        self.model = BertForSequenceClassification(BertConfig(path_config),
                                                   num_labels=self.num_classes)
        self.model.load_state_dict(torch.load(path_model))
        self.__init_model()

    def save_model(self, path_model, path_config):
        torch.save(self.model.state_dict(), path_model)
        with open(path_config, 'w') as f:
            f.write(self.model.config.to_json_string())

    # noinspection PyArgumentList
    def train(self,
              epochs,
              plot_path,
              batch_size=32,
              lr=5e-5,
              model_path=None,
              config_path=None):
        model_params = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model_params
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in model_params if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        self.optimizer = BertAdam(optimizer_grouped_parameters,
                                  lr=lr,
                                  warmup=0.1,
                                  t_total=int(len(self.x_train) / batch_size) *
                                  epochs)

        nb_tr_steps = 0
        train_features = convert_examples_to_features(self.x_train,
                                                      self.y_train,
                                                      MAX_SEQ_LENGTH,
                                                      self.tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        _, counts = np.unique(self.y_train, return_counts=True)
        class_weights = [sum(counts) / c for c in counts]
        example_weights = [class_weights[e] for e in self.y_train]
        sampler = WeightedRandomSampler(example_weights, len(self.y_train))
        train_dataloader = DataLoader(train_data,
                                      sampler=sampler,
                                      batch_size=batch_size)

        self.model.train()
        for e in range(epochs):
            print(f"Epoch {e}")
            f1, acc = self.val()
            print(f"\nF1 score: {f1}, Accuracy: {acc}")
            if model_path is not None and config_path is not None:
                self.save_model(model_path, config_path)
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                loss = self.model(input_ids, segment_ids, input_mask,
                                  label_ids)
                loss.backward()

                self.plt_y.append(loss.item())
                self.plt_x.append(nb_tr_steps)
                self.save_plot(plot_path)

                nb_tr_steps += 1
                self.optimizer.step()
                self.optimizer.zero_grad()

                if self.gpu:
                    torch.cuda.empty_cache()

    def val(self, batch_size=32, test=False):
        eval_features = convert_examples_to_features(self.x_val, self.y_val,
                                                     MAX_SEQ_LENGTH,
                                                     self.tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)

        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=batch_size)

        f1, acc = 0, 0
        nb_eval_examples = 0

        for input_ids, input_mask, segment_ids, gnd_labels in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)

            with torch.no_grad():
                logits = self.model(input_ids, segment_ids, input_mask)

            predicted_labels = np.argmax(logits.detach().cpu().numpy(), axis=1)
            acc += np.sum(predicted_labels == gnd_labels.numpy())
            tmp_eval_f1 = f1_score(predicted_labels,
                                   gnd_labels,
                                   average='macro')
            f1 += tmp_eval_f1 * input_ids.size(0)
            nb_eval_examples += input_ids.size(0)

        return f1 / nb_eval_examples, acc / nb_eval_examples

    def save_plot(self, path):
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        ax.plot(self.plt_x, self.plt_y)

        ax.set(xlabel='Training steps', ylabel='Loss')

        fig.savefig(path)
        plt.close()

    def create_test_predictions(self, path):
        eval_features = convert_examples_to_features(self.x_test,
                                                     [-1] * len(self.x_test),
                                                     MAX_SEQ_LENGTH,
                                                     self.tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)

        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=16)

        predictions = []
        inverse_labels = {v: k for k, v in TASK_LABELS[self.task].items()}

        for input_ids, input_mask, segment_ids, gnd_labels in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)

            with torch.no_grad():
                logits = self.model(input_ids, segment_ids, input_mask)

            predictions += [
                inverse_labels[p]
                for p in list(np.argmax(logits.detach().cpu().numpy(), axis=1))
            ]
        with open(path, "w") as csv_file:
            writer = csv.writer(csv_file, delimiter=',')
            for i, prediction in enumerate(predictions):
                writer.writerow([int(self.x_test_ids[i]), prediction])

        return predictions
Exemple #23
0
class MTDNNModel(object):
    def __init__(self, opt, state_dict=None, num_train_step=-1):
        self.config = opt
        self.updates = state_dict['updates'] if state_dict and 'updates' in state_dict else 0
        self.local_updates = 0
        self.train_loss = AverageMeter()
        self.initial_from_local = True if state_dict else False
        self.network = SANBertNetwork(opt, initial_from_local=self.initial_from_local)
        if state_dict:
            missing_keys, unexpected_keys = self.network.load_state_dict(state_dict['state'], strict=False)
        self.mnetwork = nn.DataParallel(self.network) if opt['multi_gpu_on'] else self.network
        self.total_param = sum([p.nelement() for p in self.network.parameters() if p.requires_grad])
        if opt['cuda']:
            self.network.cuda()
        optimizer_parameters = self._get_param_groups()
        #print(optimizer_parameters)
        self._setup_optim(optimizer_parameters, state_dict, num_train_step) 
        self.para_swapped = False
        self.optimizer.zero_grad()
        self._setup_lossmap(self.config)
        self._setup_kd_lossmap(self.config)
        self._setup_adv_lossmap(self.config)
        self._setup_adv_training(self.config)


    def _setup_adv_training(self, config):
        self.adv_teacher = None
        if config.get('adv_train', False):
            self.adv_teacher = SmartPerturbation(config['adv_epsilon'],
                    config['multi_gpu_on'],
                    config['adv_step_size'],
                    config['adv_noise_var'],
                    config['adv_p_norm'],
                    config['adv_k'],
                    config['fp16'],
                    config['encoder_type'],
                    loss_map=self.adv_task_loss_criterion)


    def _get_param_groups(self):
        no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in self.network.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in self.network.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def _setup_optim(self, optimizer_parameters, state_dict=None, num_train_step=-1): ###여기서 Error
        #print(len(optimizer_parameters[0]['params']))
        if self.config['optimizer'] == 'sgd':
            self.optimizer = optim.SGD(optimizer_parameters, self.config['learning_rate'],
                                       weight_decay=self.config['weight_decay'])

        elif self.config['optimizer'] == 'adamax':
            self.optimizer = Adamax(optimizer_parameters,
                                    self.config['learning_rate'],
                                    warmup=self.config['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=self.config['grad_clipping'],
                                    schedule=self.config['warmup_schedule'],
                                    weight_decay=self.config['weight_decay'])
            if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False
        elif self.config['optimizer'] == 'radam':
            self.optimizer = RAdam(optimizer_parameters,
                                    self.config['learning_rate'],
                                    warmup=self.config['warmup'],
                                    t_total=num_train_step,
                                    max_grad_norm=self.config['grad_clipping'],
                                    schedule=self.config['warmup_schedule'],
                                    eps=self.config['adam_eps'],
                                    weight_decay=self.config['weight_decay'])
            if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False
            # The current radam does not support FP16.
            self.config['fp16'] = False
        elif self.config['optimizer'] == 'adam':
            self.optimizer = Adam(optimizer_parameters,
                                  lr=self.config['learning_rate'],
                                  warmup=self.config['warmup'],
                                  t_total=num_train_step,
                                  max_grad_norm=self.config['grad_clipping'],
                                  schedule=self.config['warmup_schedule'],
                                  weight_decay=self.config['weight_decay'])
            if self.config.get('have_lr_scheduler', False): self.config['have_lr_scheduler'] = False
        else:
            raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer'])
        print("="*50)
        #print(state_dict['optimizer'])
        if state_dict and 'optimizer' in state_dict:
            #print("Optimizer's state_dict:")
            #state_dict['optimizer']['param_groups'][0]['params']=state_dict['optimizer']['param_groups'][0]['params'][:77]
            #print(len(state_dict['optimizer']['param_groups'][0]['params']))
            #for var_name in state_dict['optimizer']:
            #    print(var_name, "\t", state_dict['optimizer'][var_name])
            #print(self.optimizer.state_dict()) ######
            #state_dict['optimizer'][var_name] =
            self.optimizer.load_state_dict(state_dict['optimizer']) ###여기서 Error

        if self.config['fp16']:
            try:
                from apex import amp
                global amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(self.network, self.optimizer, opt_level=self.config['fp16_opt_level'])
            self.network = model
            self.optimizer = optimizer

        if self.config.get('have_lr_scheduler', False):
            if self.config.get('scheduler_type', 'rop') == 'rop':
                self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=self.config['lr_gamma'], patience=3)
            elif self.config.get('scheduler_type', 'rop') == 'exp':
                self.scheduler = ExponentialLR(self.optimizer, gamma=self.config.get('lr_gamma', 0.95))
            else:
                milestones = [int(step) for step in self.config.get('multi_step_lr', '10,20,30').split(',')]
                self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=self.config.get('lr_gamma'))
        else:
            self.scheduler = None

    def _setup_lossmap(self, config):
        task_def_list: List[TaskDef] = config['task_def_list']
        self.task_loss_criterion = []
        for idx, task_def in enumerate(task_def_list):
            cs = task_def.loss
            lc = LOSS_REGISTRY[cs](name='Loss func of task {}: {}'.format(idx, cs))
            self.task_loss_criterion.append(lc)

    def _setup_kd_lossmap(self, config):
        task_def_list: List[TaskDef] = config['task_def_list']
        self.kd_task_loss_criterion = []
        if config.get('mkd_opt', 0) > 0:
            for idx, task_def in enumerate(task_def_list):
                cs = task_def.kd_loss
                assert cs is not None
                lc = LOSS_REGISTRY[cs](name='KD Loss func of task {}: {}'.format(idx, cs))
                self.kd_task_loss_criterion.append(lc)

    def _setup_adv_lossmap(self, config):
        task_def_list: List[TaskDef] = config['task_def_list']
        self.adv_task_loss_criterion = []
        if config.get('adv_train', False):
            for idx, task_def in enumerate(task_def_list):
                cs = task_def.adv_loss
                assert cs is not None
                lc = LOSS_REGISTRY[cs](name='Adv Loss func of task {}: {}'.format(idx, cs))
                self.adv_task_loss_criterion.append(lc)


    def train(self):
        if self.para_swapped:
            self.para_swapped = False

    def _to_cuda(self, tensor):
        if tensor is None: return tensor

        if isinstance(tensor, list) or isinstance(tensor, tuple):
            y = [e.cuda(non_blocking=True) for e in tensor]
            for e in y:
                e.requires_grad = False
        else:
            y = tensor.cuda(non_blocking=True)
            y.requires_grad = False
        return y

    def update(self, batch_meta, batch_data, weight_alpha): ####
        self.network.train()
        y = batch_data[batch_meta['label']]
        y = self._to_cuda(y) if self.config['cuda'] else y

        task_id = batch_meta['task_id']
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        weight = None
        
        if self.config['itw_on']: ####
            if self.config['cuda']:
                weight = torch.FloatTensor([batch_meta['weight']]).cuda(non_blocking=True)*weight_alpha
                
            else:
                weight = batch_meta['weight']*weight_alpha
                
        
        """
        if self.config.get('weighted_on', False):
            if self.config['cuda']:
                weight = batch_data[batch_meta['factor']].cuda(non_blocking=True)
            else:
                weight = batch_data[batch_meta['factor']]
        """

        # fw to get logits
        logits = self.mnetwork(*inputs)

        # compute loss
        loss = 0
        if self.task_loss_criterion[task_id] and (y is not None):
            loss_criterion = self.task_loss_criterion[task_id]
            if isinstance(loss_criterion, RankCeCriterion) and batch_meta['pairwise_size'] > 1:
                # reshape the logits for ranking.
                loss = self.task_loss_criterion[task_id](logits, y, weight, ignore_index=-1, pairwise_size=batch_meta['pairwise_size'])
            else:
                loss = self.task_loss_criterion[task_id](logits, y, weight, ignore_index=-1)

        # compute kd loss
        if self.config.get('mkd_opt', 0) > 0 and ('soft_label' in batch_meta):
            soft_labels = batch_meta['soft_label']
            soft_labels = self._to_cuda(soft_labels) if self.config['cuda'] else soft_labels
            kd_lc = self.kd_task_loss_criterion[task_id]
            kd_loss = kd_lc(logits, soft_labels, weight, ignore_index=-1) if kd_lc else 0
            loss = loss + kd_loss

        # adv training
        if self.config.get('adv_train', False) and self.adv_teacher:
            # task info
            task_type = batch_meta['task_def']['task_type']
            adv_inputs = [self.mnetwork, logits] + inputs + [task_type, batch_meta.get('pairwise_size', 1)]
            adv_loss = self.adv_teacher.forward(*adv_inputs)
            loss = loss + self.config['adv_alpha'] * adv_loss

        self.train_loss.update(loss.item(), batch_data[batch_meta['token_id']].size(0))
        # scale loss
        loss = loss / self.config.get('grad_accumulation_step', 1)
        if self.config['fp16']:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        self.local_updates += 1
        if self.local_updates % self.config.get('grad_accumulation_step', 1) == 0:
            if self.config['global_grad_clipping'] > 0:
                if self.config['fp16']:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer),
                                                   self.config['global_grad_clipping'])
                else:
                    torch.nn.utils.clip_grad_norm_(self.network.parameters(),
                                                  self.config['global_grad_clipping'])
            self.updates += 1
            # reset number of the grad accumulation
            self.optimizer.step()
            self.optimizer.zero_grad()

    def encode(self, batch_meta, batch_data):
        self.network.eval()
        inputs = batch_data[:3]
        sequence_output = self.network.encode(*inputs)[0]
        return sequence_output

    # TODO: similar as function extract, preserve since it is used by extractor.py
    # will remove after migrating to transformers package
    def extract(self, batch_meta, batch_data):
        self.network.eval()
        # 'token_id': 0; 'segment_id': 1; 'mask': 2
        inputs = batch_data[:3]
        all_encoder_layers, pooled_output = self.mnetwork.bert(*inputs)
        return all_encoder_layers, pooled_output

    def predict(self, batch_meta, batch_data):
        self.network.eval()
        task_id = batch_meta['task_id']
        task_def = TaskDef.from_dict(batch_meta['task_def'])
        task_type = task_def.task_type
        task_obj = tasks.get_task_obj(task_def)
        
        inputs = batch_data[:batch_meta['input_len']]
        if len(inputs) == 3:
            inputs.append(None)
            inputs.append(None)
        inputs.append(task_id)
        score = self.mnetwork(*inputs)
        if task_obj is not None:
            score, predict = task_obj.test_predict(score)
        elif task_type == TaskType.Ranking:
            score = score.contiguous().view(-1, batch_meta['pairwise_size'])
            assert task_type == TaskType.Ranking
            score = F.softmax(score, dim=1)
            score = score.data.cpu()
            score = score.numpy()
            predict = np.zeros(score.shape, dtype=int)
            positive = np.argmax(score, axis=1)
            for idx, pos in enumerate(positive):
                predict[idx, pos] = 1
            predict = predict.reshape(-1).tolist()
            score = score.reshape(-1).tolist()
            return score, predict, batch_meta['true_label']
        elif task_type == TaskType.SeqenceLabeling:
            mask = batch_data[batch_meta['mask']]
            score = score.contiguous()
            score = score.data.cpu()
            score = score.numpy()
            predict = np.argmax(score, axis=1).reshape(mask.size()).tolist()
            valied_lenght = mask.sum(1).tolist()
            final_predict = []
            for idx, p in enumerate(predict):
                final_predict.append(p[: valied_lenght[idx]])
            score = score.reshape(-1).tolist()
            return score, final_predict, batch_meta['label']
        elif task_type == TaskType.Span:
            start, end = score
            predictions = []
            if self.config['encoder_type'] == EncoderModelType.BERT:
                import experiments.squad.squad_utils as mrc_utils
                scores, predictions = mrc_utils.extract_answer(batch_meta, batch_data, start, end, self.config.get('max_answer_len', 5), do_lower_case=self.config.get('do_lower_case', False))
            return scores, predictions, batch_meta['answer']
        else:
            raise ValueError("Unknown task_type: %s" % task_type)
        return score, predict, batch_meta['label']

    def save(self, filename):
        network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()])
        params = {
            'state': network_state,
            'optimizer': self.optimizer.state_dict(),
            'config': self.config,
        }
        torch.save(params, filename)
        logger.info('model saved to {}'.format(filename))

    def load(self, checkpoint):
        model_state_dict = torch.load(checkpoint)
        if 'state' in model_state_dict:
            self.network.load_state_dict(model_state_dict['state'], strict=False)
        if 'optimizer' in model_state_dict:
            self.optimizer.load_state_dict(model_state_dict['optimizer'])
        if 'config' in model_state_dict:
            self.config.update(model_state_dict['config'])

    def cuda(self):
        self.network.cuda()
Exemple #24
0
def model_go():
    seed = 12
    torch.manual_seed(seed)
    # bert_model_name = 'bert-large-uncased'
    bert_model_name = 'bert-base-uncased'
    lazy = True
    # lazy = True
    forward_size = 128
    # batch_size = 64
    batch_size = 128
    gradient_accumulate_step = int(batch_size / forward_size)
    warmup_proportion = 0.1
    learning_rate = 5e-5
    num_train_epochs = 3
    eval_frequency = 5000
    do_lower_case = True
    ignore_non_verifiable = True
    doc_filter_value = 0.005
    doc_top_k = 5
    experiment_name = f'fever_v0_slevel_retri_(ignore_non_verifiable:{ignore_non_verifiable})'

    debug_mode = False
    max_l = 128
    # est_datasize = 900_000

    num_class = 1
    # num_train_optimization_steps

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_num = 0 if torch.cuda.is_available() else -1

    n_gpu = torch.cuda.device_count()

    unk_token_num = {'tokens': 1}  # work around for initiating vocabulary.
    vocab = ExVocabulary(unk_token_num=unk_token_num)
    vocab.add_token_to_namespace("false", namespace="labels")  # 0
    vocab.add_token_to_namespace("true", namespace="labels")  # 1
    vocab.add_token_to_namespace("hidden", namespace="labels")
    vocab.change_token_with_index_to_namespace("hidden",
                                               -2,
                                               namespace='labels')

    # Load Dataset
    train_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_train_results.jsonl"
    )

    dev_upstream_doc_results = common.load_jsonl(
        config.PRO_ROOT /
        "data/p_fever/fever_paragraph_level/04-22-15:05:45_fever_v0_plevel_retri_(ignore_non_verifiable:True)/"
        "i(5000)|e(0)|v02_ofever(0.8947894789478947)|v05_ofever(0.8555355535553555)|seed(12)/fever_p_level_dev_results.jsonl"
    )

    # train_list = common.load_json(config.TRAIN_FILE)
    dev_list = common.load_jsonl(config.FEVER_DEV)

    train_fitems = fever_s_level_sampler.get_sentence_forward_pair(
        'train',
        train_upstream_doc_results,
        is_training=True,
        debug=debug_mode,
        ignore_non_verifiable=ignore_non_verifiable,
        top_k=doc_top_k,
        filter_value=doc_filter_value)

    dev_fitems = fever_s_level_sampler.get_sentence_forward_pair(
        'dev',
        dev_upstream_doc_results,
        is_training=False,
        debug=debug_mode,
        ignore_non_verifiable=ignore_non_verifiable,
        top_k=doc_top_k,
        filter_value=doc_filter_value)

    # Just to show the information
    fever_p_level_sampler.down_sample_neg(train_fitems, None)
    fever_p_level_sampler.down_sample_neg(dev_fitems, None)

    if debug_mode:
        dev_list = dev_list[:100]
        eval_frequency = 2
        # print(dev_list[-1]['_id'])
        # exit(0)

    # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
    est_datasize = len(train_fitems)

    dev_o_dict = list_dict_data_tool.list_to_dict(dev_list, 'id')
    # print(dev_o_dict)

    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name,
                                                   do_lower_case=do_lower_case)
    bert_cs_reader = BertContentSelectionReader(
        bert_tokenizer,
        lazy,
        is_paired=True,
        example_filter=lambda x: len(x['context']) == 0,
        max_l=max_l,
        element_fieldname='element')

    bert_encoder = BertModel.from_pretrained(bert_model_name)
    model = BertMultiLayerSeqClassification(bert_encoder,
                                            num_labels=num_class,
                                            num_of_pooling_layer=1,
                                            act_type='tanh',
                                            use_pretrained_pooler=True,
                                            use_sigmoid=True)

    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    #
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    num_train_optimization_steps = int(est_datasize / forward_size / gradient_accumulate_step) * \
                                   num_train_epochs

    if debug_mode:
        num_train_optimization_steps = 100

    print("Estimated training size", est_datasize)
    print("Number of optimization steps:", num_train_optimization_steps)

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    dev_instances = bert_cs_reader.read(dev_fitems)

    biterator = BasicIterator(batch_size=forward_size)
    biterator.index_with(vocab)

    forbackward_step = 0
    update_step = 0

    logging_agent = save_tool.ScoreLogger({})

    if not debug_mode:
        # # # Create Log File
        file_path_prefix, date = save_tool.gen_file_prefix(
            f"{experiment_name}")
        # Save the source code.
        script_name = os.path.basename(__file__)
        with open(os.path.join(file_path_prefix, script_name),
                  'w') as out_f, open(__file__, 'r') as it:
            out_f.write(it.read())
            out_f.flush()
        # # # Log File end

    for epoch_i in range(num_train_epochs):
        print("Epoch:", epoch_i)
        # sampled_train_list = down_sample_neg(train_fitems_list, ratio=pos_ratio)
        random.shuffle(train_fitems)
        train_instance = bert_cs_reader.read(train_fitems)
        train_iter = biterator(train_instance, num_epochs=1, shuffle=True)

        for batch in tqdm(train_iter):
            model.train()
            batch = move_to_device(batch, device_num)

            paired_sequence = batch['paired_sequence']
            paired_segments_ids = batch['paired_segments_ids']
            labels_ids = batch['label']
            att_mask, _ = torch_util.get_length_and_mask(paired_sequence)
            s1_span = batch['bert_s1_span']
            s2_span = batch['bert_s2_span']

            loss = model(
                paired_sequence,
                token_type_ids=paired_segments_ids,
                attention_mask=att_mask,
                mode=BertMultiLayerSeqClassification.ForwardMode.TRAIN,
                labels=labels_ids)

            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.

            if gradient_accumulate_step > 1:
                loss = loss / gradient_accumulate_step

            loss.backward()
            forbackward_step += 1

            if forbackward_step % gradient_accumulate_step == 0:
                optimizer.step()
                optimizer.zero_grad()
                update_step += 1

                if update_step % eval_frequency == 0:
                    print("Update steps:", update_step)
                    dev_iter = biterator(dev_instances,
                                         num_epochs=1,
                                         shuffle=False)

                    cur_eval_results_list = eval_model(model,
                                                       dev_iter,
                                                       device_num,
                                                       make_int=True,
                                                       with_probs=True)
                    copied_dev_o_dict = copy.deepcopy(dev_o_dict)
                    copied_dev_d_list = copy.deepcopy(dev_list)
                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'qid',
                        'fid',
                        check=True)

                    cur_results_dict_th0_5 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        score_field_name='prob',
                        top_k=5,
                        filter_value=0.5,
                        result_field='predicted_evidence')

                    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
                        copied_dev_d_list, cur_results_dict_th0_5, 'id',
                        'predicted_evidence')
                    # mode = {'standard': False, 'check_doc_id_correct': True}
                    strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
                        copied_dev_d_list, dev_list, max_evidence=5)
                    score_05 = {
                        'ss': strict_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'qid',
                        'fid',
                        check=True)

                    cur_results_dict_th0_2 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        score_field_name='prob',
                        top_k=5,
                        filter_value=0.2,
                        result_field='predicted_evidence')

                    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
                        copied_dev_d_list, cur_results_dict_th0_2, 'id',
                        'predicted_evidence')
                    # mode = {'standard': False, 'check_doc_id_correct': True}
                    strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
                        copied_dev_d_list, dev_list, max_evidence=5)
                    score_02 = {
                        'ss': strict_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    list_dict_data_tool.append_subfield_from_list_to_dict(
                        cur_eval_results_list,
                        copied_dev_o_dict,
                        'qid',
                        'fid',
                        check=True)

                    cur_results_dict_th0_1 = select_top_k_and_to_results_dict(
                        copied_dev_o_dict,
                        score_field_name='prob',
                        top_k=5,
                        filter_value=0.1,
                        result_field='predicted_evidence')

                    list_dict_data_tool.append_item_from_dict_to_list_hotpot_style(
                        copied_dev_d_list, cur_results_dict_th0_1, 'id',
                        'predicted_evidence')
                    # mode = {'standard': False, 'check_doc_id_correct': True}
                    strict_score, pr, rec, f1 = fever_scorer.fever_sent_only(
                        copied_dev_d_list, dev_list, max_evidence=5)
                    score_01 = {
                        'ss': strict_score,
                        'pr': pr,
                        'rec': rec,
                        'f1': f1,
                    }

                    logging_item = {
                        'score_01': score_01,
                        'score_02': score_02,
                        'score_05': score_05,
                    }

                    print(json.dumps(logging_item, indent=2))

                    s01_ss_score = score_01['ss']
                    s02_ss_score = score_02['ss']
                    s05_ss_score = score_05['ss']

                    if not debug_mode:
                        save_file_name = f'i({update_step})|e({epoch_i})' \
                            f'|v01_ofever({s01_ss_score})' \
                            f'|v02_ofever({s02_ss_score})' \
                            f'|v05_ofever({s05_ss_score})|seed({seed})'

                        common.save_jsonl(
                            cur_eval_results_list,
                            Path(file_path_prefix) /
                            f"{save_file_name}_dev_s_level_results.jsonl")

                        # print(save_file_name)
                        logging_agent.incorporate_results({}, save_file_name,
                                                          logging_item)
                        logging_agent.logging_to_file(
                            Path(file_path_prefix) / "log.json")

                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_model_file = Path(
                            file_path_prefix) / save_file_name
                        torch.save(model_to_save.state_dict(),
                                   str(output_model_file))
class QATrainer(object):
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
        train_dir = os.path.join("./save", "qa")
        self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)
        # read data-set and prepare iterator
        self.train_loader = self.get_data_loader("./squad/train-v1.1.json")
        self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json")

        num_train_optimization_steps = len(self.train_loader) * config.num_epochs
        # optimizer
        param_optimizer = list(self.model.named_parameters())
        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]]
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        self.qa_opt = BertAdam(optimizer_grouped_parameters,
                               lr=config.qa_lr,
                               warmup=config.warmup_proportion,
                               t_total=num_train_optimization_steps)

        # self.qg_lr = config.lr

        # assign model to device
        self.model = self.model.to(config.device)

    def get_data_loader(self, file):
        train_examples = read_squad_examples(file, is_training=True, debug=config.debug,
                                             reduce_size=config.reduce_size)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer=self.tokenizer,
                                                      max_seq_length=config.max_seq_len,
                                                      max_query_length=config.max_query_len,
                                                      doc_stride=128,
                                                      is_training=True)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                   all_start_positions, all_end_positions)

        sampler = RandomSampler(train_data)
        batch_size = int(config.batch_size / config.gradient_accumulation_steps)
        train_loader = DataLoader(train_data, sampler=sampler, batch_size=batch_size)

        return train_loader

    def save_model(self, loss, epoch):
        loss = round(loss, 3)
        dir_name = os.path.join(self.save_dir, "bert_{}_{:.3f}".format(epoch, loss))
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # save bert model
        model_to_save = self.model.module if hasattr(self.model, "module") else self.model
        model_file = os.path.join(dir_name, "pytorch_model.bin")
        config_file = os.path.join(dir_name, "bert_config.json")

        state_dict = model_to_save.state_dict()
        torch.save(state_dict, model_file)
        model_to_save.config.to_json_file(config_file)

    def train(self):
        global_step = 1
        batch_num = len(self.train_loader)
        best_loss = 1e10
        qa_loss_lst = []
        self.model.train()
        for epoch in range(1, 4):
            start = time.time()
            for step, batch in enumerate(self.train_loader, start=1):

                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                seq_len = torch.sum(torch.sign(input_ids), 1)
                max_len = torch.max(seq_len)
                input_ids = input_ids[:, :max_len].to(config.device)
                input_mask = input_mask[:, : max_len].to(config.device)
                segment_ids = segment_ids[:, :max_len].to(config.device)
                start_positions = start_positions.to(config.device)
                end_positions = end_positions.to(config.device)
                loss = self.model(input_ids, segment_ids, input_mask, start_positions, end_positions)

                # mean() to average across multiple gpu and back-propagation
                loss /= config.gradient_accumulation_steps
                loss.backward()
                qa_loss_lst.append(loss)
                # update params
                if step % config.gradient_accumulation_steps == 0:
                    self.qa_opt.step()
                    # zero grad
                    self.qa_opt.zero_grad()
                    global_step += 1
                    avg_qa_loss = sum(qa_loss_lst)
                    # empty list
                    qa_loss_lst = []
                    msg = "{}/{} {} - ETA : {} - qa_loss: {:.2f}" \
                        .format(step, batch_num, progress_bar(step, batch_num),
                                eta(start, step, batch_num),
                                avg_qa_loss)
                    print(msg, end="\r")

            val_loss = self.evaluate(msg)
            if val_loss <= best_loss:
                best_loss = val_loss
                self.save_model(val_loss, epoch)

            print("Epoch {} took {} - final loss : {:.4f} -  val_loss :{:.4f}"
                  .format(epoch, user_friendly_time(time_since(start)), loss, val_loss))

    def evaluate(self, msg):
        self.model.eval()
        num_val_batches = len(self.dev_loader)
        val_losses = []
        for i, val_data in enumerate(self.dev_loader, start=1):
            with torch.no_grad():
                val_data = tuple(t.to(config.device) for t in val_data)
                input_ids, input_mask, segment_ids, start_positions, end_positions = val_data
                val_batch_loss = self.model(input_ids, segment_ids, input_mask, start_positions, end_positions)
                qa_loss = val_batch_loss
                val_losses.append(qa_loss.mean().item())
                msg2 = "{} => Evaluating :{}/{}".format(msg, i, num_val_batches)
                print(msg2, end="\r")
        val_loss = np.mean(val_losses)
        self.model.train()
        return val_loss
Exemple #26
0
    def train(self):
        device = torch.device("cuda:0")
        # pdb.set_trace()
        if self.debug_mode: self.epochs = 2
        print('加载dataloader')
        # train_loader, valid_loader = self.create_dataloader()
        train_dataloader, eval_dataloader, train_examples_length, valid_examples_length, eval_features = self.create_dataloader()
        print('开始训练')

        num_train_optimization_steps = None
        if do_train:
            num_train_optimization_steps = int(
                train_examples_length / self.batch_size / self.gradient_accumulation_steps) * self.epochs
        model = BertForSequenceClassification.from_pretrained(self.bert_model_path, cache_dir=None,
                                                              num_labels=self.num_labels).cuda()
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=self.learning_rate,
                             warmup=self.warmup_proportion,
                             t_total=num_train_optimization_steps)

        global_step = 0
        tr_loss = 0
        best_F1 = 0

        tokenizer = BertTokenizer.from_pretrained(self.bert_model_path, cache_dir=None, do_lower_case=True)

        model.train()
        for epoch in range(int(self.epochs)):
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(train_dataloader):
                print('epoch:', epoch, 'batchIndex:', step)
                batch = tuple(t.to(device) for t in batch)
                # pdb.set_trace()
                input_ids, input_mask, segment_ids, label_ids = batch
                logits = model(input_ids.cuda(), segment_ids.cuda(), input_mask.cuda(), labels=None).cuda()
                loss_fct = BCEWithLogitsLoss()
                label_ids = label_ids.cuda()
                loss = loss_fct(logits.view(-1, 1), label_ids.view(-1, 1))

                if self.gradient_accumulation_steps > 1:
                    loss = loss / self.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % self.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    model.zero_grad()
                    global_step += 1

                if (step + 1) % self.period == 0:
                    model_to_save = model.module if hasattr(model, 'module') else model

                    model.eval()
                    torch.set_grad_enabled(False)

                    # 开始验证
                    idx = 0
                    TP, TN, FN, FP = 0, 0, 0, 0
                    output = {}
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)
                        batch_size = input_ids.size(0)
                        with torch.no_grad():
                            logits = model(input_ids, segment_ids, input_mask, labels=None)
                            logits = torch.sigmoid(logits)
                        preds = (logits > 0.4).float()
                        preds_numpy = preds.cpu().long().data.numpy()
                        for i in range(idx, idx + batch_size):
                            if eval_features[i].file not in output:
                                output[eval_features[i].file] = {}
                            output[eval_features[i].file][eval_features[i].turn] = preds_numpy[i - idx].tolist()
                        TP, TN, FN, FP = obtain_TP_TN_FN_FP(preds, label_ids, TP, TN, FN, FP)
                        idx += batch_size

                    with open("data/BERT_{}_prediction.json".format(self.test_set), 'w') as f:
                        json.dump(output, f)

                    precision = TP / (TP + FP + 0.001)
                    recall = TP / (TP + FN + 0.001)
                    F1 = 2 * precision * recall / (precision + recall + 0.001)
                    logger.info(
                        "epoch is {} step is {} precision is {} recall is {} F1 is {} best_F1 is {}".format(epoch, step,
                                                                                                            precision,
                                                                                                            recall, F1,
                                                                                                            best_F1))

                    # F1 = evaluate(args, model, device, processor, label_list, num_labels, tokenizer, output_mode)

                    if F1 > best_F1:
                        output_dir = os.path.join("checkpoints/predictor/", 'save_step_{}'.format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
                        output_config_file = os.path.join(output_dir, CONFIG_NAME)

                        torch.save(model_to_save.state_dict(), output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(output_dir)

                        best_F1 = F1

                    model.train()  # turn on train mode
                    torch.set_grad_enabled(True)  # start gradient tracking
                    tr_loss = 0
Exemple #27
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 2e-5
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 预训练 bert 转成 pytorch
     if os.path.exists(self.bert_model_path + "pytorch_model.bin") is False:
         convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(
             self.bert_model_path + 'bert_model.ckpt',
             self.bert_model_path + 'bert_config.json',
             self.bert_model_path + 'pytorch_model.bin')
     # 加载预训练模型
     model = BertNeuralNet.from_pretrained(self.bert_model_path, cache_dir=None)
     model.zero_grad()
     model = model.to(self.device)
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
     epoch_steps = int(self.train_len / self.base_batch_size / accumulation_steps)
     num_train_optimization_steps = int(self.epochs * epoch_steps)
     valid_every = math.floor(epoch_steps / 10)
     optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
     # 开始训练
     for epoch in range(self.epochs):
         train_start_time = time.time()
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             x_mask = batch_data[5]
             y_pred = model(x_batch, attention_mask=x_mask, labels=None)
             target_loss, aux_loss, identity_loss = self.custom_loss(y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch)
             loss = target_loss + aux_loss + identity_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
             # 验证
             if (i + 1) % valid_every == 0:
                 valid_start_time = time.time()
                 model.eval()
                 y_pred = np.zeros((len(self.train_df) - self.train_len))
                 for j, valid_batch_data in enumerate(valid_loader):
                     x_batch = valid_batch_data[0]
                     x_mask = valid_batch_data[2]
                     batch_y_pred = self.sigmoid(model(x_batch, attention_mask=x_mask, labels=None).detach().cpu().numpy())[:, 0]
                     y_pred[j * self.base_batch_size: (j + 1) * self.base_batch_size] = batch_y_pred
                 # 计算得分
                 auc_score = self.evaluator.get_final_metric(y_pred)
                 print("epoch: %d duration: %d min auc_score: %.4f" % (epoch, int((time.time() - train_start_time) / 60), auc_score))
                 if not self.debug_mode:
                     state_dict = model.state_dict()
                     stage = int((i + 1) / valid_every)
                     train_duration = int((time.time() - train_start_time) / 60)
                     valid_duration = int((time.time() - valid_start_time) / 60)
                     if epoch == 0 and stage == 1:
                         # model[bert][seed][epoch][stage][model_name][stage_train_duration][valid_duration][score].bin
                         model_name = "model/model_%d_%d_%d_%s_%dmin_%dmin_%.4f.bin" % (self.seed, epoch + 1, stage, self.model_name, train_duration, valid_duration, auc_score)
                     else:
                         # model[bert][seed][epoch][stage][model_name][score].bin
                         model_name = "model/model_%d_%d_%d_%s_%.4f.bin" % (self.seed, epoch + 1, stage, self.model_name, auc_score)
                     torch.save(state_dict, os.path.join(self.data_dir, model_name))
                 model.train()
     # del 训练相关输入和模型
     training_history = [train_loader, valid_loader, model, optimizer, param_optimizer, optimizer_grouped_parameters]
     for variable in training_history:
         del variable
     gc.collect()
            opt = model.forward_logits(sentences_s, mask_s, sentences_t,
                                       mask_t, event1, event1_mask, event2,
                                       event2_mask)
            opt_mask = model_mask.forward_logits(sentences_s_mask, mask_s,
                                                 sentences_t, mask_t, event1,
                                                 event1_mask, event2,
                                                 event2_mask)

            opt_mix = torch.cat([opt, opt_mask], dim=-1)
            logits = model.additional_fc(opt_mix)
            loss = loss_fn(logits, data_y)

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            torch.nn.utils.clip_grad_norm_(model_mask.parameters(), 1)

            optimizer.zero_grad()
            optimizer_mask.zero_grad()
            loss.backward()
            optimizer.step()
            optimizer_mask.step()

        model.eval()
        model_mask.eval()
        with torch.no_grad():
            predicted_all = []
            gold_all = []
            for batch, batch_mask in test_dataset_mix:
                sentences_s, mask_s, sentences_t, mask_t, event1, event1_mask, event2, event2_mask, data_y, _ = batch
                sentences_s_mask = batch_mask[0]

                opt = model.forward_logits(sentences_s, mask_s, sentences_t,
                                                              targets[:, :1])
    #bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight)
    #return (bce_loss_1 * loss_weight) + bce_loss_2


tq = tqdm(range(EPOCHS))
for epoch in tq:
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=batch_size,
                                               shuffle=True)
    avg_loss = 0.
    avg_accuracy = 0.
    lossf = None
    tk0 = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    optimizer.zero_grad()  # Bug fix - thanks to @chinhuic
    for i, (x_batch, y_batch) in tk0:
        #        optimizer.zero_grad()
        y_pred = model(x_batch.to(device),
                       attention_mask=(x_batch > 0).to(device),
                       labels=None)

        #loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
        loss = custom_loss(y_pred, y_batch.to(device))

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i + 1
            ) % accumulation_steps == 0:  # Wait for several backward steps
            optimizer.step()  # Now we can do an optimizer step
            optimizer.zero_grad()
Exemple #30
0
    def train(self):
        model = self.agent
        config = self.config
        work_dir = Path(config['work_dir'])
        train_iter = 0
        save_every_niter = config['save_every_niter']
        entropy_reg_weight = config['entropy_reg_weight']
        summary_writer = SummaryWriter(
            os.path.join(config['work_dir'], 'tb_log/train'))
        max_train_step = config['max_train_step']
        save_program_cache_niter = config.get('save_program_cache_niter', 0)
        freeze_bert_for_niter = config.get('freeze_bert_niter', 0)
        gradient_accumulation_niter = config.get('gradient_accumulation_niter',
                                                 1)
        use_trainable_sketch_predictor = self.config.get(
            'use_trainable_sketch_predictor', False)

        bert_params = [(p_name, p) for (p_name, p) in model.named_parameters()
                       if 'bert_model' in p_name and p.requires_grad]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        bert_grouped_parameters = [{
            'params':
            [p for n, p in bert_params if not any(nd in n for nd in no_decay)],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in bert_params if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        bert_optimizer = BertAdam(bert_grouped_parameters,
                                  lr=self.config['bert_learning_rate'],
                                  warmup=0.1,
                                  t_total=max_train_step)

        # non bert parameters
        other_params = [
            p for n, p in model.named_parameters()
            if 'bert_model' not in n and p.requires_grad
        ]

        other_optimizer = torch.optim.Adam(other_params, lr=0.001)

        # eval batch loader
        self.load_dev_environments()
        dev_iter = nn_util.loop_iter(self.dev_environments,
                                     batch_size=self.config['batch_size'],
                                     shuffle=True)

        cum_loss = cum_examples = 0.
        t1 = time.time()

        while train_iter < max_train_step:
            if 'cuda' in self.devices[0].type:
                torch.cuda.set_device(self.devices[0])

            train_iter += 1
            other_optimizer.zero_grad()
            bert_optimizer.zero_grad()

            train_samples, samples_info = self.train_queue.get()
            sample_categories = samples_info['category']
            dev_batched_envs = next(dev_iter)  # get a batch of dev examples

            # model inference on dev examples
            dev_samples = model.decode_examples(
                dev_batched_envs, beam_size=self.config['beam_size'])
            dev_samples = dev_samples[0]  # list of list to list

            try:
                queue_size = self.train_queue.qsize()
                # queue_sizes = []
                # for cat in self.categories:
                #     queue_sizes.append(self.queues[cat].qsize())
                print(
                    f'[Learner] train_iter={train_iter} train queue size={queue_size}',
                    file=sys.stderr)
                summary_writer.add_scalar('train_queue_sizes', queue_size,
                                          train_iter)
            except NotImplementedError:
                pass

            train_trajectories = [
                sample.trajectory for sample in train_samples
            ]

            # dev
            dev_trajectories = [sample.trajectory for sample in dev_samples]

            # repeat for getting dev grad
            dev_loss, dev_log_prob = self.forward_single(dev_samples,
                                                         train_iter,
                                                         summary_writer,
                                                         batch_type='dev')
            # other_optimizer.step() # should we not do this

            grad_dev_nested = [p.grad for p in other_params]
            grad_dev = [torch.flatten(g) for g in grad_dev_nested]

            grad_dev = torch.cat(grad_dev)

            # print('dev gradient: ', len(grad_dev), grad_dev[0])
            # print('log pr dev: ', dev_log_prob)

            other_optimizer.zero_grad()
            bert_optimizer.zero_grad()

            # to save memory, for vertical tableBERT, we partition the training trajectories into small chunks
            # if isinstance(self.agent.encoder.bert_model, VerticalAttentionTableBert) and 'large' in self.agent.encoder.bert_model.config.base_model_name:
            #     chunk_size = 5
            #     # dev_chunk_size = 5
            # else:
            #     chunk_size = len(train_samples)
            #     dev_chunk_size = len(dev_samples)

            chunk_size = 1000000000
            chunk_num = int(math.ceil(len(train_samples) / chunk_size))
            cum_loss = 0.
            log_pr_catwise_train = torch.zeros((len(self.categories), 1))

            if chunk_num > 1:
                for chunk_id in range(0, chunk_num):
                    train_samples_chunk = train_samples[chunk_size *
                                                        chunk_id:chunk_size *
                                                        chunk_id + chunk_size]
                    sample_categories_chunk = sample_categories[
                        chunk_size * chunk_id:chunk_size * chunk_id +
                        chunk_size]
                    for idx, cat in enumerate(self.categories):
                        cat_indices = [
                            j for j in range(len(train_samples_chunk))
                            if sample_categories_chunk[j] == cat
                        ]
                        train_cat_chunk = [
                            train_samples_chunk[j] for j in cat_indices
                        ]
                        loss_val, log_pr_chunk = self.forward_single(
                            train_cat_chunk,
                            train_iter,
                            summary_writer,
                            batch_type='train')
                        cum_loss += loss_val
                        grad_cat = [p.grad for p in other_params]
                        reward = torch.dot(torch.tensor(grad_dev),
                                           torch.tensor(grad_cat))
                        self.current_psi[idx] = self.current_psi[
                            idx] + self.config['dds_lr'] * reward * log_pr_chunk

                grad_multiply_factor = 1 / len(train_samples)
                for p in self.agent.parameters():
                    if p.grad is not None:
                        p.grad.data.mul_(grad_multiply_factor)
            else:

                for idx, cat in enumerate(self.categories):
                    cat_indices = [
                        j for j in range(len(train_samples))
                        if sample_categories[j] == cat
                    ]
                    train_cat = [train_samples[j] for j in cat_indices]
                    if not train_cat:  # empty list, no samples from this category
                        print('no samples in current batch for: ', cat)
                        sys.stdout.flush()
                        continue
                    loss_val, log_pr = self.forward_single(train_cat,
                                                           train_iter,
                                                           summary_writer,
                                                           batch_type='train')
                    cum_loss = loss_val * len(train_samples)
                    grad_cat = [p.grad
                                for p in other_params]  # ignore bert_params
                    grad_cat = [torch.flatten(g) for g in grad_cat]
                    grad_cat = torch.cat(grad_cat)
                    other_optimizer.step()
                    other_optimizer.zero_grad()
                    # for every cat, fresh gradients
                    # print(type(grad_cat), grad_cat, grad_cat.shape)
                    # print(type(grad_dev), grad_dev, grad_dev.shape)
                    sys.stdout.flush()

                    # t1 = torch.FloatTensor(grad_dev)
                    # t2 = torch.FloatTensor(grad_cat)
                    # print(t1.shape)
                    # sys.stdout.flush()
                    # print(t2.shape)
                    # sys.stdout.flush()
                    reward = torch.dot(grad_dev, grad_cat) / (
                        torch.norm(grad_cat) * torch.norm(grad_dev))
                    print('reward: ', reward)
                    sys.stderr.flush()
                    sys.stdout.flush()
                    self.current_psi[idx] = self.current_psi[
                        idx] + self.config['dds_lr'] * reward * log_pr

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(other_params, 5.)

            # cumulative gradient backprop
            if train_iter % gradient_accumulation_niter == 0:
                # other_optimizer.step()
                if train_iter > freeze_bert_for_niter:
                    bert_optimizer.step()
                elif train_iter == freeze_bert_for_niter:
                    print(
                        f'[Learner] train_iter={train_iter} reset Adam optimizer and start fine-tuning BERT'
                    )
                    other_optimizer = torch.optim.Adam(other_params, lr=0.001)

            self.psi_queue.put(self.current_psi)

            if 'clip_frac' in samples_info:
                summary_writer.add_scalar('sample_clip_frac',
                                          samples_info['clip_frac'],
                                          train_iter)

            # update sketch predictor
            if use_trainable_sketch_predictor:
                if 'cuda' in self.devices[1].type:
                    torch.cuda.set_device(self.devices[1])

                self.sketch_predictor_trainer.step(train_trajectories,
                                                   train_iter=train_iter)

            cum_examples += len(train_samples)

            self.try_update_model_to_actors(train_iter)

            if train_iter % save_every_niter == 0:
                print(
                    f'[Learner] train_iter={train_iter} avg. loss={cum_loss / cum_examples}, '
                    f'{cum_examples} examples ({cum_examples / (time.time() - t1)} examples/s)',
                    file=sys.stderr)
                cum_loss = cum_examples = 0.
                t1 = time.time()

                # log stats of the program cache
                program_cache_stat = self.shared_program_cache.stat()
                summary_writer.add_scalar(
                    'avg_num_programs_in_cache',
                    program_cache_stat['num_entries'] /
                    program_cache_stat['num_envs'], train_iter)
                summary_writer.add_scalar('num_programs_in_cache',
                                          program_cache_stat['num_entries'],
                                          train_iter)

            if save_program_cache_niter > 0 and train_iter % save_program_cache_niter == 0:
                program_cache_file = work_dir / 'log' / f'program_cache.iter{train_iter}.json'
                program_cache = self.shared_program_cache.all_programs()
                json.dump(program_cache,
                          program_cache_file.open('w'),
                          indent=2)