Beispiel #1
0
    def build_dataloader(file: str, task: int, source2idx: Callable, target2idx: Callable, batch_size: int,
                         firstline: bool,
                         collate: Callable) -> Tuple[DataLoader, int]:
        """

        @param task: Choose the respective task
        @param source2idx: source tokenizer
        @param target2idx: target tokenizer
        @param batch_size: batch size
        @param firstline: if first line is header
        @param collate: collate function for sequence conversion
        @return: Dataloader and the file size
        """
        iterdata, num_lines = Tokenizer.prepare_iter(file, firstline=firstline,
                                                     task=task)
        dataset = IterDataset(iterdata, source2idx=source2idx, target2idx=target2idx,
                              num_lines=num_lines)
        dataloader = DataLoader(dataset, pin_memory=True, batch_size=batch_size, collate_fn=collate,
                                num_workers=8)
        return dataloader, num_lines
Beispiel #2
0
    def train(self):
        # training result is returned after training to inform calling code of the outcome of training
        # Values: Matching threshold reached (success): 0, Otherwise: 1
        # training_result = 1
        train_data, train_numlines = Tokenizer.prepare_iter(self.args.train_file, firstline=self.args.firstline, task=2)
        dev_data, dev_numlines = Tokenizer.prepare_iter(self.args.dev_file, firstline=self.args.firstline, task=2)
        test_data, test_numlines = Tokenizer.prepare_iter(self.args.test_file, firstline=self.args.firstline, task=2)

        saved_epoch = 0
        nepoch_no_imprv = 0
        epoch_start = time.time()
        max_epochs = self.args.max_epochs
        best_dev = -np.inf if self.args.metric == "bleu" else np.inf

        if self.args.tl:
            # 1. Load pre-trained model from previous model_dir
            print("INFO: - Load transfer learning models")
            self.load_transferlearning(epoch=-1)
            # 2. Update model_dir to the new one
            if self.args.timestamped_subdir:
                self.args.model_dir = os.path.abspath(os.path.join(self.args.model_dir, ".."))
                sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(self.args.model_dir, sub_folder)):
                os.mkdir(os.path.join(self.args.model_dir, sub_folder))
            self.args.model_dir = os.path.join(self.args.model_dir, sub_folder)
            # 3. Update logfile dir
            self.args.log_file = os.path.join(self.args.model_dir, self.args.log_file)
            with open(self.args.log_file, "w") as f:
                f.write("START TRAINING\n")
            # 4. save updated arguments and log file to the new folder
            print("INFO: - Save new argument file")
            SaveloadHP.save(self.args, os.path.join(self.args.model_dir, self.args.model_args))

            dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines, self.args.pred_dev_file)
            best_dev = dev_bleu[0] if self.args.metric == "bleu" else dev_loss
            print("INFO: - Transfer learning performance")
            print("         - Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)" %
                  (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
            self.appendfile("\t- Transfer learning performance")
            self.appendfile("\t\t- Current Dev loss: %.4f; Current Dev bleu: %.4f; Current Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" %
                            (dev_loss, dev_bleu[0], dev_string_match, dev_speed))

            # print("INFO: - Save transfer learning models")
            # self.save_parameters(epoch=0)
            # suppose the transfered model is the best one and save in the main dir
            self.save_parameters(epoch=-1)
        else:
            with open(self.args.log_file, "w") as f:
                f.write("START TRAINING\n")

        print('Dev metric:', self.args.metric)
        for epoch in range(1, max_epochs + 1):
            print("Epoch: %s/%s" % (epoch, max_epochs))
            stime = time.time()
            train_loss = self.train_batch(train_data, train_numlines)
            print("BONUS: Training time of %.4f" % (time.time() - stime))
            # Save the  model
            # print("INFO: - Frequently save models to checkpoint folders")
            # self.save_parameters(epoch=epoch)
            # set the first model as the best one and save to the main dir
            # evaluate on developing data

            dev_loss, dev_bleu, dev_string_match, dev_speed = self.evaluate_batch(dev_data, dev_numlines,
                                                                                  self.args.pred_dev_file)

            dev_metric = dev_bleu[0] if self.args.metric == "bleu" else dev_loss
            cond = dev_metric > best_dev if self.args.metric == "bleu" else dev_loss < best_dev
            if cond:
                nepoch_no_imprv = 0
                saved_epoch = epoch
                best_dev = dev_metric
                print("UPDATES: - New improvement")
                print("         - Train loss: %.4f" % train_loss)
                print("         - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" %
                      (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
                self.appendfile("\t- New improvement at epoch %d:\n" % saved_epoch)
                self.appendfile("\t\t- Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)\n" %
                                (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
                print("INFO: - Save best models")
                self.save_parameters(epoch=-1)

                # if dev_string_match >= self.args.matching_threshold:
                #     # TODO: automatically load models to gcp
                #     training_result = 0
                #     break

            else:
                print("UPDATES: - No improvement")
                print("         - Train loss: %.4f" % train_loss)
                print("         - Dev loss: %.4f; Dev bleu: %.4f; Dev string match: %.4f; Dev speed: %.2f(tokens/s)" %
                      (dev_loss, dev_bleu[0], dev_string_match, dev_speed))
                nepoch_no_imprv += 1
                # Decay learning_rate if no improvement
                if self.args.decay_rate > 0:
                    self.lr_decay(epoch)

                if nepoch_no_imprv >= self.args.patience:
                    # Load the current best models
                    print("INFO: - Load best models")
                    self.load_parameters(epoch=-1)

                    test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines,
                                                                                              self.args.pred_test_file)
                    print("SUMMARY: - Early stopping after %d epochs without improvements" % nepoch_no_imprv)
                    print("         - Dev metric (%s): %.4f" % (self.args.metric, best_dev))
                    print("         - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" %
                          (test_loss, test_bleu[0], test_string_match, test_speed))

                    self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs))
                    self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch)
                    self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test speed: %.2f(tokens/s)\n" %
                                    (test_loss, test_bleu[0], test_speed))
                    return test_bleu[0]

            epoch_finish, epoch_remain = Timer.timeEst2(epoch_start, epoch / max_epochs)
            print("INFO: - Trained time for %d epochs: %s" % (epoch, epoch_finish))
            print("\t- Remained time for %d epochs (est): %s\n" % (max_epochs - epoch, epoch_remain))

        # print("INFO: - Save best models")
        # self.save_parameters(epoch=-1)
        print("INFO: - Load best models")
        self.load_parameters(epoch=-1)

        test_loss, test_bleu, test_string_match, test_speed = self.evaluate_batch(test_data, test_numlines,
                                                                                  self.args.pred_test_file)
        print("SUMMARY: - Completed %d epoches" % max_epochs)
        print("         - Dev metric (%s): %.4f" % (self.args.metric, best_dev))
        print("         - Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)" %
              (test_loss, test_bleu[0], test_string_match, test_speed))
        self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs))
        self.appendfile("\t- Testing the best model at epoch %d:\n" % saved_epoch)
        self.appendfile("\t\t- Test loss: %.4f; Test bleu: %.4f; Test string match: %.4f; Test speed: %.2f(tokens/s)\n" %
                        (test_loss, test_bleu[0], test_string_match, test_speed))
        return test_bleu[0]
Beispiel #3
0
    def train(self):
        train_data, train_numlines = Tokenizer.prepare_iter(
            self.args.train_file, firstline=self.args.firstline, task=2)
        dev_data, dev_numlines = Tokenizer.prepare_iter(
            self.args.dev_file, firstline=self.args.firstline, task=2)
        test_data, test_numlines = Tokenizer.prepare_iter(
            self.args.test_file, firstline=self.args.firstline, task=2)

        saved_epoch = 0
        nepoch_no_imprv = 0
        epoch_start = time.time()
        max_epochs = self.args.max_epochs
        # best_dev = -np.inf if self.args.metric == "f1" else np.inf
        best_dev = np.inf if self.args.metric == "loss" else -np.inf

        with open(self.args.log_file, "w") as f:
            f.write("START TRAINING\n")
        if self.args.tl:
            # 1. Load pre-trained model from previous model_dir
            print("INFO: - Load transfer learning models")
            self.load_transferlearning(epoch=-1)
            # 2. Update model_dir to the new one
            if self.args.timestamped_subdir:
                self.args.model_dir = os.path.abspath(
                    os.path.join(self.args.model_dir, ".."))
                sub_folder = datetime.now().isoformat(
                    sep='-',
                    timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(self.args.model_dir,
                                               sub_folder)):
                os.mkdir(os.path.join(self.args.model_dir, sub_folder))
            self.args.model_dir = os.path.join(self.args.model_dir, sub_folder)
            # 3. Update logfile dir
            self.args.log_file = os.path.join(self.args.model_dir,
                                              self.args.log_file)

            # 4. save updated arguments and log file to the new folder
            print("INFO: - Save new argument file")
            SaveloadHP.save(
                self.args,
                os.path.join(self.args.model_dir, self.args.model_args))

            dev_loss, dev_metrics, dev_speed = self.evaluate_batch(
                dev_data, dev_numlines)
            # best_dev = dev_metrics[2] if self.args.metric == "f1" else dev_loss
            best_dev = dev_loss if self.args.metric == "loss" else dev_metrics[
                2]
            print("INFO: - Transfer learning performance")
            print(
                "         - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                 dev_speed))
            print(
                "         - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                % (dev_metrics[3], dev_metrics[4]))
            self.appendfile("\t- Transfer learning performance")
            self.appendfile(
                "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                 dev_speed))
            self.appendfile(
                "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f" %
                (dev_metrics[3], dev_metrics[4]))

            # print("INFO: - Save transfer learning models")
            # self.save_parameters(epoch=0)
            # suppose the transfered model is the best one and save in the main dir
            self.save_parameters(epoch=-1)

        for epoch in range(1, max_epochs + 1):
            print("Epoch: %s/%s" % (epoch, max_epochs))
            stime = time.time()
            train_loss = self.train_batch(train_data, train_numlines)
            print("BONUS: Training time of %.4f" % (time.time() - stime))
            # Save the  model
            # print("INFO: - Frequently save models to checkpoint folders")
            # self.save_parameters(epoch=epoch)
            # evaluate on developing data
            dev_loss, dev_metrics, dev_speed = self.evaluate_batch(
                dev_data, dev_numlines)
            # dev_metric = dev_metrics[2] if self.args.metric == "f1" else dev_loss
            dev_metric = dev_loss if self.args.metric == "loss" else dev_metrics[
                2]
            # cond = dev_metric > best_dev if self.args.metric == "f1" else dev_loss < best_dev
            cond = dev_loss < best_dev if self.args.metric == "loss" else dev_metric > best_dev
            if cond:
                nepoch_no_imprv = 0
                saved_epoch = epoch
                best_dev = dev_metric
                print("UPDATES: - New improvement")
                print("         - Train loss: %.4f" % train_loss)
                print(
                    "         - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                    "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                    (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                     dev_speed))
                print(
                    "         - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                    % (dev_metrics[3], dev_metrics[4]))
                self.appendfile("\t- New improvement at epoch %d:\n" %
                                saved_epoch)
                self.appendfile(
                    "\t\t- Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                    "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)\n" %
                    (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                     dev_speed))
                self.appendfile(
                    "\t\t- Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                    % (dev_metrics[3], dev_metrics[4]))
                print("INFO: - Save best models")
                self.save_parameters(epoch=-1)

            else:
                print("UPDATES: - No improvement")
                print("         - Train loss: %.4f" % train_loss)
                print(
                    "         - Current Dev loss: %.4f; Current Dev P: %.4f; Current Dev R: %.4f; "
                    "Current Dev F1: %.4f; Dev speed: %.2f(tokens/s)" %
                    (dev_loss, dev_metrics[0], dev_metrics[1], dev_metrics[2],
                     dev_speed))
                print(
                    "         - Current Dev sep_acc: %.4f; Current Dev full_acc: %.4f"
                    % (dev_metrics[3], dev_metrics[4]))
                nepoch_no_imprv += 1
                # Decay learning_rate if no improvement
                if self.args.decay_rate > 0:
                    self.lr_decay(epoch)

                if nepoch_no_imprv >= self.args.patience:
                    # Load the current best models
                    print("INFO: - Load best models")
                    self.load_parameters(epoch=-1)

                    test_loss, test_metrics, test_speed = self.evaluate_batch(
                        test_data, test_numlines)
                    print(
                        "SUMMARY: - Early stopping after %d epochs without improvements"
                        % nepoch_no_imprv)
                    print("         - Dev metric (%s): %.4f" %
                          (self.args.metric, best_dev))
                    print(
                        "         - Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
                        "Test F1: %.4f; Test speed: %.2f(tokens/s)" %
                        (test_loss, test_metrics[0], test_metrics[1],
                         test_metrics[2], test_speed))
                    print(
                        "         - Current Test sep_acc: %.4f; Current Test full_acc: %.4f"
                        % (test_metrics[3], test_metrics[4]))

                    self.appendfile("STOP TRAINING at epoch %s/%s\n" %
                                    (epoch, max_epochs))
                    self.appendfile(
                        "\t- Testing the best model at epoch %d:\n" %
                        saved_epoch)
                    self.appendfile(
                        "\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
                        "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" %
                        (test_loss, test_metrics[0], test_metrics[1],
                         test_metrics[2], test_speed))
                    self.appendfile(
                        "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f"
                        % (test_metrics[3], test_metrics[4]))
                    return test_metrics

            epoch_finish, epoch_remain = Timer.timeEst2(
                epoch_start, epoch / max_epochs)
            print("INFO: - Trained time for %d epochs: %s" %
                  (epoch, epoch_finish))
            print("\t- Remained time for %d epochs (est): %s\n" %
                  (max_epochs - epoch, epoch_remain))

        print("INFO: - Load best models")
        self.load_parameters(epoch=-1)

        test_loss, test_metrics, test_speed = self.evaluate_batch(
            test_data, test_numlines)
        print("SUMMARY: - Completed %d epoches" % max_epochs)
        print("         - Dev metric (%s): %.4f" %
              (self.args.metric, best_dev))
        print("         - Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
              "Test F1: %.4f; Test speed: %.2f(tokens/s)" %
              (test_loss, test_metrics[0], test_metrics[1], test_metrics[2],
               test_speed))
        print(
            "         - Current Test sep_acc: %.4f; Current Test full_acc: %.4f"
            % (test_metrics[3], test_metrics[4]))
        self.appendfile("STOP TRAINING at epoch %s/%s\n" % (epoch, max_epochs))
        self.appendfile("\t- Testing the best model at epoch %d:\n" %
                        saved_epoch)
        self.appendfile("\t\t- Test loss: %.4f; Test P: %.4f; Test R: %.4f; "
                        "Test F1: %.4f; Test speed: %.2f(tokens/s)\n" %
                        (test_loss, test_metrics[0], test_metrics[1],
                         test_metrics[2], test_speed))
        self.appendfile(
            "\t\t- Current Test sep_acc: %.4f; Current Test full_acc: %.4f" %
            (test_metrics[3], test_metrics[4]))
        return test_metrics
Beispiel #4
0
                                        num_labels=num_labels,
                                        id2label=tokenizer.i2tw,
                                        label2id=tokenizer.tw2i)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, from_tf=bool(".ckpt" in model_name), config=config)

    # data = JSON.read(args.train_data_file)
    # train_dataset = MapDataset(data, source2idx=source2idx, target2idx=source2idx)
    # # train_sampler = RandomSampler(train_dataset)
    # train_sampler = SequentialSampler(train_dataset)
    # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, pin_memory=True,
    #                               batch_size=16, collate_fn=collate_fn)

    data_file = "/media/data/review_response/Dev.json"
    iterdata, num_lines = Tokenizer.prepare_iter(data_file,
                                                 firstline=True,
                                                 task=1)

    train_iterdataset = IterDataset(iterdata,
                                    source2idx=source2idx,
                                    target2idx=lb2ids,
                                    num_lines=num_lines)
    # bpe=True, special_tokens_func=build_inputs_with_special_tokens)

    # avg_scores = {'f': 0., 'p': 0., 'r': 0.}
    # rouge_lf = []
    # rouge = Rouge()
    # for line in iterdata(0, num_lines):
    #     scores = rouge.get_scores(line[0], line[1])[0]
    #     rouge_lf.append(scores['rouge-l']['f'])
    #     break
Beispiel #5
0
        sw_size = len(tokenizer.sw2i)
        tw_size = len(tokenizer.tw2i)
        collate_fn = Tokenizer.collate_fn(pad_id, True)
    else:
        vocab_file = "/media/data/review_response/tokens/bert_level-bpe-vocab.txt"
        tokenizer = BPE.load(vocab_file)
        tokenizer.add_tokens(sys_tokens)
        nl2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)
        tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False)

        pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(BPAD) is not None else tokenizer.token_to_id(PAD)
        sw_size = tokenizer.get_vocab_size()
        tw_size = tokenizer.get_vocab_size()
        collate_fn = BPE.collate_fn(pad_id, True)

    train_data, num_lines = Tokenizer.prepare_iter(filename, firstline=False, task=1)
    train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines)
    train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn)

    for i, batch in enumerate(train_dataloader):
        # inputs, outputs = batch[0], batch[1]
        nl_tensor, lb_tensor = batch
        nl_len_tensor = (nl_tensor != pad_id).sum(dim=1)
        break

    use_selfatt = True
    if use_selfatt:
        # use the maximum length 5 times larger than input length
        nlemb_HPs = [sw_size, 50, None, 0.5, True, True, 1000]
        # nn_mode, ninp, nhid, nlayers, nhead, dropout, activation, norm, his_mask
        enc_HPs = ["self_attention", 50, 200, 6, 10, 0.5, "relu", None, False]