コード例 #1
0
class NewsClassifier(nn.Module):
    def __init__(self, args):
        super(NewsClassifier, self).__init__()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.PRE_TRAINED_MODEL_NAME = "bert-base-uncased"
        self.EPOCHS = args.max_epochs
        self.df = None
        self.tokenizer = None
        self.df_train = None
        self.df_val = None
        self.df_test = None
        self.train_data_loader = None
        self.val_data_loader = None
        self.test_data_loader = None
        self.optimizer = None
        self.total_steps = None
        self.scheduler = None
        self.loss_fn = None
        self.BATCH_SIZE = 16
        self.MAX_LEN = 160
        self.NUM_SAMPLES_COUNT = args.num_samples
        n_classes = len(class_names)
        self.VOCAB_FILE_URL = args.vocab_file
        self.VOCAB_FILE = "bert_base_uncased_vocab.txt"

        self.drop = nn.Dropout(p=0.2)
        self.bert = BertModel.from_pretrained(self.PRE_TRAINED_MODEL_NAME)
        for param in self.bert.parameters():
            param.requires_grad = False
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 512)
        self.out = nn.Linear(512, n_classes)

    def forward(self, input_ids, attention_mask):
        """
        :param input_ids: Input sentences from the batch
        :param attention_mask: Attention mask returned by the encoder

        :return: output - label for the input text
        """
        pooled_output = self.bert(input_ids=input_ids,
                                  attention_mask=attention_mask).pooler_output
        output = F.relu(self.fc1(pooled_output))
        output = self.drop(output)
        output = self.out(output)
        return output

    @staticmethod
    def process_label(rating):
        rating = int(rating)
        return rating - 1

    def create_data_loader(self, df, tokenizer, max_len, batch_size):
        """
        :param df: DataFrame input
        :param tokenizer: Bert tokenizer
        :param max_len: maximum length of the input sentence
        :param batch_size: Input batch size

        :return: output - Corresponding data loader for the given input
        """
        ds = AGNewsDataset(
            reviews=df.description.to_numpy(),
            targets=df.label.to_numpy(),
            tokenizer=tokenizer,
            max_len=max_len,
        )

        return DataLoader(ds, batch_size=batch_size, num_workers=4)

    def prepare_data(self):
        """
        Creates train, valid and test dataloaders from the csv data
        """
        td.AG_NEWS(root="data", split=("train", "test"))
        extracted_files = os.listdir("data/AG_NEWS")

        train_csv_path = None
        for fname in extracted_files:
            if fname.endswith("train.csv"):
                train_csv_path = os.path.join(os.getcwd(), "data/AG_NEWS",
                                              fname)

        self.df = pd.read_csv(train_csv_path)

        self.df.columns = ["label", "title", "description"]
        self.df.sample(frac=1)
        self.df = self.df.iloc[:self.NUM_SAMPLES_COUNT]

        self.df["label"] = self.df.label.apply(self.process_label)

        if not os.path.isfile(self.VOCAB_FILE):
            filePointer = requests.get(self.VOCAB_FILE_URL,
                                       allow_redirects=True)
            if filePointer.ok:
                with open(self.VOCAB_FILE, "wb") as f:
                    f.write(filePointer.content)
            else:
                raise RuntimeError("Error in fetching the vocab file")

        self.tokenizer = BertTokenizer(self.VOCAB_FILE)

        RANDOM_SEED = 42
        np.random.seed(RANDOM_SEED)
        torch.manual_seed(RANDOM_SEED)

        self.df_train, self.df_test = train_test_split(
            self.df,
            test_size=0.1,
            random_state=RANDOM_SEED,
            stratify=self.df["label"])
        self.df_val, self.df_test = train_test_split(
            self.df_test,
            test_size=0.5,
            random_state=RANDOM_SEED,
            stratify=self.df_test["label"])

        self.train_data_loader = self.create_data_loader(
            self.df_train, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE)
        self.val_data_loader = self.create_data_loader(self.df_val,
                                                       self.tokenizer,
                                                       self.MAX_LEN,
                                                       self.BATCH_SIZE)
        self.test_data_loader = self.create_data_loader(
            self.df_test, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE)

    def setOptimizer(self):
        """
        Sets the optimizer and scheduler functions
        """
        self.optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False)
        self.total_steps = len(self.train_data_loader) * self.EPOCHS

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=self.total_steps)

        self.loss_fn = nn.CrossEntropyLoss().to(self.device)

    def startTraining(self, model):
        """
        Initialzes the Traning step with the model initialized

        :param model: Instance of the NewsClassifier class
        """
        history = defaultdict(list)
        best_accuracy = 0

        for epoch in range(self.EPOCHS):

            print(f"Epoch {epoch + 1}/{self.EPOCHS}")

            train_acc, train_loss = self.train_epoch(model)

            print(f"Train loss {train_loss} accuracy {train_acc}")

            val_acc, val_loss = self.eval_model(model, self.val_data_loader)
            print(f"Val   loss {val_loss} accuracy {val_acc}")

            history["train_acc"].append(train_acc)
            history["train_loss"].append(train_loss)
            history["val_acc"].append(val_acc)
            history["val_loss"].append(val_loss)

            if val_acc > best_accuracy:
                torch.save(model.state_dict(), "best_model_state.bin")
                best_accuracy = val_acc

    def train_epoch(self, model):
        """
        Training process happens and accuracy is returned as output

        :param model: Instance of the NewsClassifier class

        :result: output - Accuracy of the model after training
        """

        model = model.train()
        losses = []
        correct_predictions = 0

        for data in tqdm(self.train_data_loader):
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            _, preds = torch.max(outputs, dim=1)
            loss = self.loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        return (
            correct_predictions.double() / len(self.train_data_loader) /
            self.BATCH_SIZE,
            np.mean(losses),
        )

    def eval_model(self, model, data_loader):
        """
        Validation process happens and validation / test accuracy is returned as output

        :param model: Instance of the NewsClassifier class
        :param data_loader: Data loader for either test / validation dataset

        :result: output - Accuracy of the model after testing
        """
        model = model.eval()

        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for d in data_loader:
                input_ids = d["input_ids"].to(self.device)
                attention_mask = d["attention_mask"].to(self.device)
                targets = d["targets"].to(self.device)

                outputs = model(input_ids=input_ids,
                                attention_mask=attention_mask)
                _, preds = torch.max(outputs, dim=1)

                loss = self.loss_fn(outputs, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

        return correct_predictions.double() / len(
            data_loader) / self.BATCH_SIZE, np.mean(losses)

    def get_predictions(self, model, data_loader):
        """
        Prediction after the training step is over

        :param model: Instance of the NewsClassifier class
        :param data_loader: Data loader for either test / validation dataset

        :result: output - Returns prediction results,
                          prediction probablities and corresponding values
        """
        model = model.eval()

        review_texts = []
        predictions = []
        prediction_probs = []
        real_values = []

        with torch.no_grad():
            for d in data_loader:
                texts = d["review_text"]
                input_ids = d["input_ids"].to(self.device)
                attention_mask = d["attention_mask"].to(self.device)
                targets = d["targets"].to(self.device)

                outputs = model(input_ids=input_ids,
                                attention_mask=attention_mask)
                _, preds = torch.max(outputs, dim=1)

                probs = F.softmax(outputs, dim=1)

                review_texts.extend(texts)
                predictions.extend(preds)
                prediction_probs.extend(probs)
                real_values.extend(targets)

        predictions = torch.stack(predictions).cpu()
        prediction_probs = torch.stack(prediction_probs).cpu()
        real_values = torch.stack(real_values).cpu()
        return review_texts, predictions, prediction_probs, real_values
コード例 #2
0
def train(args, train_dataset, model, tokenizer, teacher=None):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            if teacher is not None:
                teacher.eval()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }
            if args.model_type != "distilbert":
                inputs[
                    "token_type_ids"] = None if args.model_type == "xlm" else batch[
                        2]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
            outputs = model(**inputs)
            loss, start_logits_stu, end_logits_stu = outputs

            # Distillation loss
            if teacher is not None:
                if "token_type_ids" not in inputs:
                    inputs[
                        "token_type_ids"] = None if args.teacher_type == "xlm" else batch[
                            2]
                with torch.no_grad():
                    start_logits_tea, end_logits_tea = teacher(
                        input_ids=inputs["input_ids"],
                        token_type_ids=inputs["token_type_ids"],
                        attention_mask=inputs["attention_mask"],
                    )
                assert start_logits_tea.size() == start_logits_stu.size()
                assert end_logits_tea.size() == end_logits_stu.size()

                loss_fct = nn.KLDivLoss(reduction="batchmean")
                loss_start = loss_fct(
                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
                    F.softmax(start_logits_tea / args.temperature, dim=-1),
                ) * (args.temperature**2)
                loss_end = loss_fct(
                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
                    F.softmax(end_logits_tea / args.temperature, dim=-1),
                ) * (args.temperature**2)
                loss_ce = (loss_start + loss_end) / 2.0

                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #3
0
def main():
    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    args = parse_args()
    distributed_args = accelerate.DistributedDataParallelKwargs(
        find_unused_parameters=True)
    accelerator = Accelerator(kwargs_handlers=[distributed_args])
    device = accelerator.device
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        filename=f'xmc_{args.dataset}_{args.mode}_{args.log}.log',
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    ch = logging.StreamHandler(sys.stdout)
    logger.addHandler(ch)
    if accelerator.is_local_main_process:
        transformers.utils.logging.set_verbosity_info()
    else:
        transformers.utils.logging.set_verbosity_error()

    logger.info(sent_trans.__file__)

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Load pretrained model and tokenizer
    if args.model_name_or_path == 'bert-base-uncased' or args.model_name_or_path == 'sentence-transformers/paraphrase-mpnet-base-v2':
        query_encoder = build_encoder(
            args.model_name_or_path,
            args.max_label_length,
            args.pooling_mode,
            args.proj_emb_dim,
        )
    else:
        query_encoder = sent_trans.SentenceTransformer(args.model_name_or_path)

    tokenizer = query_encoder._first_module().tokenizer

    block_encoder = query_encoder

    model = DualEncoderModel(query_encoder, block_encoder, args.mode)
    model = model.to(device)

    # the whole label set
    data_path = os.path.join(os.path.abspath(os.getcwd()), 'dataset',
                             args.dataset)
    all_labels = pd.read_json(os.path.join(data_path, 'lbl.json'), lines=True)
    label_list = list(all_labels.title)
    label_ids = list(all_labels.uid)
    label_data = SimpleDataset(label_list, transform=tokenizer.encode)

    # label dataloader for searching
    sampler = SequentialSampler(label_data)
    label_padding_func = lambda x: padding_util(x, tokenizer.pad_token_id, 64)
    label_dataloader = DataLoader(label_data,
                                  sampler=sampler,
                                  batch_size=16,
                                  collate_fn=label_padding_func)

    # label dataloader for regularization
    reg_sampler = RandomSampler(label_data)
    reg_dataloader = DataLoader(label_data,
                                sampler=reg_sampler,
                                batch_size=4,
                                collate_fn=label_padding_func)

    if args.mode == 'ict':
        train_data = ICTXMCDataset(tokenizer=tokenizer, dataset=args.dataset)
    elif args.mode == 'self-train':
        train_data = PosDataset(tokenizer=tokenizer,
                                dataset=args.dataset,
                                labels=label_list,
                                mode=args.mode)
    elif args.mode == 'finetune-pair':
        train_path = os.path.join(data_path, 'trn.json')
        pos_pair = []
        with open(train_path) as fp:
            for i, line in enumerate(fp):
                inst = json.loads(line.strip())
                inst_id = inst['uid']
                for ind in inst['target_ind']:
                    pos_pair.append((inst_id, ind, i))
        dataset_size = len(pos_pair)
        indices = list(range(dataset_size))
        split = int(np.floor(args.ratio * dataset_size))
        np.random.shuffle(indices)
        train_indices = indices[:split]
        torch.distributed.broadcast_object_list(train_indices,
                                                src=0,
                                                group=None)
        sample_pairs = [pos_pair[i] for i in train_indices]
        train_data = PosDataset(tokenizer=tokenizer,
                                dataset=args.dataset,
                                labels=label_list,
                                mode=args.mode,
                                sample_pairs=sample_pairs)
    elif args.mode == 'finetune-label':
        label_index = []
        label_path = os.path.join(data_path, 'label_index.json')
        with open(label_path) as fp:
            for line in fp:
                label_index.append(json.loads(line.strip()))
        np.random.shuffle(label_index)
        sample_size = int(np.floor(args.ratio * len(label_index)))
        sample_label = label_index[:sample_size]
        torch.distributed.broadcast_object_list(sample_label,
                                                src=0,
                                                group=None)
        sample_pairs = []
        for i, label in enumerate(sample_label):
            ind = label['ind']
            for inst_id in label['instance']:
                sample_pairs.append((inst_id, ind, i))
        train_data = PosDataset(tokenizer=tokenizer,
                                dataset=args.dataset,
                                labels=label_list,
                                mode=args.mode,
                                sample_pairs=sample_pairs)

    train_sampler = RandomSampler(train_data)
    padding_func = lambda x: ICT_batchify(x, tokenizer.pad_token_id, 64, 288)
    train_dataloader = torch.utils.data.DataLoader(
        train_data,
        sampler=train_sampler,
        batch_size=args.per_device_train_batch_size,
        num_workers=4,
        pin_memory=False,
        collate_fn=padding_func)

    try:
        accelerator.print("load cache")
        all_instances = torch.load(
            os.path.join(data_path, 'all_passages_with_titles.json.cache.pt'))
        test_data = SimpleDataset(all_instances.values())
    except:
        all_instances = {}
        test_path = os.path.join(data_path, 'tst.json')
        if args.mode == 'ict':
            train_path = os.path.join(data_path, 'trn.json')
            train_instances = {}
            valid_passage_ids = train_data.valid_passage_ids
            with open(train_path) as fp:
                for line in fp:
                    inst = json.loads(line.strip())
                    train_instances[
                        inst['uid']] = inst['title'] + '\t' + inst['content']
            for inst_id in valid_passage_ids:
                all_instances[inst_id] = train_instances[inst_id]
        test_ids = []
        with open(test_path) as fp:
            for line in fp:
                inst = json.loads(line.strip())
                all_instances[
                    inst['uid']] = inst['title'] + '\t' + inst['content']
                test_ids.append(inst['uid'])
        simple_transform = lambda x: tokenizer.encode(
            x, max_length=288, truncation=True)
        test_data = SimpleDataset(list(all_instances.values()),
                                  transform=simple_transform)
        inst_num = len(test_data)

    sampler = SequentialSampler(test_data)
    sent_padding_func = lambda x: padding_util(x, tokenizer.pad_token_id, 288)
    instance_dataloader = DataLoader(test_data,
                                     sampler=sampler,
                                     batch_size=128,
                                     collate_fn=sent_padding_func)

    # prepare pairs
    reader = csv.reader(open(os.path.join(data_path, 'all_pairs.txt'),
                             encoding="utf-8"),
                        delimiter=" ")
    qrels = {}
    for id, row in enumerate(reader):
        query_id, corpus_id, score = row[0], row[1], int(row[2])
        if query_id not in qrels:
            qrels[query_id] = {corpus_id: score}
        else:
            qrels[query_id][corpus_id] = score

    logging.info("| |ICT_dataset|={} pairs.".format(len(train_data)))

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=1e-8)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, label_dataloader, reg_dataloader, instance_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, label_dataloader, reg_dataloader,
        instance_dataloader)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    # args.max_train_steps = 100000
    args.num_train_epochs = math.ceil(args.max_train_steps /
                                      num_update_steps_per_epoch)
    args.num_warmup_steps = int(0.1 * args.max_train_steps)
    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_data)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Learning Rate = {args.learning_rate}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0
    from torch.cuda.amp import autocast
    scaler = torch.cuda.amp.GradScaler()
    cluster_result = eval_and_cluster(args, logger, completed_steps,
                                      accelerator.unwrap_model(model),
                                      label_dataloader, label_ids,
                                      instance_dataloader, inst_num, test_ids,
                                      qrels, accelerator)
    reg_iter = iter(reg_dataloader)
    trial_name = f"dim-{args.proj_emb_dim}-bs-{args.per_device_train_batch_size}-{args.dataset}-{args.log}-{args.mode}"
    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t for t in batch)
            label_tokens, inst_tokens, indices = batch
            if args.mode == 'ict':
                try:
                    reg_data = next(reg_iter)
                except StopIteration:
                    reg_iter = iter(reg_dataloader)
                    reg_data = next(reg_iter)

            if cluster_result is not None:
                pseudo_labels = cluster_result[indices]
            else:
                pseudo_labels = indices
            with autocast():
                if args.mode == 'ict':
                    label_emb, inst_emb, inst_emb_aug, reg_emb = model(
                        label_tokens, inst_tokens, reg_data)
                    loss, stats_dict = loss_function_reg(
                        label_emb, inst_emb, inst_emb_aug, reg_emb,
                        pseudo_labels, accelerator)
                else:
                    label_emb, inst_emb = model(label_tokens,
                                                inst_tokens,
                                                reg_data=None)
                    loss, stats_dict = loss_function(label_emb, inst_emb,
                                                     pseudo_labels,
                                                     accelerator)
                loss = loss / args.gradient_accumulation_steps

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                scaler.step(optimizer)
                scaler.update()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps % args.logging_steps == 0:
                if args.mode == 'ict':
                    logger.info(
                        "| Epoch [{:4d}/{:4d}] Step [{:8d}/{:8d}] Total Loss {:.6e}  Contrast Loss {:.6e}  Reg Loss {:.6e}"
                        .format(
                            epoch,
                            args.num_train_epochs,
                            completed_steps,
                            args.max_train_steps,
                            stats_dict["loss"].item(),
                            stats_dict["contrast_loss"].item(),
                            stats_dict["reg_loss"].item(),
                        ))
                else:
                    logger.info(
                        "| Epoch [{:4d}/{:4d}] Step [{:8d}/{:8d}] Total Loss {:.6e}"
                        .format(
                            epoch,
                            args.num_train_epochs,
                            completed_steps,
                            args.max_train_steps,
                            stats_dict["loss"].item(),
                        ))
            if completed_steps % args.eval_steps == 0:
                cluster_result = eval_and_cluster(
                    args, logger, completed_steps,
                    accelerator.unwrap_model(model), label_dataloader,
                    label_ids, instance_dataloader, inst_num, test_ids, qrels,
                    accelerator)
                unwrapped_model = accelerator.unwrap_model(model)

                unwrapped_model.label_encoder.save(
                    f"{args.output_dir}/{trial_name}/label_encoder")
                unwrapped_model.instance_encoder.save(
                    f"{args.output_dir}/{trial_name}/instance_encoder")

            if completed_steps >= args.max_train_steps:
                break
コード例 #4
0
ファイル: run_glue.py プロジェクト: rsvp-ai/segatron_aaai
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(
                os.path.join(args.model_name_or_path, 'scheduler.pt')):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt')))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt')))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.output_dir) and not args.overwrite_output_dir:
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.output_dir.split('-')[-1].split('/')[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.input_type == 'para_sent':
                inputs['input_ids'] = {
                    "input_id": batch[0],
                    'para_pos': batch[4],
                    'sent_pos': batch[5]
                }
            elif args.input_type == 'para_sent_token':
                inputs['input_ids'] = {
                    "input_id": batch[0],
                    'para_pos': batch[4],
                    'sent_pos': batch[5],
                    'token_pos': batch[6]
                }
            elif args.input_type == 'sent_token':
                inputs['input_ids'] = {
                    "input_id": batch[0],
                    'token_pos': batch[5],
                    'sent_pos': batch[4]
                }
            elif args.input_type == 'para_token':
                inputs['input_ids'] = {
                    "input_id": batch[0],
                    'token_pos': batch[5],
                    'sent_pos': batch[4]
                }

            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet'
                ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = 'eval_{}'.format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs['learning_rate'] = learning_rate_scalar
                    logs['loss'] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{'step': global_step}}))

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, 'optimizer.pt'))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, 'scheduler.pt'))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        if args.save_steps < 0 and args.local_rank in [-1, 0]:
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir,
                                      'checkpoint-{}'.format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, 'training_args.bin'))
            logger.info("Saving model checkpoint to %s", output_dir)

            torch.save(optimizer.state_dict(),
                       os.path.join(output_dir, 'optimizer.pt'))
            torch.save(scheduler.state_dict(),
                       os.path.join(output_dir, 'scheduler.pt'))
            logger.info("Saving optimizer and scheduler states to %s",
                        output_dir)
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #5
0
def main():
    # Parse the arguments
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForSeq2SeqLM.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForSeq2SeqLM.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, (MBartTokenizer, MBartTokenizerFast)):
        assert (args.target_lang is not None and args.source_lang
                is not None), "mBart requires --target_lang and --source_lang"
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
                args.target_lang]
        else:
            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(
                args.target_lang)

    if model.config.decoder_start_token_id is None:
        raise ValueError(
            "Make sure that `config.decoder_start_token_id` is correctly defined"
        )

    prefix = args.source_prefix if args.source_prefix is not None else ""

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names

    # For translation we set the codes of our source and target languages (only useful for mBART, the others will
    # ignore those attributes).
    if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
        if args.source_lang is not None:
            tokenizer.src_lang = args.source_lang
        if args.target_lang is not None:
            tokenizer.tgt_lang = args.target_lang

    # Get the language codes for input/target.
    source_lang = args.source_lang.split("_")[0]
    target_lang = args.target_lang.split("_")[0]

    padding = "max_length" if args.pad_to_max_length else False

    # Temporarily set max_target_length for training.
    max_target_length = args.max_target_length
    padding = "max_length" if args.pad_to_max_length else False

    def preprocess_function(examples):
        inputs = [ex[source_lang] for ex in examples["translation"]]
        targets = [ex[target_lang] for ex in examples["translation"]]
        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs,
                                 max_length=args.max_source_length,
                                 padding=padding,
                                 truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets,
                               max_length=max_target_length,
                               padding=padding,
                               truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length" and args.ignore_pad_token_for_loss:
            labels["input_ids"] = [[
                (l if l != tokenizer.pad_token_id else -100) for l in label
            ] for label in labels["input_ids"]]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not args.overwrite_cache,
        desc="Running tokenizer on dataset",
    )

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=8 if accelerator.use_fp16 else None,
        )

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    metric = load_metric("sacrebleu")

    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]

        return preds, labels

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()

        if args.val_max_target_length is None:
            args.val_max_target_length = args.max_target_length

        gen_kwargs = {
            "max_length":
            args.val_max_target_length
            if args is not None else config.max_length,
            "num_beams":
            args.num_beams,
        }
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                generated_tokens = accelerator.unwrap_model(model).generate(
                    batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    **gen_kwargs,
                )

                generated_tokens = accelerator.pad_across_processes(
                    generated_tokens, dim=1, pad_index=tokenizer.pad_token_id)
                labels = batch["labels"]
                if not args.pad_to_max_length:
                    # If we did not pad to max length, we need to pad the labels too
                    labels = accelerator.pad_across_processes(
                        batch["labels"],
                        dim=1,
                        pad_index=tokenizer.pad_token_id)

                generated_tokens = accelerator.gather(
                    generated_tokens).cpu().numpy()
                labels = accelerator.gather(labels).cpu().numpy()

                if args.ignore_pad_token_for_loss:
                    # Replace -100 in the labels as we can't decode them.
                    labels = np.where(labels != -100, labels,
                                      tokenizer.pad_token_id)

                decoded_preds = tokenizer.batch_decode(
                    generated_tokens, skip_special_tokens=True)
                decoded_labels = tokenizer.batch_decode(
                    labels, skip_special_tokens=True)

                decoded_preds, decoded_labels = postprocess_text(
                    decoded_preds, decoded_labels)

                metric.add_batch(predictions=decoded_preds,
                                 references=decoded_labels)
        eval_metric = metric.compute()
        logger.info({"bleu": eval_metric["score"]})

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
コード例 #6
0
ファイル: run_squad.py プロジェクト: ripplesaround/DD4CLinQA
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler_total = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    subset_quantity = args.div_subset

    # notice 难度划分
    curriculum_sets_temp = []

    # done 如何保证课程被采样了
    diff_eval_result = Difficulty_Evaluation(args, train_dataset)
    for i, subset in enumerate(diff_eval_result):
        gate = int(
            (len(train_dataset) / args.train_batch_size) / (subset_quantity))
        print("第", i, "个 num:", len(subset), " 阈值 ", gate)
        random.shuffle(subset)
        # 如果subset过于小,就不采样了
        if len(subset) > gate:
            # subset = list(subset)
            # 决定没一个采样的长度
            curriculum_sets_temp.append(subset[0:int(gate / subset_quantity)])
        # elif(len(subset) <= int(gate/subset_quantity)):
        #     for i in range(subset_quantity):
        #         curriculum_sets_temp.append(subset)
        else:
            curriculum_sets_temp.append(subset)
        # curriculum_sets_temp.append(subset)

    # 不采样的
    # diff_eval_result = Difficulty_Evaluation(args, train_dataset)
    # for _ in range(int(args.num_train_epochs)):
    #     for i, subset in enumerate(diff_eval_result):
    #         random.shuffle(subset)
    #         curriculum_sets_temp.append(subset)

    # 随机划分
    # curriculum_sets_temp = Difficulty_Evaluation_Randomly(args,train_dataset)

    # 先添加全部任务
    curriculum_sets = []
    total_train_dataloader = DataLoader(train_dataset,
                                        sampler=train_sampler_total,
                                        batch_size=args.train_batch_size)
    for i in range(int(args.num_train_epochs)):
        curriculum_sets.append(total_train_dataloader)

    # 再添加课程任务
    # notice 课程任务顺序
    curriculum_sets += curriculum_sets_temp

    # CL阶段训练

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(curriculum_sets[0]) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            curriculum_sets[0]
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    # notice 添加L2正则化
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon,
                      weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(curriculum_sets[0]))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(curriculum_sets[0]) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(curriculum_sets[0]) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        # epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
        epochs_trained,
        int(len(curriculum_sets)),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    current_stage = 0
    for _ in train_iterator:
        epoch_iterator = tqdm(curriculum_sets[current_stage],
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            # print("batch_size",batch[0].shape)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert", "bart",
                    "longformer"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            # # notice 添加KL的loss 或者 wgan的那个w
            # pa = 0.0001
            # for i in range(args.train_batch_size):
            #     loss += ((pa)*
            #              ((cal_diff(x=outputs.hidden_states[0], y=outputs.hidden_states[-1], norm="line",criterion="kl")+
            #               cal_diff(x=outputs.hidden_states[-1], y=outputs.hidden_states[0], norm="line", criterion="kl")
            #               )/2)
            #              )

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        current_stage += 1

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #7
0
class Experiment(object):
    def __init__(self,
                 config,
                 model,
                 tokenizer,
                 total_samples=None,
                 label_names=None,
                 results=None,
                 run_name=None):
        self.config = config
        self.model = model
        self.tokenizer = tokenizer

        self.global_step = 0

        self.optimizer_state_dict = None
        self.scheduler_state_dict = None

        self.total_samples = total_samples

        self.label_names = label_names

        self.results = results
        self.run_name = run_name

        util.set_seed(config)

    __prfs_names = ['precision', 'recall', 'f1', 'support']
    __report_metrics = [
        'acc', 'macro_f1', 'micro_f1', 'macro_auc', 'avg_precision'
    ]

    def after_eval_cb(self, eval_name, result, pred_label_ids, preds,
                      extra_log):

        row = OrderedDict(step=self.global_step,
                          eval_name=eval_name,
                          run_name=self.run_name)

        row.update(extra_log)

        for key in self.__report_metrics:
            if key in result:
                row[key] = result[key]

        prfs = result['prfs']

        for metric_idx, metric_name in enumerate(self.__prfs_names):
            for label_idx, label_name in enumerate(self.label_names):
                col_name = f"{label_name}_{metric_name}"
                row[col_name] = result['prfs'][metric_idx][label_idx]

        if self.config.seeds:
            row['seed'] = self.config.seed

        if self.results is None:
            logger.warning("Creating new results DataFrame")
            self.results = pd.DataFrame(row, columns=row.keys(), index=[0])
        else:
            logger.debug("Adding row: %s", row)
            self.results = self.results.append(row, ignore_index=True)

        if self.config.get('out_file', None):
            self.results.to_csv(self.config.out_file, index=False)
            # results = self.results
            # key = self.run_name
            # if key not in results:
            #     results[key] = {}
            # if eval_name not in results[key]:
            #     results[key][eval_name] = {}
            # results[key][eval_name][self.global_step] = result
            # with open(self.config.out_file, 'w') as f:
            #     json.dump(results, f, indent=4, cls=util.ExtendedJSONEncoder)

    def after_logging(self, result):
        pass

    def train(self,
              train_dataloader,
              valid_dataloader=None,
              test_dataloader=None,
              should_continue=False):
        """ Train the model """
        tb_writer = SummaryWriter()

        train_epochs = self.config.train_epochs

        if self.config.max_steps > 0:
            train_steps = self.config.max_steps
            train_epochs = self.config.max_steps // (
                len(train_dataloader) // self.config.grad_acc_steps) + 1
        else:
            train_steps = len(
                train_dataloader) // self.config.grad_acc_steps * train_epochs

        if self.total_samples and should_continue:
            steps_total = self.total_samples // self.config.train_bs // self.config.grad_acc_steps * train_epochs
        else:
            steps_total = train_steps

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.config.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        self.optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.config.lr,
            eps=self.config.adam_eps,
        )

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.config.warmup_steps,
            num_training_steps=steps_total)

        # self.scheduler = get_constant_schedule(self.optimizer)

        if should_continue and self.global_step > 0:
            logger.info("loading saved optimizer and scheduler states")
            assert (self.optimizer_state_dict)
            assert (self.scheduler_state_dict)
            self.optimizer.load_state_dict(self.optimizer_state_dict)
            self.scheduler.load_state_dict(self.scheduler_state_dict)
        else:
            logger.info("Using fresh optimizer and scheduler")

        if self.config.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            self.model, self.optimizer = amp.initialize(
                self.model,
                self.optimizer,
                opt_level=self.config.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if self.config.n_gpu > 1 and not isinstance(self.model,
                                                    torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d (%d)", len(train_dataloader.dataset),
                    len(train_dataloader))
        logger.info("  Num Epochs = %d", train_epochs)
        logger.info("  Batch size = %d", self.config.train_bs)
        logger.info("  Learning rate = %e", self.config.lr)
        logger.info("  Loss label weights = %s",
                    self.config.loss_label_weights)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.config.train_bs * self.config.grad_acc_steps)
        logger.info("  Gradient Accumulation steps = %d",
                    self.config.grad_acc_steps)
        logger.info("  Total optimization steps = %d", train_steps)

        if not should_continue:
            self.global_step = 0

        epochs_trained = 0
        steps_trained_in_current_epoch = 0

        # # Check if continuing training from a checkpoint
        # if os.path.exists(self.config.model_path):
        #     if self.config.should_continue:
        #         step_str = self.config.model_path.split("-")[-1].split("/")[0]

        #         if step_str:
        #             # set self.global_step to gobal_step of last saved checkpoint from model path
        #             self.global_step = int(step_str)
        #             epochs_trained = self.global_step // (len(train_dataloader) //
        #                                                   self.config.grad_acc_steps)
        #             steps_trained_in_current_epoch = self.global_step % (
        #                 len(train_dataloader) // self.config.grad_acc_steps)

        #             logger.info(
        #                 "  Continuing training from checkpoint, will skip to saved self.global_step")
        #             logger.info(
        #                 "  Continuing training from epoch %d", epochs_trained)
        #             logger.info(
        #                 "  Continuing training from global step %d", self.global_step)
        #             logger.info("  Will skip the first %d steps in the first epoch",
        #                         steps_trained_in_current_epoch)

        train_loss = 0.0
        self.model.zero_grad()
        train_iterator = trange(
            epochs_trained,
            int(train_epochs),
            desc="Epoch",
        )
        util.set_seed(self.config)  # Added here for reproductibility

        self.model.train()

        if self.config.train_head_only:
            for param in self.model.roberta.embeddings.parameters():
                param.requires_grad = False
            logger.info("Training only head")
            # for param in self.model.__getattr__(self.config.model_type).roberta.parameters():
            #     param.requires_grad = False

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                self.model.train()

                inputs = self.__inputs_from_batch(batch)
                outputs = self.model(**inputs)

                # model outputs are always tuple in transformers (see doc)
                loss = outputs[0]

                if self.config.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if self.config.grad_acc_steps > 1:
                    loss = loss / self.config.grad_acc_steps

                if self.config.fp16:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                batch_loss = loss.item()
                train_loss += batch_loss

                if (step + 1) % self.config.grad_acc_steps == 0:
                    if self.config.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(self.optimizer),
                            self.config.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.config.max_grad_norm)

                    self.optimizer.step()
                    self.scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    self.global_step += 1

                    if self.config.logging_steps > 0 and self.global_step % self.config.logging_steps == 0:
                        logs = {}
                        if valid_dataloader:
                            result_valid, * \
                                _ = self.evaluate(
                                    'valid', valid_dataloader, backtrans=(test_dataloader == None))
                            logs.update({
                                f"valid_{k}": v
                                for k, v in result_valid.items()
                            })

                        if test_dataloader:
                            test_dataloader = test_dataloader if isinstance(
                                test_dataloader, dict) else {
                                    'test': test_dataloader
                                }
                            for eval_name, dataloader_or_tuple in test_dataloader.items(
                            ):
                                if isinstance(dataloader_or_tuple, tuple):
                                    dataloader, kwargs = dataloader_or_tuple
                                else:
                                    dataloader = dataloader_or_tuple
                                    kwargs = {}

                                result_test, * \
                                    _ = self.evaluate(
                                        eval_name, dataloader, **kwargs)
                                logs.update({
                                    f"{eval_name}_{k}": v
                                    for k, v in result_test.items()
                                })

                        learning_rate_scalar = self.scheduler.get_last_lr()[0]
                        logger.info("Learning rate: %f (at step %d)",
                                    learning_rate_scalar, step)
                        logs["learning_rate"] = learning_rate_scalar
                        logs["train_loss"] = train_loss

                        self.after_logging(logs)

                        logger.info("Batch loss: %f", batch_loss)

                        # for key, value in logs.items():
                        #     tb_writer.add_scalar(key, value, self.global_step)

                    if self.config.save_steps > 0 and self.global_step % self.config.save_steps == 0:
                        # Save model checkpoint
                        self.save_checkpoint()

                if self.config.max_steps > 0 and self.global_step > self.config.max_steps:
                    epoch_iterator.close()
                    break
            if self.config.max_steps > 0 and self.global_step > self.config.max_steps:
                train_iterator.close()
                break

        if self.config.train_head_only:
            logger.info("Training only head")
            # for param in self.model.__getattr__(self.config.model_type).parameters():
            #     param.requires_grad = True

            for param in self.model.roberta.embeddings.parameters():
                param.requires_grad = False

        tb_writer.close()
        self.optimizer_state_dict = self.optimizer.state_dict()
        self.scheduler_state_dict = self.scheduler.state_dict()

        avg_train_loss = train_loss / self.global_step

        logger.info("Learning rate now: %s", self.scheduler.get_last_lr())
        logger.info("***** Done training *****")
        return self.global_step, avg_train_loss

    def save_model(self, model_path):
        if not os.path.exists(model_path):
            os.makedirs(model_path)

        logger.info("Saving model to %s", model_path)

        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`

        model_to_save = (self.model.module
                         if hasattr(self.model, "module") else self.model)
        model_to_save.save_pretrained(model_path)
        self.tokenizer.save_pretrained(model_path)

        # Good practice: save your training arguments together with the trained model
        torch.save(self.config.as_dict(),
                   os.path.join(model_path, "training_config.bin"))

    def save_checkpoint(self):
        output_dir = os.path.join(self.config.output_model_path,
                                  "checkpoint-{}".format(self.global_step))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = (
            self.model.module if hasattr(self.model, "module") else self.model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

        torch.save(self.config.as_dict(),
                   os.path.join(output_dir, "training_self.config.bin"))
        logger.info("Saving model checkpoint to %s", output_dir)

        torch.save(self.optimizer.state_dict(),
                   os.path.join(output_dir, "optimizer.pt"))
        torch.save(self.scheduler.state_dict(),
                   os.path.join(output_dir, "scheduler.pt"))
        logger.info("Saving optimizer and scheduler states to %s", output_dir)

    def predict(self, dataloader):
        self.model.eval()

        preds = None

        for batch in tqdm(dataloader, desc="Predicting"):
            batch = tuple(t.to(self.config.device) for t in batch)

            input_ids, attention_mask, _ = batch

            with torch.no_grad():
                inputs = {
                    "input_ids": input_ids,
                    "attention_mask": attention_mask
                }
                # if config.model_type != "distilbert":
                #    inputs["token_type_ids"] = (
                #        batch[2] if config.model_type in [
                #            "bert", "xlnet", "albert"] else None
                #    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
                outputs = self.model(**inputs)
                logits = outputs[0]

            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

        return preds

    def logits_to_label_ids(self, logits):
        if not self.config.multi_label:
            label_ids = np.argmax(logits, axis=1)
        else:
            label_ids = F.sigmoid(torch.from_numpy(logits)).numpy() > 0.5
        return label_ids

    def evaluate(self,
                 eval_name,
                 dataloader,
                 mc_dropout=False,
                 skip_cb=False,
                 pred_label_ids_func=None,
                 backtrans=True,
                 extra_log={}):
        dropout_ps = {}

        def set_dropout_to_train(m):
            if type(m) == nn.Dropout:
                logger.info("setting dropout into train mode (%s)", str(m))
                logger.info("setting dropout into train mode (%s)", str(m))
                m.p = 0.5
                m.train()

        def reset_dropout_to_eval(m):
            if type(m) == nn.Dropout:
                p = dropout_ps[m]
                logger.info("reseting dropout into eval mode (%s) p=%d",
                            str(m), p)
                m.p = p
                m.eval()

        # Eval!
        logger.info("***** Running evaluation %s*****", eval_name)
        logger.info("  Num examples = %d", len(dataloader.dataset))
        logger.info("  Batch size = %d", self.config.eval_bs)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        true_label_ids = None

        self.model.eval()

        if mc_dropout:
            self.model.apply(set_dropout_to_train)

        for batch in tqdm(dataloader, desc="Evaluating"):

            with torch.no_grad():
                inputs = self.__inputs_from_batch(batch)
                labels = inputs['labels']

                outputs = self.model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                true_label_ids = labels.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                true_label_ids = np.append(true_label_ids,
                                           labels.detach().cpu().numpy(),
                                           axis=0)

        if mc_dropout:
            self.model.apply(reset_dropout_to_eval)

        eval_loss = eval_loss / nb_eval_steps

        if self.config.test_backtrans_langs and backtrans:
            logger.info('Using test augmentation...')
            groups = np.split(preds, len(self.config.test_backtrans_langs) + 1)
            #preds = sum(groups)

            preds = np.mean(groups, axis=0)
            #preds = np.maximum.reduce(groups)
            true_label_ids = true_label_ids[:preds.shape[0]]

        label_idxs = list(range(len(self.label_names)))

        if self.config.soft_label:
            true_label_ids = np.argmax(true_label_ids, axis=1)

        pred_label_ids = self.logits_to_label_ids(preds)

        if pred_label_ids_func:
            pred_label_ids = pred_label_ids_func(pred_label_ids)

        # print(out_label_ids)
        # print(max_preds)
        # print(out_label_ids.shape, max_preds.shape)

        result = {
            'acc':
            accuracy_score(true_label_ids, pred_label_ids),
            'macro_f1':
            f1_score(true_label_ids, pred_label_ids, average='macro'),
            'micro_f1':
            f1_score(true_label_ids, pred_label_ids, average='micro'),
            'prfs':
            precision_recall_fscore_support(true_label_ids,
                                            pred_label_ids,
                                            labels=label_idxs)
        }

        if not self.config.multi_label:
            result['cm'] = confusion_matrix(true_label_ids,
                                            pred_label_ids).ravel()

        if self.config.num_labels == 2:
            result['macro_auc'] = roc_auc_score(true_label_ids,
                                                pred_label_ids,
                                                average='macro')
            result['avg_precision'] = average_precision_score(
                true_label_ids, pred_label_ids)

        logger.info("***** Eval results {} *****".format(eval_name))

        try:
            logger.info(
                "\n %s",
                classification_report(
                    true_label_ids,
                    pred_label_ids,
                    labels=label_idxs,
                    target_names=self.label_names,
                ))

            result['report'] = classification_report(
                true_label_ids,
                pred_label_ids,
                labels=label_idxs,
                target_names=self.label_names,
                output_dict=True)
        except ValueError as e:
            print(e)
            pass

        logger.info("\n Accuracy = %f", result['acc'])

        if self.config.num_labels == 2:
            logger.info("\n MacroAUC = %f", result['macro_auc'])
            logger.info("\n AUPRC = %f", result['avg_precision'])

        logger.info("***** Done evaluation *****")

        if not skip_cb:
            self.after_eval_cb(eval_name, result, pred_label_ids, preds,
                               extra_log)
        return result, pred_label_ids, preds

    def __inputs_from_batch(self, batch, labels=True):
        batch = tuple(t.to(self.config.device) for t in batch)
        input_ids, attention_mask, label_ids, *rest = batch

        if rest:
            extra_features = rest[0]
        else:
            extra_features = None

        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "extra_features": extra_features
        }
        if labels:
            inputs["labels"] = label_ids

        # if self.config.model_type != "distilbert":
        #    inputs["token_type_ids"] = (
        #        batch[2] if self.config.model_type in [
        #            "bert", "xlnet", "albert"] else None
        #    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids

        #         outputs = model(b_input_ids, token_type_ids=None,
        #                         attention_mask=b_input_mask, labels=b_labels)

        return inputs

    def interpret(self, dataloader, df, label_names=None):

        dataset = dataloader.dataset
        sampler = SequentialSampler(dataset)

        # We need a sequential dataloader with bs=1
        dataloader = DataLoader(dataset,
                                sampler=sampler,
                                batch_size=1,
                                num_workers=4)

        logger.info("***** Running interpretation *****")
        logger.info("  Num examples = %d", len(dataset))

        # preds = None
        losses = None
        pred_labels = []

        self.model.eval()

        for batch in tqdm(dataloader, desc="Interpretation"):

            with torch.no_grad():
                inputs = self.__inputs_from_batch(batch)
                # if config.model_type != "distilbert":
                #    inputs["token_type_ids"] = (
                #        batch[2] if config.model_type in [
                #            "bert", "xlnet", "albert"] else None
                #    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
                outputs = self.model(**inputs)
                batch_loss, logits = outputs[:2]

                if self.config.n_gpu > 1:
                    batch_loss = batch_loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                batch_loss = batch_loss.detach().cpu().view(1)

                pred_label_ids = self.logits_to_label_ids(
                    logits.detach().cpu())
                pred_label_id = pred_label_ids[0]
                if label_names:
                    pred_labels.append(label_names[pred_label_id])
                else:
                    pred_labels.append(pred_label_id)

            if losses is None:
                # preds = logits.detach().cpu().numpy()
                losses = batch_loss
            else:
                # preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                losses = torch.cat((losses, batch_loss), dim=0)

        top_values, top_indices = torch.topk(losses, 100)
        top_indices = top_indices.numpy()
        top_pred_labels = [pred_labels[top_index] for top_index in top_indices]

        top_df = df.iloc[top_indices]
        top_df = top_df.assign(loss=top_values.numpy(),
                               pred_label=top_pred_labels)

        return top_df
コード例 #8
0
    "bert-base-uncased",
    num_labels=len(label_dict),
    output_attentions=False,
    output_hidden_states=False)

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

epochs = 5

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train) * epochs)


def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

コード例 #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--teacher_model",
                        default=None,
                        type=str,
                        help="The teacher model dir.")
    parser.add_argument("--student_model",
                        default=None,
                        type=str,
                        help="The student model dir.")
    parser.add_argument("--task_name",
                        default="SST-2",
                        type=str,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--weight_decay',
                        '--wd',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay')
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )

    # added arguments
    parser.add_argument('--aug_train', action='store_true')
    parser.add_argument('--eval_step', type=float, default=0.1)
    parser.add_argument('--pred_distill', action='store_true')
    parser.add_argument('--data_url', type=str, default="")
    parser.add_argument('--temperature', type=float, default=1.)

    args = parser.parse_args()
    logger.info('The args: {}'.format(args))

    # intermediate distillation default parameters
    default_params = {
        "cola": {
            "num_train_epochs": 50,
            "max_seq_length": 64
        },
        "mnli": {
            "num_train_epochs": 5,
            "max_seq_length": 128
        },
        "mrpc": {
            "num_train_epochs": 20,
            "max_seq_length": 128
        },
        "sst-2": {
            "num_train_epochs": 10,
            "max_seq_length": 64
        },
        "sts-b": {
            "num_train_epochs": 20,
            "max_seq_length": 128
        },
        "qqp": {
            "num_train_epochs": 5,
            "max_seq_length": 128
        },
        "qnli": {
            "num_train_epochs": 10,
            "max_seq_length": 128
        },
        "rte": {
            "num_train_epochs": 20,
            "max_seq_length": 128
        }
    }

    acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
    corr_tasks = ["sts-b"]
    mcc_tasks = ["cola"]

    # Prepare devices
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    logger.info("device: {} n_gpu: {}".format(device, n_gpu))

    # Prepare seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Prepare task settings
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name in default_params:
        args.max_seq_len = default_params[task_name]["max_seq_length"]

    if not args.pred_distill and not args.do_eval:
        if task_name in default_params:
            args.num_train_epoch = default_params[task_name][
                "num_train_epochs"]

    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name]()
    output_mode = output_modes[task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.student_model,
                                              do_lower_case=args.do_lower_case)
    student_config = BertConfig.from_pretrained(args.student_model,
                                                num_labels=num_labels,
                                                finetuning_task=args.task_name)

    if not args.do_eval:
        if args.gradient_accumulation_steps < 1:
            raise ValueError(
                "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
                .format(args.gradient_accumulation_steps))

        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

        train_data, _ = get_tensor_data(args, task_name, tokenizer, False,
                                        args.aug_train)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        num_train_optimization_steps = int(
            len(train_dataloader) /
            args.gradient_accumulation_steps) * args.num_train_epochs

    eval_data, eval_labels = get_tensor_data(args, task_name, tokenizer, True,
                                             False)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    if not args.do_eval:
        teacher_config = BertConfig.from_pretrained(
            args.teacher_model,
            num_labels=num_labels,
            finetuning_task=args.task_name)
        teacher_model = TinyBertForSequenceClassification.from_pretrained(
            args.teacher_model, config=teacher_config)
        teacher_model.to(device)

    student_model = TinyBertForSequenceClassification.from_pretrained(
        args.student_model, config=student_config)
    student_model.to(device)
    if args.do_eval:
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_data))
        logger.info("  Batch size = %d", args.eval_batch_size)

        student_model.eval()
        result = do_eval(student_model, task_name, eval_dataloader, device,
                         output_mode, eval_labels, num_labels)
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
    else:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        if n_gpu > 1:
            student_model = torch.nn.DataParallel(student_model)
            teacher_model = torch.nn.DataParallel(teacher_model)
        # Prepare optimizer
        param_optimizer = list(student_model.named_parameters())
        size = 0
        for n, p in student_model.named_parameters():
            logger.info('n: {}'.format(n))
            size += p.nelement()

        logger.info('Total parameters: {}'.format(size))
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          correct_bias=False)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(num_train_optimization_steps *
                                 args.warmup_proportion),
            num_training_steps=num_train_optimization_steps)
        if not args.pred_distill:
            scheduler = get_constant_schedule(optimizer)

        # Prepare loss functions
        loss_mse = MSELoss()

        def soft_cross_entropy(predicts, targets):
            student_likelihood = torch.nn.functional.log_softmax(predicts,
                                                                 dim=-1)
            targets_prob = torch.nn.functional.softmax(targets, dim=-1)
            return (-targets_prob * student_likelihood).mean()

        # Train and evaluate
        global_step = 0
        best_dev_acc = 0.0
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")

        for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0.
            tr_att_loss = 0.
            tr_rep_loss = 0.
            tr_cls_loss = 0.

            student_model.train()
            nb_tr_examples, nb_tr_steps = 0, 0

            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", ascii=True)):
                batch = tuple(t.to(device) for t in batch)

                input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch
                if input_ids.size()[0] != args.train_batch_size:
                    continue

                att_loss = 0.
                rep_loss = 0.
                cls_loss = 0.

                student_logits, student_atts, student_reps = student_model(
                    input_ids, segment_ids, input_mask, is_student=True)

                with torch.no_grad():
                    teacher_logits, teacher_atts, teacher_reps = teacher_model(
                        input_ids, segment_ids, input_mask)

                if not args.pred_distill:
                    teacher_layer_num = len(teacher_atts)
                    student_layer_num = len(student_atts)
                    # print("teacher_layer_num:",teacher_layer_num)
                    # print("student_layer_num:",student_layer_num)
                    # print("teacher_reps num:",len(teacher_reps))

                    assert teacher_layer_num % student_layer_num == 0
                    layers_per_block = int(teacher_layer_num /
                                           student_layer_num)
                    new_teacher_atts = [
                        teacher_atts[i * layers_per_block + layers_per_block -
                                     1] for i in range(student_layer_num)
                    ]

                    for student_att, teacher_att in zip(
                            student_atts, new_teacher_atts):
                        student_att = torch.where(
                            student_att <= -1e2,
                            torch.zeros_like(student_att).to(device),
                            student_att)
                        teacher_att = torch.where(
                            teacher_att <= -1e2,
                            torch.zeros_like(teacher_att).to(device),
                            teacher_att)

                        tmp_loss = loss_mse(student_att, teacher_att)
                        att_loss += tmp_loss

                    new_teacher_reps = [
                        teacher_reps[i * layers_per_block]
                        for i in range(student_layer_num + 1)
                    ]
                    new_student_reps = student_reps
                    for student_rep, teacher_rep in zip(
                            new_student_reps, new_teacher_reps):
                        tmp_loss = loss_mse(student_rep, teacher_rep)
                        rep_loss += tmp_loss

                    loss = rep_loss + att_loss
                    tr_att_loss += att_loss.item()
                    tr_rep_loss += rep_loss.item()
                else:
                    if output_mode == "classification":
                        cls_loss = soft_cross_entropy(
                            student_logits / args.temperature,
                            teacher_logits / args.temperature)
                    elif output_mode == "regression":
                        loss_mse = MSELoss()
                        cls_loss = loss_mse(student_logits.view(-1),
                                            label_ids.view(-1))

                    loss = cls_loss
                    tr_cls_loss += cls_loss.item()

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += label_ids.size(0)
                nb_tr_steps += 1

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

                if (global_step + 1) % int(
                        args.eval_step * num_train_optimization_steps) == 0:
                    logger.info("***** Running evaluation *****")
                    logger.info("  Epoch = {} iter {} step".format(
                        epoch_, global_step))
                    logger.info("  Num examples = %d", len(eval_data))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    student_model.eval()

                    loss = tr_loss / (step + 1)
                    cls_loss = tr_cls_loss / (step + 1)
                    att_loss = tr_att_loss / (step + 1)
                    rep_loss = tr_rep_loss / (step + 1)

                    result = {}
                    if args.pred_distill:
                        result = do_eval(student_model, task_name,
                                         eval_dataloader, device, output_mode,
                                         eval_labels, num_labels)
                    result['global_step'] = global_step
                    result['cls_loss'] = cls_loss
                    result['att_loss'] = att_loss
                    result['rep_loss'] = rep_loss
                    result['loss'] = loss

                    result_to_file(result, output_eval_file)

                    if not args.pred_distill:
                        save_model = True
                    else:
                        save_model = False

                        if task_name in acc_tasks and result[
                                'acc'] > best_dev_acc:
                            best_dev_acc = result['acc']
                            save_model = True

                        if task_name in corr_tasks and result[
                                'corr'] > best_dev_acc:
                            best_dev_acc = result['corr']
                            save_model = True

                        if task_name in mcc_tasks and result[
                                'mcc'] > best_dev_acc:
                            best_dev_acc = result['mcc']
                            save_model = True

                    if save_model:
                        logger.info("***** Save model *****")

                        model_to_save = student_model.module if hasattr(
                            student_model, 'module') else student_model

                        model_name = "pytorch_model.bin"
                        # if not args.pred_distill:
                        #     model_name = "step_{}_{}".format(global_step, "pytorch_model.bin")
                        output_model_file = os.path.join(
                            args.output_dir, model_name)
                        output_config_file = os.path.join(
                            args.output_dir, "config.json")

                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(args.output_dir)

                        # Test mnli-mm
                        if args.pred_distill and task_name == "mnli":
                            task_name = "mnli-mm"
                            if not os.path.exists(args.output_dir + '-MM'):
                                os.makedirs(args.output_dir + '-MM')

                            eval_data, eval_labels = get_tensor_data(
                                args, task_name, tokenizer, True, False)

                            eval_sampler = SequentialSampler(eval_data)
                            eval_dataloader = DataLoader(
                                eval_data,
                                sampler=eval_sampler,
                                batch_size=args.eval_batch_size)
                            logger.info("***** Running mm evaluation *****")
                            logger.info("  Num examples = %d", len(eval_data))
                            logger.info("  Batch size = %d",
                                        args.eval_batch_size)

                            result = do_eval(student_model, task_name,
                                             eval_dataloader, device,
                                             output_mode, eval_labels,
                                             num_labels)

                            result['global_step'] = global_step

                            tmp_output_eval_file = os.path.join(
                                args.output_dir + '-MM', "eval_results.txt")
                            result_to_file(result, tmp_output_eval_file)

                            task_name = 'mnli'

                    student_model.train()
コード例 #10
0
def train(args):
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    set_seed(args)
    best_f1 = 0
    logger.info("the current config is :\n {}".format(str(vars(args))))
    if args.model_name in MODEL_MAP:
        Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name]
        config = Config.from_pretrained(args.pretrained_model_path,
                                        num_labels=args.num_labels)
        config = add_args_to_config(args, config)  ##add customized args
        tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path,
                                              do_lower_case=args.do_lower_case)
        model = load_model(Model, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        ###adv training
        fgm = FGM(model)
        transform = Transform(tokenizer, args)
        train_data = Corpus(args, "train.csv", transform)
        ###get the weighted sample with the weight [0.9,0.2,0.5]
        # weight = [0.9,0.2,0.5]
        # weight_sequence = []
        # for i in range(len(train_data)):
        #     data = train_data[i]
        #     label =data.get('label').item()
        #     weight_sequence.append(weight[label]) ###add the weight of this label
        dev_data = Corpus(args, 'dev.csv', transform)
        dev_sampler = SequentialSampler(dev_data)
        dev_loader = DataLoader(dev_data,
                                batch_size=args.eval_batch_size,
                                sampler=dev_sampler)

        # Run prediction for full data
        eval_sampler = SequentialSampler(dev_data)
        dev_loader = DataLoader(dev_data,
                                sampler=eval_sampler,
                                batch_size=args.eval_batch_size)

        train_sampler = RandomSampler(train_data)
        # weight_sampler = WeightedRandomSampler(weights=weight_sequence,num_samples=args.epochs*len(train_data), replacement=True)
        test_sampler = SubsetRandomSampler(
            np.random.randint(low=0,
                              high=(len(train_data)),
                              size=len(dev_data)))
        train_loader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  sampler=train_sampler,
                                  drop_last=True)

        test_loader = DataLoader(train_data,
                                 batch_size=args.eval_batch_size,
                                 sampler=test_sampler)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Batch size = %d", args.batch_size)
        logger.info("  Num steps = %d", args.epochs)
        logger.info("  Early Stopping dev_loss = %f", args.dev_loss)
        bar = tqdm(range(len(train_loader) * args.epochs),
                   total=len(train_loader) * args.epochs)
        train_loader = cycle(train_loader)
        ##get optimizer
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            args.weight_decay
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=0,
                                         t_total=len(bar))
        steps = 0
        # dev_labels = dev_data.get_feature("label")
        # dev_labels = [i.item() for i in dev_labels]# get gold label
        total_train_loss = 0
        for step in bar:
            model.train()
            data_batch = next(train_loader)
            for k, v in data_batch.items():
                data_batch[k] = v.to(device)
            loss = model(batch=data_batch, feed_labels=True)
            if args.n_gpus > 1:
                loss = loss.mean()
            loss.backward()
            ###adv training
            fgm.attack()
            loss_adv = model(batch=data_batch, feed_labels=True)
            if args.n_gpus > 1:
                loss_adv = loss_adv.mean()
            loss_adv.backward()  # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
            fgm.restore()  # 恢复embedding参数
            ###adv training
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            ##setting bar
            steps += 1
            total_train_loss += loss.item()
            bar.set_description("training loss {}".format(loss.item()))
            if (steps) % args.eval_steps == 0:
                logits, loss, dev_labels = do_inference(
                    model, dev_loader, device)
                test_logits, test_loss, test_labels = do_inference(
                    model, test_loader, device)
                inference_labels = logits.argmax(axis=1)
                test_inference_labels = test_logits.argmax(axis=1)
                f1 = f1_score(dev_labels,
                              inference_labels,
                              labels=[0, 1, 2],
                              average="macro")
                test_f1 = f1_score(test_labels,
                                   test_inference_labels,
                                   labels=[0, 1, 2],
                                   average="macro")
                # acc = accuracy_score(dev_labels, inference_labels)
                logger.info("=========eval report =========")
                logger.info("step : %s ", str(steps))
                logger.info("average_train loss: %s" %
                            (str(total_train_loss / steps)))
                logger.info("subset train loss: %s" % (str(test_loss)))
                logger.info("subset train f1 score: %s", str(test_f1))
                logger.info("eval loss: %s", str(loss))
                logger.info("eval f1 score: %s", str(f1))
                output_eval_file = os.path.join(args.out_dir,
                                                "eval_records.txt")
                with open(output_eval_file, "a") as writer:
                    if steps == args.eval_steps:
                        writer.write("\n%s\n" % (args.memo))
                    writer.write("=========eval report =========\n")
                    writer.write("step : %s \n" % (str(steps)))
                    writer.write("average_train loss: %s\n" %
                                 (str(total_train_loss / steps)))
                    writer.write("subset train loss: %s\n" % (str(test_loss)))
                    writer.write("subset f1 score: %s\n" % (str(test_f1)))
                    writer.write("eval loss: %s\n" % (str(loss)))
                    writer.write("eval f1 score: %s\n" % (str(f1)))
                    writer.write('\n')
                if f1 > best_f1:
                    logger.info("we get a best dev f1 %s saving model....",
                                str(f1))
                    output_path = os.path.join(args.out_dir,
                                               "pytorch_model.bin")
                    if hasattr(model, 'module'):
                        logger.info("model has module")
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    torch.save(model_to_save.state_dict(), output_path)
                    logger.info("model saved")
                    best_f1 = f1
        save_config(args)
        logger.info("args saved")
        ##load the final model
        args.to_resume_model = True
        model = load_model(Model, args, config)
        model = model.to(device)
        if args.n_gpus > 1:
            model = nn.DataParallel(model)
        dev_logits, loss, dev_labels = do_inference(
            model, dev_loader, device)  ##do the inference for dev set
        pub_data = Corpus(args, 'test.csv', transform)
        pub_sampler = SequentialSampler(pub_data)
        pub_loader = DataLoader(pub_data,
                                batch_size=args.eval_batch_size,
                                sampler=pub_sampler)
        # logits, loss, dev_labels = do_inference(model, dev_loader, device)
        test_logits, _, _ = do_inference(model, pub_loader, device)
        return dev_logits, dev_labels, test_logits
    else:
        logger.info("the model %s is not registered", args.model_name)
        return
コード例 #11
0
ファイル: run_simMix.py プロジェクト: thunlp/MixADA
def train(args, train_dataset, model, tokenizer):
    global extracted_grads
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    if args.mix_option == 1:
        logger.info("Random Mixup")
    else:
        logger.info("No Mixup")

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    processor = processors[args.task_name]()
    attacker = get_attacker(args.attacker)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.train_batch_size,
                                  shuffle=True)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    ## Add Mixup in Batch
    epoch = 0
    for _ in train_iterator:
        epoch += 1

        if epoch > 1 and args.iterative:
            ## augment the current train dataset with new batch of adversarial exampels generated by the currect model
            orig_data = load_custom_dataset(os.path.join(
                args.data_dir, "train.tsv"),
                                            all_data=True,
                                            number=args.num_adv)
            clsf = ModelClassifier(tokenizer, model, args)
            attack_eval = OpenAttack.attack_evals.DefaultAttackEval(
                attacker, clsf, progress_bar=True)
            adv_egs = attack_eval.eval(orig_data,
                                       visualize=False,
                                       return_examples=True)
            adv_examples = processor._create_examples(adv_egs, "adv_train")
            logger.info(
                "Epoch: {}, Number of adversarial examples added to training: {}"
                .format(epoch, len(adv_examples)))
            adv_dataset = convert_examples_dataset(args, adv_examples,
                                                   tokenizer)
            train_dataset = ConcatDataset([train_dataset, adv_dataset])

            ## start training on augmented data (we will shuffle the training data)
            # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
            train_dataloader = DataLoader(train_dataset,
                                          batch_size=args.train_batch_size,
                                          shuffle=True)

            logger.info("Current Num examples = %d", len(train_dataset))

        epoch_iterator = train_dataloader
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            ## normal training
            ## for now, just ignore token type ids
            input_ids = batch[0]  #(bsz, len)
            attention_mask = batch[1]
            batch_size = input_ids.size(0)
            length = input_ids.size(1)
            labels = batch[3]  #(bsz,)
            logits, outputs = model(input_ids,
                                    attention_mask)  #(bsz, num_labels)
            # x_embeddings = outputs[2] # (bsz, len, dim)
            # x_embeddings.register_hook(save_grad("x_emb"))
            # logger.info("#outputs 1: " + str(len(outputs[-1])))
            L_ori = nn.CrossEntropyLoss()(logits.view(-1, args.num_labels),
                                          labels.view(-1))

            ## RandomMix
            if args.mix_option == 1:
                idx = torch.randperm(batch_size)
                input_ids_2 = input_ids[idx]
                labels_2 = labels[idx]
                attention_mask_2 = attention_mask[idx]
                ## convert the labels to one-hot
                labels = torch.zeros(batch_size,
                                     args.num_labels).to(args.device).scatter_(
                                         1, labels.view(-1, 1), 1)
                labels_2 = torch.zeros(batch_size, args.num_labels).to(
                    args.device).scatter_(1, labels_2.view(-1, 1), 1)

                l = np.random.beta(args.alpha, args.alpha)
                # l = max(l, 1-l) ## not needed when only using labeled examples
                mixed_labels = l * labels + (1 - l) * labels_2

                mix_layer = np.random.choice(args.mix_layers_set, 1)[0]
                mix_layer = mix_layer - 1

                logits, outputs = model(input_ids, attention_mask, input_ids_2,
                                        attention_mask_2, l, mix_layer)
                probs = torch.softmax(logits, dim=1)  #(bsz, num_labels)
                L_mix = F.kl_div(probs.log(), mixed_labels, None, None,
                                 'batchmean')

                loss = L_ori + L_mix

            else:
                loss = L_ori

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            tr_loss += loss.item()

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    # print(json.dumps({**logs, **{"step": global_step}}))

                    logging.info("Global Step: " + str(global_step))
                    logging.info("Loss: " + str(loss_scalar))

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    ## save the final epoch only
    if args.local_rank in [-1, 0]:
        # Save model checkpoint
        output_dir = os.path.join(args.output_dir, "final-checkpoint")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.info("Saving model checkpoint to %s", output_dir)

        torch.save(optimizer.state_dict(),
                   os.path.join(output_dir, "optimizer.pt"))
        torch.save(scheduler.state_dict(),
                   os.path.join(output_dir, "scheduler.pt"))
        logger.info("Saving optimizer and scheduler states to %s", output_dir)

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #12
0
VALID_DATA_LOADER = create_dataloader(df=VALID, max_len=MAX_LEN, bs=BS)


"""Calling Model and sending to CUDA"""

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
model = Classifier()
criterion = criterion
criterion.to(device)
model.to(device)

"""Otimizador e Scheduler"""

optimizer = AdamW(
    model.parameters(),
    lr=float(config['model']['learning_rate']),
    correct_bias=False
)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config['model']['num_warmup_steps'],
    num_training_steps=config['model']['num_epochs'] * config['model']['batch_size']
)

"""Training Loop"""

EPOCHS = config['model']['num_epochs']

with open("logger.txt", "w") as f:
    f.write(f"")
コード例 #13
0
ファイル: trainBERT.py プロジェクト: mumeblossom/PTD
def train(args, train_dataset, model, tokenizer):
    """ Train the model """

    # train_batch_size
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # train_sampler
    train_sampler = RandomSampler(train_dataset)
    # train_dataloader
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        # 实际并不会用到max_steps,故实际要跑的所有step=t_total
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=range(args.n_gpu))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps,
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    best_acc = 0.0
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=False)
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=False)
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss
                    print('\n')
                    print(json.dumps({**logs, **{"step": global_step}}))
                    print('\n')

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    # global_step是save_step的倍数时进行save checkpoint
                    if (args.evaluate_during_training):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        recent_acc = evaluate(args, model, tokenizer)

                    output_dir = os.path.join(
                        args.output_dir,
                        'checkpoint_{}'.format(args.task_name))
                    best_dir = os.path.join(args.output_dir,
                                            'best_{}'.format(args.task_name))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    if not os.path.exists(best_dir):
                        os.makedirs(best_dir)

                    is_best = recent_acc > best_acc
                    best_acc = max(recent_acc, best_acc)

                    logger.info('Recent EVAL ACC: {} BEST EVAL ACC: {}'.format(
                        recent_acc, best_acc))

                    if is_best:
                        model_to_save = (
                            model.module if hasattr(model, "module") else model
                        )  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(best_dir)
                        tokenizer.save_pretrained(best_dir)
                        torch.save(args,
                                   os.path.join(best_dir, "training_args.bin"))

                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
コード例 #14
0
ファイル: train.py プロジェクト: allenai/scruples
def train_lm(
        data_dir: str,
        model_dir: str,
        dataset: str,
        baseline: str,
        hyper_params: Dict[str, Any],
        loss_type: str,
        compute_train_batch_size: int,
        predict_batch_size: int,
        gpu_ids: Optional[List[int]],
        logger: Optional[logging.Logger] = None
) -> None:
    """Fine-tune a pre-trained LM baseline on a scruples dataset.

    Fine-tune ``baseline`` on ``dataset``, writing all results and
    artifacts to ``model_dir``. Return the best calibrated xentropy achieved on
    dev after any epoch.

    Parameters
    ----------
    data_dir : str
        The path to the directory containing the dataset.
    model_dir : str
        The path to the directory in which to save results.
    dataset : str
        The dataset to use when fine-tuning ``baseline``. Must be either
        "resource" or "corpus".
    baseline : str
        The pre-trained LM to fine-tune. Should be one of the keys for
        ``scruples.baselines.$dataset.FINE_TUNE_LM_BASELINES`` where
        ``$dataset`` corresponds to the ``dataset`` argument to this
        function.
    hyper_params : Dict[str, Any]
        The dictionary of hyper-parameters for the model.
    loss_type : str
        The type of loss to use. Should be one of ``"xentropy-hard"``,
        ``"xentropy-soft"``, ``"xentropy-full"`` or
        ``"dirichlet-multinomial"``.
    compute_train_batch_size : int
        The largest batch size that will fit on the hardware during
        training. Gradient accumulation will be used to make sure the
        actual size of the batch on the hardware respects this limit.
    predict_batch_size : int
        The number of instances to use in a predicting batch.
    gpu_ids : Optional[List[int]]
        A list of IDs for GPUs to use.
    logger : Optional[logging.Logger], optional (default=None)
        The logger to use when logging messages. If ``None``, then no
        messages will be logged.

    Returns
    -------
    float
        The best calibrated xentropy on dev achieved after any epoch.
    bool
        ``True`` if the training loss diverged, ``False`` otherwise.
    """
    gc.collect()
    # collect any garbage to make sure old torch objects are cleaned up (and
    # their memory is freed from the GPU). Otherwise, old tensors can hang
    # around on the GPU, causing CUDA out-of-memory errors.

    if loss_type not in settings.LOSS_TYPES:
        raise ValueError(
            f'Unrecognized loss type: {loss_type}. Please use one of'
            f' "xentropy-hard", "xentropy-soft", "xentropy-full" or'
            f' "dirichlet-multinomial".')

    # Step 1: Manage and construct paths.

    if logger is not None:
        logger.info('Creating the model directory.')

    checkpoints_dir = os.path.join(model_dir, 'checkpoints')
    tensorboard_dir = os.path.join(model_dir, 'tensorboard')
    os.makedirs(model_dir)
    os.makedirs(checkpoints_dir)
    os.makedirs(tensorboard_dir)

    config_file_path = os.path.join(model_dir, 'config.json')
    log_file_path = os.path.join(model_dir, 'log.txt')
    best_checkpoint_path = os.path.join(
        checkpoints_dir, 'best.checkpoint.pkl')
    last_checkpoint_path = os.path.join(
        checkpoints_dir, 'last.checkpoint.pkl')

    # Step 2: Setup the log file.

    if logger is not None:
        logger.info('Configuring log files.')

    log_file_handler = logging.FileHandler(log_file_path)
    log_file_handler.setLevel(logging.DEBUG)
    log_file_handler.setFormatter(logging.Formatter(settings.LOG_FORMAT))
    logging.root.addHandler(log_file_handler)

    # Step 3: Record the script's arguments.

    if logger is not None:
        logger.info(f'Writing arguments to {config_file_path}.')

    with open(config_file_path, 'w') as config_file:
        json.dump({
            'data_dir': data_dir,
            'model_dir': model_dir,
            'dataset': dataset,
            'baseline': baseline,
            'hyper_params': hyper_params,
            'loss_type': loss_type,
            'compute_train_batch_size': compute_train_batch_size,
            'predict_batch_size': predict_batch_size,
            'gpu_ids': gpu_ids
        }, config_file)

    # Step 4: Configure GPUs.

    if gpu_ids:
        if logger is not None:
            logger.info(
                f'Configuring environment to use {len(gpu_ids)} GPUs:'
                f' {", ".join(str(gpu_id) for gpu_id in gpu_ids)}.')

        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu_ids))

        if not torch.cuda.is_available():
            raise EnvironmentError('CUDA must be available to use GPUs.')

        device = torch.device('cuda')
    else:
        if logger is not None:
            logger.info('Configuring environment to use CPU.')

        device = torch.device('cpu')

    # Step 5: Fetch the baseline information and training loop parameters.

    if logger is not None:
        logger.info('Retrieving baseline and related parameters.')

    if dataset == 'resource':
        Model, baseline_config, _, make_transform =\
            resource.FINE_TUNE_LM_BASELINES[baseline]
    elif dataset == 'corpus':
        Model, baseline_config, _, make_transform =\
            corpus.FINE_TUNE_LM_BASELINES[baseline]
    else:
        raise ValueError(
            f'dataset must be either "resource" or "corpus", not'
            f' {dataset}.')

    n_epochs = hyper_params['n_epochs']
    train_batch_size = hyper_params['train_batch_size']
    n_gradient_accumulation = math.ceil(
        train_batch_size / (compute_train_batch_size * len(gpu_ids)))

    # Step 6: Load the dataset.

    if logger is not None:
        logger.info(f'Loading the dataset from {data_dir}.')

    featurize = make_transform(**baseline_config['transform'])
    if dataset == 'resource':
        Dataset = ScruplesResourceDataset
        labelize = None
        labelize_scores = lambda scores: np.array(scores).astype(float)
    elif dataset == 'corpus':
        Dataset = ScruplesCorpusDataset
        labelize = lambda s: getattr(Label, s).index
        labelize_scores = lambda scores: np.array([
            score
            for _, score in sorted(
                    scores.items(),
                    key=lambda t: labelize(t[0]))
        ]).astype(float)
    else:
        raise ValueError(
            f'dataset must be either "resource" or "corpus", not'
            f' {dataset}.')

    train = Dataset(
        data_dir=data_dir,
        split='train',
        transform=featurize,
        label_transform=labelize,
        label_scores_transform=labelize_scores)
    dev = Dataset(
        data_dir=data_dir,
        split='dev',
        transform=featurize,
        label_transform=labelize,
        label_scores_transform=labelize_scores)

    train_loader = DataLoader(
        dataset=train,
        batch_size=train_batch_size // n_gradient_accumulation,
        shuffle=True,
        num_workers=len(gpu_ids),
        pin_memory=bool(gpu_ids))
    dev_loader = DataLoader(
        dataset=dev,
        batch_size=predict_batch_size,
        shuffle=False,
        num_workers=len(gpu_ids),
        pin_memory=bool(gpu_ids))

    # Step 7: Create the model, optimizer, and loss.

    if logger is not None:
        logger.info('Initializing the model.')

    model = Model(**baseline_config['model'])
    model.to(device)

    n_optimization_steps = n_epochs * math.ceil(len(train) / train_batch_size)
    parameter_groups = [
        {
            'params': [
                param
                for name, param in model.named_parameters()
                if 'bias' in name
                or 'LayerNorm.bias' in name
                or 'LayerNorm.weight' in name
            ],
            'weight_decay': 0
        },
        {
            'params': [
                param
                for name, param in model.named_parameters()
                if 'bias' not in name
                and 'LayerNorm.bias' not in name
                and 'LayerNorm.weight' not in name
            ],
            'weight_decay': hyper_params['weight_decay']
        }
    ]
    optimizer = AdamW(parameter_groups, lr=hyper_params['lr'])

    if loss_type == 'xentropy-hard':
        loss = torch.nn.CrossEntropyLoss()
    elif loss_type == 'xentropy-soft':
        loss = SoftCrossEntropyLoss()
    elif loss_type == 'xentropy-full':
        loss = SoftCrossEntropyLoss()
    elif loss_type == 'dirichlet-multinomial':
        loss = DirichletMultinomialLoss()

    xentropy = SoftCrossEntropyLoss()

    scheduler = WarmupLinearSchedule(
        optimizer=optimizer,
        warmup_steps=int(
            hyper_params['warmup_proportion']
            * n_optimization_steps
        ),
        t_total=n_optimization_steps)

    # add data parallelism support
    model = torch.nn.DataParallel(model)

    # Step 8: Run training.

    n_train_batches_per_epoch = math.ceil(len(train) / train_batch_size)
    n_dev_batch_per_epoch = math.ceil(len(dev) / predict_batch_size)

    writer = tensorboardX.SummaryWriter(log_dir=tensorboard_dir)

    best_dev_calibrated_xentropy = math.inf
    for epoch in range(n_epochs):
        # set the model to training mode
        model.train()

        # run training for the epoch
        epoch_train_loss = 0
        epoch_train_xentropy = 0
        for i, (_, features, labels, label_scores) in tqdm.tqdm(
                enumerate(train_loader),
                total=n_gradient_accumulation * n_train_batches_per_epoch,
                **settings.TQDM_KWARGS
        ):
            # move the data onto the device
            features = {k: v.to(device) for k, v in features.items()}

            # create the targets
            if loss_type == 'xentropy-hard':
                targets = labels
            elif loss_type == 'xentropy-soft':
                targets = label_scores / torch.unsqueeze(
                    torch.sum(label_scores, dim=-1), dim=-1)
            elif loss_type == 'xentropy-full':
                targets = label_scores
            elif loss_type == 'dirichlet-multinomial':
                targets = label_scores
            # create the soft labels
            soft_labels = label_scores / torch.unsqueeze(
                torch.sum(label_scores, dim=-1), dim=-1)

            # move the targets and soft labels to the device
            targets = targets.to(device)
            soft_labels = soft_labels.to(device)

            # make predictions
            logits = model(**features)[0]

            batch_loss = loss(logits, targets)
            batch_xentropy = xentropy(logits, soft_labels)

            # update training statistics
            epoch_train_loss = (
                batch_loss.item() + i * epoch_train_loss
            ) / (i + 1)
            epoch_train_xentropy = (
                batch_xentropy.item() + i * epoch_train_xentropy
            ) / (i + 1)

            # update the network
            batch_loss.backward()

            if (i + 1) % n_gradient_accumulation == 0:
                optimizer.step()
                optimizer.zero_grad()

                scheduler.step()

            # write training statistics to tensorboard

            step = n_train_batches_per_epoch * epoch + (
                (i + 1) // n_gradient_accumulation)
            if step % 100 == 0 and (i + 1) % n_gradient_accumulation == 0:
                writer.add_scalar('train/loss', epoch_train_loss, step)
                writer.add_scalar('train/xentropy', epoch_train_xentropy, step)

        # run evaluation
        with torch.no_grad():
            # set the model to evaluation mode
            model.eval()

            # run validation for the epoch
            epoch_dev_loss = 0
            epoch_dev_soft_labels = []
            epoch_dev_logits = []
            for i, (_, features, labels, label_scores) in tqdm.tqdm(
                    enumerate(dev_loader),
                    total=n_dev_batch_per_epoch,
                    **settings.TQDM_KWARGS):
                # move the data onto the device
                features = {k: v.to(device) for k, v in features.items()}

                # create the targets
                if loss_type == 'xentropy-hard':
                    targets = labels
                elif loss_type == 'xentropy-soft':
                    targets = label_scores / torch.unsqueeze(
                        torch.sum(label_scores, dim=-1), dim=-1)
                elif loss_type == 'xentropy-full':
                    targets = label_scores
                elif loss_type == 'dirichlet-multinomial':
                    targets = label_scores

                # move the targets to the device
                targets = targets.to(device)

                # make predictions
                logits = model(**features)[0]

                batch_loss = loss(logits, targets)

                # update validation statistics
                epoch_dev_loss = (
                    batch_loss.item() + i * epoch_dev_loss
                ) / (i + 1)
                epoch_dev_soft_labels.extend(
                    (
                        label_scores
                        / torch.unsqueeze(torch.sum(label_scores, dim=-1), dim=-1)
                    ).cpu().numpy().tolist()
                )
                epoch_dev_logits.extend(logits.cpu().numpy().tolist())

            # compute validation statistics
            epoch_dev_soft_labels = np.array(epoch_dev_soft_labels)
            epoch_dev_logits = np.array(epoch_dev_logits)

            calibration_factor = utils.calibration_factor(
                logits=epoch_dev_logits,
                targets=epoch_dev_soft_labels)

            epoch_dev_xentropy = utils.xentropy(
                y_true=epoch_dev_soft_labels,
                y_pred=softmax(epoch_dev_logits, axis=-1))
            epoch_dev_calibrated_xentropy = utils.xentropy(
                y_true=epoch_dev_soft_labels,
                y_pred=softmax(epoch_dev_logits / calibration_factor, axis=-1))

            # write validation statistics to tensorboard
            writer.add_scalar('dev/loss', epoch_dev_loss, step)
            writer.add_scalar('dev/xentropy', epoch_dev_xentropy, step)
            writer.add_scalar(
                'dev/calibrated-xentropy', epoch_dev_calibrated_xentropy, step)

            if logger is not None:
                logger.info(
                    f'\n\n'
                    f'  epoch {epoch}:\n'
                    f'    train loss              : {epoch_train_loss:.4f}\n'
                    f'    train xentropy          : {epoch_train_xentropy:.4f}\n'
                    f'    dev loss                : {epoch_dev_loss:.4f}\n'
                    f'    dev xentropy            : {epoch_dev_xentropy:.4f}\n'
                    f'    dev calibrated xentropy : {epoch_dev_calibrated_xentropy:.4f}\n'
                    f'    calibration factor      : {calibration_factor:.4f}\n')

        # update checkpoints

        torch.save(
            {
                'epoch': epoch,
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'calibration_factor': calibration_factor
            },
            last_checkpoint_path)

        # update the current best model
        if epoch_dev_calibrated_xentropy < best_dev_calibrated_xentropy:
            shutil.copyfile(last_checkpoint_path, best_checkpoint_path)
            best_dev_calibrated_xentropy = epoch_dev_calibrated_xentropy

        # exit early if the training loss has diverged
        if math.isnan(epoch_train_loss):
            logger.info('Training loss has diverged. Exiting early.')

            return best_dev_calibrated_xentropy, True

    logger.info(
        f'Training complete. Best dev calibrated xentropy was'
        f' {best_dev_calibrated_xentropy:.4f}.')

    return best_dev_calibrated_xentropy, False
コード例 #15
0
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
    """ Train the model """
    # freeze the bert layers to preseve pre-trained embeddings:
    # model.module.bert.weight.requires_grad_(False)
    # model.module.bert.bias.requires_grad_(False)
    for name, param in model.bert.named_parameters():
        if name.startswith('embeddings') or name.startswith('encoding'):
            param.requires_grad = False

    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = int(
                args.model_name_or_path.split("-")[-1].split("/")[0])
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except:
            pass

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()

            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM and RoBERTa don"t use segment_ids

            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(args,
                                              model,
                                              tokenizer,
                                              labels,
                                              pad_token_label_id,
                                              mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #16
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer, logger) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #17
0
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)
model.train()
print('initialized bert model')

train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

print('starting training\n')
for epoch in range(3):
    print('EPOCH', epoch + 1)
    batch_num = 1
    for batch in train_loader:
        print('  batch', batch_num)
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
コード例 #18
0
    def train_model(self, batches):
        self.model = self.new_model()
        self.model = self.model.to(self.device)
        self.optimizer = AdamW(self.model.parameters(),
                               lr=self.params.learning_rate)

        train_batches, val_batches = train_test_split(
            batches,
            shuffle=True,
            random_state=self.params.random_state,
            test_size=.1)

        self.loss = self.create_loss_functions()
        for epoch in range(self.params.num_epochs):

            loss_val = 0
            self.model.train()

            for batch in train_batches:
                X_ids = torch.tensor(batch["inputs"]).to(self.device)
                X_att = torch.tensor(batch["attentions"]).to(self.device)
                if len([
                        x for task_label in self.task_labels
                        for x in batch["masks"][task_label]
                ]) == 0:
                    continue

                logits, _ = self.model(X_ids, attn=X_att)
                class_loss = dict()
                weighted_sum = 0
                for task_label in self.task_labels:
                    masked_logits = logits[task_label][batch["masks"]
                                                       [task_label]]
                    masked_labels = [
                        batch["labels"][task_label][x]
                        for x in batch["masks"][task_label]
                    ]
                    if self.multi_task or self.ensemble:
                        masked_labels = torch.tensor(masked_labels).type(
                            "torch.LongTensor").to(self.device)
                    else:
                        masked_labels = torch.tensor(masked_labels).to(
                            self.device)

                    if len(batch["masks"][task_label]) > 0:
                        ## list of loss values for each batch instance
                        class_loss[task_label] = self.loss[task_label](
                            masked_logits, masked_labels)

                        ## using a column of the data as the weight for loss value of each instance
                        # Batch["weight"] shows the instance weight (based on its certainty), class_weight shows the class weight for positive and negative labels
                        # batch["weights"][batch_i] *
                        """
                        class_loss[task_label] = sum([ batch_loss[mask_i] * self.class_weight[task_label][masked_labels[mask_i]]
                                                      for mask_i, batch_i in enumerate(batch["masks"][task_label])])
                        weighted_sum += sum([self.class_weight[task_label][label] for label in masked_labels])
                        """
                total_loss = sum(class_loss.values())  # / weighted_sum
                loss_val += total_loss.item()
                total_loss.backward()
                self.optimizer.step()

            print("Epoch", epoch, "-", "Loss", round(loss_val, 3))
            if val_batches:
                val_results = self.predict(val_batches, self.model)
                print("Validation:")
                print(self.report_results(val_results))
コード例 #19
0
    def train(self,
              train_dataloader,
              valid_dataloader=None,
              test_dataloader=None,
              should_continue=False):
        """ Train the model """
        tb_writer = SummaryWriter()

        train_epochs = self.config.train_epochs

        if self.config.max_steps > 0:
            train_steps = self.config.max_steps
            train_epochs = self.config.max_steps // (
                len(train_dataloader) // self.config.grad_acc_steps) + 1
        else:
            train_steps = len(
                train_dataloader) // self.config.grad_acc_steps * train_epochs

        if self.total_samples and should_continue:
            steps_total = self.total_samples // self.config.train_bs // self.config.grad_acc_steps * train_epochs
        else:
            steps_total = train_steps

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.config.weight_decay,
            },
            {
                "params": [
                    p for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]

        self.optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.config.lr,
            eps=self.config.adam_eps,
        )

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.config.warmup_steps,
            num_training_steps=steps_total)

        # self.scheduler = get_constant_schedule(self.optimizer)

        if should_continue and self.global_step > 0:
            logger.info("loading saved optimizer and scheduler states")
            assert (self.optimizer_state_dict)
            assert (self.scheduler_state_dict)
            self.optimizer.load_state_dict(self.optimizer_state_dict)
            self.scheduler.load_state_dict(self.scheduler_state_dict)
        else:
            logger.info("Using fresh optimizer and scheduler")

        if self.config.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            self.model, self.optimizer = amp.initialize(
                self.model,
                self.optimizer,
                opt_level=self.config.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if self.config.n_gpu > 1 and not isinstance(self.model,
                                                    torch.nn.DataParallel):
            self.model = torch.nn.DataParallel(self.model)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d (%d)", len(train_dataloader.dataset),
                    len(train_dataloader))
        logger.info("  Num Epochs = %d", train_epochs)
        logger.info("  Batch size = %d", self.config.train_bs)
        logger.info("  Learning rate = %e", self.config.lr)
        logger.info("  Loss label weights = %s",
                    self.config.loss_label_weights)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.config.train_bs * self.config.grad_acc_steps)
        logger.info("  Gradient Accumulation steps = %d",
                    self.config.grad_acc_steps)
        logger.info("  Total optimization steps = %d", train_steps)

        if not should_continue:
            self.global_step = 0

        epochs_trained = 0
        steps_trained_in_current_epoch = 0

        # # Check if continuing training from a checkpoint
        # if os.path.exists(self.config.model_path):
        #     if self.config.should_continue:
        #         step_str = self.config.model_path.split("-")[-1].split("/")[0]

        #         if step_str:
        #             # set self.global_step to gobal_step of last saved checkpoint from model path
        #             self.global_step = int(step_str)
        #             epochs_trained = self.global_step // (len(train_dataloader) //
        #                                                   self.config.grad_acc_steps)
        #             steps_trained_in_current_epoch = self.global_step % (
        #                 len(train_dataloader) // self.config.grad_acc_steps)

        #             logger.info(
        #                 "  Continuing training from checkpoint, will skip to saved self.global_step")
        #             logger.info(
        #                 "  Continuing training from epoch %d", epochs_trained)
        #             logger.info(
        #                 "  Continuing training from global step %d", self.global_step)
        #             logger.info("  Will skip the first %d steps in the first epoch",
        #                         steps_trained_in_current_epoch)

        train_loss = 0.0
        self.model.zero_grad()
        train_iterator = trange(
            epochs_trained,
            int(train_epochs),
            desc="Epoch",
        )
        util.set_seed(self.config)  # Added here for reproductibility

        self.model.train()

        if self.config.train_head_only:
            for param in self.model.roberta.embeddings.parameters():
                param.requires_grad = False
            logger.info("Training only head")
            # for param in self.model.__getattr__(self.config.model_type).roberta.parameters():
            #     param.requires_grad = False

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                self.model.train()

                inputs = self.__inputs_from_batch(batch)
                outputs = self.model(**inputs)

                # model outputs are always tuple in transformers (see doc)
                loss = outputs[0]

                if self.config.n_gpu > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training
                if self.config.grad_acc_steps > 1:
                    loss = loss / self.config.grad_acc_steps

                if self.config.fp16:
                    with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                batch_loss = loss.item()
                train_loss += batch_loss

                if (step + 1) % self.config.grad_acc_steps == 0:
                    if self.config.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(self.optimizer),
                            self.config.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.config.max_grad_norm)

                    self.optimizer.step()
                    self.scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    self.global_step += 1

                    if self.config.logging_steps > 0 and self.global_step % self.config.logging_steps == 0:
                        logs = {}
                        if valid_dataloader:
                            result_valid, * \
                                _ = self.evaluate(
                                    'valid', valid_dataloader, backtrans=(test_dataloader == None))
                            logs.update({
                                f"valid_{k}": v
                                for k, v in result_valid.items()
                            })

                        if test_dataloader:
                            test_dataloader = test_dataloader if isinstance(
                                test_dataloader, dict) else {
                                    'test': test_dataloader
                                }
                            for eval_name, dataloader_or_tuple in test_dataloader.items(
                            ):
                                if isinstance(dataloader_or_tuple, tuple):
                                    dataloader, kwargs = dataloader_or_tuple
                                else:
                                    dataloader = dataloader_or_tuple
                                    kwargs = {}

                                result_test, * \
                                    _ = self.evaluate(
                                        eval_name, dataloader, **kwargs)
                                logs.update({
                                    f"{eval_name}_{k}": v
                                    for k, v in result_test.items()
                                })

                        learning_rate_scalar = self.scheduler.get_last_lr()[0]
                        logger.info("Learning rate: %f (at step %d)",
                                    learning_rate_scalar, step)
                        logs["learning_rate"] = learning_rate_scalar
                        logs["train_loss"] = train_loss

                        self.after_logging(logs)

                        logger.info("Batch loss: %f", batch_loss)

                        # for key, value in logs.items():
                        #     tb_writer.add_scalar(key, value, self.global_step)

                    if self.config.save_steps > 0 and self.global_step % self.config.save_steps == 0:
                        # Save model checkpoint
                        self.save_checkpoint()

                if self.config.max_steps > 0 and self.global_step > self.config.max_steps:
                    epoch_iterator.close()
                    break
            if self.config.max_steps > 0 and self.global_step > self.config.max_steps:
                train_iterator.close()
                break

        if self.config.train_head_only:
            logger.info("Training only head")
            # for param in self.model.__getattr__(self.config.model_type).parameters():
            #     param.requires_grad = True

            for param in self.model.roberta.embeddings.parameters():
                param.requires_grad = False

        tb_writer.close()
        self.optimizer_state_dict = self.optimizer.state_dict()
        self.scheduler_state_dict = self.scheduler.state_dict()

        avg_train_loss = train_loss / self.global_step

        logger.info("Learning rate now: %s", self.scheduler.get_last_lr())
        logger.info("***** Done training *****")
        return self.global_step, avg_train_loss
コード例 #20
0
class ToxicityClassifier():
    def __init__(self, data, annotators, params, task_labels=["toxic"]):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.device = torch.device('cuda')
        self.data = data
        self.annotators = annotators

        self.multi_label, self.multi_task, self.ensemble, self.single, self.log_reg = False, False, False, False, False
        setattr(self, params.task, True)

        if self.single or self.log_reg:
            self.task_labels = task_labels
        else:
            self.task_labels = annotators

        self.majority_vote()
        self.uncertainty()
        print("Data shape after majority voting", self.data.shape)

        # Setting the parameters
        self.params = params
        print([(k, v) for k, v in self.params.__dict__.items()])

    def majority_vote(self):
        self.data["toxic"] = (self.data[self.annotators].sum(axis=1) / \
                              self.data[self.annotators].count(axis=1) >= 0.5).astype(int)
        print(sum(self.data["toxic"]))

    def uncertainty(self):
        self.data["uncertainty"] = (self.data[self.annotators].sum(axis=1) \
                                    * (self.data[self.annotators].count(axis=1) - self.data[self.annotators].sum(
                    axis=1)) \
                                    / (self.data[self.annotators].count(axis=1) * self.data[self.annotators].count(
                    axis=1)))

    def CV(self):
        if self.ensemble:
            ensemble_results = pd.DataFrame()
            for annotator in self.annotators:
                print("Training model for annotator", annotator)
                self.task_labels = ["toxic"]
                scores, results = self._CV(
                    self.data.rename(columns={
                        annotator: "toxic",
                        "toxic": "_toxic"
                    }))
                ensemble_results[annotator + "_pred"] = results["toxic_pred"]
                ensemble_results[annotator + "_label"] = results["toxic_label"]
                ensemble_results[annotator +
                                 "_masked_pred"] = results["toxic_masked_pred"]
                ensemble_results[
                    annotator +
                    "_masked_label"] = results["toxic_masked_label"]
            self.task_labels = self.annotators
            scores = self.report_results(ensemble_results)
            return scores, ensemble_results
        else:
            return self._CV(self.data)

    def masks(self, df):
        df = df.replace(0, 1)
        df = df.replace(np.nan, 0)
        new_labels = LabelEncoder().fit_transform(
            [''.join(str(l) for l in row) for i, row in df.iterrows()])
        return new_labels

    def _CV(self, data):
        if self.params.stratified:
            kfold = StratifiedKFold(n_splits=self.params.num_folds,
                                    shuffle=True,
                                    random_state=self.params.random_state)
        else:
            kfold = KFold(n_splits=self.params.num_folds,
                          shuffle=True,
                          random_state=self.params.random_state)

        results = pd.DataFrame()
        i = 1
        for train_idx, test_idx in kfold.split(
                np.zeros(self.data.shape[0]),
                self.masks(self.data[self.annotators])):
            print("Fold #", i)

            train = data.loc[train_idx].reset_index()
            test = data.loc[test_idx].reset_index()
            """
            if i == 1:
              test.to_csv(os.path.join(self.params.source_dir, "results", "GHC", "test_file.csv"), index=False)
            else:
              test.to_csv(os.path.join(self.params.source_dir, "results", "GHC", "test_file.csv"), index=False, header=False, mode="a")
            """

            train_batches = self.get_batches(train)
            test_batches = self.get_batches(test)

            self.train_model(train_batches)
            if self.params.predict == "label":
                # testing on the validation set
                fold_result = self.predict(test_batches)
                print("Test:")
                print(self.report_results(fold_result))

                fold_result["fold"] = pd.Series([i for id in test_idx])
                results = results.append(fold_result)
                i += 1
            elif self.params.predict == "mc":
                certainty_results = self.mc_predict(test_batches)
                fold_result = self.predict(test_batches)
                fold_result["fold"] = pd.Series([i for id in test_idx])
                fold_result = fold_result.join(certainty_results)
                results = results.append(fold_result)

        scores = self.report_results(results)
        print(scores)
        return scores, results

    def new_model(self):
        if self.multi_task:
            return ClassifierBert(self.device, tasks=self.annotators)
        elif self.multi_label:
            return ClassifierBert(self.device, labels=len(self.annotators))
        elif self.log_reg:
            return ClassifierBert(self.device,
                                  labels=1,
                                  tasks=self.task_labels)
        else:
            return ClassifierBert(self.device)

    def create_loss_functions(self):
        losses = dict()
        # self.class_weight = dict()

        for task_label in self.task_labels:
            _labels = [int(x) for x in self.data[task_label].dropna().tolist()]
            weight = compute_class_weight('balanced', np.unique(_labels),
                                          _labels)
            if len(weight) == 1:
                weight = [0.01, 1]
            weight = torch.tensor(weight, dtype=torch.float32).to(self.device)

            if self.multi_label:
                losses[task_label] = nn.BCEWithLogitsLoss(
                    reduction="sum")  # , pos_weight=class_weight)
            elif self.log_reg:
                losses[task_label] = nn.MSELoss()
            else:
                losses[task_label] = nn.CrossEntropyLoss(weight=weight)

        return losses

    def train_model(self, batches):
        self.model = self.new_model()
        self.model = self.model.to(self.device)
        self.optimizer = AdamW(self.model.parameters(),
                               lr=self.params.learning_rate)

        train_batches, val_batches = train_test_split(
            batches,
            shuffle=True,
            random_state=self.params.random_state,
            test_size=.1)

        self.loss = self.create_loss_functions()
        for epoch in range(self.params.num_epochs):

            loss_val = 0
            self.model.train()

            for batch in train_batches:
                X_ids = torch.tensor(batch["inputs"]).to(self.device)
                X_att = torch.tensor(batch["attentions"]).to(self.device)
                if len([
                        x for task_label in self.task_labels
                        for x in batch["masks"][task_label]
                ]) == 0:
                    continue

                logits, _ = self.model(X_ids, attn=X_att)
                class_loss = dict()
                weighted_sum = 0
                for task_label in self.task_labels:
                    masked_logits = logits[task_label][batch["masks"]
                                                       [task_label]]
                    masked_labels = [
                        batch["labels"][task_label][x]
                        for x in batch["masks"][task_label]
                    ]
                    if self.multi_task or self.ensemble:
                        masked_labels = torch.tensor(masked_labels).type(
                            "torch.LongTensor").to(self.device)
                    else:
                        masked_labels = torch.tensor(masked_labels).to(
                            self.device)

                    if len(batch["masks"][task_label]) > 0:
                        ## list of loss values for each batch instance
                        class_loss[task_label] = self.loss[task_label](
                            masked_logits, masked_labels)

                        ## using a column of the data as the weight for loss value of each instance
                        # Batch["weight"] shows the instance weight (based on its certainty), class_weight shows the class weight for positive and negative labels
                        # batch["weights"][batch_i] *
                        """
                        class_loss[task_label] = sum([ batch_loss[mask_i] * self.class_weight[task_label][masked_labels[mask_i]]
                                                      for mask_i, batch_i in enumerate(batch["masks"][task_label])])
                        weighted_sum += sum([self.class_weight[task_label][label] for label in masked_labels])
                        """
                total_loss = sum(class_loss.values())  # / weighted_sum
                loss_val += total_loss.item()
                total_loss.backward()
                self.optimizer.step()

            print("Epoch", epoch, "-", "Loss", round(loss_val, 3))
            if val_batches:
                val_results = self.predict(val_batches, self.model)
                print("Validation:")
                print(self.report_results(val_results))

    def predict(self, batches, model=None):
        self.model.eval()
        results = defaultdict(list)

        for batch in batches:

            X_ids = torch.tensor(batch["inputs"]).to(self.device)
            X_att = torch.tensor(batch["attentions"]).to(self.device)

            logits, predictions = self.model(X_ids, attn=X_att)

            for task_label in self.task_labels:
                masked_labels = [
                    x if x in batch["masks"][task_label] else np.nan
                    for x in batch["labels"][task_label]
                ]
                masked_predictions = [
                    x if x in batch["masks"][task_label] else np.nan
                    for x in predictions[task_label]
                ]

                results[task_label + "_masked_pred"].extend(masked_predictions)
                results[task_label + "_masked_label"].extend(masked_labels)
                results[task_label + "_pred"].extend(predictions[task_label])
                results[task_label + "_label"].extend(
                    batch["labels"][task_label])

                if self.params.task == "single":
                    results[task_label + "_logit"].extend(
                        softmax(logits[task_label].cpu().detach().numpy(),
                                axis=1)[:, 1])

        return pd.DataFrame.from_dict(results)

    def mc_predict(self, batches, model=None):
        results = defaultdict(list)
        soft = nn.Softmax(dim=1)
        num_samples = sum([batch["batch_len"] for batch in batches])
        dropout_predictions = np.empty((0, num_samples, 1))

        for task_label in self.task_labels:
            for mc_pass in range(self.params.mc_passes):
                self.model.eval()
                self.enable_dropout(self.model)
                mc_predictions = np.empty((0, 1))

                for batch in batches:
                    X_ids = torch.tensor(batch["inputs"]).to(self.device)
                    X_att = torch.tensor(batch["attentions"]).to(self.device)
                    logits, predictions = self.model(X_ids, attn=X_att)

                    predictions = np.array(predictions[task_label])
                    mc_predictions = np.vstack(
                        (mc_predictions, predictions[:, np.newaxis]))

                dropout_predictions = np.vstack(
                    (dropout_predictions, mc_predictions[np.newaxis, :]))
            results[task_label + "_mean"] = list(
                np.squeeze(np.mean(dropout_predictions, axis=0)))
            results[task_label + "_variance"] = list(
                np.squeeze(np.var(dropout_predictions, axis=0)))

        return pd.DataFrame.from_dict(results)

    def enable_dropout(self, model):
        for m in model.modules():
            if m.__class__.__name__.startswith('Dropout'):
                m.train()

    def report_results(self, results):
        if self.log_reg:
            label_col = self.task_labels[0] + "_label"
            pred_col = self.task_labels[0] + "_pred"
            r2 = r2_score(results[label_col], results[pred_col])
            scores = {"r2": round(r2, 4)}
            return scores
        if len(self.task_labels) > 1:
            label_cols = [col + "_label" for col in self.annotators]
            pred_cols = [col + "_pred" for col in self.annotators]

            masked_label_cols = [
                col + "_masked_label" for col in self.annotators
            ]
            masked_pred_cols = [
                col + "_masked_pred" for col in self.annotators
            ]

            toxic_label = results[label_cols].sum(
                axis=1) / results[label_cols].count(axis=1) >= 0.5
            toxic_pred = results[pred_cols].sum(
                axis=1) / results[pred_cols].count(axis=1) >= 0.5

            masked_toxic_label = results[masked_label_cols].sum(
                axis=1) / results[masked_label_cols].count(axis=1) >= 0.5
            masked_toxic_pred = results[masked_pred_cols].sum(
                axis=1) / results[masked_pred_cols].count(axis=1) >= 0.5

            print("Accuracy of the majority vote (after masking):")

            result_cat = masked_toxic_label.map({
                True: "T",
                False: "F"
            }) + masked_toxic_pred.map({
                True: "T",
                False: "F"
            })
            result_cat = result_cat.map({
                "TT": "TP",
                "FF": "TN",
                "TF": "FN",
                "FT": "FP"
            })
            true_results = result_cat.isin(["TP", "TN"])

            counts = Counter(result_cat)
            a = Counter(true_results)[True] / results.shape[0]
            p = counts["TP"] / max((counts["TP"] + counts["FP"]), 1)
            r = counts["TP"] / max((counts["TP"] + counts["FN"]), 1)
            try:
                f = 2 * p * r / (p + r)
            except Exception:
                f = 0
            print({
                "A": round(a, 4),
                "P": round(p, 4),
                "R": round(r, 4),
                "F1": round(f, 4)
            })

            print("Accuracy of the majority vote (using all annotator heads):")
        else:
            toxic_label = results["toxic_label"] == 1
            toxic_pred = results["toxic_pred"] == 1
            print("Accuracy of single label")

        result_cat = toxic_label.map({
            True: "T",
            False: "F"
        }) + toxic_pred.map({
            True: "T",
            False: "F"
        })
        result_cat = result_cat.map({
            "TT": "TP",
            "FF": "TN",
            "TF": "FN",
            "FT": "FP"
        })
        true_results = result_cat.isin(["TP", "TN"])

        counts = Counter(result_cat)
        a = Counter(true_results)[True] / results.shape[0]
        p = counts["TP"] / max((counts["TP"] + counts["FP"]), 1)
        r = counts["TP"] / max((counts["TP"] + counts["FN"]), 1)
        try:
            f = 2 * p * r / (p + r)
        except Exception:
            f = 0

        scores = {
            "A": round(a, 4),
            "P": round(p, 4),
            "R": round(r, 4),
            "F1": round(f, 4)
        }
        return scores

    def get_batches(self, data):
        if isinstance(self.params.sort_by, str):
            data = data.sort_values(by=[self.params.sort_by],
                                    ascending=False).reset_index()
        batches = list()

        for s in range(0, len(data), self.params.batch_size):
            e = s + self.params.batch_size if s + self.params.batch_size < len(
                data) else len(data)
            data_info = self.batch_to_info(data["text"].tolist()[s:e])

            anno_batch = dict()
            mask_batch = dict()
            for task_label in self.task_labels:
                anno_batch[task_label] = data[task_label].tolist()[s:e]
                mask_batch[task_label] = [i for i, h in enumerate(anno_batch[task_label]) \
                                          if not math.isnan(h)]
            data_info["labels"] = anno_batch
            data_info["masks"] = mask_batch

            # data_info["majority_vote"] = data["toxic"].tolist()[s: e]
            data_info["batch_len"] = e - s
            if isinstance(self.params.batch_weight, str):
                data_info["weights"] = data[
                    self.params.batch_weight].tolist()[s:e]
            else:
                data_info["weights"] = [1 for i in range(e - s)]
            batches.append(data_info)
        return batches

    def batch_to_info(self, batch):
        batch_info = dict()
        if isinstance(self.params.max_len, int):
            tokens = self.tokenizer(batch,
                                    padding="max_length",
                                    max_length=self.params.max_len,
                                    truncation=True)
        else:
            tokens = self.tokenizer(batch, padding=True, truncation=True)
        batch_info["inputs"] = tokens["input_ids"]
        batch_info["attentions"] = tokens["attention_mask"]
        return batch_info
コード例 #21
0
                                 hidden_size=768,
                                 drop_rate=0.1)
model_name = 'bert_encoder_on_fewrel'

# set optimizer
batch_size = 32
train_epoch = 10

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':
    0.0
}]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

framework.train_encoder_epoch(model,
                              model_name,
                              optimizer=optimizer,
                              batch_size=batch_size,
                              train_epoch=train_epoch,
                              learning_rate=2e-5,
                              warmup=True)
コード例 #22
0
class MAML:

    def __init__(self, device, **kwargs):
        self.inner_lr = kwargs.get('inner_lr')
        self.meta_lr = kwargs.get('meta_lr')
        self.write_prob = kwargs.get('write_prob')
        self.replay_rate = kwargs.get('replay_rate')
        self.replay_every = kwargs.get('replay_every')
        self.device = device

        self.pn = TransformerClsModel(model_name=kwargs.get('model'),
                                      n_classes=1,
                                      max_length=kwargs.get('max_length'),
                                      device=device)

        logger.info('Loaded {} as PN'.format(self.pn.__class__.__name__))

        meta_params = [p for p in self.pn.parameters() if p.requires_grad]
        self.meta_optimizer = AdamW(meta_params, lr=self.meta_lr)

        self.memory = ReplayMemory(write_prob=self.write_prob, tuple_size=3)
        self.loss_fn = nn.BCEWithLogitsLoss()

        inner_params = [p for p in self.pn.parameters() if p.requires_grad]
        self.inner_optimizer = optim.SGD(inner_params, lr=self.inner_lr)

    def save_model(self, model_path):
        checkpoint = self.pn.state_dict()
        torch.save(checkpoint, model_path)

    def load_model(self, model_path):
        checkpoint = torch.load(model_path)
        self.pn.load_state_dict(checkpoint)

    def evaluate(self, dataloader, updates, mini_batch_size):

        self.pn.train()

        support_set = []
        for _ in range(updates):
            text, label, candidates = self.memory.read_batch(batch_size=mini_batch_size)
            support_set.append((text, label, candidates))

        with higher.innerloop_ctx(self.pn, self.inner_optimizer,
                                  copy_initial_weights=False,
                                  track_higher_grads=False) as (fpn, diffopt):

            # Inner loop
            task_predictions, task_labels = [], []
            support_loss = []
            for text, label, candidates in support_set:
                replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text,
                                                                                                         label,
                                                                                                         candidates)

                input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations)))
                output = fpn(input_dict)
                targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device)
                loss = self.loss_fn(output, targets)

                diffopt.step(loss)
                pred, true_labels = models.utils.make_rel_prediction(output, ranking_label)
                support_loss.append(loss.item())
                task_predictions.extend(pred.tolist())
                task_labels.extend(true_labels.tolist())

            acc = models.utils.calculate_accuracy(task_predictions, task_labels)

            logger.info('Support set metrics: Loss = {:.4f}, accuracy = {:.4f}'.format(np.mean(support_loss), acc))

            all_losses, all_predictions, all_labels = [], [], []

            for text, label, candidates in dataloader:
                replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text,
                                                                                                         label,
                                                                                                         candidates)
                with torch.no_grad():

                    input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations)))
                    output = fpn(input_dict)
                    targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device)
                    loss = self.loss_fn(output, targets)

                loss = loss.item()
                pred, true_labels = models.utils.make_rel_prediction(output, ranking_label)
                all_losses.append(loss)
                all_predictions.extend(pred.tolist())
                all_labels.extend(true_labels.tolist())

        acc = models.utils.calculate_accuracy(all_predictions, all_labels)
        logger.info('Test metrics: Loss = {:.4f}, accuracy = {:.4f}'.format(np.mean(all_losses), acc))

        return acc

    def training(self, train_datasets, **kwargs):
        updates = kwargs.get('updates')
        mini_batch_size = kwargs.get('mini_batch_size')

        if self.replay_rate != 0:
            replay_batch_freq = self.replay_every // mini_batch_size
            replay_freq = int(math.ceil((replay_batch_freq + 1) / (updates + 1)))
            replay_steps = int(self.replay_every * self.replay_rate / mini_batch_size)
        else:
            replay_freq = 0
            replay_steps = 0
        logger.info('Replay frequency: {}'.format(replay_freq))
        logger.info('Replay steps: {}'.format(replay_steps))

        concat_dataset = data.ConcatDataset(train_datasets)
        train_dataloader = iter(data.DataLoader(concat_dataset, batch_size=mini_batch_size, shuffle=False,
                                                collate_fn=datasets.utils.rel_encode))

        episode_id = 0
        while True:

            self.inner_optimizer.zero_grad()
            support_loss, support_acc = [], []

            with higher.innerloop_ctx(self.pn, self.inner_optimizer,
                                      copy_initial_weights=False,
                                      track_higher_grads=False) as (fpn, diffopt):

                # Inner loop
                support_set = []
                task_predictions, task_labels = [], []
                for _ in range(updates):
                    try:
                        text, label, candidates = next(train_dataloader)
                        support_set.append((text, label, candidates))
                    except StopIteration:
                        logger.info('Terminating training as all the data is seen')
                        return

                for text, label, candidates in support_set:
                    replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text,
                                                                                                             label,
                                                                                                             candidates)

                    input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations)))
                    output = fpn(input_dict)
                    targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device)
                    loss = self.loss_fn(output, targets)

                    diffopt.step(loss)
                    pred, true_labels = models.utils.make_rel_prediction(output, ranking_label)
                    support_loss.append(loss.item())
                    task_predictions.extend(pred.tolist())
                    task_labels.extend(true_labels.tolist())
                    self.memory.write_batch(text, label, candidates)

                acc = models.utils.calculate_accuracy(task_predictions, task_labels)

                logger.info('Episode {} support set: Loss = {:.4f}, accuracy = {:.4f}'.format(episode_id + 1,
                                                                                              np.mean(support_loss),
                                                                                              acc))

                # Outer loop
                query_loss, query_acc = [], []
                query_set = []

                if self.replay_rate != 0 and (episode_id + 1) % replay_freq == 0:
                    for _ in range(replay_steps):
                        text, label, candidates = self.memory.read_batch(batch_size=mini_batch_size)
                        query_set.append((text, label, candidates))
                else:
                    try:
                        text, label, candidates = next(train_dataloader)
                        query_set.append((text, label, candidates))
                        self.memory.write_batch(text, label, candidates)
                    except StopIteration:
                        logger.info('Terminating training as all the data is seen')
                        return

                for text, label, candidates in query_set:
                    replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text,
                                                                                                             label,
                                                                                                             candidates)

                    input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations)))
                    output = fpn(input_dict)
                    targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device)
                    loss = self.loss_fn(output, targets)

                    query_loss.append(loss.item())
                    pred, true_labels = models.utils.make_rel_prediction(output, ranking_label)

                    acc = models.utils.calculate_accuracy(pred.tolist(), true_labels.tolist())
                    query_acc.append(acc)

                    # PN meta gradients
                    pn_params = [p for p in fpn.parameters() if p.requires_grad]
                    meta_pn_grads = torch.autograd.grad(loss, pn_params)
                    pn_params = [p for p in self.pn.parameters() if p.requires_grad]
                    for param, meta_grad in zip(pn_params, meta_pn_grads):
                        if param.grad is not None:
                            param.grad += meta_grad.detach()
                        else:
                            param.grad = meta_grad.detach()

                # Meta optimizer step
                self.meta_optimizer.step()
                self.meta_optimizer.zero_grad()

                logger.info('Episode {} query set: Loss = {:.4f}, accuracy = {:.4f}'.format(episode_id + 1,
                                                                                            np.mean(query_loss),
                                                                                            np.mean(query_acc)))

                episode_id += 1

    def testing(self, test_dataset, **kwargs):
        updates = kwargs.get('updates')
        mini_batch_size = kwargs.get('mini_batch_size')
        test_dataloader = data.DataLoader(test_dataset, batch_size=mini_batch_size, shuffle=False,
                                          collate_fn=datasets.utils.rel_encode)
        acc = self.evaluate(dataloader=test_dataloader, updates=updates, mini_batch_size=mini_batch_size)
        logger.info('Overall test metrics: Accuracy = {:.4f}'.format(acc))
        return acc
コード例 #23
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file", default=None, type=str)
    parser.add_argument("--eval_file", default=None, type=str)
    parser.add_argument("--test_file", default=None, type=str)
    parser.add_argument("--model_name_or_path", default=None, type=str)
    parser.add_argument("--output_dir", default=None, type=str)

    ## other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument("--max_seq_length", default=256, type=int)
    parser.add_argument("--do_train", default=False, type=boolean_string)
    parser.add_argument("--do_eval", default=False, type=boolean_string)
    parser.add_argument("--do_test", default=False, type=boolean_string)
    parser.add_argument("--train_batch_size", default=8, type=int)
    parser.add_argument("--eval_batch_size", default=8, type=int)
    parser.add_argument("--learning_rate", default=3e-5, type=float)
    parser.add_argument("--num_train_epochs", default=10, type=float)
    parser.add_argument("--warmup_proprotion", default=0.1, type=float)
    parser.add_argument("--use_weight", default=1, type=int)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--seed", type=int, default=2020)
    parser.add_argument("--fp16", default=False)
    parser.add_argument("--loss_scale", type=float, default=0)
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
    parser.add_argument("--warmup_steps", default=0, type=int)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--max_steps", default=-1, type=int)
    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument("--logging_steps", default=500, type=int)
    parser.add_argument("--clean",
                        default=False,
                        type=boolean_string,
                        help="clean the output dir")

    parser.add_argument("--need_birnn", default=False, type=boolean_string)
    parser.add_argument("--rnn_dim", default=128, type=int)

    args = parser.parse_args()

    device = torch.device("cuda")
    # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_
    args.device = device
    n_gpu = torch.cuda.device_count()

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)

    logger.info(f"device: {device} n_gpu: {n_gpu}")

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H')
    # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie'
    # if not os.path.exists(tmp_dir):
    #     os.makedirs(tmp_dir)
    # args.output_dir = tmp_dir
    if args.clean and args.do_train:
        # logger.info("清理")
        if os.path.exists(args.output_dir):

            def del_file(path):
                ls = os.listdir(path)
                for i in ls:
                    c_path = os.path.join(path, i)
                    print(c_path)
                    if os.path.isdir(c_path):
                        del_file(c_path)
                        os.rmdir(c_path)
                    else:
                        os.remove(c_path)

            try:
                del_file(args.output_dir)
            except Exception as e:
                print(e)
                print('pleace remove the files of output dir and data.conf')
                exit(-1)

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if not os.path.exists(os.path.join(args.output_dir, "eval")):
        os.makedirs(os.path.join(args.output_dir, "eval"))

    writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"),
                           comment="Linear")

    processor = NerProcessor()
    label_list = get_labels(r"./data/labels.txt")
    num_labels = len(label_list)
    args.label_list = label_list

    if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")):
        with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f:
            label2id = pickle.load(f)
    else:
        label2id = {l: i for i, l in enumerate(label_list)}
        with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f:
            pickle.dump(label2id, f)

    id2label = {value: key for key, value in label2id.items()}

    # Prepare optimizer and schedule (linear warmup and decay)

    if args.do_train:
        tokenizer = BertTokenizer.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        config = BertConfig.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels)
        model = BERT_BiLSTM_CRF.from_pretrained(args.model_name_or_path,
                                                config=config,
                                                need_birnn=args.need_birnn,
                                                rnn_dim=args.rnn_dim)

        model.to(device)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        train_examples, train_features, train_data = get_Dataset(args,
                                                                 processor,
                                                                 tokenizer,
                                                                 label_list,
                                                                 mode="train")
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        if args.do_eval:
            eval_examples, eval_features, eval_data = get_Dataset(args,
                                                                  processor,
                                                                  tokenizer,
                                                                  label_list,
                                                                  mode="eval")

        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Num Epochs = %d", args.num_train_epochs)
        logger.info("  Total optimization steps = %d", t_total)

        model.train()
        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        best_f1 = 0.0
        for ep in trange(int(args.num_train_epochs), desc="Epoch"):
            model.train()
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id, bbox_num = batch
                outputs = model(input_ids, bbox, bbox_pos_id, bbox_num,
                                label_ids, segment_ids, input_mask)
                loss = outputs

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                        tr_loss_avg = (tr_loss -
                                       logging_loss) / args.logging_steps
                        writer.add_scalar("Train/loss", tr_loss_avg,
                                          global_step)
                        logging_loss = tr_loss

            if args.do_eval:
                all_ori_tokens_eval = [f.ori_tokens for f in eval_features]
                overall, by_type = evaluate(args, eval_data, model, id2label,
                                            all_ori_tokens_eval)

                # add eval result to tensorboard
                f1_score = overall.fscore
                writer.add_scalar("Eval/precision", overall.prec, ep)
                writer.add_scalar("Eval/recall", overall.rec, ep)
                writer.add_scalar("Eval/f1_score", overall.fscore, ep)

                # save the best performs model
                if f1_score > best_f1:
                    logger.info(
                        f"----------the best f1 is {f1_score}---------")
                    best_f1 = f1_score
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(args.output_dir)
                    tokenizer.save_pretrained(args.output_dir)

                    # Good practice: save your training arguments together with the trained model
                    torch.save(
                        args, os.path.join(args.output_dir,
                                           'training_args.bin'))

            # logger.info(f'epoch {ep}, train loss: {tr_loss}')
        # writer.add_graph(model)
        writer.close()

        # model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        # model_to_save.save_pretrained(args.output_dir)
        # tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        # torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

    if args.do_test:
        # model = BertForTokenClassification.from_pretrained(args.output_dir)
        # model.to(device)
        label_map = {i: label for i, label in enumerate(label_list)}

        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        #args = torch.load(os.path.join(args.output_dir, 'training_args.bin'))
        model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir,
                                                need_birnn=args.need_birnn,
                                                rnn_dim=args.rnn_dim)
        model.to(device)

        test_examples, test_features, test_data = get_Dataset(args,
                                                              processor,
                                                              tokenizer,
                                                              label_list,
                                                              mode="test")

        logger.info("***** Running test *****")
        logger.info(f" Num examples = {len(test_examples)}")
        logger.info(f" Batch size = {args.eval_batch_size}")

        all_ori_tokens = [f.ori_tokens for f in test_features]
        all_ori_labels = [e.label for e in test_examples]
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()

        pred_labels = []

        for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox,
                  bbox_pos_id, bbox_num) in enumerate(
                      tqdm(test_dataloader, desc="Predicting")):

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            bbox = bbox.to(device)
            bbox_pos_id = bbox_pos_id.to(device)
            bbox_num = bbox_num.to(device)

            with torch.no_grad():
                logits = model.predict(input_ids, segment_ids, input_mask,
                                       bbox, bbox_pos_id, bbox_num)
            # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            # logits = logits.detach().cpu().numpy()

            for l in logits:

                pred_label = []
                for idx in l:
                    pred_label.append(id2label[idx])
                pred_labels.append(pred_label)

        assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels)
        print(len(pred_labels))
        with open(os.path.join(args.output_dir, "token_labels_.txt"),
                  "w",
                  encoding="utf-8") as f:
            for ori_tokens, ori_labels, prel in zip(all_ori_tokens,
                                                    all_ori_labels,
                                                    pred_labels):
                for ot, ol, pl in zip(ori_tokens, ori_labels, prel):
                    if ot in ["[CLS]", "[SEP]"]:
                        f.write("\n")
                        continue
                    else:
                        f.write(f"{ot} {ol} {pl}\n")
                f.write("\n")
コード例 #24
0
def train(args, processor, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter()

    train_dataset = load_and_cache_examples(args,
                                            processor,
                                            tokenizer,
                                            evaluate=False)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'labels': batch[3]
            }

            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, processor, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #25
0
ファイル: pretrained.py プロジェクト: ErikEkstedt/TurnGPT
 def configure_optimizers(self):
     return AdamW(
         self.model.parameters(), lr=self.hparams.learning_rate, correct_bias=True
     )
コード例 #26
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet'
                ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            # print("loss: "+ str(loss))
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1
                ) % args.gradient_accumulation_steps == 0 and not args.tpu:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.tpu:
                args.xla_model.optimizer_step(optimizer, barrier=True)
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
コード例 #27
0
ファイル: run_ner.py プロジェクト: SH-NLP/shin_bert
def train(args, model, train_dataset, dev_dataset=None, test_dataset=None):
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(t_total * args.warmup_proportion),
        num_training_steps=t_total)

    # if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
    #         os.path.join(args.model_name_or_path, "scheduler.pt")
    # ):
    #     # Load optimizer and scheduler states
    #     optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    #     scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Total train batch size = %d", args.train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    logger.info("  Logging steps = %d", args.logging_steps)
    logger.info("  Save steps = %d", args.save_steps)

    global_step = 0
    tr_loss = 0.0

    model.zero_grad()
    mb = master_bar(range(int(args.num_train_epochs)))
    for epoch in mb:
        epoch_iterator = progress_bar(train_dataloader, parent=mb)
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type not in ["distilkobert", "xlm-roberta"]:
                inputs["token_type_ids"] = batch[
                    2]  # Distilkobert, XLM-Roberta don't use segment_ids
            outputs = model(**inputs)

            loss = outputs[0]

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0 or (
                    len(train_dataloader) <= args.gradient_accumulation_steps
                    and (step + 1) == len(train_dataloader)):
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    if args.evaluate_test_during_training:
                        evaluate(args, model, test_dataset, "test",
                                 global_step)
                    else:
                        evaluate(args, model, dev_dataset, "dev", global_step)

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (model.module
                                     if hasattr(model, "module") else model)
                    model_to_save.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info(
                        "Saving model checkpoint to {}".format(output_dir))

                    if args.save_optimizer:
                        torch.save(optimizer.state_dict(),
                                   os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(),
                                   os.path.join(output_dir, "scheduler.pt"))
                        logger.info(
                            "Saving optimizer and scheduler states to {}".
                            format(output_dir))

            if args.max_steps > 0 and global_step > args.max_steps:
                break

        mb.write("Epoch {} done".format(epoch + 1))

        if args.max_steps > 0 and global_step > args.max_steps:
            break

    return global_step, tr_loss / global_step
コード例 #28
0
ファイル: model064.py プロジェクト: kurupical/riiid
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    logger = get_logger()
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "category"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent()
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="train_0",
            load_feature=not is_debug,
            save_feature=not is_debug)

        print("all_predict")
        df = feature_factory_manager.all_predict(df)
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300"
        ]]
        print(df.head(10))

        print("data preprocess")

        train_idx = []
        val_idx = []
        np.random.seed(0)
        for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
            if np.random.random() < 0.01:
                # all val
                val_idx.extend(w_df.index.tolist())
            else:
                train_num = int(len(w_df) * 0.95)
                train_idx.extend(w_df[:train_num].index.tolist())
                val_idx.extend(w_df[train_num:].index.tolist())
    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=pd.DataFrame())
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])
    if not load_pickle or is_debug:
        df["is_val"] = 0
        df["is_val"].loc[val_idx] = 1
        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model064", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model064/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model064/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model064/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model064/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False,
                                num_workers=1)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    for item in tqdm(dataloader_val):
        x = item["x"].to(device).long()
        target_id = item["target_id"].to(device).long()
        part = item["part"].to(device).long()
        label = item["label"].to(device).float()
        elapsed_time = item["elapsed_time"].to(device).long()
        duration_previous_content = item["duration_previous_content"].to(
            device).long()

        output = model(x, target_id, part, elapsed_time,
                       duration_previous_content)

        preds.extend(torch.nn.Sigmoid()(
            output[:, -1]).view(-1).data.cpu().numpy().tolist())
        labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
コード例 #29
0
    def train(self, train_dataset, output_dir, show_running_loss=True):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        tokenizer = self.tokenizer
        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter()
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"])

        t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {"params": [p for n, p in model.named_parameters() if not any(
                nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]},
            {"params": [p for n, p in model.named_parameters() if any(
                nd in n for nd in no_decay)], "weight_decay": 0.0}
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"])
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args["warmup_steps"], t_total=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

            model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"])

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch")

        for _ in train_iterator:
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)
                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]
                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"])
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"])

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                        # Log metrics
                        # Only evaluate when single GPU otherwise metrics may not average well
                        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step)
                        logging_loss = tr_loss

                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                        # Save model checkpoint
                        output_dir = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        # Take care of distributed/parallel training
                        model_to_save = model.module if hasattr(model, "module") else model
                        model_to_save.save_pretrained(output_dir)

        return global_step, tr_loss / global_step
コード例 #30
0
def train(args):
    random.seed(12345)
    np.random.seed(12345)
    torch.manual_seed(12345)
    if args.distributed:
        torch.cuda.manual_seed_all(12345)

    if args.distributed:
        assert args.bsize % args.nranks == 0, (args.bsize, args.nranks)
        assert args.accumsteps == 1
        args.bsize = args.bsize // args.nranks

        print("Using args.bsize =", args.bsize,
              "(per process) and args.accumsteps =", args.accumsteps)

    if args.lazy:
        reader = LazyBatcher(args, (0 if args.rank == -1 else args.rank),
                             args.nranks)
    else:
        reader = EagerBatcher(args, (0 if args.rank == -1 else args.rank),
                              args.nranks)

    if args.rank not in [-1, 0]:
        torch.distributed.barrier()

    colbert = ColBERT.from_pretrained('bert-base-uncased',
                                      query_maxlen=args.query_maxlen,
                                      doc_maxlen=args.doc_maxlen,
                                      dim=args.dim,
                                      similarity_metric=args.similarity,
                                      mask_punctuation=args.mask_punctuation)

    if args.checkpoint is not None:
        assert args.resume_optimizer is False, "TODO: This would mean reload optimizer too."
        print_message(
            f"#> Starting from checkpoint {args.checkpoint} -- but NOT the optimizer!"
        )

        checkpoint = torch.load(args.checkpoint, map_location='cpu')

        try:
            colbert.load_state_dict(checkpoint['model_state_dict'])
        except:
            print_message("[WARNING] Loading checkpoint with strict=False")
            colbert.load_state_dict(checkpoint['model_state_dict'],
                                    strict=False)

    if args.rank == 0:
        torch.distributed.barrier()

    colbert = colbert.to(DEVICE)
    colbert.train()

    if args.distributed:
        colbert = torch.nn.parallel.DistributedDataParallel(
            colbert,
            device_ids=[args.rank],
            output_device=args.rank,
            find_unused_parameters=True)

    optimizer = AdamW(filter(lambda p: p.requires_grad, colbert.parameters()),
                      lr=args.lr,
                      eps=1e-8)
    optimizer.zero_grad()

    amp = MixedPrecisionManager(args.amp)
    criterion = nn.CrossEntropyLoss()
    labels = torch.zeros(args.bsize, dtype=torch.long, device=DEVICE)

    start_time = time.time()
    train_loss = 0.0

    start_batch_idx = 0

    if args.resume:
        assert args.checkpoint is not None
        start_batch_idx = checkpoint['batch']

        reader.skip_to_batch(start_batch_idx, checkpoint['arguments']['bsize'])

    for batch_idx, BatchSteps in zip(range(start_batch_idx, args.maxsteps),
                                     reader):
        this_batch_loss = 0.0

        for queries, passages in BatchSteps:
            with amp.context():
                scores = colbert(queries, passages).view(2, -1).permute(1, 0)
                loss = criterion(scores, labels[:scores.size(0)])
                loss = loss / args.accumsteps

            if args.rank < 1:
                print_progress(scores)

            amp.backward(loss)

            train_loss += loss.item()
            this_batch_loss += loss.item()

        amp.step(colbert, optimizer)

        if args.rank < 1:
            avg_loss = train_loss / (batch_idx + 1)

            num_examples_seen = (batch_idx -
                                 start_batch_idx) * args.bsize * args.nranks
            elapsed = float(time.time() - start_time)

            log_to_mlflow = (batch_idx % 20 == 0)
            Run.log_metric('train/avg_loss',
                           avg_loss,
                           step=batch_idx,
                           log_to_mlflow=log_to_mlflow)
            Run.log_metric('train/batch_loss',
                           this_batch_loss,
                           step=batch_idx,
                           log_to_mlflow=log_to_mlflow)
            Run.log_metric('train/examples',
                           num_examples_seen,
                           step=batch_idx,
                           log_to_mlflow=log_to_mlflow)
            Run.log_metric('train/throughput',
                           num_examples_seen / elapsed,
                           step=batch_idx,
                           log_to_mlflow=log_to_mlflow)

            print_message(batch_idx, avg_loss)
            manage_checkpoints(args, colbert, optimizer, batch_idx + 1)