Exemple #1
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--max_choices_num", default=10, type=int,
                        help="The maximum number of cadicate answer, shorter than this will be padded.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path, None)

    model.eval()
    batch_size = args.batch_size
    results_final = []
    dataset_by_group = {}
    print("The number of prediction instances: ", len(dataset))

    for example in dataset:
        if example[-1] not in dataset_by_group:
            dataset_by_group[example[-1]] = [example]
        else:
            dataset_by_group[example[-1]].append(example)

    for group_index, examples in dataset_by_group.items():
        src = torch.LongTensor([example[0] for example in examples])
        tgt = torch.LongTensor([example[1] for example in examples])
        seg = torch.LongTensor([example[2] for example in examples])
        index = 0
        results = []
        for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    results.append((examples[index][-2], logits[index].cpu().numpy()))
                    index += 1
        results_final.extend(postprocess_chid_predictions(results))

    with open(args.prediction_path, 'w') as f:
        json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument("--train_answer_path",
                        type=str,
                        required=True,
                        help="Path of the answers for trainset.")
    parser.add_argument("--dev_answer_path",
                        type=str,
                        required=True,
                        help="Path of the answers for devset.")

    parser.add_argument(
        "--max_choices_num",
        default=10,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )

    args = parser.parse_args()

    args.labels_num = args.max_choices_num
    if args.output_model_path == None:
        args.output_model_path = "./models/chid_model.bin"

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build multiple choice model.
    model = MultipleChoice(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path, args.train_answer_path)
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size

    src = torch.LongTensor([example[0] for example in trainset])
    tgt = torch.LongTensor([example[1] for example in trainset])
    seg = torch.LongTensor([example[2] for example in trainset])

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, result, best_result = 0., 0., 0.

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (src_batch, tgt_batch, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch)
            total_loss += loss.item()

            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.

        result = evaluate(
            args, read_dataset(args, args.dev_path, args.dev_answer_path))
        if result[0] > best_result:
            best_result = result[0]
            save_model(model, args.output_model_path)
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--pooling", choices=["first", "last", "max", "mean"], \
                                              default="first", help="Pooling Type.")
    parser.add_argument("--whitening_size",
                        type=int,
                        default=None,
                        help="Output vector size after whitening.")

    tokenizer_opts(parser)

    args = parser.parse_args()
    args = load_hyperparam(args)

    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build feature extractor model.
    model = FeatureExtractor(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)
    model.eval()
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build machine reading comprehension model.
    model = MachineReadingComprehension(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Training phase.
    batch_size = args.batch_size
    print("Batch size: ", batch_size)
    trainset, _ = read_dataset(args, args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)

    src = torch.LongTensor([sample[0] for sample in trainset])
    seg = torch.LongTensor([sample[1] for sample in trainset])
    start_position = torch.LongTensor([sample[2] for sample in trainset])
    end_position = torch.LongTensor([sample[3] for sample in trainset])

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss = 0.0
    result = 0.0
    best_result = 0.0

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            loss = train(args, model, optimizer, scheduler, src_batch,
                         seg_batch, start_position_batch, end_position_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        result = evaluate(args, *read_dataset(args, args.dev_path))
        if result > best_result:
            best_result = result
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            model.module.load_state_dict(torch.load(args.output_model_path))
        else:
            model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, *read_dataset(args, args.test_path))
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/classifier_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default="./models/google_vocab.txt",
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=128,
                        help="Sequence length.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    # Evaluation options.
    parser.add_argument("--mean_reciprocal_rank",
                        action="store_true",
                        help="Evaluation metrics for DBQA dataset.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    labels_set = set()
    columns = {}
    with open(args.train_path, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            try:
                line = line.strip().split("\t")
                if line_id == 0:
                    for i, column_name in enumerate(line):
                        columns[column_name] = i
                    continue
                label = int(line[columns["label"]])
                labels_set.add(label)
            except:
                pass
    args.labels_num = len(labels_set)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build bert model.
    # A pseudo target is added.
    args.target = "bert"
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model.load_state_dict(torch.load(args.pretrained_model_path),
                              strict=False)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    # Build classification model.
    model = BertClassifier(args, model)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    # Datset loader.
    def batch_loader(batch_size, input_ids, label_ids, mask_ids):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            label_ids_batch = label_ids[instances_num // batch_size *
                                        batch_size:]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch

    # Build tokenizer.
    tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Read dataset.
    def read_dataset(path):
        dataset = []
        with open(path, mode="r", encoding="utf-8") as f:
            for line_id, line in enumerate(f):
                if line_id == 0:
                    continue
                try:
                    line = line.strip().split('\t')
                    if len(line) == 2:
                        label = int(line[columns["label"]])
                        text = line[columns["text_a"]]
                        tokens = [
                            vocab.get(t) for t in tokenizer.tokenize(text)
                        ]
                        tokens = [CLS_ID] + tokens
                        mask = [1] * len(tokens)
                        if len(tokens) > args.seq_length:
                            tokens = tokens[:args.seq_length]
                            mask = mask[:args.seq_length]
                        while len(tokens) < args.seq_length:
                            tokens.append(0)
                            mask.append(0)
                        dataset.append((tokens, label, mask))
                    elif len(line) == 3:  # For sentence pair input.
                        label = int(line[columns["label"]])
                        text_a, text_b = line[columns["text_a"]], line[
                            columns["text_b"]]

                        tokens_a = [
                            vocab.get(t) for t in tokenizer.tokenize(text_a)
                        ]
                        tokens_a = [CLS_ID] + tokens_a + [SEP_ID]
                        tokens_b = [
                            vocab.get(t) for t in tokenizer.tokenize(text_b)
                        ]
                        tokens_b = tokens_b + [SEP_ID]

                        tokens = tokens_a + tokens_b
                        mask = [1] * len(tokens_a) + [2] * len(tokens_b)

                        if len(tokens) > args.seq_length:
                            tokens = tokens[:args.seq_length]
                            mask = mask[:args.seq_length]
                        while len(tokens) < args.seq_length:
                            tokens.append(0)
                            mask.append(0)
                        dataset.append((tokens, label, mask))
                    elif len(line) == 4:  # For dbqa input.
                        qid = int(line[columns["qid"]])
                        label = int(line[columns["label"]])
                        text_a, text_b = line[columns["text_a"]], line[
                            columns["text_b"]]

                        tokens_a = [
                            vocab.get(t) for t in tokenizer.tokenize(text_a)
                        ]
                        tokens_a = [CLS_ID] + tokens_a + [SEP_ID]
                        tokens_b = [
                            vocab.get(t) for t in tokenizer.tokenize(text_b)
                        ]
                        tokens_b = tokens_b + [SEP_ID]

                        tokens = tokens_a + tokens_b
                        mask = [1] * len(tokens_a) + [2] * len(tokens_b)

                        if len(tokens) > args.seq_length:
                            tokens = tokens[:args.seq_length]
                            mask = mask[:args.seq_length]
                        while len(tokens) < args.seq_length:
                            tokens.append(0)
                            mask.append(0)
                        dataset.append((tokens, label, mask, qid))
                    else:
                        pass

                except:
                    pass
        return dataset

    # Evaluation function.
    def evaluate(args, is_test):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        label_ids = torch.LongTensor([sample[1] for sample in dataset])
        mask_ids = torch.LongTensor([sample[2] for sample in dataset])

        batch_size = args.batch_size
        instances_num = input_ids.size()[0]
        if is_test:
            print("The number of evaluation instances: ", instances_num)

        correct = 0
        # Confusion matrix.
        confusion = torch.zeros(args.labels_num,
                                args.labels_num,
                                dtype=torch.long)

        model.eval()

        if not args.mean_reciprocal_rank:
            for i, (input_ids_batch, label_ids_batch,
                    mask_ids_batch) in enumerate(
                        batch_loader(batch_size, input_ids, label_ids,
                                     mask_ids)):
                input_ids_batch = input_ids_batch.to(device)
                label_ids_batch = label_ids_batch.to(device)
                mask_ids_batch = mask_ids_batch.to(device)
                with torch.no_grad():
                    loss, logits = model(input_ids_batch, label_ids_batch,
                                         mask_ids_batch)
                logits = nn.Softmax(dim=1)(logits)
                pred = torch.argmax(logits, dim=1)
                gold = label_ids_batch
                for j in range(pred.size()[0]):
                    confusion[pred[j], gold[j]] += 1
                correct += torch.sum(pred == gold).item()

            if is_test:
                print("Confusion matrix:")
                print(confusion)
                print("Report precision, recall, and f1:")
            for i in range(confusion.size()[0]):
                p = confusion[i, i].item() / confusion[i, :].sum().item()
                r = confusion[i, i].item() / confusion[:, i].sum().item()
                f1 = 2 * p * r / (p + r)
                if is_test:
                    print("Label {}: {:.3f}, {:.3f}, {:.3f}".format(
                        i, p, r, f1))
            print("Acc. (Correct/Total): {:.4f} ({}/{}) ".format(
                correct / len(dataset), correct, len(dataset)))
            return correct / len(dataset)
        else:
            for i, (input_ids_batch, label_ids_batch,
                    mask_ids_batch) in enumerate(
                        batch_loader(batch_size, input_ids, label_ids,
                                     mask_ids)):
                input_ids_batch = input_ids_batch.to(device)
                label_ids_batch = label_ids_batch.to(device)
                mask_ids_batch = mask_ids_batch.to(device)
                with torch.no_grad():
                    loss, logits = model(input_ids_batch, label_ids_batch,
                                         mask_ids_batch)
                logits = nn.Softmax(dim=1)(logits)
                if i == 0:
                    logits_all = logits
                if i >= 1:
                    logits_all = torch.cat((logits_all, logits), 0)

            order = -1
            gold = []
            for i in range(len(dataset)):
                qid = dataset[i][3]
                label = dataset[i][1]
                if qid == order:
                    j += 1
                    if label == 1:
                        gold.append((qid, j))
                else:
                    order = qid
                    j = 0
                    if label == 1:
                        gold.append((qid, j))

            label_order = []
            order = -1
            for i in range(len(gold)):
                if gold[i][0] == order:
                    templist.append(gold[i][1])
                elif gold[i][0] != order:
                    order = gold[i][0]
                    if i > 0:
                        label_order.append(templist)
                    templist = []
                    templist.append(gold[i][1])
            label_order.append(templist)

            order = -1
            score_list = []
            for i in range(len(logits_all)):
                score = float(logits_all[i][1])
                qid = int(dataset[i][3])
                if qid == order:
                    templist.append(score)
                else:
                    order = qid
                    if i > 0:
                        score_list.append(templist)
                    templist = []
                    templist.append(score)
            score_list.append(templist)

            rank = []
            pred = []
            for i in range(len(score_list)):
                if len(label_order[i]) == 1:
                    if label_order[i][0] < len(score_list[i]):
                        true_score = score_list[i][label_order[i][0]]
                        score_list[i].sort(reverse=True)
                        for j in range(len(score_list[i])):
                            if score_list[i][j] == true_score:
                                rank.append(1 / (j + 1))
                    else:
                        rank.append(0)

                else:
                    true_rank = len(score_list[i])
                    for k in range(len(label_order[i])):
                        if label_order[i][k] < len(score_list[i]):
                            true_score = score_list[i][label_order[i][k]]
                            temp = sorted(score_list[i], reverse=True)
                            for j in range(len(temp)):
                                if temp[j] == true_score:
                                    if j < true_rank:
                                        true_rank = j
                    if true_rank < len(score_list[i]):
                        rank.append(1 / (true_rank + 1))
                    else:
                        rank.append(0)
            MRR = sum(rank) / len(rank)
            print(MRR)
            return MRR

    # Training phase.
    print("Start training.")
    trainset = read_dataset(args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size

    input_ids = torch.LongTensor([example[0] for example in trainset])
    label_ids = torch.LongTensor([example[1] for example in trainset])
    mask_ids = torch.LongTensor([example[2] for example in trainset])

    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup,
                         t_total=train_steps)

    total_loss = 0.
    result = 0.0
    best_result = 0.0

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            model.zero_grad()

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)

            loss, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch)
            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.
            loss.backward()
            optimizer.step()
        result = evaluate(args, False)
        if result > best_result:
            best_result = result
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        model = load_model(model, args.output_model_path)
        evaluate(args, True)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the multiple choice model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=64,
                        help="Sequence length.")
    parser.add_argument(
        "--max_choices_num",
        default=10,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="char",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path, None)

    model.eval()
    batch_size = args.batch_size
    results_final = []
    dataset_by_group = {}
    print("The number of prediction instances: ", len(dataset))

    for example in dataset:
        if example[-1] not in dataset_by_group:
            dataset_by_group[example[-1]] = [example]
        else:
            dataset_by_group[example[-1]].append(example)

    for group_index, examples in dataset_by_group.items():
        src = torch.LongTensor([example[0] for example in examples])
        tgt = torch.LongTensor([example[1] for example in examples])
        seg = torch.LongTensor([example[2] for example in examples])
        index = 0
        results = []
        for i, (src_batch, _, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    results.append(
                        (examples[index][-2], logits[index].cpu().numpy()))
                    index += 1
        results_final.extend(postprocess_chid_predictions(results))

    with open(args.prediction_path, 'w') as f:
        json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument("--label2id_path",
                        type=str,
                        required=True,
                        help="Path of the label2id file.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    args.begin_ids = []

    with open(args.label2id_path, mode="r", encoding="utf-8") as f:
        l2i = json.load(f)
        print("Labels: ", l2i)
        l2i["[PAD]"] = len(l2i)
        for label in l2i:
            if label.startswith("B"):
                args.begin_ids.append(l2i[label])

    args.l2i = l2i

    args.labels_num = len(l2i)

    args.tokenizer = SpaceTokenizer(args)

    # Build sequence labeling model.
    model = NerTagger(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    instances = read_dataset(args, args.train_path)

    src = torch.LongTensor([ins[0] for ins in instances])
    tgt = torch.LongTensor([ins[1] for ins in instances])
    seg = torch.LongTensor([ins[2] for ins in instances])

    instances_num = src.size(0)
    batch_size = args.batch_size
    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, f1, best_f1 = 0.0, 0.0, 0.0

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (src_batch, tgt_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, tgt,
                                                     seg)):
            loss = train(args, model, optimizer, scheduler, src_batch,
                         tgt_batch, seg_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        f1 = evaluate(args, read_dataset(args, args.dev_path))
        if f1 > best_f1:
            best_f1 = f1
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            args.model.module.load_state_dict(
                torch.load(args.output_model_path))
        else:
            args.model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path))
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path", default=None, type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path", default=None, type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path", default=None, type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--train_path", type=str, required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path", type=str, required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str,
                        help="Path of the testset.")
    parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size", type=int, default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length", type=int, default=128,
                        help="Sequence length.")
    parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")
    parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer on Chinese corpus."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             )

    # Optimizer options.
    parser.add_argument("--soft_targets", action='store_true',
                        help="Train model with logits.")
    parser.add_argument("--soft_alpha", type=float, default=0.5,
                        help="Weight of the soft targets loss.")
    parser.add_argument("--learning_rate", type=float, default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup", type=float, default=0.1,
                        help="Warm up value.")
    parser.add_argument("--fp16", action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit.")
    parser.add_argument("--fp16_opt_level", choices=["O0", "O1", "O2", "O3" ], default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5,
                        help="Dropout.")
    parser.add_argument("--epochs_num", type=int, default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps", type=int, default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7,
                        help="Random seed.")
    
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels. 
    args.labels_num = count_labels_num(args.train_path)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model.
    model = Classifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)
    
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size

    src = torch.LongTensor([example[0] for example in trainset])
    tgt = torch.LongTensor([example[1] for example in trainset])
    seg = torch.LongTensor([example[2] for example in trainset])
    if args.soft_targets:
        soft_tgt = torch.FloatTensor([example[3] for example in trainset])
    else:
        soft_tgt = None

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)
    
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, result, best_result = 0., 0., 0.

    print("Start training.")
    
    for epoch in range(1, args.epochs_num+1):
        model.train()
        for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(batch_loader(batch_size, src, tgt, seg, soft_tgt)):    
            loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i+1, total_loss / args.report_steps))
                total_loss = 0.
        #change by qyl
        import os
        base_dir=args.output_model_path.split('/')[0]
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)
        result = evaluate(args, read_dataset(args, args.dev_path))
        save_model(model, base_dir+'/epoch_'+str(epoch)+'.bin')
        if result[0] > best_result:
            best_result = result[0]
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            model.module.load_state_dict(torch.load(args.output_model_path))
        else:
            model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path), True)
Exemple #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--dataset_path",
                        type=str,
                        default="dataset.pt",
                        help="Path of the preprocessed dataset.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--pretrained_model_path",
                        type=str,
                        default=None,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        type=str,
                        required=True,
                        help="Path of the output model.")
    parser.add_argument("--config_path",
                        type=str,
                        default="models/bert_base_config.json",
                        help="Config file of model hyper-parameters.")

    # Training and saving options.
    parser.add_argument("--total_steps",
                        type=int,
                        default=100000,
                        help="Total training steps.")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=10000,
                        help="Specific steps to save model checkpoint.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--accumulation_steps",
                        type=int,
                        default=1,
                        help="Specific steps to accumulate gradient.")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help=
        "Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps]."
    )
    parser.add_argument("--instances_buffer_size",
                        type=int,
                        default=25600,
                        help="The buffer size of instances in memory.")

    # Model options.
    parser.add_argument("--dropout",
                        type=float,
                        default=0.1,
                        help="Dropout value.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")
    parser.add_argument("--embedding",
                        choices=["bert", "word", "gpt"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", "synt", \
                                                   "rcnn", "crnn", "gpt", "gpt2", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--target",
                        choices=["bert", "lm", "cls", "mlm", "bilm", "albert"],
                        default="bert",
                        help="The training target of the pretraining model.")
    parser.add_argument("--tie_weights",
                        action="store_true",
                        help="Tie the word embedding and softmax weights.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--has_lmtarget_bias",
                        action="store_true",
                        help="Add bias on output_layer for lm target.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Masking options.
    parser.add_argument("--span_masking",
                        action="store_true",
                        help="Span masking.")
    parser.add_argument(
        "--span_geo_prob",
        type=float,
        default=0.2,
        help="Hyperparameter of geometric distribution for span masking.")
    parser.add_argument("--span_max_length",
                        type=int,
                        default=10,
                        help="Max length for span masking.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Initial learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")
    parser.add_argument("--beta1",
                        type=float,
                        default=0.9,
                        help="Beta1 for Adam optimizer.")
    parser.add_argument("--beta2",
                        type=float,
                        default=0.999,
                        help="Beta2 for Adam optimizer.")
    parser.add_argument(
        "--fp16",
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        "--fp16_opt_level",
        choices=["O0", "O1", "O2", "O3"],
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")

    # GPU options.
    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="Total number of processes (GPUs) for training.")
    parser.add_argument(
        "--gpu_ranks",
        default=[],
        nargs='+',
        type=int,
        help="List of ranks of each process."
        " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU."
    )
    parser.add_argument("--master_ip",
                        default="tcp://localhost:12345",
                        type=str,
                        help="IP-Port of master for training.")
    parser.add_argument("--backend",
                        choices=["nccl", "gloo"],
                        default="nccl",
                        type=str,
                        help="Distributed backend.")

    args = parser.parse_args()

    # Load hyper-parameters from config file.
    if args.config_path:
        load_hyperparam(args)

    ranks_num = len(args.gpu_ranks)

    if args.world_size > 1:
        # Multiprocessing distributed mode.
        assert torch.cuda.is_available(), "No available GPUs."
        assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit."
        assert ranks_num <= torch.cuda.device_count(
        ), "Started processes exceeds the available GPUs."
        args.dist_train = True
        args.ranks_num = ranks_num
        print("Using distributed mode for training.")
    elif args.world_size == 1 and ranks_num == 1:
        # Single GPU mode.
        assert torch.cuda.is_available(), "No available GPUs."
        args.gpu_id = args.gpu_ranks[0]
        assert args.gpu_id < torch.cuda.device_count(
        ), "Invalid specified GPU device."
        args.dist_train = False
        args.single_gpu = True
        print("Using GPU %d for training." % args.gpu_id)
    else:
        # CPU mode.
        assert ranks_num == 0, "GPUs are specified, please check the arguments."
        args.dist_train = False
        args.single_gpu = False
        print("Using CPU mode for training.")

    trainer.train_and_validate(args)
Exemple #10
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/tagger_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default="./models/google_vocab.txt",
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path",
                        type=str,
                        required=True,
                        help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/google_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch_size.")
    parser.add_argument("--seq_length",
                        default=128,
                        type=int,
                        help="Sequence length.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Find tagging labels.
    labels_map = {"NULL": 0, "O": 1}  # ID for padding and non-entity.
    with open(args.train_path, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            if line_id == 0:
                continue
            line = line.strip().split()
            if len(line) != 2:
                continue
            if line[1] not in labels_map:
                labels_map[line[1]] = len(labels_map)

    print("Labels: ", labels_map)
    print("Label Num: ", len(labels_map))
    args.labels_num = len(labels_map)

    # Create the bad pairs
    args.bad_pairs = []
    args.good_pairs = []
    for key1, value1 in labels_map.items():
        key1 = key1.strip().split('-')
        if len(key1) < 1 or len(key1) > 2:
            print("Error label: ", key1)
            exit()
        for key2, value2 in labels_map.items():
            key2 = key2.strip().split('-')
            if len(key2) == 1:
                continue
            if len(key1) == 1 and len(key2) == 2:
                if key2[0] == 'I':
                    args.bad_pairs.append([value1, value2])
                continue
            # p(B-X -> I-Y) = 0
            if key1[1] != key2[1] and key1[0] == 'B' and key2[0] == 'I':
                args.bad_pairs.append([value1, value2])
            # p(I-X -> I-Y) = 0
            if key1[1] != key2[1] and key1[0] == 'I' and key2[0] == 'I':
                args.bad_pairs.append([value1, value2])
            # p(B-X -> I-X) = 10
            if key1[1] == key2[1] and key1[0] == 'B' and key2[0] == 'I':
                args.good_pairs.append([value1, value2])

    print("Bad pairs: ", args.bad_pairs)
    print("Good pairs: ", args.good_pairs)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build bert model.
    # A pseudo target is added.
    args.target = "bert"
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model.load_state_dict(torch.load(args.pretrained_model_path),
                              strict=False)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    # Some other parameters
    args.lstm_hidden = args.hidden_size
    args.lstm_layers = 2
    args.lstm_dropout = 0.1
    if torch.cuda.is_available():
        args.use_cuda = True

    # Build sequence labeling model.
    model = CCKSTagger(args, model)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)
        model = model.module

    model = model.to(device)

    # Datset loader.
    def batch_loader(batch_size, input_ids, label_ids, mask_ids):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            label_ids_batch = label_ids[instances_num // batch_size *
                                        batch_size:, :]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch

    # Read dataset.
    def read_dataset(path):
        dataset = []
        with open(path, mode="r", encoding="utf-8") as f:
            tokens, labels = [], []
            for line_id, line in enumerate(f):
                if line_id == 0:
                    continue
                line = line.strip().split()
                if len(line) != 2:
                    if len(labels) == 0:
                        continue
                    assert len(tokens) == len(labels)
                    tokens = [vocab.get(t) for t in tokens]
                    labels = [labels_map[l] for l in labels]
                    mask = [1] * len(tokens)
                    if len(tokens) > args.seq_length:
                        tokens = tokens[:args.seq_length]
                        labels = labels[:args.seq_length]
                        mask = mask[:args.seq_length]
                    while len(tokens) < args.seq_length:
                        tokens.append(0)
                        labels.append(0)
                        mask.append(0)
                    dataset.append([tokens, labels, mask])

                    tokens, labels = [], []
                    continue
                tokens.append(line[0])
                labels.append(line[1])

        return dataset

    # Evaluation function.
    def evaluate(args, is_test):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        label_ids = torch.LongTensor([sample[1] for sample in dataset])
        mask_ids = torch.LongTensor([sample[2] for sample in dataset])

        instances_num = input_ids.size(0)
        batch_size = args.batch_size

        if is_test:
            print("Batch size: ", batch_size)
            print("The number of test instances:", instances_num)

        correct = 0
        gold_entities_num = 0
        pred_entities_num = 0

        confusion = torch.zeros(len(labels_map),
                                len(labels_map),
                                dtype=torch.long)

        model.eval()

        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            # loss, _, pred, gold = model(input_ids_batch, label_ids_batch, mask_ids_batch)
            feats = model(input_ids_batch, label_ids_batch, mask_ids_batch)
            path_score, best_path = model.crf(feats, mask_ids_batch.byte())
            pred = best_path.contiguous().view(-1)
            gold = label_ids_batch.contiguous().view(-1)
            """ if i == 0:
                print('pred', pred)
                print('gold', gold) """

            # Gold.
            for j in range(gold.size()[0]):
                if (j > 0 and gold[j - 1].item() <= 1
                        and gold[j].item() > 1) or (j == 0
                                                    and gold[j].item() > 1):
                    gold_entities_num += 1

            # Predict.
            for j in range(pred.size()[0]):
                if (j > 0 and pred[j - 1].item() <= 1 and pred[j].item() > 1
                        and gold[j].item() != 0) or (j == 0
                                                     and pred[j].item() > 1):
                    pred_entities_num += 1

            pred_entities_pos = []
            gold_entities_pos = []
            start, end = 0, 0

            # Correct.
            for j in range(gold.size()[0]):
                if (j > 0 and gold[j - 1].item() <= 1
                        and gold[j].item() > 1) or (j == 0
                                                    and gold[j].item() > 1):
                    start = j
                    for k in range(j, gold.size()[0]):
                        if gold[k].item() <= 1:
                            end = k - 1
                            break
                    else:
                        end = gold.size()[0] - 1
                    gold_entities_pos.append((start, end))

            # Predict.
            for j in range(pred.size()[0]):
                if (j > 0 and pred[j - 1].item() <= 1
                        and pred[j].item() > 1) or (j == 0
                                                    and pred[j].item() > 1):
                    start = j
                    for k in range(j, pred.size()[0]):
                        if pred[k].item() <= 1:
                            end = k - 1
                            break
                    else:
                        end = pred.size()[0] - 1
                    pred_entities_pos.append((start, end))

            for entity in pred_entities_pos:
                if entity not in gold_entities_pos:
                    continue
                for j in range(entity[0], entity[1] + 1):
                    if gold[j].item() != pred[j].item():
                        break
                else:
                    correct += 1

        print("Report precision, recall, and f1:")
        p = correct / pred_entities_num
        r = correct / gold_entities_num
        f1 = 2 * p * r / (p + r)
        print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1))

        return f1

    # Training phase.
    print("Start training.")
    instances = read_dataset(args.train_path)

    input_ids = torch.LongTensor([ins[0] for ins in instances])
    label_ids = torch.LongTensor([ins[1] for ins in instances])
    mask_ids = torch.LongTensor([ins[2] for ins in instances])

    instances_num = input_ids.size(0)
    batch_size = args.batch_size
    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup,
                         t_total=train_steps)

    total_loss = 0.
    f1 = 0.0
    best_f1 = 0.0

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            model.zero_grad()

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            """ loss, _, _, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch)
            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, i+1, total_loss / args.report_steps))
                total_loss = 0. """
            """ print("mask1:", mask_ids_batch)
            print("label1:", label_ids_batch) """
            feats = model(input_ids_batch, label_ids_batch, mask_ids_batch)
            """ print("feats:", feats) """
            loss = model.loss(feats, mask_ids_batch, label_ids_batch)
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Loss: {:.3f}".format(
                    epoch, i + 1, loss))

            loss.backward()
            optimizer.step()

        f1 = evaluate(args, False)
        if f1 > best_f1:
            best_f1 = f1
            save_model(model, args.output_model_path)
        #else:
        #    break

    # Evaluation phase.
    print("Start evaluation.")
    """ if torch.cuda.device_count() > 1:
        model.module.load_state_dict(torch.load(args.output_model_path))
    else:
        model.load_state_dict(torch.load(args.output_model_path)) """
    model.load_state_dict(torch.load(args.output_model_path))

    evaluate(args, True)
    """
Exemple #11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/tagger_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--output_encoder",
                        default="./luke-models/",
                        type=str,
                        help="Path of the output luke model.")
    parser.add_argument("--suffix_file_encoder",
                        default="encoder",
                        type=str,
                        help="output file suffix luke model.")
    parser.add_argument("--vocab_path",
                        default="./models/google_vocab.txt",
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path",
                        type=str,
                        required=True,
                        help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/google_config.json",
                        type=str,
                        help="Path of the config file.")
    parser.add_argument("--output_file_prefix",
                        type=str,
                        required=True,
                        help="Prefix for file output.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=2,
                        help="Batch_size.")
    parser.add_argument("--seq_length",
                        default=256,
                        type=int,
                        help="Sequence length.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                        default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=5,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=2,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    # kg
    parser.add_argument("--kg_name", required=True, help="KG name or path")
    parser.add_argument("--use_kg",
                        action='store_true',
                        help="Enable the use of KG.")
    parser.add_argument("--dry_run",
                        action='store_true',
                        help="Dry run to test the implementation.")
    parser.add_argument(
        "--voting_choicer",
        action='store_true',
        help="Enable the Voting choicer to select the entity type.")
    parser.add_argument("--eval_kg_tag",
                        action='store_true',
                        help="Enable to include [ENT] tag in evaluation.")
    parser.add_argument("--use_subword_tag",
                        action='store_true',
                        help="Enable to use separate tag for subword splits.")
    parser.add_argument("--debug", action='store_true', help="Enable debug.")
    parser.add_argument("--reverse_order",
                        action='store_true',
                        help="Reverse the feature selection order.")
    parser.add_argument("--max_entities",
                        default=2,
                        type=int,
                        help="Number of KG features.")
    parser.add_argument("--eval_range_with_types",
                        action='store_true',
                        help="Enable to eval range with types.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    labels_map = {"[PAD]": 0, "[ENT]": 1, "[X]": 2, "[CLS]": 3, "[SEP]": 4}
    begin_ids = []

    # Find tagging labels
    with open(args.train_path, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            if line_id == 0:
                continue
            labels = line.strip().split("\t")[0].split()
            for l in labels:
                if l not in labels_map:
                    if l.startswith("B") or l.startswith("S"):
                        begin_ids.append(len(labels_map))
                    labels_map[l] = len(labels_map)

    idx_to_label = {labels_map[key]: key for key in labels_map}

    print(begin_ids)
    print("Labels: ", labels_map)
    args.labels_num = len(labels_map)

    # Build knowledge graph.
    if args.kg_name == 'none':
        kg_file = []
    else:
        kg_file = args.kg_name

    # Load Luke model.
    model_archive = ModelArchive.load(args.pretrained_model_path)
    tokenizer = model_archive.tokenizer

    # Handling space character in roberta tokenizer
    byte_encoder = bytes_to_unicode()
    byte_decoder = {v: k for k, v in byte_encoder.items()}

    # Load the pretrained model
    encoder = LukeModel(model_archive.config)
    encoder.load_state_dict(model_archive.state_dict, strict=False)

    # Build sequence labeling model.
    model = LukeTagger(args, encoder)
    kg = KnowledgeGraph(kg_file=kg_file, tokenizer=tokenizer)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    # Datset loader.
    def batch_loader(batch_size, input_ids, label_ids, mask_ids, pos_ids,
                     vm_ids, tag_ids, segment_ids):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            pos_ids_batch = pos_ids[i * batch_size:(i + 1) * batch_size, :]
            vm_ids_batch = vm_ids[i * batch_size:(i + 1) * batch_size, :, :]
            tag_ids_batch = tag_ids[i * batch_size:(i + 1) * batch_size, :]
            segment_ids_batch = segment_ids[i * batch_size:(i + 1) *
                                            batch_size, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, tag_ids_batch, segment_ids_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            label_ids_batch = label_ids[instances_num // batch_size *
                                        batch_size:, :]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            pos_ids_batch = pos_ids[instances_num // batch_size *
                                    batch_size:, :]
            vm_ids_batch = vm_ids[instances_num // batch_size *
                                  batch_size:, :, :]
            tag_ids_batch = tag_ids[instances_num // batch_size *
                                    batch_size:, :]
            segment_ids_batch = segment_ids[instances_num // batch_size *
                                            batch_size:, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch, pos_ids_batch, vm_ids_batch, tag_ids_batch, segment_ids_batch

    # Read dataset.
    def read_dataset(path):
        dataset = []
        count = 0
        with open(path, mode="r", encoding="utf8") as f:
            f.readline()
            tokens, labels = [], []
            for line_id, line in enumerate(f):
                fields = line.strip().split("\t")
                if len(fields) == 2:
                    labels, tokens = fields
                elif len(fields) == 3:
                    labels, tokens, cls = fields
                else:
                    print(
                        f'The data is not in accepted format at line no:{line_id}.. Ignored'
                    )
                    continue

                tokens, pos, vm, tag = \
                    kg.add_knowledge_with_vm([tokens], [labels],
                                             use_kg=args.use_kg,
                                             max_length=args.seq_length,
                                             max_entities=args.max_entities,
                                             reverse_order=args.reverse_order)
                tokens = tokens[0]
                pos = pos[0]
                vm = vm[0].astype("bool")
                tag = tag[0]

                # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token])
                non_pad_tokens = [
                    tok for tok in tokens if tok != tokenizer.pad_token
                ]
                num_tokens = len(non_pad_tokens)
                num_pad = len(tokens) - num_tokens

                labels = [config.CLS_TOKEN
                          ] + labels.split(" ") + [config.SEP_TOKEN]
                new_labels = []
                j = 0
                joiner = '-'
                for i in range(len(tokens)):
                    if tag[i] == 0 and tokens[i] != tokenizer.pad_token:
                        cur_type = labels[j]
                        new_labels.append(cur_type)
                        if cur_type != 'O':
                            joiner = cur_type[1]
                            prev_label = cur_type[2:]
                        else:
                            prev_label = cur_type
                        j += 1
                    elif tag[i] == 1 and tokens[
                            i] != tokenizer.pad_token:  # 是添加的实体
                        new_labels.append('[ENT]')
                    elif tag[i] == 2:
                        if prev_label == 'O':
                            new_labels.append('O')
                        else:
                            if args.use_subword_tag:
                                new_labels.append('[X]')
                            else:
                                new_labels.append(f'I{joiner}' + prev_label)
                    else:
                        new_labels.append(PAD_TOKEN)

                new_labels = [labels_map[l] for l in new_labels]

                # print(tokens)
                # print(labels)
                # print(tag)

                mask = [1] * (num_tokens) + [0] * num_pad
                word_segment_ids = [0] * (len(tokens))

                # print(len(tokens))
                # print(len(tag))
                # exit()
                # print(tokenizer.pad_token_id)

                # for i in range(len(tokens)):
                #     if tag[i] == 0 and tokens[i] != tokenizer.pad_token:
                #         new_labels.append(labels[j])
                #         j += 1
                #     elif tag[i] == 1 and tokens[i] != tokenizer.pad_token:  # 是添加的实体
                #         new_labels.append(labels_map['[ENT]'])
                #     elif tag[i] == 2:
                #         if args.use_subword_tag:
                #             new_labels.append(labels_map['[X]'])
                #         else:
                #             new_labels.append(labels_map['[ENT]'])
                #     else:
                #         new_labels.append(labels_map[PAD_TOKEN])

                # print(labels)
                # print(new_labels)
                # print([idx_to_label.get(key) for key in labels])
                # print([idx_to_label.get(key) for key in labels])
                # print(mask)
                # print(pos)
                # print(word_segment_ids)
                # print(tokens)
                # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token])
                tokens = tokenizer.convert_tokens_to_ids(tokens)
                # print(tokens)
                # exit()
                assert len(tokens) == len(new_labels), AssertionError(
                    "The length of token and label is not matching")

                dataset.append(
                    [tokens, new_labels, mask, pos, vm, tag, word_segment_ids])

                # Enable dry rune
                if args.dry_run:
                    count += 1
                    if count == 100:
                        break

        return dataset

    # Evaluation function.
    def evaluate(args, is_test, final=False):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        label_ids = torch.LongTensor([sample[1] for sample in dataset])
        mask_ids = torch.LongTensor([sample[2] for sample in dataset])
        pos_ids = torch.LongTensor([sample[3] for sample in dataset])
        vm_ids = torch.BoolTensor([sample[4] for sample in dataset])
        tag_ids = torch.LongTensor([sample[5] for sample in dataset])
        segment_ids = torch.LongTensor([sample[6] for sample in dataset])

        instances_num = input_ids.size(0)
        batch_size = args.batch_size

        if is_test:
            print("Batch size: ", batch_size)
            print("The number of test instances:", instances_num)

        correct = 0
        correct_with_type = 0
        gold_entities_num = 0
        pred_entities_num = 0

        confusion = torch.zeros(len(labels_map),
                                len(labels_map),
                                dtype=torch.long)

        model.eval()

        for i, (input_ids_batch, label_ids_batch, mask_ids_batch,
                pos_ids_batch, vm_ids_batch, tag_ids_batch,
                segment_ids_batch) in enumerate(
                    batch_loader(batch_size, input_ids, label_ids, mask_ids,
                                 pos_ids, vm_ids, tag_ids, segment_ids)):

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            pos_ids_batch = pos_ids_batch.to(device)
            tag_ids_batch = tag_ids_batch.to(device)
            vm_ids_batch = vm_ids_batch.long().to(device)
            segment_ids_batch = segment_ids_batch.long().to(device)

            loss, _, pred, gold, _ = model(input_ids_batch,
                                           segment_ids_batch,
                                           mask_ids_batch,
                                           label_ids_batch,
                                           pos_ids_batch,
                                           vm_ids_batch,
                                           use_kg=args.use_kg)

            if final:
                with open(f'{args.output_file_prefix}_predictions.txt', 'a') as p, \
                        open(f'{args.output_file_prefix}_gold.txt', 'a') as g, \
                        open(f'{args.output_file_prefix}_text.txt', 'a') as t:
                    predicted_labels = [
                        idx_to_label.get(key) for key in pred.tolist()
                    ]
                    gold_labels = [
                        idx_to_label.get(key) for key in gold.tolist()
                    ]

                    num_tokens = len(predicted_labels)
                    mask_ids_batch = mask_ids_batch.view(-1, num_tokens)
                    masks = mask_ids_batch.tolist()[0]
                    input_ids_batch = input_ids_batch.view(-1, num_tokens)
                    tokens = input_ids_batch.tolist()[0]

                    for start_idx in range(0, num_tokens, args.seq_length):
                        pred_sample = predicted_labels[start_idx:start_idx +
                                                       args.seq_length]
                        gold_sample = gold_labels[start_idx:start_idx +
                                                  args.seq_length]
                        mask = masks[start_idx:start_idx + args.seq_length]
                        num_labels = sum(mask)

                        token_sample = tokens[start_idx:start_idx +
                                              args.seq_length]
                        token_sample = token_sample[:num_labels]
                        text = ''.join(
                            tokenizer.convert_ids_to_tokens(token_sample))
                        text = bytearray([byte_decoder[c]
                                          for c in text]).decode('utf-8')

                        p.write(' '.join(pred_sample[:num_labels]) + '\n')
                        g.write(' '.join(gold_sample[:num_labels]) + '\n')
                        t.write(text + '\n')

            for j in range(gold.size()[0]):
                if gold[j].item() in begin_ids:
                    gold_entities_num += 1

            for j in range(pred.size()[0]):
                if pred[j].item(
                ) in begin_ids and gold[j].item() != labels_map["[PAD]"]:
                    pred_entities_num += 1

            pred_entities_pos = []
            pred_entities_pos_with_type = []
            gold_entities_pos = []
            gold_entities_pos_with_type = []
            start, end = 0, 0

            for j in range(gold.size()[0]):
                if gold[j].item() in begin_ids:
                    start = j
                    for k in range(j + 1, gold.size()[0]):
                        if gold[k].item() == labels_map['[X]'] or gold[k].item(
                        ) == labels_map['[ENT]']:
                            continue

                        if gold[k].item(
                        ) == labels_map["[PAD]"] or gold[k].item(
                        ) == labels_map["O"] or gold[k].item() in begin_ids:
                            end = k - 1
                            break
                    else:
                        end = gold.size()[0] - 1
                    if args.eval_range_with_types:
                        ent_type_gold = idx_to_label.get(gold[start].item())
                        ent_type_gold = ent_type_gold.replace('_NOKG', '')
                        gold_entities_pos_with_type.append(
                            (start, end, ent_type_gold))

                    gold_entities_pos.append((start, end))

            for j in range(pred.size()[0]):
                if pred[j].item() in begin_ids and gold[j].item() != labels_map["[PAD]"] and gold[j].item() != \
                        labels_map["[ENT]"] and gold[j].item() != labels_map["[X]"]:
                    start = j
                    for k in range(j + 1, pred.size()[0]):

                        if pred[k].item() == labels_map['[X]'] or gold[k].item(
                        ) == labels_map['[ENT]']:
                            continue

                        if pred[k].item(
                        ) == labels_map["[PAD]"] or pred[k].item(
                        ) == labels_map["O"] or pred[k].item() in begin_ids:
                            end = k - 1
                            break
                    else:
                        end = pred.size()[0] - 1

                    if args.eval_range_with_types:
                        # Get all the labels in the range
                        if start == end:
                            entity_types = [
                                idx_to_label.get(l.item())
                                for l in [pred[start]]
                            ]
                        else:
                            entity_types = [
                                idx_to_label.get(l.item())
                                for l in pred[start:end]
                            ]

                        # Run voting choicer
                        final_entity_type = voting_choicer(entity_types)
                        final_entity_type = final_entity_type.replace(
                            '_NOKG', '')

                        if final:
                            logger.info(
                                f'Predicted: {" ".join(entity_types)}, Selected: {final_entity_type}'
                            )
                        if args.voting_choicer:
                            # Convert back to label id and add in the tuple
                            pred_entities_pos_with_type.append(
                                (start, end, final_entity_type))
                        else:
                            # Use the first prediction
                            ent_type_pred = idx_to_label.get(
                                pred[start].item())
                            ent_type_pred = ent_type_pred.replace('_NOKG', '')
                            pred_entities_pos_with_type.append(
                                (start, end, ent_type_pred))

                    pred_entities_pos.append((start, end))

            for entity in pred_entities_pos:
                if entity not in gold_entities_pos:
                    continue
                else:
                    correct += 1
            if args.eval_range_with_types:
                for entity in pred_entities_pos_with_type:
                    if entity not in gold_entities_pos_with_type:
                        continue
                    else:
                        correct_with_type += 1

        try:
            print("Report precision, recall, and f1:")
            p = correct / pred_entities_num
            r = correct / gold_entities_num
            f1 = 2 * p * r / (p + r)
            print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1))

            if args.eval_range_with_types:
                try:
                    print(
                        "Report accuracy with type, precision, recall, and f1:"
                    )
                    p_with_type = correct_with_type / pred_entities_num
                    r_with_type = correct_with_type / gold_entities_num
                    f1_with_type = 2 * p_with_type * r_with_type / (
                        p_with_type + r_with_type)
                    print("{:.3f}, {:.3f}, {:.3f}".format(
                        p_with_type, r_with_type, f1_with_type))
                except:
                    pass
            return f1
        except ZeroDivisionError:
            return 0

    # Training phase.
    print("Start training.")
    instances = read_dataset(args.train_path)

    input_ids = torch.LongTensor([ins[0] for ins in instances])
    label_ids = torch.LongTensor([ins[1] for ins in instances])
    mask_ids = torch.LongTensor([ins[2] for ins in instances])
    pos_ids = torch.LongTensor([ins[3] for ins in instances])
    vm_ids = torch.BoolTensor([ins[4] for ins in instances])
    tag_ids = torch.LongTensor([ins[5] for ins in instances])
    segment_ids = torch.LongTensor([ins[6] for ins in instances])

    instances_num = input_ids.size(0)
    batch_size = args.batch_size
    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    train_batcher = Batcher(batch_size, input_ids, label_ids, mask_ids,
                            pos_ids, vm_ids, tag_ids, segment_ids)

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup,
                         t_total=train_steps)

    total_loss = 0.
    f1 = 0.0
    best_f1 = 0.0

    # Dry evaluate
    # evaluate(args, True)

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (input_ids_batch, label_ids_batch, mask_ids_batch,
                pos_ids_batch, vm_ids_batch, tag_ids_batch,
                segment_ids_batch) in enumerate(train_batcher):
            model.zero_grad()

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            pos_ids_batch = pos_ids_batch.to(device)
            tag_ids_batch = tag_ids_batch.to(device)
            vm_ids_batch = vm_ids_batch.long().to(device)
            segment_ids_batch = segment_ids_batch.long().to(device)

            loss, _, _, _, _ = model(input_ids_batch,
                                     segment_ids_batch,
                                     mask_ids_batch,
                                     label_ids_batch,
                                     pos_ids_batch,
                                     vm_ids_batch,
                                     use_kg=args.use_kg)

            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.

            loss.backward()
            optimizer.step()

        # Evaluation phase.
        print("Start evaluate on dev dataset.")
        f1 = evaluate(args, False)
        print("Start evaluation on test dataset.")
        evaluate(args, True)

        if f1 > best_f1:
            best_f1 = f1
            save_model(model, args.output_model_path)
            save_encoder(args, encoder, suffix=args.suffix_file_encoder)
        else:
            continue

    # Evaluation phase.
    print("Final evaluation on test dataset.")

    if torch.cuda.device_count() > 1:
        model.module.load_state_dict(torch.load(args.output_model_path))
    else:
        model.load_state_dict(torch.load(args.output_model_path))

    evaluate(args, True, final=True)
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--label2id_path",
                        type=str,
                        required=True,
                        help="Path of the label2id file.")
    parser.add_argument(
        "--crf_target",
        action="store_true",
        help="Use CRF loss as the target function or not, default False.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    with open(args.label2id_path, mode="r", encoding="utf-8") as f:
        l2i = json.load(f)
        print("Labels: ", l2i)
        l2i["[PAD]"] = len(l2i)

    i2l = {}
    for key, value in l2i.items():
        i2l[value] = key

    args.l2i = l2i

    args.labels_num = len(l2i)

    # Load tokenizer.
    args.tokenizer = SpaceTokenizer(args)

    # Build sequence labeling model.
    model = NerTagger(args)
    model = load_model(model, args.load_model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    instances = read_dataset(args, args.test_path)

    src = torch.LongTensor([ins[0] for ins in instances])
    seg = torch.LongTensor([ins[1] for ins in instances])

    instances_num = src.size(0)
    batch_size = args.batch_size

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("pred_label" + "\n")
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, pred = model(src_batch, None, seg_batch)

            # Storing sequence length of instances in a batch.
            seq_length_batch = []
            for seg in seg_batch.cpu().numpy().tolist():
                for j in range(len(seg) - 1, -1, -1):
                    if seg[j] != 0:
                        break
                seq_length_batch.append(j + 1)
            pred = pred.cpu().numpy().tolist()
            for j in range(0, len(pred), args.seq_length):
                for label_id in pred[j:j +
                                     seq_length_batch[j // args.seq_length]]:
                    f.write(i2l[label_id] + " ")
                f.write("\n")
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--dataset_path_list",
                        default=[],
                        nargs='+',
                        type=str,
                        help="Dataset path list.")
    parser.add_argument("--output_model_path",
                        default="./models/multitask_classifier_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=128,
                        help="Sequence length.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    # Optimizer options.
    parser.add_argument("--soft_targets",
                        action='store_true',
                        help="Train model with logits.")
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")
    parser.add_argument(
        "--fp16",
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit."
    )
    parser.add_argument(
        "--fp16_opt_level",
        choices=["O0", "O1", "O2", "O3"],
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num_list = [
        count_labels_num(os.path.join(path, "train.tsv"))
        for path in args.dataset_path_list
    ]

    args.datasets_num = len(args.dataset_path_list)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build multi-task classification model.
    model = MultitaskClassifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)
    args.model = model

    # Training phase.
    dataset_list = [
        read_dataset(args, os.path.join(path, "train.tsv"))
        for path in args.dataset_path_list
    ]
    packed_dataset_list = [
        pack_dataset(dataset, i, args.batch_size)
        for i, dataset in enumerate(dataset_list)
    ]

    packed_dataset_all = []
    for packed_dataset in packed_dataset_list:
        packed_dataset_all += packed_dataset

    random.shuffle(packed_dataset_all)
    instances_num = sum([len(dataset) for dataset in dataset_list])
    batch_size = args.batch_size

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    total_loss, result, best_result = 0., 0., 0.

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (dataset_id, src_batch, tgt_batch,
                seg_batch) in enumerate(packed_dataset_all):
            if hasattr(model, "module"):
                model.module.change_dataset(dataset_id)
            else:
                model.change_dataset(dataset_id)
            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch, None)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.

        for dataset_id, path in enumerate(args.dataset_path_list):
            args.labels_num = args.labels_num_list[dataset_id]
            if hasattr(model, "module"):
                model.module.change_dataset(dataset_id)
            else:
                model.change_dataset(dataset_id)
            result = evaluate(
                args, read_dataset(args, os.path.join(path, "dev.tsv")))

    save_model(model, args.output_model_path)
Exemple #14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--dataset_path",
                        type=str,
                        default="dataset.pt",
                        help="Path of the preprocessed dataset.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--pretrained_model_path",
                        type=str,
                        default=None,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        type=str,
                        required=True,
                        help="Path of the output model.")
    parser.add_argument("--config_path",
                        type=str,
                        default=None,
                        help="Config file of model hyper-parameters.")

    # Training and saving options.
    parser.add_argument("--total_steps",
                        type=int,
                        default=100000,
                        help="Total training steps.")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=10000,
                        help="Specific steps to save model checkpoint.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--accumulation_steps",
                        type=int,
                        default=1,
                        help="Specific steps to accumulate gradient.")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
        help=
        "Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps]."
    )
    parser.add_argument("--instances_buffer_size",
                        type=int,
                        default=25600,
                        help="The buffer size of instances in memory.")

    # Model options.
    parser.add_argument("--emb_size",
                        type=int,
                        default=768,
                        help="Embedding dimension.")
    parser.add_argument("--hidden_size",
                        type=int,
                        default=768,
                        help="Hidden state dimension.")
    parser.add_argument("--feedforward_size",
                        type=int,
                        default=3072,
                        help="Feed forward layer dimension.")
    parser.add_argument("--kernel_size",
                        type=int,
                        default=3,
                        help="Kernel size for CNN.")
    parser.add_argument("--block_size",
                        type=int,
                        default=2,
                        help="Block size for CNN.")
    parser.add_argument("--heads_num",
                        type=int,
                        default=12,
                        help="The number of heads in multi-head attention.")
    parser.add_argument("--layers_num",
                        type=int,
                        default=12,
                        help="The number of encoder layers.")
    parser.add_argument("--dropout",
                        type=float,
                        default=0.1,
                        help="Dropout value.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--target",
                        choices=["bert", "lm", "cls", "mlm", "bilm"],
                        default="bert",
                        help="The training target of the pretraining model.")
    parser.add_argument("--labels_num",
                        type=int,
                        default=2,
                        help="Specific to classification target.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Initial learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # GPU options.
    parser.add_argument("--world_size",
                        type=int,
                        default=1,
                        help="Total number of processes (GPUs) for training.")
    parser.add_argument(
        "--gpu_ranks",
        default=[],
        nargs='+',
        type=int,
        help="List of ranks of each process."
        " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU."
    )
    parser.add_argument("--master_ip",
                        default="tcp://localhost:12345",
                        type=str,
                        help="IP-Port of master for training.")
    parser.add_argument("--backend",
                        choices=["nccl", "gloo"],
                        default="nccl",
                        type=str,
                        help="Distributed backend.")

    args = parser.parse_args()

    # Load hyper-parameters from config file.
    if args.config_path:
        load_hyperparam(args)

    ranks_num = len(args.gpu_ranks)

    if args.world_size > 1:
        # Multiprocessing distributed mode.
        assert torch.cuda.is_available(), "No available GPUs."
        assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit."
        assert ranks_num <= torch.cuda.device_count(
        ), "Started processes exceeds the available GPUs."
        args.dist_train = True
        args.ranks_num = ranks_num
        print("Using distributed mode for training.")
    elif args.world_size == 1 and ranks_num == 1:
        # Single GPU mode.
        assert torch.cuda.is_available(), "No available GPUs."
        args.gpu_id = args.gpu_ranks[0]
        assert args.gpu_id < torch.cuda.device_count(
        ), "Invalid specified GPU device."
        args.dist_train = False
        args.single_gpu = True
        print("Using single GPU:%d for training." % args.gpu_id)
    else:
        # CPU mode.
        assert ranks_num == 0, "GPUs are specified, please check the arguments."
        args.dist_train = False
        args.single_gpu = False
        print("Using CPU mode for training.")

    trainer.train_and_validate(args)
Exemple #15
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/QA_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default="./models/google_vocab.txt",
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/google_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=100,
                        help="Sequence length.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="char",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=3e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    args.target = "bert"
    bert_model = build_model(args)
    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        bert_model.load_state_dict(torch.load(args.pretrained_model_path),
                                   strict=False)
    else:
        # Initialize with normal distribution.
        for n, p in list(bert_model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    # Build QA model.
    model = BertQuestionAnswering(args, bert_model)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    # Dataset loader.
    def batch_loader(batch_size, input_ids, mask_ids, start_positions,
                     end_positions):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            start_positions_batch = start_positions[i * batch_size:(i + 1) *
                                                    batch_size]
            end_positions_batch = end_positions[i * batch_size:(i + 1) *
                                                batch_size]
            yield input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            start_positions_batch = start_positions[instances_num //
                                                    batch_size * batch_size:]
            end_positions_batch = end_positions[instances_num // batch_size *
                                                batch_size:]
            yield input_ids_batch, mask_ids_batch, start_positions_batch, end_positions_batch

    # Build tokenizer.
    tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Read examples.
    def read_examples(path):
        examples = []
        with open(path, 'r', encoding='utf-8') as fp:
            all_dict = json.loads(fp.read())
            v1 = all_dict["data"]
            for i in range(len(v1)):
                data_dict = v1[i]
                v2 = data_dict["paragraphs"]

                for j in range(len(v2)):
                    para_dict = v2[j]
                    context = para_dict["context"]
                    v3 = para_dict["qas"]

                    for m in range(len(v3)):
                        qas_dict = v3[m]
                        question = qas_dict["question"]
                        question_id = qas_dict["id"]
                        v4 = qas_dict["answers"]

                        answers = []
                        start_positions = []
                        end_positions = []

                        for n in range(len(v4)):
                            ans_dict = v4[n]
                            answer = ans_dict["text"]
                            start_position = ans_dict["answer_start"]
                            end_position = start_position + len(answer)

                            answers.append(answer)
                            start_positions.append(start_position)
                            end_positions.append(end_position)

                        examples.append(
                            (context, question, question_id, start_positions,
                             end_positions, answers))

        return examples

    def convert_examples_to_dataset(examples, args):
        dataset = []
        print("The number of questions in the dataset", len(examples))
        for i in range(len(examples)):
            context = examples[i][0]
            question = examples[i][1]
            q_len = len(question)
            question_id = examples[i][2]

            start_positions_true = examples[i][3][0]  #待修改
            end_positions_true = examples[i][4][0]

            answers = examples[i][5]
            max_context_length = args.seq_length - q_len - 3
            # divide the context to some spans
            _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
                "DocSpan", ["start", "length"])
            doc_spans = []
            start_offset = 0
            while start_offset < len(context):
                length = len(context) - start_offset
                if length > max_context_length:
                    length = max_context_length
                doc_spans.append(_DocSpan(start=start_offset, length=length))
                if start_offset + length == len(context):
                    break
                start_offset += min(length, args.doc_stride)

            for (doc_span_index, doc_span) in enumerate(doc_spans):
                doc_span_start = doc_span.start
                span_context = context[doc_span_start:doc_span_start +
                                       doc_span.length]
                # convert the start or end position to real position in tokens
                start_positions = start_positions_true - doc_span_start + q_len + 2
                end_positions = end_positions_true - doc_span_start + q_len + 2
                # the answers of some question are not in the doc_span, we ignore them.
                if start_positions < q_len + 2 or start_positions > doc_span.length + q_len + 2 or end_positions < q_len + 2 or end_positions > doc_span.length + q_len + 2:
                    continue

                tokens_a = [vocab.get(t) for t in tokenizer.tokenize(question)]
                tokens_a = [CLS_ID] + tokens_a + [SEP_ID]
                tokens_b = [
                    vocab.get(t) for t in tokenizer.tokenize(span_context)
                ]
                tokens_b = tokens_b + [SEP_ID]
                tokens = tokens_a + tokens_b
                mask = [1] * len(tokens_a) + [2] * len(tokens_b)

                while len(tokens) < args.seq_length:
                    tokens.append(0)
                    mask.append(0)

                dataset.append(
                    (tokens, mask, start_positions, end_positions, answers,
                     question_id, q_len, doc_span_index, doc_span_start))
        return dataset

    # Evaluation function.
    def evaluate(args, is_test):
        # some calculation functions
        def mixed_segmentation(in_str, rm_punc=False):
            in_str = str(in_str).lower().strip()
            segs_out = []
            temp_str = ""
            sp_char = [
                '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',',
                '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·',
                '、', '「', '」', '(', ')', '-', '~', '『', '』'
            ]
            for char in in_str:
                if rm_punc and char in sp_char:
                    continue
                if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
                    if temp_str != "":
                        ss = nltk.word_tokenize(temp_str)
                        segs_out.extend(ss)
                        temp_str = ""
                    segs_out.append(char)
                else:
                    temp_str += char

            #handling last part
            if temp_str != "":
                ss = nltk.word_tokenize(temp_str)
                segs_out.extend(ss)

            return segs_out

        # remove punctuation
        def remove_punctuation(in_str):
            in_str = str(in_str).lower().strip()
            sp_char = [
                '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',',
                '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·',
                '、', '「', '」', '(', ')', '-', '~', '『', '』'
            ]
            out_segs = []
            for char in in_str:
                if char in sp_char:
                    continue
                else:
                    out_segs.append(char)
            return ''.join(out_segs)

        # find longest common string
        def find_lcs(s1, s2):
            m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
            mmax = 0
            p = 0
            for i in range(len(s1)):
                for j in range(len(s2)):
                    if s1[i] == s2[j]:
                        m[i + 1][j + 1] = m[i][j] + 1
                        if m[i + 1][j + 1] > mmax:
                            mmax = m[i + 1][j + 1]
                            p = i + 1
            return s1[p - mmax:p], mmax

        def calc_f1_score(answers, prediction):
            f1_scores = []
            for i in range(len(answers)):
                ans = answers[i]
                ans_segs = mixed_segmentation(ans, rm_punc=True)
                prediction_segs = mixed_segmentation(prediction, rm_punc=True)
                lcs, lcs_len = find_lcs(ans_segs, prediction_segs)
                if lcs_len == 0:
                    f1_scores.append(0)
                else:
                    precision = 1.0 * lcs_len / len(prediction_segs)
                    recall = 1.0 * lcs_len / len(ans_segs)
                    f1 = (2 * precision * recall) / (precision + recall)
                    f1_scores.append(f1)
            return max(f1_scores)

        def calc_em_score(answers, prediction):
            em = 0
            for i in range(len(answers)):
                ans = answers[i]
                ans_ = remove_punctuation(ans)
                prediction_ = remove_punctuation(prediction)
                if ans_ == prediction_:
                    em = 1
                    break
            return em

        def is_max_score(score_list):
            score_max = -100
            index_max = 0
            best_start_prediction = 0
            best_end_prediction = 0
            for i in range(len(score_list)):
                if score_max <= score_list[i][3]:
                    score_max = score_list[i][3]
                    index_max = score_list[i][0]
                    best_start_prediction = score_list[i][1]
                    best_end_prediction = score_list[i][2]
            return index_max, best_start_prediction, best_end_prediction

        if is_test:
            examples = read_examples(args.test_path)
            dataset = convert_examples_to_dataset(examples, args)

        else:
            examples = read_examples(args.dev_path)
            dataset = convert_examples_to_dataset(examples, args)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        mask_ids = torch.LongTensor([sample[1] for sample in dataset])
        start_positions = torch.LongTensor([sample[2] for sample in dataset])
        end_positions = torch.LongTensor([sample[3] for sample in dataset])

        batch_size = args.batch_size
        instances_num = input_ids.size()[0]

        if is_test:
            print("The number of evaluation instances: ", instances_num)
        model.eval()
        start_logits_all = []
        end_logits_all = []
        start_pred_all = []
        end_pred_all = []
        for i, (input_ids_batch, mask_ids_batch, start_positions_batch,
                end_positions_batch) in enumerate(
                    batch_loader(batch_size, input_ids, mask_ids,
                                 start_positions, end_positions)):
            model.zero_grad()
            input_ids_batch = input_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            start_positions_batch = start_positions_batch.to(device)
            end_positions_batch = end_positions_batch.to(device)

            with torch.no_grad():
                loss, start_logits, end_logits = model(input_ids_batch,
                                                       mask_ids_batch,
                                                       start_positions_batch,
                                                       end_positions_batch)

            start_logits = nn.Softmax(dim=1)(start_logits)
            end_logits = nn.Softmax(dim=1)(end_logits)

            start_pred = torch.argmax(start_logits, dim=1)
            end_pred = torch.argmax(end_logits, dim=1)

            start_pred = start_pred.cpu().numpy().tolist()
            end_pred = end_pred.cpu().numpy().tolist()

            start_logits = start_logits.cpu().numpy().tolist()
            end_logits = end_logits.cpu().numpy().tolist()

            start_logits_max = []
            end_logits_max = []
            for j in range(len(start_pred)):
                start_logits_max.append(start_logits[j][start_pred[j]])
                end_logits_max.append(end_logits[j][end_pred[j]])

            start_logits_all += start_logits_max
            end_logits_all += end_logits_max
            start_pred_all += start_pred
            end_pred_all += end_pred

        assert len(start_pred_all) == len(dataset)
        assert len(start_logits_all) == len(dataset)

        # couster by question id and chose the best answer in doc_spans
        order = -1
        pred_list = []
        templist = []
        for i in range(len(dataset)):
            qid = dataset[i][5]
            q_len = dataset[i][6]
            span_index = dataset[i][7]
            doc_span_start = dataset[i][8]

            score1 = float(start_logits_all[i])
            score2 = float(end_logits_all[i])
            score = (score1 + score2) / 2

            pre_start_pred = start_pred_all[i] + doc_span_start - q_len - 2
            pre_end_pred = end_pred_all[i] + doc_span_start - q_len - 2

            if qid == order:
                templist.append(
                    (span_index, pre_start_pred, pre_end_pred, score))
            else:
                order = qid
                if i > 0:
                    span_index_max, best_start_prediction, best_end_prediction = is_max_score(
                        templist)
                    pred_list.append((span_index_max, best_start_prediction,
                                      best_end_prediction))
                templist = []
                templist.append(
                    (span_index, pre_start_pred, pre_end_pred, score))
        span_index_max, best_start_prediction, best_end_prediction = is_max_score(
            templist)
        pred_list.append(
            (span_index_max, best_start_prediction, best_end_prediction))

        assert len(pred_list) == len(examples)

        #strat pred
        f1 = 0
        em = 0
        total_count = len(examples)
        skip_count = 0
        for i in range(len(examples)):
            question_id = examples[i][2]
            answers = examples[i][5]
            span_index = pred_list[i][0]
            start_prediction = pred_list[i][1]
            end_prediction = pred_list[i][2]

            #error prediction
            if end_prediction <= start_prediction:
                skip_count += 1
                continue

            prediction = examples[i][0][start_prediction:end_prediction]

            f1 += calc_f1_score(answers, prediction)
            em += calc_em_score(answers, prediction)

        f1_score = 100.0 * f1 / total_count
        em_score = 100.0 * em / total_count
        avg = (f1_score + em_score) * 0.5
        print("Avg: {:.4f},F1:{:.4f},EM:{:.4f},Total:{},Skip:{}".format(
            avg, f1_score, em_score, total_count, skip_count))
        return avg

    # Training phase
    print("Start training.")
    batch_size = args.batch_size
    print("Batch size: ", batch_size)
    examples = read_examples(args.train_path)
    trainset = convert_examples_to_dataset(examples, args)
    random.shuffle(trainset)
    instances_num = len(trainset)

    input_ids = torch.LongTensor([sample[0] for sample in trainset])
    mask_ids = torch.LongTensor([sample[1] for sample in trainset])
    start_positions = torch.LongTensor([sample[2] for sample in trainset])
    end_positions = torch.LongTensor([sample[3] for sample in trainset])

    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup,
                         t_total=train_steps)

    total_loss = 0.
    result = 0.0
    best_result = 0.0

    for epoch in range(1, args.epochs_num + 1):
        model.train()

        for i, (input_ids_batch, mask_ids_batch, start_positions_batch,
                end_positions_batch) in enumerate(
                    batch_loader(batch_size, input_ids, mask_ids,
                                 start_positions, end_positions)):
            model.zero_grad()
            input_ids_batch = input_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            start_positions_batch = start_positions_batch.to(device)
            end_positions_batch = end_positions_batch.to(device)

            loss, _, _ = model(input_ids_batch, mask_ids_batch,
                               start_positions_batch, end_positions_batch)
            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.
            loss.backward()
            optimizer.step()
        result = evaluate(args, False)
        if result > best_result:
            best_result = result
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        model = load_model(model, args.output_model_path)
        evaluate(args, True)
Exemple #16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build model and load parameters.
    model = MachineReadingComprehension(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset, examples = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])
    start_position = torch.LongTensor([sample[2] for sample in dataset])
    end_position = torch.LongTensor([sample[3] for sample in dataset])

    batch_size = args.batch_size
    instances_num = len(dataset)

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:

        start_prob_all, end_prob_all = [], []

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            start_position_batch = start_position_batch.to(device)
            end_position_batch = end_position_batch.to(device)

            with torch.no_grad():
                loss, start_logits, end_logits = model(src_batch, seg_batch,
                                                       start_position_batch,
                                                       end_position_batch)

            start_prob = nn.Softmax(dim=1)(start_logits)
            end_prob = nn.Softmax(dim=1)(end_logits)

            for j in range(start_prob.size()[0]):
                start_prob_all.append(start_prob[j])
                end_prob_all.append(end_prob[j])

        pred_answers = get_answers(dataset, start_prob_all, end_prob_all)

        output = {}
        for i in range(len(examples)):
            question_id = examples[i][2]
            start_pred_pos = pred_answers[i][1]
            end_pred_pos = pred_answers[i][2]

            prediction = examples[i][0][start_pred_pos:end_pred_pos + 1]
            output[question_id] = prediction

        f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
Exemple #17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/cmrc_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=100,
                        help="Sequence length.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=3e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")
    parser.add_argument(
        "--fp16",
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        "--fp16_opt_level",
        choices=["O0", "O1", "O2", "O3"],
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build machine reading comprehension model.
    model = MachineReadingComprehension(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Training phase.
    batch_size = args.batch_size
    print("Batch size: ", batch_size)
    trainset, _ = read_dataset(args, args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)

    src = torch.LongTensor([sample[0] for sample in trainset])
    seg = torch.LongTensor([sample[1] for sample in trainset])
    start_position = torch.LongTensor([sample[2] for sample in trainset])
    end_position = torch.LongTensor([sample[3] for sample in trainset])

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss = 0.
    result = 0.0
    best_result = 0.0

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            loss = train(args, model, optimizer, scheduler, src_batch,
                         seg_batch, start_position_batch, end_position_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.

        result = evaluate(args, *read_dataset(args, args.dev_path))
        if result > best_result:
            best_result = result
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            model.module.load_state_dict(torch.load(args.output_model_path))
        else:
            model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, *read_dataset(args, args.test_path))
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")

    parser.add_argument("--labels_num", type=int, required=True,
                        help="Number of prediction labels.")

    parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer on Chinese corpus."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             )

    parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
    parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")
    
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    model = Classifier(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
            
            pred = torch.argmax(logits, dim=1)
            pred = pred.cpu().numpy().tolist()
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()
            
            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")
Exemple #19
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    tokenizer_opts(parser)

    parser.add_argument("--soft_targets",
                        action='store_true',
                        help="Train model with logits.")
    parser.add_argument("--soft_alpha",
                        type=float,
                        default=0.5,
                        help="Weight of the soft targets loss.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num = count_labels_num(args.train_path)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model.
    model = Classifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size

    src = torch.LongTensor([example[0] for example in trainset])
    tgt = torch.LongTensor([example[1] for example in trainset])
    seg = torch.LongTensor([example[2] for example in trainset])
    if args.soft_targets:
        soft_tgt = torch.FloatTensor([example[3] for example in trainset])
    else:
        soft_tgt = None

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, result, best_result = 0.0, 0.0, 0.0

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(
                batch_loader(batch_size, src, tgt, seg, soft_tgt)):
            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch, soft_tgt_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        result = evaluate(args, read_dataset(args, args.dev_path))
        if result[0] > best_result:
            best_result = result[0]
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            args.model.module.load_state_dict(
                torch.load(args.output_model_path))
        else:
            args.model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path), True)
Exemple #20
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the classfier model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=512,
                        help="Sequence length.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build model and load parameters.
    model = MachineReadingComprehension(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset, examples = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])
    start_position = torch.LongTensor([sample[2] for sample in dataset])
    end_position = torch.LongTensor([sample[3] for sample in dataset])

    batch_size = args.batch_size
    instances_num = len(dataset)

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:

        start_prob_all, end_prob_all = [], []

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            start_position_batch = start_position_batch.to(device)
            end_position_batch = end_position_batch.to(device)

            with torch.no_grad():
                loss, start_logits, end_logits = model(src_batch, seg_batch,
                                                       start_position_batch,
                                                       end_position_batch)

            start_prob = nn.Softmax(dim=1)(start_logits)
            end_prob = nn.Softmax(dim=1)(end_logits)

            for j in range(start_prob.size()[0]):
                start_prob_all.append(start_prob[j])
                end_prob_all.append(end_prob[j])

        pred_answers = get_answers(dataset, start_prob_all, end_prob_all)

        output = {}
        for i in range(len(examples)):
            question_id = examples[i][2]
            start_pred_pos = pred_answers[i][1]
            end_pred_pos = pred_answers[i][2]

            prediction = examples[i][0][start_pred_pos:end_pred_pos]
            output[question_id] = prediction

        f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
Exemple #21
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--dataset_path", type=str, default="dataset.pt",
                        help="Path of the preprocessed dataset.")
    parser.add_argument("--vocab_path", default=None, type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path", default=None, type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--tgt_vocab_path", default=None, type=str,
                        help="Path of the target vocabulary file.")
    parser.add_argument("--tgt_spm_model_path", default=None, type=str,
                        help="Path of the target sentence piece model.")
    parser.add_argument("--pretrained_model_path", type=str, default=None,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path", type=str, required=True,
                        help="Path of the output model.")
    parser.add_argument("--config_path", type=str, default="models/bert/base_config.json",
                        help="Config file of model hyper-parameters.")

    # Training and saving options.
    parser.add_argument("--total_steps", type=int, default=100000,
                        help="Total training steps.")
    parser.add_argument("--save_checkpoint_steps", type=int, default=10000,
                        help="Specific steps to save model checkpoint.")
    parser.add_argument("--report_steps", type=int, default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--accumulation_steps", type=int, default=1,
                        help="Specific steps to accumulate gradient.")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps].")
    parser.add_argument("--instances_buffer_size", type=int, default=25600,
                        help="The buffer size of instances in memory.")
    parser.add_argument("--labels_num", type=int, required=False,
                        help="Number of prediction labels.")
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    # Preprocess options.
    parser.add_argument("--tokenizer", choices=["bert", "char", "space", "xlmroberta"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             "Original XLM-RoBERTa uses xlmroberta tokenizer."
                             )
    parser.add_argument("--tgt_tokenizer", choices=["bert", "char", "space", "xlmroberta"], default="bert",
                        help="Specify the tokenizer for target side.")

    # Model options.
    model_opts(parser)
    parser.add_argument("--tgt_embedding", choices=["word", "word_pos", "word_pos_seg", "word_sinusoidalpos"], default="word_pos_seg",
                        help="Target embedding type.")
    parser.add_argument("--decoder", choices=["transformer"], default="transformer", help="Decoder type.")
    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")
    parser.add_argument("--target", choices=["bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm", "gsg", "bart"], default="bert",
                        help="The training target of the pretraining model.")
    parser.add_argument("--tie_weights", action="store_true",
                        help="Tie the word embedding and softmax weights.")
    parser.add_argument("--has_lmtarget_bias", action="store_true",
                        help="Add bias on output_layer for lm target.")
    parser.add_argument("--deep_init", action="store_true",
                        help="initialize bert model similar to gpt2 model."
                             "scales initialization of projection layers by a "
                             "factor of 1/sqrt(2N). Necessary to train bert "
                             "models larger than BERT-Large.")

    # Masking options.
    parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.")
    parser.add_argument("--span_masking", action="store_true", help="Span masking.")
    parser.add_argument("--span_geo_prob", type=float, default=0.2,
                        help="Hyperparameter of geometric distribution for span masking.")
    parser.add_argument("--span_max_length", type=int, default=10,
                        help="Max length for span masking.")

    # Optimizer options.
    optimization_opts(parser)

    # GPU options.
    parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.")
    parser.add_argument("--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process."
                        " Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU.")
    parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.")
    parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.")

    args = parser.parse_args()

    if args.target == "cls":
        assert args.labels_num is not None, "Cls target needs the denotation of the number of labels."

    # Load hyper-parameters from config file.
    if args.config_path:
        load_hyperparam(args)

    ranks_num = len(args.gpu_ranks)

    if args.world_size > 1:
        # Multiprocessing distributed mode.
        assert torch.cuda.is_available(), "No available GPUs."
        assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit."
        assert ranks_num <= torch.cuda.device_count(), "Started processes exceeds the available GPUs."
        args.dist_train = True
        args.ranks_num = ranks_num
        print("Using distributed mode for training.")
    elif args.world_size == 1 and ranks_num == 1:
        # Single GPU mode.
        assert torch.cuda.is_available(), "No available GPUs."
        args.gpu_id = args.gpu_ranks[0]
        assert args.gpu_id < torch.cuda.device_count(), "Invalid specified GPU device."
        args.dist_train = False
        args.single_gpu = True
        print("Using GPU %d for training." % args.gpu_id)
    else:
        # CPU mode.
        assert ranks_num == 0, "GPUs are specified, please check the arguments."
        args.dist_train = False
        args.single_gpu = False
        print("Using CPU mode for training.")

    trainer.train_and_validate(args)
Exemple #22
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path", default=None, type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path", default=None, type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path", default=None, type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--train_path", type=str, required=True,
                        help="Path of the trainset.")
    parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
                        help="Path of the config file.")
    parser.add_argument("--train_features_path", type=str, required=True,
                        help="Path of the train features for stacking.")

    # Model options.
    parser.add_argument("--batch_size", type=int, default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length", type=int, default=128,
                        help="Sequence length.")
    parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")
    parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer on Chinese corpus."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             )

    # Optimizer options.
    parser.add_argument("--soft_targets", action='store_true',
                        help="Train model with logits.")
    parser.add_argument("--soft_alpha", type=float, default=0.5,
                        help="Weight of the soft targets loss.")
    parser.add_argument("--learning_rate", type=float, default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup", type=float, default=0.1,
                        help="Warm up value.")
    parser.add_argument("--fp16", action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit.")
    parser.add_argument("--fp16_opt_level", choices=["O0", "O1", "O2", "O3" ], default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5,
                        help="Dropout.")
    parser.add_argument("--epochs_num", type=int, default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps", type=int, default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7,
                        help="Random seed.")

    # Cross validation options.
    parser.add_argument("--folds_num", type=int, default=5,
                        help="The number of folds for cross validation.")
    
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels. 
    args.labels_num = count_labels_num(args.train_path)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Training phase.
    dataset = read_dataset(args, args.train_path)
    instances_num = len(dataset)
    batch_size = args.batch_size
    instances_num_per_fold = instances_num // args.folds_num + 1

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    train_features = []

    total_loss, result = 0., 0.
    acc, marco_f1 = 0., 0.

    for fold_id in range(args.folds_num):
        # Build classification model.
        model = Classifier(args)

        args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(args.device)
        load_or_initialize_parameters(args, model)
        optimizer, scheduler = build_optimizer(args, model)
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level = args.fp16_opt_level)
            args.amp = amp
        if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
        args.model = model

        trainset = dataset[0:fold_id*instances_num_per_fold] + dataset[(fold_id+1)*instances_num_per_fold:]
        random.shuffle(trainset)
        
        train_src = torch.LongTensor([example[0] for example in trainset])
        train_tgt = torch.LongTensor([example[1] for example in trainset])
        train_seg = torch.LongTensor([example[2] for example in trainset])

        if args.soft_targets:
            train_soft_tgt = torch.FloatTensor([example[3] for example in trainset])
        else:
            train_soft_tgt = None

        devset = dataset[fold_id*instances_num_per_fold:(fold_id+1)*instances_num_per_fold]

        dev_src = torch.LongTensor([example[0] for example in devset])
        dev_tgt = torch.LongTensor([example[1] for example in devset])
        dev_seg = torch.LongTensor([example[2] for example in devset])
        dev_soft_tgt = None

        for epoch in range(1, args.epochs_num+1):
            model.train()
            for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(batch_loader(batch_size, train_src, train_tgt, train_seg, train_soft_tgt)):    
                loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch, soft_tgt_batch)
                total_loss += loss.item()
                if (i + 1) % args.report_steps == 0:
                    print("Fold id: {}, Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(fold_id, epoch, i+1, total_loss / args.report_steps))
                    total_loss = 0.

        model.eval()
        for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(batch_loader(batch_size, dev_src, dev_tgt, dev_seg, dev_soft_tgt)): 
             src_batch = src_batch.to(args.device)
             seg_batch = seg_batch.to(args.device)
             with torch.no_grad():
                 _, logits = model(src_batch, None, seg_batch)
             prob = nn.Softmax(dim=1)(logits)
             prob = prob.cpu().numpy().tolist()        
             train_features.extend(prob)

        output_model_name = ".".join(args.output_model_path.split(".")[:-1])
        output_model_suffix = args.output_model_path.split(".")[-1]
        save_model(model, output_model_name+"-fold_"+str(fold_id)+"."+output_model_suffix)
        result = evaluate(args, devset)
        acc += result[0]/args.folds_num
        f1 = []
        confusion = result[1]
        for i in range(confusion.size()[0]):
            p = confusion[i,i].item()/confusion[i,:].sum().item()
            r = confusion[i,i].item()/confusion[:,i].sum().item()
            f1.append(2*p*r/(p+r))

        marco_f1 += sum(f1)/len(f1)/args.folds_num
        # print("Acc. : {:.4f}".format(result[0]))
        # print("Marco F1 : {:.4f}".format(sum(f1)/len(f1)))
        
    train_features = np.array(train_features)
    np.save(args.train_features_path, train_features)
    print("Acc. : {:.4f}".format(acc))
    print("Marco F1 : {:.4f}".format(marco_f1))
Exemple #23
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    parser.add_argument(
        "--max_choices_num",
        default=4,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )

    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()
    args.labels_num = args.max_choices_num

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build multiple choice model.
    model = MultipleChoice(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size

    src = torch.LongTensor([example[0] for example in trainset])
    tgt = torch.LongTensor([example[1] for example in trainset])
    seg = torch.LongTensor([example[2] for example in trainset])

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, result, best_result = 0.0, 0.0, 0.0

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (src_batch, tgt_batch, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch)
            total_loss += loss.item()

            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        result = evaluate(args, read_dataset(args, args.dev_path))
        if result[0] > best_result:
            best_result = result[0]
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            model.module.load_state_dict(torch.load(args.output_model_path))
        else:
            model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--dataset_path_list",
                        default=[],
                        nargs='+',
                        type=str,
                        help="Dataset path list.")
    parser.add_argument("--output_model_path",
                        default="models/multitask_classifier_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--config_path",
                        default="models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    model_opts(parser)
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    # Optimizer options.
    optimization_opts(parser)

    # Training options.
    training_opts(parser)

    args = parser.parse_args()

    args.soft_targets = False

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num_list = [
        count_labels_num(os.path.join(path, "train.tsv"))
        for path in args.dataset_path_list
    ]

    args.datasets_num = len(args.dataset_path_list)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build multi-task classification model.
    model = MultitaskClassifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)
    args.model = model

    # Training phase.
    dataset_list = [
        read_dataset(args, os.path.join(path, "train.tsv"))
        for path in args.dataset_path_list
    ]
    packed_dataset_list = [
        pack_dataset(dataset, i, args.batch_size)
        for i, dataset in enumerate(dataset_list)
    ]

    packed_dataset_all = []
    for packed_dataset in packed_dataset_list:
        packed_dataset_all += packed_dataset

    random.shuffle(packed_dataset_all)
    instances_num = sum([len(dataset) for dataset in dataset_list])
    batch_size = args.batch_size

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    total_loss, result, best_result = 0.0, 0.0, 0.0

    print("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (dataset_id, src_batch, tgt_batch,
                seg_batch) in enumerate(packed_dataset_all):
            if hasattr(model, "module"):
                model.module.change_dataset(dataset_id)
            else:
                model.change_dataset(dataset_id)
            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch, None)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        for dataset_id, path in enumerate(args.dataset_path_list):
            args.labels_num = args.labels_num_list[dataset_id]
            if hasattr(model, "module"):
                model.module.change_dataset(dataset_id)
            else:
                model.change_dataset(dataset_id)
            result = evaluate(
                args, read_dataset(args, os.path.join(path, "dev.tsv")))

    save_model(model, args.output_model_path)
Exemple #25
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="models/classifier_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--config_path",
                        default="models/bert/base_config.json",
                        type=str,
                        help="Path of the config file.")
    parser.add_argument("--train_features_path",
                        type=str,
                        required=True,
                        help="Path of the train features for stacking.")

    # Model options.
    model_opts(parser)
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Tokenizer options.
    tokenizer_opts(parser)

    # Optimization options.
    optimization_opts(parser)
    parser.add_argument("--soft_targets",
                        action='store_true',
                        help="Train model with logits.")
    parser.add_argument("--soft_alpha",
                        type=float,
                        default=0.5,
                        help="Weight of the soft targets loss.")

    # Training options.
    training_opts(parser)

    # Cross validation options.
    parser.add_argument("--folds_num",
                        type=int,
                        default=5,
                        help="The number of folds for cross validation.")

    adv_opts(parser)

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Get logger.
    args.logger = get_logger(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num = count_labels_num(args.train_path)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Training phase.
    dataset = read_dataset(args, args.train_path)
    instances_num = len(dataset)
    batch_size = args.batch_size
    instances_num_per_fold = instances_num // args.folds_num + 1

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    train_features = []

    total_loss, result = 0.0, 0.0
    acc, marco_f1 = 0.0, 0.0

    for fold_id in range(args.folds_num):
        # Build classification model.
        model = Classifier(args)

        args.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(args.device)
        load_or_initialize_parameters(args, model)
        optimizer, scheduler = build_optimizer(args, model)
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)
            args.amp = amp
        if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
        args.model = model

        if args.use_adv:
            args.adv_method = str2adv[args.adv_type](model)

        trainset = dataset[0:fold_id * instances_num_per_fold] + dataset[
            (fold_id + 1) * instances_num_per_fold:]

        devset = dataset[fold_id * instances_num_per_fold:(fold_id + 1) *
                         instances_num_per_fold]

        dev_src = torch.LongTensor([example[0] for example in devset])
        dev_tgt = torch.LongTensor([example[1] for example in devset])
        dev_seg = torch.LongTensor([example[2] for example in devset])
        dev_soft_tgt = None

        for epoch in range(1, args.epochs_num + 1):
            random.shuffle(trainset)

            train_src = torch.LongTensor([example[0] for example in trainset])
            train_tgt = torch.LongTensor([example[1] for example in trainset])
            train_seg = torch.LongTensor([example[2] for example in trainset])

            if args.soft_targets:
                train_soft_tgt = torch.FloatTensor(
                    [example[3] for example in trainset])
            else:
                train_soft_tgt = None

            model.train()
            for i, (src_batch, tgt_batch, seg_batch,
                    soft_tgt_batch) in enumerate(
                        batch_loader(batch_size, train_src, train_tgt,
                                     train_seg, train_soft_tgt)):
                loss = train_model(args, model, optimizer, scheduler,
                                   src_batch, tgt_batch, seg_batch,
                                   soft_tgt_batch)
                total_loss += loss.item()
                if (i + 1) % args.report_steps == 0:
                    args.logger.info(
                        "Fold id: {}, Epoch id: {}, Training steps: {}, Avg loss: {:.3f}"
                        .format(fold_id, epoch, i + 1,
                                total_loss / args.report_steps))
                    total_loss = 0.0

        model.eval()
        for i, (src_batch, tgt_batch, seg_batch, soft_tgt_batch) in enumerate(
                batch_loader(batch_size, dev_src, dev_tgt, dev_seg,
                             dev_soft_tgt)):
            src_batch = src_batch.to(args.device)
            seg_batch = seg_batch.to(args.device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
            prob = nn.Softmax(dim=1)(logits)
            prob = prob.cpu().numpy().tolist()
            train_features.extend(prob)

        output_model_name = ".".join(args.output_model_path.split(".")[:-1])
        output_model_suffix = args.output_model_path.split(".")[-1]
        save_model(
            model, output_model_name + "-fold_" + str(fold_id) + "." +
            output_model_suffix)
        result = evaluate(args, devset)
        acc += result[0] / args.folds_num
        f1 = []
        confusion = result[1]
        eps = 1e-9
        for i in range(confusion.size()[0]):
            p = confusion[i, i].item() / (confusion[i, :].sum().item() + eps)
            r = confusion[i, i].item() / (confusion[:, i].sum().item() + eps)
            f1.append(2 * p * r / (p + r + eps))

        marco_f1 += sum(f1) / len(f1) / args.folds_num

    train_features = np.array(train_features)
    np.save(args.train_features_path, train_features)
    args.logger.info("Acc. : {:.4f}".format(acc))
    args.logger.info("Marco F1 : {:.4f}".format(marco_f1))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    finetune_opts(parser)

    tokenizer_opts(parser)

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    args.labels_num = count_labels_num(args.train_path)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model.
    model = SiameseClassifier(args)

    # Load or initialize parameters.
    load_or_initialize_parameters(args, model)

    # Get logger.
    args.logger = init_logger(args)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)

    # Training phase.
    trainset = read_dataset(args, args.train_path)
    instances_num = len(trainset)
    batch_size = args.batch_size

    args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    args.logger.info("Batch size: {}".format(batch_size))
    args.logger.info(
        "The number of training instances: {}".format(instances_num))

    optimizer, scheduler = build_optimizer(args, model)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)
        args.amp = amp

    if torch.cuda.device_count() > 1:
        args.logger.info("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)
    args.model = model

    total_loss, result, best_result = 0.0, 0.0, 0.0

    args.logger.info("Start training.")

    for epoch in range(1, args.epochs_num + 1):
        random.shuffle(trainset)
        src_a = torch.LongTensor([example[0][0] for example in trainset])
        src_b = torch.LongTensor([example[0][1] for example in trainset])
        tgt = torch.LongTensor([example[1] for example in trainset])
        seg_a = torch.LongTensor([example[2][0] for example in trainset])
        seg_b = torch.LongTensor([example[2][1] for example in trainset])

        model.train()
        for i, (src_batch, tgt_batch, seg_batch) in enumerate(
                batch_loader(batch_size, (src_a, src_b), tgt, (seg_a, seg_b))):
            loss = train_model(args, model, optimizer, scheduler, src_batch,
                               tgt_batch, seg_batch)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                args.logger.info(
                    "Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                    format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.0

        result = evaluate(args, read_dataset(args, args.dev_path))
        if result[0] > best_result:
            best_result = result[0]
            save_model(model, args.output_model_path)

    # Evaluation phase.
    if args.test_path is not None:
        args.logger.info("Test set evaluation.")
        if torch.cuda.device_count() > 1:
            args.model.module.load_state_dict(
                torch.load(args.output_model_path))
        else:
            args.model.load_state_dict(torch.load(args.output_model_path))
        evaluate(args, read_dataset(args, args.test_path))
Exemple #27
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument(
        "--max_choices_num",
        default=4,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )

    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([example[0] for example in dataset])
    tgt = torch.LongTensor([example[1] for example in dataset])
    seg = torch.LongTensor([example[2] for example in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.test_path) as f:
        data = json.load(f)

    question_ids = []
    for i in range(len(data)):
        questions = data[i][1]
        for question in questions:
            question_ids.append(question["id"])

    index = 0
    with open(args.prediction_path, "w") as f:
        for i, (src_batch, _, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    output = {}
                    output["id"] = question_ids[index]
                    index += 1
                    output["label"] = int(pred[j])
                    f.write(json.dumps(output))
                    f.write("\n")
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        required=True,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/tagger_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--output_encoder",
                        default="./luke-models/",
                        type=str,
                        help="Path of the output luke model.")
    parser.add_argument("--suffix_file_encoder",
                        default="encoder",
                        type=str,
                        help="output file suffix luke model.")
    parser.add_argument("--vocab_path",
                        default="./models/google_vocab.txt",
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path",
                        type=str,
                        required=True,
                        help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/google_config.json",
                        type=str,
                        help="Path of the config file.")
    parser.add_argument("--output_file_prefix",
                        type=str,
                        required=True,
                        help="Prefix for file output.")
    parser.add_argument("--log_file", default='app.log')

    # Model options.
    parser.add_argument("--seq_length",
                        default=256,
                        type=int,
                        help="Sequence length.")
    parser.add_argument("--classifier",
                        choices=["mlp", "lstm", "lstm_crf", "lstm_ncrf"],
                        default="mlp",
                        help="Classifier type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument('--freeze_encoder_weights',
                        action='store_true',
                        help="Enable to freeze the encoder weigths.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=0,
                        help="Number of epochs.")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=2,
                        help="Number of steps to accumulate the gradient.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=200,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=35, help="Random seed.")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch_size.")
    parser.add_argument("--num_train_steps",
                        type=int,
                        default=20000,
                        help="Max steps to be trained.")
    parser.add_argument("--patience",
                        type=int,
                        default=8000,
                        help="Specific steps to wait until stops training.")

    # Optimizer options.
    parser.add_argument("--learning_rate", default=1e-5, type=float)
    parser.add_argument("--lr_schedule",
                        default="warmup_linear",
                        type=str,
                        choices=["warmup_linear", "warmup_constant"])
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--max_grad_norm", default=0.0, type=float)
    parser.add_argument("--adam_b1", default=0.9, type=float)
    parser.add_argument("--adam_b2", default=0.999, type=float)
    parser.add_argument("--adam_eps", default=1e-8, type=float)
    parser.add_argument("--adam_correct_bias", action='store_true')
    parser.add_argument("--warmup_proportion", default=0.006, type=float)
    parser.add_argument("--freeze_proportions", default=0.0, type=float)
    parser.add_argument("--wandb",
                        action='store_true',
                        help="Enable wandb logging")

    # kg
    parser.add_argument("--kg_name", type=str, help="KG name or path")
    parser.add_argument("--use_kg",
                        action='store_true',
                        help="Enable the use of KG.")
    parser.add_argument("--padding",
                        action='store_true',
                        help="Enable padding.")
    parser.add_argument(
        "--truncate",
        action='store_true',
        help="Enable truncation if length is more than seq length.")
    parser.add_argument("--shuffle",
                        action='store_true',
                        help="Enable shuffling during training.")
    parser.add_argument("--dry_run",
                        action='store_true',
                        help="Dry run to test the implementation.")
    parser.add_argument(
        "--voting_choicer",
        action='store_true',
        help="Enable the Voting choicer to select the entity type.")
    parser.add_argument("--eval_kg_tag",
                        action='store_true',
                        help="Enable to include [ENT] tag in evaluation.")
    parser.add_argument("--use_subword_tag",
                        action='store_true',
                        help="Enable to use separate tag for subword splits.")
    parser.add_argument("--debug", action='store_true', help="Enable debug.")
    parser.add_argument("--reverse_order",
                        action='store_true',
                        help="Reverse the feature selection order.")
    parser.add_argument("--max_entities",
                        default=2,
                        type=int,
                        help="Number of KG features.")
    parser.add_argument("--eval_range_with_types",
                        action='store_true',
                        help="Enable to eval range with types.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    logging.basicConfig(filename=args.log_file, filemode='w', format=fmt)

    labels_map = {"[PAD]": 0, "[ENT]": 1, "[X]": 2, "[CLS]": 3, "[SEP]": 4}
    begin_ids = []

    # Find tagging labels
    for file in (args.train_path, args.dev_path, args.test_path):
        with open(file, mode="r", encoding="utf-8") as f:
            for line_id, line in enumerate(f):
                if line_id == 0:
                    continue
                labels = line.strip().split("\t")[0].split()
                for l in labels:
                    if l not in labels_map:
                        if l.startswith("B") or l.startswith("S"):
                            begin_ids.append(len(labels_map))
                            # check if I-TAG exists
                            infix = l[1]
                            tag = l[2:]
                            inner_tag = f'I{infix}{tag}'
                            if inner_tag not in labels_map:
                                labels_map[inner_tag] = len(labels_map)

                        labels_map[l] = len(labels_map)

    idx_to_label = {labels_map[key]: key for key in labels_map}

    print(begin_ids)
    print("Labels: ", labels_map)
    args.labels_num = len(labels_map)

    # Build knowledge graph.
    if args.kg_name == 'none':
        kg_file = []
    else:
        kg_file = args.kg_name

    # Load Luke model.
    model_archive = ModelArchive.load(args.pretrained_model_path)
    tokenizer = model_archive.tokenizer

    # Handling space character in roberta tokenizer
    byte_encoder = bytes_to_unicode()
    byte_decoder = {v: k for k, v in byte_encoder.items()}

    # Load the pretrained model
    encoder = LukeModel(model_archive.config)
    encoder.load_state_dict(model_archive.state_dict, strict=False)

    kg = KnowledgeGraph(kg_file=kg_file, tokenizer=tokenizer)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.device = device

    # Build sequence labeling model.
    classifiers = {
        "mlp": LukeTaggerMLP,
        "lstm": LukeTaggerLSTM,
        "lstm_crf": LukeTaggerLSTMCRF,
        "lstm_ncrf": LukeTaggerLSTMNCRF
    }
    logger.info(f'The selected classifier is:{classifiers[args.classifier]}')
    model = classifiers[args.classifier](args, encoder)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)
    model = model.to(device)

    # Read dataset.
    def read_dataset(path):
        dataset = []
        count = 0
        with open(path, mode="r", encoding="utf8") as f:
            f.readline()
            tokens, labels = [], []
            for line_id, line in enumerate(f):
                fields = line.strip().split("\t")
                if len(fields) == 2:
                    labels, tokens = fields
                elif len(fields) == 3:
                    labels, tokens, cls = fields
                else:
                    print(
                        f'The data is not in accepted format at line no:{line_id}.. Ignored'
                    )
                    continue

                tokens, pos, vm, tag = kg.add_knowledge_with_vm(
                    args, [tokens], [labels])
                tokens = tokens[0]
                pos = pos[0]
                vm = vm[0].astype("bool")
                tag = tag[0]

                # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token])
                non_pad_tokens = [
                    tok for tok in tokens if tok != tokenizer.pad_token
                ]
                num_tokens = len(non_pad_tokens)
                num_pad = len(tokens) - num_tokens

                labels = [config.CLS_TOKEN
                          ] + labels.split(" ") + [config.SEP_TOKEN]
                new_labels = []
                j = 0
                joiner = '-'
                for i in range(len(tokens)):
                    if tag[i] == 0 and tokens[i] != tokenizer.pad_token:
                        cur_type = labels[j]
                        if cur_type != 'O':
                            try:
                                joiner = cur_type[1]
                                prev_label = cur_type[2:]
                            except:
                                logger.info(
                                    f'The label:{cur_type} is converted to O')
                                prev_label = 'O'
                                j += 1
                                new_labels.append('O')
                                continue
                        else:
                            prev_label = cur_type

                        new_labels.append(cur_type)
                        j += 1

                    elif tag[i] == 1 and tokens[
                            i] != tokenizer.pad_token:  # 是添加的实体
                        new_labels.append('[ENT]')
                    elif tag[i] == 2:
                        if prev_label == 'O':
                            new_labels.append('O')
                        else:
                            if args.use_subword_tag:
                                new_labels.append('[X]')
                            else:
                                new_labels.append(f'I{joiner}' + prev_label)
                    else:
                        new_labels.append(PAD_TOKEN)

                new_labels = [labels_map[l] for l in new_labels]

                # print(tokens)
                # print(labels)
                # print(tag)
                if num_pad != 0:
                    print(num_pad)
                    exit()
                mask = [1] * (num_tokens) + [0] * num_pad
                word_segment_ids = [0] * (len(tokens))

                # print(len(tokens))
                # print(len(tag))
                # exit()
                # print(tokenizer.pad_token_id)

                # for i in range(len(tokens)):
                #     if tag[i] == 0 and tokens[i] != tokenizer.pad_token:
                #         new_labels.append(labels[j])
                #         j += 1
                #     elif tag[i] == 1 and tokens[i] != tokenizer.pad_token:  # 是添加的实体
                #         new_labels.append(labels_map['[ENT]'])
                #     elif tag[i] == 2:
                #         if args.use_subword_tag:
                #             new_labels.append(labels_map['[X]'])
                #         else:
                #             new_labels.append(labels_map['[ENT]'])
                #     else:
                #         new_labels.append(labels_map[PAD_TOKEN])

                # print(labels)
                # print(new_labels)
                # print([idx_to_label.get(key) for key in labels])
                # print([idx_to_label.get(key) for key in labels])
                # print(mask)
                # print(pos)
                # print(word_segment_ids)
                # print(tokens)
                # tokens = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokens + [tokenizer.sep_token])
                tokens = tokenizer.convert_tokens_to_ids(tokens)
                # print(tokens)
                # exit()
                assert len(tokens) == len(new_labels), AssertionError(
                    "The length of token and label is not matching")

                dataset.append(
                    [tokens, new_labels, mask, pos, vm, tag, word_segment_ids])

                # Enable dry rune
                if args.dry_run:
                    count += 1
                    if count == 100:
                        break

        return dataset

    # Evaluation function.
    def evaluate(args, is_test, final=False):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        instances_num = len(dataset)
        batch_size = args.batch_size

        if is_test:
            logger.info(f"Batch size:{batch_size}")
            print(f"The number of test instances:{instances_num}")

        true_labels_all = []
        predicted_labels_all = []
        confusion = torch.zeros(len(labels_map),
                                len(labels_map),
                                dtype=torch.long)
        model.eval()

        test_batcher = Batcher(args,
                               dataset,
                               token_pad=tokenizer.pad_token_id,
                               label_pad=labels_map[PAD_TOKEN])

        for i, (input_ids_batch, label_ids_batch, mask_ids_batch,
                pos_ids_batch, vm_ids_batch,
                segment_ids_batch) in enumerate(test_batcher):

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            pos_ids_batch = pos_ids_batch.to(device)
            vm_ids_batch = vm_ids_batch.long().to(device)
            segment_ids_batch = segment_ids_batch.long().to(device)

            pred, logits, scores = model(input_ids_batch,
                                         segment_ids_batch,
                                         mask_ids_batch,
                                         label_ids_batch,
                                         pos_ids_batch,
                                         vm_ids_batch,
                                         use_kg=args.use_kg)

            for pred_sample, gold_sample, mask in zip(pred, label_ids_batch,
                                                      mask_ids_batch):

                pred_labels = [
                    idx_to_label.get(key) for key in pred_sample.tolist()
                ]
                gold_labels = [
                    idx_to_label.get(key) for key in gold_sample.tolist()
                ]

                num_labels = sum(mask)

                # Exclude the [CLS], and [SEP] tokens
                pred_labels = pred_labels[1:num_labels - 1]
                true_labels = gold_labels[1:num_labels - 1]

                pred_labels = [p.replace('_NOKG', '') for p in pred_labels]
                true_labels = [t.replace('_NOKG', '') for t in true_labels]

                true_labels, pred_labels = filter_kg_labels(
                    true_labels, pred_labels)

                pred_labels = [p.replace('_', '-') for p in pred_labels]
                true_labels = [t.replace('_', '-') for t in true_labels]

                biluo_tags_predicted = get_bio(pred_labels)
                biluo_tags_true = get_bio(true_labels)

                if len(biluo_tags_predicted) != len(biluo_tags_true):
                    logger.error(
                        'The length of the predicted labels is not same as that of true labels..'
                    )
                    exit()

                predicted_labels_all.append(biluo_tags_predicted)
                true_labels_all.append(biluo_tags_true)

        if final:
            with open(f'{args.output_file_prefix}_predictions.txt', 'a') as p, \
                    open(f'{args.output_file_prefix}_gold.txt', 'a') as g:
                p.write('\n'.join([' '.join(l) for l in predicted_labels_all]))
                g.write('\n'.join([' '.join(l) for l in true_labels_all]))

        return dict(
            f1=seqeval.metrics.f1_score(true_labels_all, predicted_labels_all),
            precision=seqeval.metrics.precision_score(true_labels_all,
                                                      predicted_labels_all),
            recall=seqeval.metrics.recall_score(true_labels_all,
                                                predicted_labels_all),
            f1_span=f1_score_span(true_labels_all, predicted_labels_all),
            precision_span=precision_score_span(true_labels_all,
                                                predicted_labels_all),
            recall_span=recall_score_span(true_labels_all,
                                          predicted_labels_all),
        )

    # Training phase.
    logger.info("Start training.")
    instances = read_dataset(args.train_path)

    instances_num = len(instances)
    batch_size = args.batch_size

    if args.epochs_num:
        args.num_train_steps = int(
            instances_num * args.epochs_num / batch_size) + 1

    unfreeze_steps = 0
    model_frozen = False
    if args.freeze_proportions != 0.0:
        unfreeze_steps = int(
            args.num_train_steps * args.freeze_proportions) + 1
        logger.info(
            f'Two phase training is enabled with model unfreeze at:{unfreeze_steps}'
        )
        # freeze the model
        model.freeze()
        model_frozen = True

    logger.info(f"Batch size:{batch_size}")
    logger.info(f"The number of training instances:{instances_num}")

    train_batcher = Batcher(args,
                            instances,
                            token_pad=tokenizer.pad_token_id,
                            label_pad=labels_map[PAD_TOKEN])

    optimizer = create_optimizer(args, model)
    scheduler = create_scheduler(args, optimizer)
    total_loss = 0.
    best_f1 = 0.0

    # Dry evaluate
    # evaluate(args, True)

    def maybe_no_sync(step):
        if (hasattr(model, "no_sync")
                and (step + 1) % args.gradient_accumulation_steps != 0):
            return model.no_sync()
        else:
            return contextlib.ExitStack()

    # YOU MUST LOG INTO WANDB WITH YOUR OWN ACCOUNT
    if args.wandb:
        import wandb
        wandb.init(project="kbert_pretrain")
        # args.update(wandb.config)
        print(f'new args{args}')
    else:
        wandb = None

    global_steps = 0
    early_stop_steps = 0
    epoch = 0

    with tqdm(total=args.num_train_steps) as pbar:
        while True:
            model.train()
            for step, (input_ids_batch, label_ids_batch, mask_ids_batch,
                       pos_ids_batch, vm_ids_batch,
                       segment_ids_batch) in enumerate(train_batcher):

                input_ids_batch = input_ids_batch.to(device)
                label_ids_batch = label_ids_batch.to(device)
                mask_ids_batch = mask_ids_batch.to(device)
                pos_ids_batch = pos_ids_batch.to(device)
                vm_ids_batch = vm_ids_batch.long().to(device)
                segment_ids_batch = segment_ids_batch.long().to(device)

                loss, logits = model.score(input_ids_batch,
                                           segment_ids_batch,
                                           mask_ids_batch,
                                           label_ids_batch,
                                           pos_ids_batch,
                                           vm_ids_batch,
                                           use_kg=args.use_kg)

                if torch.cuda.device_count() > 1:
                    loss = torch.mean(loss)

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                with maybe_no_sync(step):
                    loss.backward()

                total_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.max_grad_norm != 0.0:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    optimizer.zero_grad()

                    pbar.set_description("epoch: %d loss: %.7f" %
                                         (epoch, loss.item()))
                    pbar.update()
                    global_steps += 1

                    if global_steps % args.report_steps == 0:
                        logger.info("Epoch id: {}, Global Steps:{}, Avg loss: "
                                    "{:.10f}".format(
                                        epoch, global_steps + 1,
                                        total_loss / args.report_steps))

                        # Evaluation phase.
                        logger.info("Start evaluate on dev dataset.")
                        results = evaluate(args, False)
                        logger.info(results)

                        logger.info("Start evaluation on test dataset.")
                        results_test = evaluate(args, True)
                        logger.info(results_test)

                        avg_loss = total_loss / args.report_steps

                        if args.wandb:
                            # Log the loss and accuracy values at the end of each epoch
                            wandb.log({
                                "steps": global_steps,
                                "train Loss": avg_loss,
                                "valid_acc": results['f1'],
                                "test_acc": results_test['f1'],
                                "learning_rate": args.learning_rate,
                                "batch_size": args.batch_size,
                                "lr_schedule": args.lr_schedule,
                                "weight_decay": args.weight_decay,
                                "max_grad_norm": args.max_grad_norm,
                            })

                        if results['f1'] > best_f1:
                            best_f1 = results['f1']
                            early_stop_steps = 0
                            save_model(model, args.output_model_path)
                            save_encoder(args,
                                         encoder,
                                         suffix=args.suffix_file_encoder)
                        else:
                            early_stop_steps += args.report_steps

                        # Change back the model for training
                        model.train()
                        total_loss = 0.

                if model_frozen and global_steps >= unfreeze_steps:
                    # unfreeze the model and start training
                    logger.info('The encoder is unfrozen for training.')
                    model.unfreeze()
                    model_frozen = False

                if global_steps >= args.num_train_steps:
                    # Training completed
                    logger.info('The training is completed!')
                    break

                if early_stop_steps >= args.patience:
                    # Early stopping
                    logger.info('The early stopping is triggered!')
                    break

            if model_frozen and global_steps >= unfreeze_steps:
                # unfreeze the model and start training
                logger.info('The encoder is unfrozen for training.')
                model.unfreeze()
                model_frozen = False

            if global_steps >= args.num_train_steps:
                # Training completed
                logger.info('The training is completed!')
                break

            if early_stop_steps >= args.patience:
                # Early stopping
                logger.info('The early stopping is triggered!')
                break

            epoch += 1

        # Evaluation phase.
        logger.info("Final evaluation on test dataset.")
        if torch.cuda.device_count() > 1:
            model.module.load_state_dict(torch.load(args.output_model_path))
        else:
            model.load_state_dict(torch.load(args.output_model_path))
        results_final = evaluate(args, True, final=True)
        logger.info(results_final)
Exemple #29
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/ner_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch_size.")
    parser.add_argument("--seq_length",
                        default=128,
                        type=int,
                        help="Sequence length.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    labels_map = {"[PAD]": 0}
    begin_ids = []

    # Find tagging labels
    with open(args.train_path, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            if line_id == 0:
                continue
            labels = line.strip().split("\t")[1].split()
            for l in labels:
                if l not in labels_map:
                    if l.startswith("B") or l.startswith("S"):
                        begin_ids.append(len(labels_map))
                    labels_map[l] = len(labels_map)

    print("Labels: ", labels_map)
    args.labels_num = len(labels_map)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build bert model.
    # A pseudo target is added.
    args.target = "bert"
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model.load_state_dict(torch.load(args.pretrained_model_path),
                              strict=False)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    # Build sequence labeling model.
    model = BertTagger(args, model)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    # Datset loader.
    def batch_loader(batch_size, input_ids, label_ids, mask_ids):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            label_ids_batch = label_ids[instances_num // batch_size *
                                        batch_size:, :]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch

    # Read dataset.
    def read_dataset(path):
        dataset = []
        with open(path, mode="r", encoding="utf-8") as f:
            f.readline()
            tokens, labels = [], []
            for line_id, line in enumerate(f):
                tokens, labels = line.strip().split("\t")
                tokens = [vocab.get(t) for t in tokens.split(" ")]
                labels = [labels_map[l] for l in labels.split(" ")]
                mask = [1] * len(tokens)
                if len(tokens) > args.seq_length:
                    tokens = tokens[:args.seq_length]
                    labels = labels[:args.seq_length]
                    mask = mask[:args.seq_length]
                while len(tokens) < args.seq_length:
                    tokens.append(0)
                    labels.append(0)
                    mask.append(0)
                dataset.append([tokens, labels, mask])

        return dataset

    # Evaluation function.
    def evaluate(args, is_test):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        label_ids = torch.LongTensor([sample[1] for sample in dataset])
        mask_ids = torch.LongTensor([sample[2] for sample in dataset])

        instances_num = input_ids.size(0)
        batch_size = args.batch_size

        if is_test:
            print("Batch size: ", batch_size)
            print("The number of test instances:", instances_num)

        correct = 0
        gold_entities_num = 0
        pred_entities_num = 0

        confusion = torch.zeros(len(labels_map),
                                len(labels_map),
                                dtype=torch.long)

        model.eval()

        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            loss, _, pred, gold = model(input_ids_batch, label_ids_batch,
                                        mask_ids_batch)

            for j in range(gold.size()[0]):
                if gold[j].item() in begin_ids:
                    gold_entities_num += 1

            for j in range(pred.size()[0]):
                if pred[j].item(
                ) in begin_ids and gold[j].item() != labels_map["[PAD]"]:
                    pred_entities_num += 1

            pred_entities_pos = []
            gold_entities_pos = []
            start, end = 0, 0

            for j in range(gold.size()[0]):
                if gold[j].item() in begin_ids:
                    start = j
                    for k in range(j + 1, gold.size()[0]):
                        if gold[k].item(
                        ) == labels_map["[PAD]"] or gold[k].item(
                        ) == labels_map["O"] or gold[k].item() in begin_ids:
                            end = k - 1
                            break
                    else:
                        end = gold.size()[0] - 1
                    gold_entities_pos.append((start, end))

            for j in range(pred.size()[0]):
                if pred[j].item(
                ) in begin_ids and gold[j].item() != labels_map["[PAD]"]:
                    start = j
                    for k in range(j + 1, pred.size()[0]):
                        if pred[k].item(
                        ) == labels_map["[PAD]"] or pred[k].item(
                        ) == labels_map["O"] or pred[k].item() in begin_ids:
                            end = k - 1
                            break
                    else:
                        end = pred.size()[0] - 1
                    pred_entities_pos.append((start, end))

            for entity in pred_entities_pos:
                if entity not in gold_entities_pos:
                    continue
                for j in range(entity[0], entity[1] + 1):
                    if gold[j].item() != pred[j].item():
                        break
                else:
                    correct += 1

        print("Report precision, recall, and f1:")
        p = correct / pred_entities_num
        r = correct / gold_entities_num
        f1 = 2 * p * r / (p + r)
        print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1))

        return f1

    # Training phase.
    print("Start training.")
    instances = read_dataset(args.train_path)

    input_ids = torch.LongTensor([ins[0] for ins in instances])
    label_ids = torch.LongTensor([ins[1] for ins in instances])
    mask_ids = torch.LongTensor([ins[2] for ins in instances])

    instances_num = input_ids.size(0)
    batch_size = args.batch_size
    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    ooptimizer = AdamW(optimizer_grouped_parameters,
                       lr=args.learning_rate,
                       correct_bias=False)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=train_steps * args.warmup,
                                     t_total=train_steps)

    total_loss = 0.
    f1 = 0.0
    best_f1 = 0.0

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            model.zero_grad()

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)

            loss, _, _, _ = model(input_ids_batch, label_ids_batch,
                                  mask_ids_batch)
            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.

            loss.backward()
            optimizer.step()
            scheduler.step()

        f1 = evaluate(args, False)
        if f1 > best_f1:
            best_f1 = f1
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        model = load_model(model, args.output_model_path)
        evaluate(args, True)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the classfier model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--test_features_path",
                        default=None,
                        type=str,
                        help="Path of the test features for stacking.")
    parser.add_argument("--config_path",
                        default="models/bert/base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    model_opts(parser)
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Inference options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=128,
                        help="Sequence length.")
    parser.add_argument("--labels_num",
                        type=int,
                        required=True,
                        help="Number of prediction labels.")

    # Tokenizer options.
    tokenizer_opts(parser)

    # Output options.
    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")
    parser.add_argument("--output_prob",
                        action="store_true",
                        help="Write probabilities to output file.")

    # Cross validation options.
    parser.add_argument("--folds_num",
                        type=int,
                        default=5,
                        help="The number of folds for cross validation.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    test_features = [[] for _ in range(args.folds_num)]
    for fold_id in range(args.folds_num):
        load_model_name = ".".join(args.load_model_path.split(".")[:-1])
        load_model_suffix = args.load_model_path.split(".")[-1]

        model = Classifier(args)
        model = load_model(
            model, load_model_name + "-fold_" + str(fold_id) + "." +
            load_model_suffix)

        # For simplicity, we use DataParallel wrapper to use multiple GPUs.
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        if torch.cuda.device_count() > 1:
            print("{} GPUs are available. Let's use them.".format(
                torch.cuda.device_count()))
            model = torch.nn.DataParallel(model)

        model.eval()
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

            prob = nn.Softmax(dim=1)(logits)
            prob = prob.cpu().numpy().tolist()
            test_features[fold_id].extend(prob)

    test_features = np.array(test_features)
    test_features = np.mean(test_features, axis=0)
    np.save(args.test_features_path, test_features)