Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--folds', type=int, default=5)
    parser.add_argument('--seed', default=42, type=int)
    parser.add_argument('--do_aug', action='store_true')
    parser.add_argument('--data_name',default='datagrand',type=str)
    args = parser.parse_args()
    init_logger(log_file=config['log_dir'] / 'prepare_fold_data.log')
    make_folds(args)
import os
import random
import json
import collections
import numpy as np
from pydatagrand.common.tools import save_json
from pydatagrand.configs.base import config
from pydatagrand.configs.bert_config import bert_base_config
from pydatagrand.common.tools import logger, init_logger
from argparse import ArgumentParser
from pydatagrand.io.vocabulary import Vocabulary
from pydatagrand.common.tools import seed_everything

MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])
init_logger(log_file=config['log_dir'] / ("pregenerate_training_data.log"))


#n-gram masking algorithm
#15% use unigram , 20% use bigram, 30% use trigram, 20% use four gram, 15% use five gram
def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_list):
    """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
    with several refactors to clean it up and remove a lot of unnecessary variables."""
    cand_indices = []
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        cand_indices.append(i)

    num_to_mask = min(max_predictions_per_seq,
def main():
    parser = ArgumentParser()
    parser.add_argument("--arch", default='bert_lstm_span', type=str)
    parser.add_argument("--do_train", action='store_true')
    parser.add_argument("--do_test", action='store_true')
    parser.add_argument("--save_best", action='store_true')
    parser.add_argument("--do_lower_case", action='store_true')
    parser.add_argument('--soft_label', action='store_true')
    parser.add_argument('--data_name', default='datagrand', type=str)
    parser.add_argument('--optimizer',
                        default='adam',
                        type=str,
                        choices=['adam', 'lookahead'])
    parser.add_argument('--markup',
                        default='bios',
                        type=str,
                        choices=['bio', 'bios'])
    parser.add_argument('--checkpoint', default=900000, type=int)
    parser.add_argument('--fold', default=0, type=int)
    parser.add_argument("--epochs", default=50.0, type=int)
    parser.add_argument("--resume_path", default='', type=str)
    parser.add_argument("--mode", default='max', type=str)
    parser.add_argument("--monitor", default='valid_f1', type=str)
    parser.add_argument("--local_rank", type=int, default=-1)
    parser.add_argument("--sorted",
                        default=1,
                        type=int,
                        help='1 : True  0:False ')
    parser.add_argument("--n_gpu",
                        type=str,
                        default='0',
                        help='"0,1,.." or "0" or "" ')
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
    parser.add_argument("--train_batch_size", default=24, type=int)
    parser.add_argument('--eval_batch_size', default=48, type=int)
    parser.add_argument("--train_max_seq_len", default=128, type=int)
    parser.add_argument("--eval_max_seq_len", default=512, type=int)
    parser.add_argument('--loss_scale', type=float, default=0)
    parser.add_argument("--warmup_proportion", default=0.1, type=float)
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--grad_clip", default=5.0, type=float)
    parser.add_argument("--learning_rate", default=1e-4, type=float)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument("--no_cuda", action='store_true')
    parser.add_argument('--fp16', action='store_true')
    parser.add_argument('--fp16_opt_level', type=str, default='O1')
    args = parser.parse_args()

    args.pretrain_model = config[
        'checkpoint_dir'] / f'lm-checkpoint-{args.checkpoint}'
    args.device = torch.device(
        f"cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.arch = args.arch + f"_{args.markup}_fold_{args.fold}"
    if args.optimizer == 'lookahead':
        args.arch += "_lah"
    args.model_path = config['checkpoint_dir'] / args.arch
    args.model_path.mkdir(exist_ok=True)
    # Good practice: save your training arguments together with the trained model
    torch.save(args, config['checkpoint_dir'] / 'training_args.bin')
    seed_everything(args.seed)
    init_logger(log_file=config['log_dir'] / f"{args.arch}.log")
    logger.info("Training/evaluation parameters %s", args)

    if args.do_train:
        run_train(args)

    if args.do_test:
        run_test(args)
Example #4
0
from collections import namedtuple
from tempfile import TemporaryDirectory
from pydatagrand.common.tools import logger, init_logger
from pydatagrand.configs.base import config
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from pydatagrand.common.tools import AverageMeter
from pydatagrand.train.metrics import LMAccuracy
from pydatagrand.model.pytorch_transformers.modeling_bert import BertForMaskedLM, BertConfig
from pydatagrand.model.pytorch_transformers.file_utils import CONFIG_NAME
from pydatagrand.model.pytorch_transformers.tokenization_bert import BertTokenizer
from pydatagrand.model.pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from pydatagrand.common.tools import seed_everything

InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids")
init_logger(log_file=config['log_dir'] / ("train_bert_model.log"))


def convert_example_to_features(example, tokenizer, max_seq_length):
    tokens = example["tokens"]
    segment_ids = example["segment_ids"]
    masked_lm_positions = example["masked_lm_positions"]
    masked_lm_labels = example["masked_lm_labels"]

    assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
    input_array = np.zeros(max_seq_length, dtype=np.int)
    input_array[:len(input_ids)] = input_ids
    mask_array = np.zeros(max_seq_length, dtype=np.bool)
    mask_array[:len(input_ids)] = 1