def __init__(self,
                 ds_path,
                 model_name,
                 base_model="bert-base-uncased",
                 do_lower_case=True,
                 num_epochs=4):
        self.path = ds_path
        self.num_epochs = num_epochs
        self.save_path = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                      str(model_name))
        self.model_name = model_name
        self.batch_size = 32
        self.max_seq_len = 64
        self.masked_lm_prob = 0.15
        self.max_predictions_per_seq = 20
        self.max_token = 30000

        bert_model = base_model

        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
        self.model = BertForMaskedLM.from_pretrained(bert_model,
                                                     cache_dir=cache_dir)
        self.model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
            5, 768)
        self.model.bert.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=0.02)

        self.tokenizer = BertTokenizer.from_pretrained(
            bert_model, do_lower_case=do_lower_case)
Ejemplo n.º 2
0
 def load(cls, model_fqdn, weights_path=None, **model_kwargs):
     cache_dir = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), "distributed_-1")
     tokenizer = BertTokenizer.from_pretrained(model_fqdn, do_lower_case="uncase" in model_fqdn)
     model = BertForMaskedLM.from_pretrained(model_fqdn, cache_dir=cache_dir)
     if weights_path is not None:
         model.load_state_dict(torch.load(weights_path), strict=False)
     return cls(model, tokenizer, **model_kwargs)
Ejemplo n.º 3
0
    def __init__(self,
                 model_name,
                 do_lower_case=True,
                 base_model="bert-base-uncased",
                 use_untuned=False,
                 use_stop=False):
        self.model_name = model_name
        bert_model = base_model
        self.tokenizer = BertTokenizer.from_pretrained(
            bert_model, do_lower_case=do_lower_case)

        if use_untuned:
            cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
            self.model = BertForMaskedLM.from_pretrained(bert_model,
                                                         cache_dir=cache_dir)
            self.model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
                5, 768)
            self.model.bert.embeddings.token_type_embeddings.weight.data.normal_(
                mean=0.0, std=0.02)
        else:
            weights_path = os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                        model_name)
            self.model = torch.load(weights_path)
        self.model.cuda()

        self.MAX_LEN = 10
        self.__segment_proc_flag = True
        if use_stop:
            self.__stop_words = set(stopwords.words('english'))
        else:
            self.__stop_words = []
Ejemplo n.º 4
0
def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.

    Example:
        # Load the tokenizer
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> masked_index = 8
        >>> tokenized_text[masked_index] = '[MASK]'
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForMaskedLM
        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
        >>> model.eval()
        # Predict all tokens
        >>> with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensors)
        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        'henson'
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model
def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model
Ejemplo n.º 6
0
    def __init__(self, label_list, device):
        self._label_list = label_list

        self._tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                                        do_lower_case=True)

        self._model = BertForMaskedLM.from_pretrained(BERT_MODEL)
        if len(self._label_list) != 2:
            self._model.bert.embeddings.token_type_embeddings = \
                nn.Embedding(len(label_list), 768)
            self._model.bert.embeddings.token_type_embeddings.weight.data.\
                normal_(mean=0.0, std=0.02)

        self._device = device
        self._model.to(self._device)

        self._optimizer = None

        self._dataset = {}
        self._data_loader = {}
    def __init__(self,
                 featQty,
                 headFeatQty,
                 useTfIdfTransform=False,
                 useWordFeatures=True,
                 useBERT=False,
                 useHeadBERT=False,
                 bertModelPath=None,
                 torch_device='cuda',
                 bertModelType='bert-base-uncased'):

        self.useWordFeatures = useWordFeatures
        if self.useWordFeatures:
            self.featQty = featQty
            self.countVect = CountVectorizer(ngram_range=(1, 1))
            self.tfidf = TfidfTransformer() if useTfIdfTransform else None

            self.headFeatQty = headFeatQty
            self.headCountVect = CountVectorizer(ngram_range=(1, 1))
            self.headTfidf = TfidfTransformer() if useTfIdfTransform else None

        self.useBERT = useBERT
        self.useHeadBERT = useHeadBERT
        if useBERT or useHeadBERT:
            self.torch_device = torch.device(torch_device)
            if bertModelPath is not None:
                print('Loading fine-tuned model from file:', bertModelPath)
                self.bertModelWrapper = BertForPreTraining.from_pretrained(
                    bertModelType)
                self.bertModelWrapper.load_state_dict(
                    torch.load(bertModelPath))
            else:
                print('Loading standard pre-trained model')
                self.bertModelWrapper = BertForMaskedLM.from_pretrained(
                    bertModelType)

            self.bertModelWrapper.eval()
            self.bertModelWrapper.to(torch_device)
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                bertModelType, do_lower_case=True)
Ejemplo n.º 8
0
def load_model(bison_args, device, data_handler, output_model_file=None):
    """
    Load a model.

    :param bison_args: instance of :py:class:BisonArguments
    :param device: the device to move the model to
    :param data_handler: the dataset handler, an instance of :py:class:BitextHandler or a subclass
    :param output_model_file: the location of the model to load
    :return: the loaded model
    """

    model_state_dict = None
    if output_model_file is not None:
        model_state_dict = torch.load(output_model_file)

    if bison_args.bert_model == 'bert-vanilla':
        # randomly initialises BERT weights instead of using a pre-trained model
        model = BertForMaskedLM(BertConfig.from_default_settings())
    else:
        model = BertForMaskedLM.from_pretrained(bison_args.bert_model,
                                                state_dict=model_state_dict)
    model.to(device)
    return model
Ejemplo n.º 9
0
import plotly.offline as offline
import numpy as np
from pytorch_pretrained_bert.tokenization import load_vocab, BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import argparse
from tqdm import tqdm, trange
import os

base_path = os.path.dirname(os.path.abspath(__file__))

tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path),
                          do_lower_case=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()

vocab = load_vocab(vocab_file='{}/data/vocab.txt'.format(base_path))
inv_vocab = {v: k for k, v in vocab.items()}


def getMI(sentence):
    tokens = tokenizer.tokenize(sentence)

    tokens.insert(0, "[CLS]")
    tokens.append("[SEP]")

    tokens_length = len(tokens)
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument("--data_dir",
                        default="/work01/ryuto/data/NTC_processed",
                        type=str)
    parser.add_argument(
        "--bert_model",
        default="/home/ryuto/data/jap_BERT/",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--vocab",
        default="/home/ryuto/data/NTC_Matsu_original/wordIndex.txt",
        type=str)

    # model parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Set this flag if you are using an uncased model. (If Japanese model, set false)"
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )

    # Hyper parameter
    parser.add_argument('--seed', type=int, default=2020)
    parser.add_argument('--insert_max', type=int, default=10)
    parser.add_argument('--insert_min', type=int, default=3)
    parser.add_argument('--target_max', type=int, default=3)
    parser.add_argument('--target_min', type=int, default=1)
    parser.add_argument('--iteration', type=int, default=3)
    parser.add_argument('--data_ratio', type=float, default=100)

    args = parser.parse_args()

    # Seed
    random.seed(args.seed)

    # vocab & tokenizer
    vocab = set_vocab(args.vocab)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Extract predicates
    predicates = extract_predicates(vocab=vocab, data_dir=args.data_dir)
    random.shuffle(predicates)

    # model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    model.to(device)
    model.eval()

    counter = 0
    data_size = int(len(predicates) * args.data_ratio / 100)

    with open(args.output_file, "w", encoding='utf-8') as writer:
        for predicate in tqdm(predicates[:data_size]):
            for case in CASES:
                for _ in range(args.iteration):
                    # insert MASK and case
                    n_target = random.randint(args.target_min, args.target_max)
                    text_a = [CLS] + [MASK] * n_target + [case, predicate, SEP]
                    tokens = tokenizer.tokenize(" ".join(text_a))
                    mask_ids = [
                        idx for idx, token in enumerate(tokens)
                        if token == MASK
                    ]
                    trg_id = mask_ids[-1]
                    black_list = [predicate]

                    # predict MASK
                    tokens = prediction(model=model,
                                        seq_length=args.max_seq_length,
                                        device=device,
                                        tokenizer=tokenizer,
                                        tokens=tokens,
                                        mask_ids=mask_ids,
                                        black_list=black_list,
                                        how_select="sample")

                    # insert MASK
                    n_insert = random.randint(args.insert_min, args.insert_max)
                    tokens = tokens[:trg_id +
                                    2] + [MASK] * n_insert + tokens[trg_id +
                                                                    2:]
                    mask_ids2 = [
                        idx for idx, token in enumerate(tokens)
                        if token == MASK
                    ]

                    # predict MASK
                    tokens = prediction(model=model,
                                        seq_length=args.max_seq_length,
                                        device=device,
                                        tokenizer=tokenizer,
                                        tokens=tokens,
                                        mask_ids=mask_ids2,
                                        black_list=black_list,
                                        how_select="argmax")

                    target = tokens[mask_ids[0]:mask_ids[-1] + 2]
                    chunk = tokens[mask_ids2[0]:mask_ids2[-1] + 1]
                    prd = tokens[mask_ids2[-1] + 1:len(tokens) - 1]

                    target_tokens, target_ids = convert_bert_predicts_to_ids(
                        target, vocab)
                    chunk_tokens, chunk_ids = convert_bert_predicts_to_ids(
                        chunk, vocab)
                    predicate_tokens, predicate_ids = convert_bert_predicts_to_ids(
                        prd, vocab)

                    concat_surfs = target_tokens + chunk_tokens + predicate_tokens
                    concat_ids = target_ids + chunk_ids + predicate_ids
                    p_id = len(concat_surfs) - 1
                    labels = [3] * len(concat_surfs)
                    labels[len(target_tokens) - 2] = CASES[case]
                    instance = {
                        "tokens": concat_ids,
                        "surfaces": concat_surfs,
                        "pas": [{
                            "p_id": p_id,
                            "args": labels
                        }]
                    }

                    print(json.dumps(instance), file=writer)

                    if counter < 5:
                        counter += 1
                        logger.info("{} + {} = {} {} {}".format(
                            predicate, case, "".join(target_tokens),
                            "".join(chunk_tokens), "".join(predicate_tokens)))
Ejemplo n.º 11
0
def main():
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # 多卡训练
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # 以免替换模型
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer(vocab_file=args.vocab_file)

    train_examples = None
    num_train_optimization_steps = None
    vocab_list = []
    # 读取BERT字典
    with open(args.vocab_file, 'r', encoding='utf-8') as fr:
        for line in fr:
            vocab_list.append(line.strip("\n"))

    if args.do_train:
        train_examples = []
        for _ in range(args.dupe_factor):  # 动态掩盖
            print("create_training_instances.started...")
            instances = create_examples(
                data_path=args.pretrain_train_path,
                max_seq_length=args.max_seq_length,
                masked_lm_prob=args.masked_lm_prob,
                max_predictions_per_seq=args.max_predictions_per_seq,
                vocab_list=vocab_list,
                tokenizer=tokenizer)
            train_examples += instances
            print("create_training_instances.ended...")

        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )
    # word_embeddings+Transformer+MLM
    model = BertForMaskedLM(
        config=BertConfig.from_json_file(args.bert_config_json))
    if args.init_model != '':
        # 不是从零开始训练
        print('args.init_model', args.init_model)
        model.from_pretrained(args.init_model, args.bert_config_json)
    else:
        # 从零开始训练
        print("从零训练")
        pass

    # 混合精度训练
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    """冻结word_embedding参数,大字典时可能需要"""
    if args.frozen:
        for para in model.bert.embeddings.word_embeddings.parameters():
            para.requires_grad = False
        param_optimizer = list(
            filter(lambda p: p[1].requires_grad, model.named_parameters()))
        # model.bert.embeddings.word_embeddings.cpu() #embdedding进cpu
    # Prepare optimizer
    else:
        param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        # 混合精度训练
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    best_loss = 100000

    # 加载数据
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      args.max_seq_length,
                                                      tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        nb_tr_steps = 0  # 总共步伐
        for e in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples = 0
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                # masked_lm_loss
                loss = model(input_ids, segment_ids, input_mask, label_ids)

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                if nb_tr_steps > 0 and nb_tr_steps % 100 == 0:
                    logger.info(
                        "===================== -epoch %d -train_step %d -train_loss %.4f\n"
                        % (e, nb_tr_steps, tr_loss / nb_tr_steps))
                if e > 0 and e % args.save_epochs == 0 and not args.do_eval:
                    # Save a trained model, configuration and tokenizer
                    # Only save the model it-self
                    model_to_save = model.module if hasattr(
                        model, 'module') else model

                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)

            if nb_tr_steps > 0 and nb_tr_steps % 2000 == 0 and args.do_eval:
                eval_examples = create_examples(
                    data_path=args.pretrain_dev_path,
                    max_seq_length=args.max_seq_length,
                    masked_lm_prob=args.masked_lm_prob,
                    max_predictions_per_seq=args.max_predictions_per_seq,
                    vocab_list=vocab_list)
                eval_features = convert_examples_to_features(
                    eval_examples, args.max_seq_length, tokenizer)
                all_input_ids = torch.tensor(
                    [f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor(
                    [f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)
                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label_ids)
                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=args.eval_batch_size)

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                for input_ids, input_mask, segment_ids, label_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        loss = model(input_ids, segment_ids, input_mask,
                                     label_ids)

                    eval_loss += loss.item()
                    nb_eval_steps += 1

                eval_loss = eval_loss / nb_eval_steps
                if eval_loss < best_loss:
                    # Save a trained model, configuration and tokenizer
                    # Only save the model it-self
                    model_to_save = model.module if hasattr(
                        model, 'module') else model

                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    torch.save(model_to_save.state_dict(), output_model_file)
                    best_loss = eval_loss
                logger.info(
                    "============================ -epoch %d -train_loss %.4f -eval_loss %.4f\n"
                    % (e, tr_loss / nb_tr_steps, eval_loss))
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters

    parser.add_argument("--eval_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The evaluation data dir.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")

    parser.add_argument(
        "--output_SR_file",
        default=None,
        type=str,
        required=True,
        help="The output directory of writing substitution selection.")
    parser.add_argument("--word_embeddings",
                        default=None,
                        type=str,
                        required=True,
                        help="The path of word embeddings")
    parser.add_argument("--word_frequency",
                        default=None,
                        type=str,
                        required=True,
                        help="The path of word frequency.")
    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        "--max_seq_length",
        default=250,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")

    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--num_selections",
                        default=20,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_eval_epochs",
                        default=1,
                        type=int,
                        help="Total number of training epochs to perform.")

    parser.add_argument(
        "--prob_mask",
        default=0.5,
        type=float,
        help="Proportion of the masked words in first sentence. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--ppdb",
                        default="./ppdb-2.0-tldr",
                        type=str,
                        required=True,
                        help="The path of word frequency.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_eval:
        raise ValueError("At least `do_eval` must be True.")

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Prepare model
    model = BertForMaskedLM.from_pretrained(args.bert_model,
                                            output_attentions=True)

    model.to(device)

    ranker = Ranker()

    ranker.read_features(args.word_embeddings, args.word_frequency, args.ppdb)

    #one_sent = "John composed these verses."
    output_sr_file = open(args.output_SR_file, "w")

    one_sent = "alessandro mazzola -lrb- born 8 november , 1942 -rrb- is a former italian football player ."

    simple_sent = simplified_sentence(one_sent,
                                      model,
                                      tokenizer,
                                      ranker,
                                      args.max_seq_length,
                                      threshold=0.5,
                                      num_selections=args.num_selections)

    print(simple_sent)

    with open(args.eval_dir, "r") as reader:
        while True:
            one_sent = reader.readline()
            one_sent = one_sent.strip()

            if one_sent == "":
                break

            #output_sr_file.write(one_sent)
            #output_sr_file.write(' ||| ')

            print(one_sent)

            simple_sent = simplified_sentence(
                one_sent,
                model,
                tokenizer,
                ranker,
                args.max_seq_length,
                threshold=0.5,
                num_selections=args.num_selections)

            #simple_sent = "---------"

            output_sr_file.write(simple_sent)

            print(simple_sent)
            output_sr_file.write('\n')

    output_sr_file.close()
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default="/home/ryuto/data/jap_BERT/",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--vocab",
        default="/home/ryuto/data/NTC_Matsu_original/wordIndex.txt",
        type=str)

    # model parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Set this flag if you are using an uncased model. (If Japanese model, set false)"
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )

    # Data Augmentation Option
    parser.add_argument('--data_ratio',
                        type=float,
                        default=100,
                        help="full size = 100 (default=100)")
    parser.add_argument("--token_strategy",
                        dest='how_select',
                        default="argmax",
                        type=str,
                        help="Choose from 'argmax' or 'sample'")
    parser.add_argument(
        '--predicate',
        action='store_true',
        help="If True, target word is replaced even if it is predicate.")

    # Hyper parameter
    parser.add_argument('--seed', type=int, default=2020)
    parser.add_argument('--replace_max', type=int, default=5)
    parser.add_argument('--replace_min', type=int, default=3)
    parser.add_argument('--n_sample', type=int, default=3)

    args = parser.parse_args()

    # Seed
    random.seed(args.seed)

    # vocab & tokenizer
    vocab = set_vocab(args.vocab)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Create MASK instances
    instances = create_masked_instances(args)

    # Create dataset
    features = convert_instances_to_features(instances=instances,
                                             seq_length=args.max_seq_length,
                                             tokenizer=tokenizer)
    # model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    model.to(device)
    model.eval()

    with open(args.output_file, "w", encoding='utf-8') as writer:
        for feature in tqdm(features):
            feature.send_to_device(device)
            instance = prediction(model=model,
                                  feature=feature,
                                  tokenizer=tokenizer,
                                  how_select=args.how_select)
            instance = convert_bert_predicts_to_ids(instance=instance,
                                                    vocab=vocab)
            print(json.dumps(instance), file=writer)
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model_or_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "Directory containing pre-trained BERT model or path of configuration file (if no pre-training)."
    )
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--num_gpus",
        type=int,
        default=-1,
        help="Num GPUs to use for training (0 for none, -1 for all available)")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    # Check whether bert_model_or_config_file is a file or directory
    if os.path.isdir(args.bert_model_or_config_file):
        pretrained = True
        targets = [WEIGHTS_NAME, CONFIG_NAME, "tokenizer.pkl"]
        for t in targets:
            path = os.path.join(args.bert_model_or_config_file, t)
            if not os.path.exists(path):
                msg = "File '{}' not found".format(path)
                raise ValueError(msg)
        fp = os.path.join(args.bert_model_or_config_file, CONFIG_NAME)
        config = BertConfig(fp)
    else:
        pretrained = False
        config = BertConfig(args.bert_model_or_config_file)

    # What GPUs do we use?
    if args.num_gpus == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
        device_ids = None
    else:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and args.num_gpus > 0 else "cpu")
        n_gpu = args.num_gpus
        if n_gpu > 1:
            device_ids = list(range(n_gpu))
    if args.local_rank != -1:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    # Check some other args
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    # Seed RNGs
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Prepare output directory
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Make tokenizer
    if pretrained:
        fp = os.path.join(args.bert_model_or_config_file, "tokenizer.pkl")
        with open(fp, "rb") as f:
            tokenizer = pickle.load(f)
    else:
        training_data = [
            line.strip() for line in open(args.train_file).readlines()
        ]
        tokenizer = CuneiformCharTokenizer(training_data=training_data)
        tokenizer.trim_vocab(config.min_freq)
        # Adapt vocab size in config
        config.vocab_size = len(tokenizer.vocab)
    print("Size of vocab: {}".format(len(tokenizer.vocab)))

    # Get training data
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    if pretrained:
        model = BertForMaskedLM.from_pretrained(args.bert_model_or_config_file)
    else:
        model = BertForMaskedLM(config)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=device_ids)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    # Prepare training log
    output_log_file = os.path.join(args.output_dir, "training_log.txt")
    with open(output_log_file, "w") as f:
        f.write("Steps\tTrainLoss\n")

    # Start training
    global_step = 0
    total_tr_steps = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, lm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            avg_loss = tr_loss / nb_tr_examples

            # Update training log
            total_tr_steps += nb_tr_steps
            log_data = [str(total_tr_steps), "{:.5f}".format(avg_loss)]
            with open(output_log_file, "a") as f:
                f.write("\t".join(log_data) + "\n")

            # Save model
            logger.info("** ** * Saving model ** ** * ")
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
            output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
            with open(output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())
            fn = os.path.join(args.output_dir, "tokenizer.pkl")
            with open(fn, "wb") as f:
                pickle.dump(tokenizer, f)
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--discriminative_finetuning',
                        action='store_true',
                        help='Whether to use discriminative fine-tuning')

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = BERTDataset(args.train_corpus,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    #############################################################################
    # model = BertForPreTraining.from_pretrained(args.bert_model)
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        # model = torch.nn.DataParallel(model)
        model = DataParallelModel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        if args.discriminative_finetuning:
            group1 = ['layer.0', 'layer.1.']
            group2 = ['layer.2', 'layer.3']
            group3 = ['layer.4', 'layer.5']
            group4 = ['layer.6', 'layer.7']
            group5 = ['layer.8', 'layer.9']
            group6 = ['layer.10', 'layer.11']
            group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \
            'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.01, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.01, 'lr': args.learning_rate},

                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \
                'weight_decay': 0.0},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \
                'weight_decay': 0.0, 'lr': args.learning_rate/2.6},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \
                'weight_decay': 0.0, 'lr': args.learning_rate},
            ]
        else:
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)

        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch

                logits = model(input_ids, segment_ids, input_mask)
                loss_fct = CrossEntropyLoss(ignore_index=-1)
                loss_fct = DataParallelCriterion(loss_fct)
                logits = [
                    logits[i].view(-1, model.module.config.vocab_size)
                    for i in range(len(logits))
                ]
                loss = loss_fct(logits, lm_label_ids.view(-1))

                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                # loss = model(input_ids, segment_ids, input_mask, lm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters

    parser.add_argument("--eval_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The evaluation data dir.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")

    parser.add_argument(
        "--output_SR_file",
        default=None,
        type=str,
        required=True,
        help="The output directory of writing substitution selection.")
    parser.add_argument("--word_embeddings",
                        default=None,
                        type=str,
                        required=True,
                        help="The path of word embeddings")
    parser.add_argument("--word_frequency",
                        default=None,
                        type=str,
                        required=True,
                        help="The path of word frequency.")
    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")

    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--num_selections",
                        default=10,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_eval_epochs",
                        default=1,
                        type=int,
                        help="Total number of training epochs to perform.")

    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_eval:
        raise ValueError("At least `do_eval` must be True.")

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForMaskedLM.from_pretrained(args.bert_model,
                                            cache_dir=cache_dir)
    if args.fp16:
        model.half()
    model.to(device)

    output_sr_file = open(args.output_SR_file, "a+")

    print("Loading embeddings ...")

    wordVecPath = args.word_embeddings
    #wordVecPath = "/media/qiang/ee63f41d-4004-44fe-bcfd-522df9f2eee8/glove.840B.300d.txt"

    fasttext_dico, fasttext_emb = getWordmap(wordVecPath)

    stopword = set(stopwords.words('english'))
    word_count_path = args.word_frequency
    #word_count_path = "word_frequency_wiki.txt"
    word_count = getWordCount(word_count_path)

    ps = PorterStemmer()

    SS = []
    substitution_words = []
    source_words = []

    num_selection = args.num_selections

    bre_i = 0

    window_context = 11

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):

        fileName = args.eval_dir.split('/')[-1][:-4]
        if fileName == 'lex.mturk':
            eval_examples, mask_words, mask_labels = read_eval_dataset(
                args.eval_dir)
        else:
            eval_examples, mask_words, mask_labels = read_eval_index_dataset(
                args.eval_dir)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        #logger.info("  Batch size = %d", args.eval_batch_size)

        model.eval()

        eval_size = len(eval_examples)

        for i in range(eval_size):

            print('Sentence {} rankings: '.format(i))
            #output_sr_file.write(str(i))
            #output_sr_file.write(' sentence: ')
            #output_sr_file.write('\n')
            tokens, words, position = convert_sentence_to_token(
                eval_examples[i], args.max_seq_length, tokenizer)

            assert len(words) == len(position)

            mask_index = words.index(mask_words[i])

            mask_context = extract_context(words, mask_index, window_context)

            len_tokens = len(tokens)

            mask_position = position[mask_index]

            if isinstance(mask_position, list):
                feature = convert_whole_word_to_feature(
                    tokens, mask_position, args.max_seq_length, tokenizer)
            else:
                feature = convert_token_to_feature(tokens, mask_position,
                                                   args.max_seq_length,
                                                   tokenizer)

            tokens_tensor = torch.tensor([feature.input_ids])

            token_type_ids = torch.tensor([feature.input_type_ids])

            attention_mask = torch.tensor([feature.input_mask])

            tokens_tensor = tokens_tensor.to('cuda')
            token_type_ids = token_type_ids.to('cuda')
            attention_mask = attention_mask.to('cuda')

            # Predict all tokens
            with torch.no_grad():
                prediction_scores = model(tokens_tensor, token_type_ids,
                                          attention_mask)

            if isinstance(mask_position, list):
                predicted_top = prediction_scores[0, mask_position[0]].topk(20)
            else:
                predicted_top = prediction_scores[0, mask_position].topk(20)
                #print(predicted_top[0].cpu().numpy())
            pre_tokens = tokenizer.convert_ids_to_tokens(
                predicted_top[1].cpu().numpy())
            #print(pre_tokens)
            #print(predicted_top[0].cpu().numpy())

            #break
            ss = substitution_selection(mask_words[i], pre_tokens,
                                        predicted_top[0].cpu().numpy(), ps,
                                        num_selection)

            print('ssss------')
            print(ss)

            SS.append(ss)
            #break

            #print(mask_words[i], ":", ss)
            source_words.append(mask_words[i])

            #pre_word = substitution_ranking2(mask_words[i], ss, fasttext_dico, fasttext_emb,word_count)
            pre_word = substitution_ranking(mask_words[i], mask_context, ss,
                                            fasttext_dico, fasttext_emb,
                                            word_count, tokenizer, model,
                                            mask_labels[i])

            substitution_words.append(pre_word)

            #if(bre_i==5):
            #    break
            #bre_i += 1

        potential, precision, recall, F_score = evaulation_SS_scores(
            SS, mask_labels)
        print("The score of evaluation for substitution selection")
        output_sr_file.write(str(args.num_selections))
        output_sr_file.write('\t')
        output_sr_file.write(str(precision))
        output_sr_file.write('\t')
        output_sr_file.write(str(recall))
        output_sr_file.write('\t')
        output_sr_file.write(str(F_score))
        output_sr_file.write('\t')
        print(potential, precision, recall, F_score)

        precision, accuracy, changed_proportion = evaulation_pipeline_scores(
            substitution_words, source_words, mask_labels)
        print("The score of evaluation for full LS pipeline")
        print(precision, accuracy, changed_proportion)
        output_sr_file.write(str(precision))
        output_sr_file.write('\t')
        output_sr_file.write(str(accuracy))
        output_sr_file.write('\n')
Ejemplo n.º 17
0
from pytorch_pretrained_bert.modeling import BertForMaskedLM
from pytorch_pretrained_bert import BertTokenizer
import torch

bert_model = 'bert-large-uncased'
model = BertForMaskedLM.from_pretrained(bert_model)
tokenizer = BertTokenizer.from_pretrained(bert_model)

question = 'who invented the telephone'  # "the telephone was invented by whom"
tokenized_question = tokenizer.tokenize(question)

masked_index = 0
tokenized_question[masked_index] = '[MASK]'
question_ids = tokenizer.convert_tokens_to_ids(tokenized_question)
combined_ids = question_ids
segments_ids = [0] * len(question_ids)

tokens_tensor = torch.tensor([combined_ids])
segments_tensor = torch.tensor([segments_ids])

model.eval()
predictions = model(tokens_tensor, segments_tensor)  # 1 x len(combined_ids) x vocab size
predicted_index = torch.topk(predictions[0, masked_index], 20)[1].tolist()
print(predicted_index)
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
print(predicted_token)
Ejemplo n.º 18
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    ## Other parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")

    parser.add_argument("--case_file", type=str)
    parser.add_argument("--json_file", type=str)
    parser.add_argument("--base2index", type=str)
    parser.add_argument("--n_best", type=int, default=5)
    parser.add_argument("--n_sample", type=int, default=10)
    parser.add_argument("--sampling_prob", type=float, default=0.5)
    parser.add_argument("--fill_mode",
                        type=str,
                        default=None,
                        help="Choose from 'best_n', 'best_n_surface', ...")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(args.local_rank != -1)))

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    if args.fill_mode.startswith("predicate"):
        examples, sent_options, instance_options = read_examples_and_mask_pred(
            args.input_file, args.json_file)
    else:
        examples, sent_options, instance_options = read_examples_and_mask(
            args.input_file, args.case_file)

    # add
    # split_sentence_dir = "/work01/ryuto/data/NTC_BERT_split"
    # split_sentence_file = os.path.join(split_sentence_dir, os.path.basename(args.output_file))
    # if os.path.exists(split_sentence_file):
    #     os.remove(split_sentence_file)
    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer)

    # features = convert_examples_to_features(
    #     examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = BertForMaskedLM.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    all_subwords = [feature.tokens for feature in features]

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    model.eval()

    config = FillerConfig(subword_vocab=tokenizer.vocab,
                          pre_trained_vocab_file=args.base2index,
                          json_file=args.json_file,
                          sent_options=sent_options,
                          instance_options=instance_options,
                          all_subwords=all_subwords)

    if args.fill_mode == "n_best":
        filler = BestNTokenFiller(config, n_best=args.n_best, mode="json")
    elif args.fill_mode == "n_best_surface":
        filler = BestNTokenFiller(config, n_best=args.n_best, mode="surface")
    elif args.fill_mode == "multi_sampling":
        filler = MultiTokenFiller(config,
                                  n_sample=args.n_sample,
                                  prob=args.sampling_prob,
                                  mode="json")
    elif args.fill_mode == "multi_sampling_surface":
        filler = MultiTokenFiller(config,
                                  n_sample=args.n_sample,
                                  prob=args.sampling_prob,
                                  mode="surface")
    elif args.fill_mode == "predicate":
        filler = BestNTokenPredicateFiller(config,
                                           n_best=args.n_best,
                                           mode="json")
    elif args.fill_mode == "predicate_surface":
        filler = BestNTokenPredicateFiller(config,
                                           n_best=args.n_best,
                                           mode="surface")
    elif args.fill_mode == "random":
        filler = RandomNTokenFiller(config, n_sample=args.n_best, mode="json")
    elif args.fill_mode == "random_surface":
        filler = RandomNTokenFiller(config,
                                    n_sample=args.n_best,
                                    mode="surface")
    elif args.fill_mode == "sampling":
        filler = SamplingTokenFiller(config, n_sample=args.n_best, mode="json")
    elif args.fill_mode == "sampling_surface":
        filler = SamplingTokenFiller(config,
                                     n_sample=args.n_best,
                                     mode="surface")
    else:
        raise ValueError("Unsupported Value: {}".format(args.fill_mode))

    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, example_indices in tqdm(eval_dataloader):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)

            prediction = model(input_ids,
                               token_type_ids=None,
                               attention_mask=input_mask)

            for scores in prediction:
                instances = filler(scores)
                if instances is not None:
                    print("\n".join(instances), file=writer)

        instances = filler.pop()
        if instances:
            print("\n".join(instances), file=writer)
    with open(args.output_file + ".distribution", "w") as fo:
        json.dump(filler.predict_token_distribution, fo)
Ejemplo n.º 19
0
def attack(fuzz_val,
           top_k_words,
           qrs,
           wts,
           sample_index,
           text_ls,
           true_label,
           predictor,
           stop_words_set,
           word2idx,
           idx2word,
           cos_sim,
           word_embedding,
           sim_predictor=None,
           import_score_threshold=-1.,
           sim_score_threshold=0.5,
           sim_score_window=15,
           synonym_num=50,
           batch_size=32):
    rows = []
    nlp = spacy.load('en_core_web_sm')
    masked_lang_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    changed_with = []
    doc = nlp(' '.join(text_ls))
    text = []
    for sent in doc.sents:
        for token in sent:
            text.append(token.text)
    tok_text = []
    for item in text:
        ap = item.find("'")
        if ap >= 0:
            tok_text.append(item[0:ap])
            tok_text.append("'")
            tok_text.append(item[ap + 1:len(item)])
        else:
            tok_text.append(item)
    text = []
    for item in tok_text:
        if len(item) > 0:
            text.append(item)

    text_ls = text[:]

    # first check the prediction of the original text
    orig_probs = predictor([text_ls]).squeeze()
    orig_label = torch.argmax(orig_probs)
    orig_prob = orig_probs.max()
    if true_label != orig_label:
        return '', 0, orig_label, orig_label, 0, [], []
    else:

        len_text = len(text_ls)
        if len_text < sim_score_window:
            sim_score_threshold = 0.1  # shut down the similarity thresholding function
        half_sim_score_window = (sim_score_window - 1) // 2
        num_queries = 1

        # get the pos and verb tense info
        pos_ls = criteria.get_pos(text_ls)
        # get importance score
        leave_1_texts = [
            text_ls[:ii] + ['<oov>'] + text_ls[min(ii + 1, len_text):]
            for ii in range(len_text)
        ]
        leave_1_probs = predictor(leave_1_texts, batch_size=batch_size)
        num_queries += len(leave_1_texts)
        leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1)
        import_scores = (
            orig_prob - leave_1_probs[:, orig_label] +
            (leave_1_probs_argmax != orig_label).float() *
            (leave_1_probs.max(dim=-1)[0] - torch.index_select(
                orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy()

        # get words to perturb ranked by importance score for word in words_perturb
        words_perturb = []
        for idx, score in sorted(enumerate(import_scores),
                                 key=lambda x: x[1],
                                 reverse=True):
            try:
                if score > import_score_threshold and text_ls[
                        idx] not in stop_words_set and len(text_ls[idx]) > 2:
                    words_perturb.append((idx, score))
            except:
                print(idx, len(text_ls), import_scores.shape, text_ls,
                      len(leave_1_texts))
        #return '', 0, orig_label, orig_label, 0, [], words_perturb
        # find synonyms
        words_perturb_idx = [
            word2idx[word] for idx, word in words_perturb if word in word2idx
        ]
        #synonym_words, synonym_values, synonyms_dict = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, -1.0)
        # start replacing and attacking
        text_prime = text_ls[:]
        sims = []
        text_cache = text_prime[:]
        num_changed = 0
        for idx, score in words_perturb:
            #print(text_ls[idx])
            text_range_min, text_range_max = calc_window(idx, 3, 10, len_text)

            sliced_text = text_prime[text_range_min:text_range_max]
            #print(sliced_text)
            new_index = idx - text_range_min
            #print(sliced_text[new_index])
            masked_idx = new_index

            tokens, words, position = gen.convert_sentence_to_token(
                ' '.join(sliced_text), 1000, tokenizer)
            assert len(words) == len(position)

            len_tokens = len(tokens)

            mask_position = position[masked_idx]

            if isinstance(mask_position, list):
                feature = gen.convert_whole_word_to_feature(
                    tokens, mask_position, 1000, tokenizer)
            else:
                feature = gen.convert_token_to_feature(tokens, mask_position,
                                                       1000, tokenizer)

            tokens_tensor = torch.tensor([feature.input_ids])
            token_type_ids = torch.tensor([feature.input_type_ids])
            attention_mask = torch.tensor([feature.input_mask])
            tokens_tensor = tokens_tensor.to('cuda')
            token_type_ids = token_type_ids.to('cuda')
            attention_mask = attention_mask.to('cuda')
            #new_probs = predictor(new_texts, batch_size=batch_size)
            masked_lang_model.to('cuda')
            masked_lang_model.eval()
            ps = PorterStemmer()

            with torch.no_grad():
                prediction_scores = masked_lang_model(tokens_tensor,
                                                      token_type_ids,
                                                      attention_mask)

            if isinstance(mask_position, list):
                predicted_top = prediction_scores[0, mask_position[0]].topk(50)
            else:
                predicted_top = prediction_scores[0, mask_position].topk(50)

            pre_tokens = tokenizer.convert_ids_to_tokens(
                predicted_top[1].cpu().numpy())
            synonyms_initial = gen.substitution_generation(
                words[masked_idx], pre_tokens, predicted_top[0].cpu().numpy(),
                ps, 50)
            new_texts = []
            avg = []
            synonyms = []
            assert words[masked_idx] == text_ls[idx]
            #print(synonyms)
            for candidate_word in synonyms_initial:
                if candidate_word in word_embedding and words[
                        masked_idx] in word_embedding:
                    candidate_similarity = calc_similarity(
                        word_embedding[words[masked_idx]],
                        word_embedding[candidate_word])
                    avg.append(candidate_similarity)
                    #print(words[masked_idx], candidate_similarity, candidate_word)
                    if candidate_similarity >= 0.2:
                        new_texts.append(text_prime[:idx] + [candidate_word] +
                                         text_prime[min(idx + 1, len_text):])
                        synonyms.append(candidate_word)
                else:
                    new_texts.append(text_prime[:idx] + [candidate_word] +
                                     text_prime[min(idx + 1, len_text):])
                    synonyms.append(candidate_word)
            #print(len(new_texts))
            if len(new_texts) == 0:
                continue

            text_range_min, text_range_max = calc_window(
                idx, half_sim_score_window, sim_score_window, len_text)
            semantic_sims = \
            sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts),
                                       list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0]
            sims.append(np.sum(semantic_sims) / len(semantic_sims))

            new_probs_mask = np.ones(
                len(new_texts)
            )  #(orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy()
            # prevent bad synonyms
            new_probs_mask *= (semantic_sims >= sim_score_threshold)
            # prevent incompatible pos
            synonyms_pos_ls = [
                criteria.get_pos(new_text[max(idx - 4, 0):idx +
                                          5])[min(4, idx)]
                if len(new_text) > 10 else criteria.get_pos(new_text)[idx]
                for new_text in new_texts
            ]
            pos_mask = np.array(
                criteria.pos_filter(pos_ls[idx], synonyms_pos_ls))
            new_probs_mask *= pos_mask
            new_vals = semantic_sims * new_probs_mask
            index = []
            mini = 2
            for i in range(len(new_vals)):
                if new_vals[i] > 0:
                    index.append((new_vals[i], i))
            if len(index) == 0:
                continue
            new_texts1 = [new_texts[ind] for val, ind in index]
            #print(len(new_texts1))
            num_queries += len(new_texts1)
            if num_queries > qrs:
                return '', 0, orig_label, orig_label, 0, [], []
            new_probs = predictor(new_texts1, batch_size=batch_size)
            if len(new_probs.shape) < 2:
                new_probs = new_probs.unsqueeze(0)
            pr = (orig_label != torch.argmax(new_probs,
                                             dim=-1)).data.cpu().numpy()
            if np.sum(pr) > 0:
                text_prime[idx] = synonyms[index[pr.argmax(
                )][1]]  #synonyms[(new_probs_mask * semantic_sims).argmax()]
                num_changed += 1
                break
            else:
                new_label_probs = new_probs[:, orig_label]
                new_label_prob_min, new_label_prob_argmin = torch.min(
                    new_label_probs, dim=-1)
                if new_label_prob_min < orig_prob:
                    text_prime[idx] = synonyms[index[new_label_prob_argmin][1]]
                    num_changed += 1
            text_cache = text_prime[:]

            if fuzz.token_set_ratio(' '.join(text_ls),
                                    ' '.join(text_cache)) < fuzz_val:
                return ' '.join(
                    text_prime), num_changed, orig_label, torch.argmax(
                        predictor([text_prime
                                   ])), num_queries, words_perturb, sims
        return ' '.join(text_prime), num_changed, orig_label, torch.argmax(
            predictor([text_prime])), num_queries, words_perturb, sims
Ejemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    
    parser.add_argument("--eval_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The evaluation data dir.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")

    parser.add_argument("--output_SR_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory of writing substitution selection.")
    parser.add_argument("--word_embeddings",
                        default=None,
                        type=str,
                        required=True,
                        help="The path of word embeddings")
    parser.add_argument("--word_frequency",
                        default=None,
                        type=str,
                        required=True,
                        help="The path of word frequency.")
    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument("--max_seq_length",
                        default=250,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")

    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--num_selections",
                        default=20,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--num_eval_epochs",
                        default=1,
                        type=int,
                        help="Total number of training epochs to perform.")

    parser.add_argument("--prob_mask",
                        default=0.5,
                        type=float,
                        help="Proportion of the masked words in first sentence. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--ppdb",
                        default="./ppdb-2.0-tldr",
                        type=str,
                        required=True,
                        help="The path of word frequency.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()



    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if  not args.do_eval:
        raise ValueError("At least `do_eval` must be True.")

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)


    # Prepare model
    model = BertForMaskedLM.from_pretrained(args.bert_model,output_attentions=True)
    
    model.to(device)

    output_sr_file = open(args.output_SR_file,"a+")

    print("Loading embeddings ...")
    
    wordVecPath = args.word_embeddings
    #wordVecPath = "/media/qiang/ee63f41d-4004-44fe-bcfd-522df9f2eee8/glove.840B.300d.txt"

    fasttext_dico, fasttext_emb = getWordmap(wordVecPath)

    #stopword = set(stopwords.words('english'))
    word_count_path = args.word_frequency
    #word_count_path = "word_frequency_wiki.txt"
    word_count = getWordCount(word_count_path)

    ps = PorterStemmer()

    print("loading PPDB ...")
    ppdb_path = args.ppdb
    ppdb_model = Ppdb(ppdb_path)

    CGBERT = []

    substitution_words = []
   
    num_selection = args.num_selections

    window_context = 11

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        
     
        fileName = args.eval_dir.split('/')[-1][:-4]
        if fileName=='lex.mturk':
            eval_examples, mask_words, mask_labels = read_eval_dataset(args.eval_dir)
        else:
            eval_examples, mask_words, mask_labels = read_eval_index_dataset(args.eval_dir)

       
        eval_size = len(eval_examples)
        print("***** Running evaluation *****")
        print("  Num examples = %d", eval_size)
        #logger.info("  Batch size = %d", args.eval_batch_size)

        model.eval()

        for i in range(eval_size):

            print('Sentence {} rankings: '.format(i))
            #output_sr_file.write(str(i))
            #output_sr_file.write(' sentence: ')
            #output_sr_file.write('\n')
            print(eval_examples[i])
            print(mask_words[i])
            
            tokens, words, position = convert_sentence_to_token(eval_examples[i], args.max_seq_length, tokenizer)

            assert len(words)==len(position)

            mask_index = words.index(mask_words[i])

            mask_context = extract_context(words,mask_index,window_context)

            len_tokens = len(tokens)

            mask_position = position[mask_index]
 
            if isinstance(mask_position,list):
                feature = convert_whole_word_to_feature(tokens, mask_position, args.max_seq_length, tokenizer, args.prob_mask)
            else:
                feature = convert_token_to_feature(tokens, mask_position, args.max_seq_length, tokenizer, args.prob_mask)

            tokens_tensor = torch.tensor([feature.input_ids])

            token_type_ids = torch.tensor([feature.input_type_ids])

            attention_mask = torch.tensor([feature.input_mask])

            tokens_tensor = tokens_tensor.to('cuda')
            token_type_ids = token_type_ids.to('cuda')
            attention_mask = attention_mask.to('cuda')

                # Predict all tokens
            with torch.no_grad():
                all_attentions,prediction_scores = model(tokens_tensor, token_type_ids,attention_mask)

            if isinstance(mask_position,list):
                predicted_top = prediction_scores[0, mask_position[0]].topk(80)
            else:
                predicted_top = prediction_scores[0, mask_position].topk(80)
                #print(predicted_top[0].cpu().numpy())
            pre_tokens = tokenizer.convert_ids_to_tokens(predicted_top[1].cpu().numpy())
            
            #print(predicted_top[0].cpu().numpy())

            sentence = eval_examples[i].lower()
            words = word_tokenize(sentence)

            words_tag = nltk.pos_tag(words)

            complex_word_index = words.index(mask_words[i])

            complex_word_tag = words_tag[complex_word_index][1]

   

            complex_word_tag = preprocess_tag(complex_word_tag)
            
            cgPPDB = ppdb_model.predict(mask_words[i],complex_word_tag)

            cgBERT = BERT_candidate_generation(mask_words[i], pre_tokens, predicted_top[0].cpu().numpy(), ps, args.num_selections)

            print(cgBERT)
            
            CGBERT.append(cgBERT)
          
            pre_word = substitution_ranking(mask_words[i], mask_context, cgBERT, fasttext_dico, fasttext_emb,word_count,cgPPDB,tokenizer,model,mask_labels[i])


            substitution_words.append(pre_word)

        
        potential,precision,recall,F_score=evaulation_SS_scores(CGBERT, mask_labels)
        print("The score of evaluation for BERT candidate generation")
        print(potential,precision,recall,F_score)

        output_sr_file.write(str(args.num_selections))
        output_sr_file.write('\t')
        output_sr_file.write(str(potential))
        output_sr_file.write('\t')
        output_sr_file.write(str(precision))
        output_sr_file.write('\t')
        output_sr_file.write(str(recall))
        output_sr_file.write('\t')
        output_sr_file.write(str(F_score))
        output_sr_file.write('\t')
        

        precision,accuracy,changed_proportion=evaulation_pipeline_scores(substitution_words, mask_words, mask_labels)
       	print("The score of evaluation for full LS pipeline")
        print(precision,accuracy,changed_proportion)
        output_sr_file.write(str(precision))
        output_sr_file.write('\t')
        output_sr_file.write(str(accuracy))
        output_sr_file.write('\t')
        output_sr_file.write(str(changed_proportion))
        output_sr_file.write('\n')

        output_sr_file.close()
Ejemplo n.º 21
0
def main():
    save_every_epoch = False

    args, train_dataloader, t_total, device, n_gpu = load_data()
    print("**********************************************************")
    print(args)
    # Prepare model
    model = BertForMaskedLM.from_pretrained(
        args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0

    model.train()

    save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
    if not os.path.exists(save_model_dir):
        os.mkdir(save_model_dir)

    for e in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss, avg_loss, avg_acc = 0, 0, 0.
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()
            tr_loss += loss.item()
            avg_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                global_step += 1
            if (step + 1) % 50 == 0:
                print("avg_loss: {}".format(avg_loss / 50))
                avg_loss = 0

        if save_every_epoch:
            save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(
                e + 1) + modified
            save_model_path = os.path.join(save_model_dir, save_model_name)
            torch.save(model, save_model_path)
        else:
            if (e + 1) % 10 == 0:
                save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(
                    e + 1) + modified
                save_model_path = os.path.join(save_model_dir, save_model_name)
                torch.save(model, save_model_path)
Ejemplo n.º 22
0
import numpy as np
from pytorch_pretrained_bert.tokenization import load_vocab, BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import argparse
from tqdm import tqdm, trange
import os
import re

base_path = os.path.dirname(os.path.abspath(__file__))

tokenizer = BertTokenizer(vocab_file='{}/data/vocab.txt'.format(base_path),
                          do_lower_case=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('checkpoint/')
model.to(device)
model.eval()

vocab = load_vocab('{}/data/vocab.txt'.format(base_path))
inv_vocab = {v: k for k, v in vocab.items()}


def getMI(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens.insert(0, "[CLS]")
    tokens.append("[SEP]")
    tokens_length = len(tokens)
    result = []
    for i, token in enumerate(tokens):
        # tokens preprocessing
Ejemplo n.º 23
0
def run_aug(args, save_every_epoch=False):
    processors = {
        # you can your processor here
        "TREC": AugProcessor,
        "stsa.fine": AugProcessor,
        "stsa.binary": AugProcessor,
        "mpqa": AugProcessor,
        "rt-polarity": AugProcessor,
        "subj": AugProcessor,
        "squad":SquadProcessor,
    }

    task_name = args.task_name
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    args.data_dir = os.path.join(args.data_dir, task_name)
    args.output_dir = os.path.join(args.output_dir, task_name)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    os.makedirs(args.output_dir, exist_ok=True)
    processor = processors[task_name]()
    label_list = processor.get_labels(task_name)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    #dev_examples = processor.get_dev_examples(args.data_dir)
    #train_examples.extend(dev_examples)
    num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs)

    # Prepare model
    model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

    if task_name == 'stsa.fine':
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(5, 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02)
    elif task_name == 'TREC':
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(6, 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=0.02)

    model.cuda()

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
    t_total = num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate,
                         warmup=args.warmup_proportion,t_total=t_total)

    global_step = 0
    if task_name = "squad":
        train_features = convert_examples_to_features_squad(
            train_examples, label_list, args.max_seq_length, tokenizer)
Ejemplo n.º 24
0
def main():
    parser = create_parser()
    args = parser.parse_args()

    if path.exists(args.output_file):
        raise FileExistsError("'{}' already exists.".format(args.output_file))

    # tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if args.what_replace == "word":
        with open(args.input_file) as fi:
            instance = json.loads(next(fi))
            keys = instance.keys()
            logger.info(keys)
            assert "original_surfaces" in keys

        if args.juman:
            from pyknp import Juman
            juman = Juman()
            vocab = set_vocab(args.vocab)
        else:
            juman = None
            vocab = None
        model = BertForMaskedLM.from_pretrained(args.bert_model)
    elif args.what_replace == "vec":
        if not args.train_matrix:
            raise ValueError("Please enter 'train_matrix'")
        if not args.map:
            raise ValueError("Please enter 'map'")
        model = BertModel.from_pretrained(args.bert_model)
    else:
        raise ValueError("Unsupported value: '{}'".format(args.what_replace))
    model.to(device)
    model.eval()

    logger.info(args)

    # batch generator
    if args.how_predict == "single":
        batch_generator = batch_generator_with_single(args.input_file,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.batch_size, device,
                                                      args.data_limit)
    elif args.how_predict == "multi":
        batch_generator = batch_generator_with_multi(args.input_file,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     args.batch_size, device,
                                                     args.data_limit)
    else:
        raise ValueError("Unsupported value: '{}'".format(args.how_predict))

    # tmp_file = create_tmp_file()
    tmp_file = ".tmp." + path.basename(args.input_file) + "." + path.basename(
        args.output_file)
    logger.info("Tmp file: '{}'".format(tmp_file))
    logger.info("Start BERT prediction")

    len_file = length_file(args.input_file)
    total = (1 + len_file // args.batch_size) * (1 if args.how_predict
                                                 == "multi" else 20)
    logger.info("wc -l '{}' = {}".format(args.input_file, len_file))

    # predict
    fo = open(tmp_file, "w")
    for batch_inputs, batch_att_mask, batch_target_ids, batch_position in tqdm(
            batch_generator, total=total):
        predict = model(batch_inputs,
                        token_type_ids=None,
                        attention_mask=batch_att_mask)
        if args.what_replace == "vec":
            predict = predict[0][-1]  # encoded_layers の最終層を使う
        target_predict = predict[batch_target_ids].cpu()

        assert len(target_predict) == len(batch_position)

        if args.what_replace == "word":
            converter = predict_to_words(target_predict, batch_position, juman,
                                         vocab, tokenizer, args.how_select)
        elif args.what_replace == "vec":
            converter = predict_to_vectors(target_predict, batch_position)
        else:
            raise ValueError("Unsupported value: '{}'".format(
                args.what_replace))

        for line in converter:
            print(json.dumps(line), file=fo)
    fo.close()

    logger.info("Start replace")
    tmp_f = open(tmp_file)

    # replace
    if args.what_replace == "vec":
        train_vec_file = h5py.File(args.train_matrix, "r")
        fo = h5py.File(args.output_file, 'w')
        with open(args.map) as fi:
            mapping_pseudo_to_train: dict = json.load(fi)

        for instance in tqdm(read_instance(args.input_file)):
            unique_id = str(instance["unique_id"])
            mask_ids = [
                idx for idx, token in enumerate(instance["surfaces"])
                if token == MASK
            ]
            train_vec = train_vec_file.get(
                mapping_pseudo_to_train[str(unique_id)])[()]
            assert len(train_vec) == len(instance["tokens"])

            for mask_idx in mask_ids:
                tmp_line = json.loads(next(tmp_f))
                assert unique_id == str(tmp_line["unique_id"])
                assert instance["sentence id"] == tmp_line["sentence id"]
                assert instance["file name"] == tmp_line["file name"]
                assert mask_idx == tmp_line["mask idx"]
                train_vec[mask_idx] = np.array(tmp_line["predict"])

            fo.create_dataset(unique_id, data=train_vec)

        train_vec_file.close()
        fo.close()

    elif args.what_replace == "word":
        fo = open(args.output_file, 'w')
        for n, instance in tqdm(enumerate(read_instance(args.input_file))):
            unique_id = str(instance["unique_id"])
            mask_ids = [
                idx for idx, token in enumerate(instance["surfaces"])
                if token == MASK
            ]
            new_instance = copy.deepcopy(instance)
            tokens = new_instance["tokens"]
            surfaces = new_instance["surfaces"]

            for mask_idx in mask_ids:
                tmp_line = json.loads(next(tmp_f))
                assert unique_id == str(tmp_line["unique_id"])
                assert instance["sentence id"] == tmp_line["sentence id"]
                assert instance["file name"] == tmp_line["file name"]
                assert mask_idx == tmp_line["mask idx"]
                token_idx, surf = tmp_line["predict"]
                tokens[mask_idx] = token_idx
                surfaces[mask_idx] = surf

            new_instance["tokens"] = tokens
            new_instance["surfaces"] = surfaces
            new_instance["mask_ids"] = mask_ids
            assert len([
                idx for idx, token in enumerate(new_instance["surfaces"])
                if token == MASK
            ]) == 0

            print(json.dumps(new_instance), file=fo)
            if n < 10:
                logger.debug("".join(instance["surfaces"]))
                logger.debug("".join(new_instance["original_surfaces"]))
                logger.debug("".join(new_instance["surfaces"]))
        fo.close()
    else:
        raise ValueError("Unsupported value: '{}'".format(args.what_replace))

    tmp_f.close()
    logger.info("delete: {}".format(tmp_file))
    remove(tmp_file)
    logger.info("done")
Ejemplo n.º 25
0
def main():
    parser = create_parser()
    args = parser.parse_args()
    logger.info(args)

    random.seed(args.seed)

    with open(ARGUMENT_RATE) as fi:
        argument_w = {
            line.split()[0]: float(line.rstrip("\n").split()[-1])
            for line in fi
        }
    test_create_mask_indices(argument_w)

    if path.exists(args.out_file):
        raise FileExistsError("Already exists: {}".format(args.out_file))
    if args.where_mask not in WHERE_MASK:
        raise ValueError("Unsupported mode = '{}'\nChoose from: {}".format(
            args.where_mask, WHERE_MASK))
    if args.which_arg not in WHICH_ARG:
        raise ValueError("Unsupported mode = '{}'\nChoose from: {}".format(
            args.which_arg, WHICH_ARG))

    logger.info("Where to mask: '{}'".format(args.where_mask))
    logger.info("Whether to mask the argument: '{}'".format(args.which_arg))
    logger.info("Random rate: {}".format(args.random_rate))
    logger.info("Minus: {}".format(args.minus))
    logger.info("How select tokens: {}".format(args.how_select))
    logger.info("How many tokens to predict at once: {}".format(args.how_many))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info("device: {}".format(device))
    logger.info("BERT model: {}".format(args.bert_model))
    logger.debug("Loading BERT model...")
    max_seq_length = 128
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    model.to(device)
    model.eval()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=False)
    black_list = []

    logger.debug("sort by length of tokens")
    instances = [instance for instance in tqdm(read_file(args.in_file))]
    sorted_instances = sorted(instances, key=lambda x: len(x["surfaces"]))
    logger.debug("sort is done")

    fo = open(args.out_file, "w")
    logger.debug("Start to fill the mask")
    for instance in tqdm(sorted_instances[10000:15000]):
        for pas in instance["pas"]:
            if len(set(pas["args"])) == 1:
                continue
            if "zero" not in pas["types"]:
                continue

            predict_sents = []
            mask_indices = create_mask_indices(instance=instance,
                                               pas=pas,
                                               where_mask=args.where_mask,
                                               which_arg=args.which_arg,
                                               random_rate=args.random_rate,
                                               minus=args.minus,
                                               argument_w=argument_w)
            if not mask_indices:
                continue

            original_tokens = copy.deepcopy(instance["surfaces"])
            masked_tokens = [
                MASK if idx in mask_indices else surf
                for idx, surf in enumerate(instance["surfaces"])
            ]
            feature = InputFeatures(tokens=masked_tokens,
                                    tokenizer=tokenizer,
                                    max_seq_length=max_seq_length)

            if feature.len > max_seq_length:
                continue

            if args.how_select == "beam":
                output_sents, output_tokens = prediction_with_beam_search(
                    device=device,
                    model=model,
                    feature=feature,
                    tokenizer=tokenizer,
                    black_list=black_list,
                    k=args.topk)
                for sent in output_sents:
                    predict_sents.append(sent[1:feature.len - 1])

            else:
                if args.how_many == "single":
                    predict_tokens = prediction_single(
                        device=device,
                        model=model,
                        feature=feature,
                        tokenizer=tokenizer,
                        how_select=args.how_select,
                        black_list=black_list)
                elif args.how_many == "multi":
                    predict_tokens = prediction_multi(
                        device=device,
                        model=model,
                        feature=feature,
                        tokenizer=tokenizer,
                        how_select=args.how_select,
                        black_list=black_list)
                else:
                    raise ValueError("Unsupported value: {}".format(
                        args.how_many))

                assert len(predict_tokens) == len(feature.token_mask_ids)
                # tokens = feature.tokens
                # for idx, p_token in zip(feature.token_mask_ids, predict_tokens):
                #     tokens[idx] = p_token

                filled_tokens = copy.deepcopy(masked_tokens)
                for idx, p_token in zip(sorted(list(mask_indices)),
                                        predict_tokens):
                    filled_tokens[idx] = p_token
                predict_sents.append(filled_tokens)

            print("{}: {}".format(instance["file name"],
                                  instance["sentence id"]),
                  file=fo)
            for idx, tokens in enumerate(
                [original_tokens, masked_tokens, *predict_sents]):
                case_ids = [(c_id, case)
                            for c_id, case in enumerate(pas["args"])
                            if case != 3]
                tokens[pas["p_id"]] = add_color(tokens[pas["p_id"]],
                                                "underline")
                for c_id, case in case_ids:
                    tokens[c_id] = add_color(tokens[c_id], CASE_COLOR[case])
                print("{} :{}".format(idx, " ".join(tokens)), file=fo)
            print("\n", file=fo)
    fo.close()
    logger.info("done")
Ejemplo n.º 26
0
def run_aug(args, save_every_epoch=False):
    processors = {
        # you can your processor here
        "TREC": AugProcessor,
        "stsa.fine": AugProcessor,
        "stsa.binary": AugProcessor,
        "mpqa": AugProcessor,
        "rt-polarity": AugProcessor,
        "subj": AugProcessor,
    }

    task_name = args.task_name
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    args.data_dir = os.path.join(args.data_dir, task_name)
    args.output_dir = os.path.join(args.output_dir, task_name)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    os.makedirs(args.output_dir, exist_ok=True)
    processor = processors[task_name]()
    label_list = processor.get_labels(task_name)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    #dev_examples = processor.get_dev_examples(args.data_dir)
    #train_examples.extend(dev_examples)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    # Prepare model
    model = BertForMaskedLM.from_pretrained(
        args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

    if task_name == 'stsa.fine':
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
            5, 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=0.02)
    elif task_name == 'TREC':
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
            6, 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=0.02)

    model.cuda()

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    t_total = num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)
    all_init_ids = torch.tensor([f.init_ids for f in train_features],
                                dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_masked_lm_labels = torch.tensor(
        [f.masked_lm_labels for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask,
                               all_segment_ids, all_masked_lm_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
    if not os.path.exists(save_model_dir):
        os.mkdir(save_model_dir)
    MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]

    origin_train_path = os.path.join(args.output_dir, "train_origin.tsv")
    save_train_path = os.path.join(args.output_dir, "train.tsv")
    shutil.copy(origin_train_path, save_train_path)
    best_test_acc = train_text_classifier.train("aug_data")
    print("before augment best acc:{}".format(best_test_acc))

    for e in trange(int(args.num_train_epochs), desc="Epoch"):
        avg_loss = 0.

        for step, batch in enumerate(train_dataloader):
            model.train()
            batch = tuple(t.cuda() for t in batch)
            _, input_ids, input_mask, segment_ids, masked_ids = batch
            loss = model(input_ids, segment_ids, input_mask, masked_ids)
            loss.backward()
            avg_loss += loss.item()
            optimizer.step()
            model.zero_grad()
            if (step + 1) % 50 == 0:
                print("avg_loss: {}".format(avg_loss / 50))
                avg_loss = 0
        torch.cuda.empty_cache()
        shutil.copy(origin_train_path, save_train_path)
        save_train_file = open(save_train_path, 'a')
        tsv_writer = csv.writer(save_train_file, delimiter='\t')
        #tsv_writer.writerow(['sentence', 'label'])
        for step, batch in enumerate(train_dataloader):
            model.eval()
            batch = tuple(t.cuda() for t in batch)
            init_ids, _, input_mask, segment_ids, _ = batch
            input_lens = [sum(mask).item() for mask in input_mask]
            #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens])
            masked_idx = np.squeeze(
                [np.random.randint(0, l, max(l // 7, 2)) for l in input_lens])
            for ids, idx in zip(init_ids, masked_idx):
                ids[idx] = MASK_id
            predictions = model(init_ids, segment_ids, input_mask)
            for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions,
                                            segment_ids):
                #pred = torch.argsort(pred)[:,-e-1][idx]
                '''
                pred = torch.argsort(preds)[:,-1][idx]
                ids[idx] = pred
                new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
                new_str = rev_wordpiece(new_str)
                tsv_writer.writerow([new_str, seg[0].item()])
                '''
                pred = torch.argsort(preds)[:, -2][idx]
                ids[idx] = pred
                new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy())
                new_str = rev_wordpiece(new_str)
                tsv_writer.writerow([new_str, seg[0].item()])
            torch.cuda.empty_cache()
        predictions = predictions.detach().cpu()
        torch.cuda.empty_cache()
        bak_train_path = os.path.join(args.output_dir,
                                      "train_epoch_{}.tsv".format(e))
        shutil.copy(save_train_path, bak_train_path)
        best_test_acc = train_text_classifier.train("aug_data")
        print("epoch {} augment best acc:{}".format(e, best_test_acc))
        if save_every_epoch:
            save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(
                e + 1)
            save_model_path = os.path.join(save_model_dir, save_model_name)
            torch.save(model, save_model_path)
        else:
            if (e + 1) % 10 == 0:
                save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(
                    e + 1)
                save_model_path = os.path.join(save_model_dir, save_model_name)
                torch.save(model, save_model_path)
Ejemplo n.º 27
0
def run_aug(args, save_every_epoch=False):
    processors = {
        # you can your processor here
        "TREC": AugProcessor,
        "stsa.fine": AugProcessor,
        "stsa.binary": AugProcessor,
        "mpqa": AugProcessor,
        "rt-polarity": AugProcessor,
        "subj": AugProcessor,
    }

    task_name = args.task_name
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    args.data_dir = os.path.join(args.data_dir, task_name)
    args.output_dir = os.path.join(args.output_dir, task_name)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    os.makedirs(args.output_dir, exist_ok=True)
    processor = processors[task_name]()
    label_list = processor.get_labels(task_name)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    train_examples = processor.get_train_examples(args.data_dir)
    #dev_examples = processor.get_dev_examples(args.data_dir)
    #train_examples.extend(dev_examples)
    num_train_steps = int(
        len(train_examples) / args.train_batch_size * args.num_train_epochs)

    # Prepare model
    model = BertForMaskedLM.from_pretrained(
        args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

    if task_name == 'stsa.fine':
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
            5, 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=0.02)
    elif task_name == 'TREC':
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
            6, 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=0.02)

    model.cuda()

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    t_total = num_train_steps
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)
    all_init_ids = torch.tensor([f.init_ids for f in train_features],
                                dtype=torch.long)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_masked_lm_labels = torch.tensor(
        [f.masked_lm_labels for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask,
                               all_segment_ids, all_masked_lm_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    model.train()

    save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
    if not os.path.exists(save_model_dir):
        os.mkdir(save_model_dir)
    for e in trange(int(args.num_train_epochs), desc="Epoch"):
        avg_loss = 0.

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.cuda() for t in batch)
            _, input_ids, input_mask, segment_ids, masked_ids = batch
            loss = model(input_ids, segment_ids, input_mask, masked_ids)
            loss.backward()
            avg_loss += loss.item()
            optimizer.step()
            model.zero_grad()
            if (step + 1) % 50 == 0:
                print("avg_loss: {}".format(avg_loss / 50))
                avg_loss = 0
        if save_every_epoch:
            save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(
                e + 1)
            save_model_path = os.path.join(save_model_dir, save_model_name)
            torch.save(model, save_model_path)
        else:
            if (e + 1) % 10 == 0:
                save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(
                    e + 1)
                save_model_path = os.path.join(save_model_dir, save_model_name)
                torch.save(model, save_model_path)
Ejemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--bert_model",
        default="/home/ryuto/data/jawiki-kurohashi-bert",
        type=str,
        help=
        "Please fill the path to directory of BERT model, or the name of BERT model."
        "Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )

    # BERT model parameters
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Set this flag if you are using an uncased model. (If Japanese model, set false)"
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument("--language",
                        type=str,
                        default="ja",
                        help="Choose from 'ja' or 'en' (default='ja').")

    # Data Augmentation Option
    parser.add_argument(
        "--how_select",
        dest='how_select',
        default="argmax",
        type=str,
        help="Choose from 'argmax' or 'sample' or 'beam'. (default='argmax')")
    parser.add_argument(
        "--how_many",
        default='multi',
        type=str,
        help="Choose from 'single' or 'multi'. (default='multi')")
    parser.add_argument('--topk', type=int, default=5, help="for beam search")

    # Hyper parameter
    parser.add_argument('--seed', type=int, default=2020)

    args = parser.parse_args()
    logger.info(args)

    # Seed
    random.seed(args.seed)
    logger.info("Seed: {}".format(args.seed))

    # Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info("device: {}".format(device))
    logger.info("language: {}".format(args.language))
    logger.info("BERT model: {}".format(args.bert_model))
    logger.debug("Loading BERT model...")
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    logger.debug("Sending BERT model to device...")
    model.to(device)
    model.eval()

    # Tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Input the sentence
    logger.info("How select tokens: {}".format(args.how_select))
    logger.info("How many tokens to predict at once: {}".format(args.how_many))
    print("Mask token is 'M'.")
    while True:
        text = input("Sentence: ")
        if text == "q":
            break
        black_list = input(
            "Black list of tokens (separator is ','): ").replace(
                " ", "").replace(" ", "").split(",")

        # Input feature
        feature = InputFeatures(text=text,
                                tokenizer=tokenizer,
                                max_seq_length=args.max_seq_length,
                                language=args.language)
        logger.debug(feature.tokens)
        if len(feature.token_mask_ids) == 0:
            print("Not found mask token (mask token is 'M').")
            continue

        if args.how_select == "beam":
            output_sents, output_tokens = prediction_with_beam_search(
                device=device,
                model=model,
                feature=feature,
                tokenizer=tokenizer,
                black_list=black_list,
                k=args.topk)
            for sent in output_sents:
                print(" ".join(sent[1:feature.len - 1]))

        else:
            if args.how_many == "single":
                predict_tokens = prediction_single(device=device,
                                                   model=model,
                                                   feature=feature,
                                                   tokenizer=tokenizer,
                                                   how_select=args.how_select,
                                                   black_list=black_list)
            elif args.how_many == "multi":
                predict_tokens = prediction_multi(device=device,
                                                  model=model,
                                                  feature=feature,
                                                  tokenizer=tokenizer,
                                                  how_select=args.how_select,
                                                  black_list=black_list)
            else:
                raise ValueError("Unsupported value: {}".format(args.how_many))

            assert len(predict_tokens) == len(feature.token_mask_ids)
            # tokens = feature.tokens
            # for idx, p_token in zip(feature.token_mask_ids, predict_tokens):
            #     tokens[idx] = p_token
            # print(" ".join(tokens[1:feature.len - 1]))

            filled_tokens = copy.deepcopy(feature.original_tokens)
            for idx, p_token in zip(feature.original_token_mask_ids,
                                    predict_tokens):
                filled_tokens[idx] = p_token
            print(" ".join(filled_tokens))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--on_memory",
        action='store_true',
        help="Whether to load train samples into memory or use disk")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--hybrid_attention",
                        action='store_true',
                        help="Whether to use hybrid attention")
    parser.add_argument("--continue_training",
                        action='store_true',
                        help="Continue training from a checkpoint")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError(
            "Training is currently the only implemented execution option. Please set `do_train`."
        )

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.continue_training:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_file)
        train_dataset = BERTDataset(args.train_file,
                                    tokenizer,
                                    seq_len=args.max_seq_length,
                                    corpus_lines=None,
                                    on_memory=args.on_memory)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    model = BertForMaskedLM.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    if args.hybrid_attention:
        max_seq_length = args.max_seq_length
        attention_mask = torch.ones(12,
                                    max_seq_length,
                                    max_seq_length,
                                    dtype=torch.long)
        # left attention
        attention_mask[:2, :, :] = torch.tril(
            torch.ones(max_seq_length, max_seq_length, dtype=torch.long))
        # right attention
        attention_mask[2:4, :, :] = torch.triu(
            torch.ones(max_seq_length, max_seq_length, dtype=torch.long))
        # local attention, window size = 3
        attention_mask[4:6, :, :] = torch.triu(
            torch.tril(
                torch.ones(max_seq_length, max_seq_length, dtype=torch.long),
                1), -1)
        attention_mask = torch.cat(
            [attention_mask.unsqueeze(0) for _ in range(8)])
        attention_mask = attention_mask.to(device)
    else:
        attention_mask = None

    global_step = 0
    epoch_start = 0
    if args.do_train:
        if args.continue_training:
            # if checkpoint file exists, find the last checkpoint
            if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
                all_cp = os.listdir(args.output_dir)
                steps = [
                    int(re.search('_\d+', cp).group()[1:]) for cp in all_cp
                    if re.search('_\d+', cp)
                ]
                if len(steps) == 0:
                    raise ValueError(
                        "No existing checkpoint. Please do not use --continue_training."
                    )
                max_step = max(steps)
                # load checkpoint
                checkpoint = torch.load(
                    os.path.join(args.output_dir,
                                 'checkpoints_' + str(max_step) + '.pt'))
                logger.info("***** Loading checkpoint *****")
                logger.info("  Num steps = %d", checkpoint['global_step'])
                logger.info("  Num epoch = %d", checkpoint['epoch'])
                logger.info("  Loss = %d, %d", checkpoint['loss'],
                            checkpoint['loss_now'])
                model.module.load_state_dict(checkpoint['model'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                global_step = checkpoint['global_step']
                epoch_start = checkpoint['epoch']
                del checkpoint
            else:
                raise ValueError(
                    "No existing checkpoint. Please do not use --continue_training."
                )

        writer = SummaryWriter(log_dir=os.environ['HOME'])
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        tr_loss_1000 = 0
        for ep in trange(epoch_start, int(args.num_train_epochs),
                         desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids = batch
                loss = model(input_ids,
                             segment_ids,
                             input_mask,
                             lm_label_ids,
                             hybrid_mask=attention_mask)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                tr_loss_1000 += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                # log the training loss for every 1000 steps
                if global_step % 1000 == 999:
                    writer.add_scalar('data/loss', tr_loss_1000 / 1000,
                                      global_step)
                    logger.info("training steps: %s", global_step)
                    logger.info("training loss per 1000: %s",
                                tr_loss_1000 / 1000)
                    tr_loss_1000 = 0
                # save the checkpoint for every 10000 steps
                if global_step % 10000 == 0:
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_file = os.path.join(
                        args.output_dir,
                        "checkpoints_" + str(global_step) + ".pt")
                    checkpoint = {
                        'model': model_to_save.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'epoch': ep,
                        'global_step': global_step,
                        'loss': tr_loss / nb_tr_steps,
                        'loss_now': tr_loss_1000
                    }
                    if args.do_train:
                        torch.save(checkpoint, output_file)
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(args.output_dir,
                                             "pytorch_model.bin_" + str(ep))
            if args.do_train:
                torch.save(model_to_save.state_dict(), output_model_file)
            logger.info("training loss: %s", tr_loss / nb_tr_steps)

        # Save a trained model
        logger.info("** ** * Saving fine - tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        if args.do_train:
            torch.save(model_to_save.state_dict(), output_model_file)
Ejemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--input_model_file",
        default=None,
        type=str,
        required=False,
        help="The input model file to load if not training, if any.")
    parser.add_argument(
        "--input_lm_model_file",
        default=None,
        type=str,
        required=False,
        help="The input model file to load for masked lm, if any.")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_mask_eval",
                        action='store_true',
                        help="Whether to run mask eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--mask_eval_outfile",
                        default="mask_eval_out.json",
                        type=str,
                        help="Where to store mask eval results (large).")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "sst": SstProcessor,
        "lmrd": LmrdProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2,
        "sst": 2,
        "lmrd": 2,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_mask_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank),
        num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        get_train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        train_features = next(get_train_features)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info(
            "  Num inputs = %d",
            len(train_features))  # can be different from seq oversize drops
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Eval at every epoch if flag is up to track progress against dev set.
            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank() == 0):
                eval_model(processor, args.data_dir, label_list,
                           args.max_seq_length, args.eval_batch_size,
                           tokenizer, device, model, tr_loss, nb_tr_steps,
                           args.do_train, global_step, args.output_dir, epoch)

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have previously fine-tuned
    if not args.do_train and args.input_model_file is not None:
        logger.info("Loading previously trained model from %s" %
                    args.input_model_file)
        model_state_dict = torch.load(args.input_model_file)
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model,
            state_dict=model_state_dict,
            num_labels=num_labels)
        model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_model(processor, args.data_dir, label_list, args.max_seq_length,
                   args.eval_batch_size, tokenizer, device, model, tr_loss,
                   nb_tr_steps, args.do_train, global_step, args.output_dir,
                   int(args.num_train_epochs))

    if args.do_mask_eval and (args.local_rank == -1
                              or torch.distributed.get_rank() == 0):

        # Load a trained model that you have previously fine-tuned for LM.
        if args.input_lm_model_file is not None:
            logger.info("Loading previously trained LM model from %s" %
                        args.input_lm_model_file)
            model_state_dict = torch.load(args.input_lm_model_file)
            maskedLMModel = BertForMaskedLM.from_pretrained(
                args.bert_model, state_dict=model_state_dict)
            model.to(device)
        else:
            maskedLMModel = BertForMaskedLM.from_pretrained(args.bert_model)

        mask_eval_model(processor, args.data_dir, label_list,
                        args.max_seq_length, args.eval_batch_size, tokenizer,
                        device, model, maskedLMModel, args.mask_eval_outfile)