Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_data_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input training data file (a text file).")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--eval_data_file",
        default=None,
        type=str,
        help=
        "An optional input evaluation data file to evaluate the perplexity on (a text file)."
    )
    # parser.add_argument("--model_type", default="bert", type=str,
    #                     help="The model architecture to be fine-tuned.")
    parser.add_argument(
        "--model_name_or_path",
        default="bert-base-cased",
        type=str,
        help="The model checkpoint for weights initialization.")

    # MLMで学習するか
    parser.add_argument(
        "--mlm",
        action='store_true',
        help=
        "Train with masked-language modeling loss instead of language modeling."
    )
    parser.add_argument(
        "--mlm_probability",
        type=float,
        default=0.15,
        help="Ratio of tokens to mask for masked language modeling loss")

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help=
        "Optional pretrained config name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help=
        "Optional pretrained tokenizer name or path if not the same as model_name_or_path"
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)"
    )
    # ちょっとよくわからない
    parser.add_argument(
        "--block_size",
        default=-1,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens)."
    )
    # ====== 学習 ======
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    # バリデーション
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    # ====== 学習オプション ======
    parser.add_argument("--per_gpu_train_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    # ====== GPUオプション ======
    # parser.add_argument("--n_gpu", default=1, type=int,
    #                     help="The number of GPUs to use for training")
    # tp = lambda x:list(map(int, x.split('.')))
    # parser.add_argument("--device_ids", default=None, type=tp,
    #                    help="The device IDs to use for training")
    # 重み減衰
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        '--save_total_limit',
        type=int,
        default=None,
        help=
        'Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default'
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    # ======= 分散学習  =========
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    # if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
    #     raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
    #                      "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Barrier to make sure only the first process in distributed training download model & vocab

    # config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = BertConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None)
    bert_tokenizer = BertTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    jp_tokenizer = JumanTokenizer()

    # if args.block_size <= 0:
    #  args.block_size = bert_tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
    # args.block_size = min(args.block_size, bert_tokenizer.max_len_single_sentence)
    model = BertForMaskedLM.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    # ====== BERT一部パラメータ凍結 =======
    # - BERTエンコーダ最終層,プーラーのみ凍結回避
    bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1])
    bert_pooler = copy.deepcopy(model.bert.pooler)
    # - BERT凍結
    for param in model.bert.parameters():
        param.requires_grad = False
    # - 非凍結レイヤーで置換
    model.bert.encoder.layer[-1] = bert_last_layer
    model.bert.pooler = bert_pooler
    # =====================================

    assert all([
        param.requires_grad
        for param in model.bert.encoder.layer[-1].parameters()
    ])

    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier(
            )  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args,
                                                bert_tokenizer,
                                                jp_tokenizer,
                                                evaluate=False)

        if args.local_rank == 0:
            torch.distributed.barrier()

        global_step, tr_loss = train(args, train_dataset, model,
                                     bert_tokenizer, jp_tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        bert_tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForMaskedLM.from_pretrained(args.output_dir)
        bert_tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

        # ====== BERT一部パラメータ凍結 =======
        # - BERTエンコーダ最終層,プーラーのみ凍結回避
        bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1])
        bert_pooler = copy.deepcopy(model.bert.pooler)
        # - BERT凍結
        for param in model.bert.parameters():
            param.requires_grad = False
        # - 非凍結レイヤーで置換
        model.bert.encoder.layer[-1] = bert_last_layer
        model.bert.pooler = bert_pooler
        # =====================================

        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            model = BertForMaskedLM.from_pretrained(checkpoint)

            # ====== BERT一部パラメータ凍結 =======
            # - BERTエンコーダ最終層,プーラーのみ凍結回避
            bert_last_layer = copy.deepcopy(model.bert.encoder.layer[-1])
            bert_pooler = copy.deepcopy(model.bert.pooler)
            # - BERT凍結
            for param in model.bert.parameters():
                param.requires_grad = False
            # - 非凍結レイヤーで置換
            model.bert.encoder.layer[-1] = bert_last_layer
            model.bert.pooler = bert_pooler
            # =====================================
            model.to(args.device)
            result = evaluate(args,
                              model,
                              bert_tokenizer,
                              jp_tokenizer,
                              prefix=prefix)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
Beispiel #2
0
 def __init__(self):
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
     self.model = BertForMaskedLM.from_pretrained('bert-base-chinese')
     self.model.eval()
Beispiel #3
0
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


if args.LM == 'Bert':
    from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM

    config = BertConfig(vocab_size=28996,
                        max_position_embeddings=512,
                        num_attention_heads=12,
                        num_hidden_layers=12,
                        #type_vocab_size=2, default is 2
                        )
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
    model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config)
    #model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
    # 12-layer, 768-hidden, 12-heads, 110M parameters.

elif args.LM == 'RoBerta':
    from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM

    config = RobertaConfig(vocab_size=50265,
                           max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture
Beispiel #4
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: pip install gradio
"""

import gradio as gr
import operator
import torch
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained(
    "shibing624/macbert4csc-base-chinese")
model = BertForMaskedLM.from_pretrained("shibing624/macbert4csc-base-chinese")


def ai_text(text):
    with torch.no_grad():
        outputs = model(**tokenizer([text], padding=True, return_tensors='pt'))

    def get_errors(corrected_text, origin_text):
        sub_details = []
        for i, ori_char in enumerate(origin_text):
            if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']:
                # add unk word
                corrected_text = corrected_text[:i] + ori_char + corrected_text[
                    i:]
                continue
            if i >= len(corrected_text):
                continue
            if ori_char != corrected_text[i]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        "Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
    )
    parser.add_argument("--model_type", default="bert", choices=["bert"])
    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
    parser.add_argument(
        "--dump_checkpoint",
        default="serialization_dir/tf_bert-base-uncased_0247911.pth",
        type=str)
    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

    if args.model_type == "bert":
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = "bert"
    else:
        raise ValueError('args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}

    for w in ["word_embeddings", "position_embeddings"]:
        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[
            f"{prefix}.embeddings.{w}.weight"]
    for w in ["weight", "bias"]:
        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[
            f"{prefix}.embeddings.LayerNorm.{w}"]

    std_idx = 0
Beispiel #6
0
@description:

refer https://github.com/voidful/BertGenerate/blob/master/Bert_Generate.ipynb
update bert_lstm model to seq2seq
"""

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from pycorrector import config

input_text = "[CLS] I go to school by bus [SEP] "
target_text = "我搭校车上学"
modelpath = config.bert_model_dir
tokenizer = BertTokenizer.from_pretrained(modelpath)
model = BertForMaskedLM.from_pretrained(modelpath)

# cuda
# model.to('cuda')


def get_example_pair(input_text, target_text):
    example_pair = dict()

    for i in range(0, len(target_text) + 1):
        tokenized_text = tokenizer.tokenize(input_text)
        tokenized_text.extend(target_text[:i])
        tokenized_text.append('[MASK]')
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens]).to('cpu')
                                 dtype=torch.long)
    all_input_attention = torch.tensor(
        [input_attention for input_attention in input_attention],
        dtype=torch.long)
    all_input_maskLM = torch.tensor(
        [input_maskLM for input_maskLM in input_maskLM], dtype=torch.long)

    full_dataset = TensorDataset(all_input_id, all_input_pos,
                                 all_input_attention, all_input_maskLM)
    return full_dataset


# load model
token_type_size = 13
config = BertConfig.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.bert.embeddings.token_type_embeddings = nn.Embedding(
    token_type_size, config.hidden_size)
nlp = spacy.load(
    "model/spacy/en_core_web_md-2.3.1/en_core_web_md/en_core_web_md-2.3.1")

semantics_list = [
    "about such concepts as absurdity it knew nothing .",
]
ori_syntactic_list = ["he had no idea about such terms ."]
syntactic_list_in_dict, part_maskLM_embedding_list_in_dict = data_preprocess.extrapolate_syntactic(
    ori_syntactic_list, nlp)
token_embedding_id_list, segment_embedding_list, attention_embedding_list, maskLM_embedding_list = data_preprocess.get_embedding(
    semantics_list, ori_syntactic_list, syntactic_list_in_dict,
    part_maskLM_embedding_list_in_dict)
pos_embedding_list = data_preprocess_pos.get_pos_embedding(
def download_bert_model():
    return BertForMaskedLM.from_pretrained("bert-base-cased")
Beispiel #9
0
 def __init__(self, model_name_or_path: str):
     super(BertPretrain, self).__init__()
     self.bert_model = BertForMaskedLM.from_pretrained(model_name_or_path)
Beispiel #10
0
 def __init__(self, args):
     super().__init__()
     self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.ntxloss = NTXentLoss(temperature=args.temperature)
     self.args = args
Beispiel #11
0
df_lineage = pd.read_csv(lineage[1], sep='\t', names=["Repo"])[:20000]

print("CSVs loaded")

docstrings_avg_vec = np.load(docstrings_vecs[1], allow_pickle=True)

config = BertConfig.from_json_file(model_path[1] + '/config.json')
config.output_hidden_states = True

print("Tokenizer and model initialized")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

device = torch.device('cpu')

model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config)
model.load_state_dict(
    torch.load(model_path[1] + "/pytorch_model.bin", map_location=device))
model.eval()

# Initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(docstrings_avg_vec)
index.createIndex({'post': 2}, print_progress=True)

print("Index made")


# Routes:
@app.route('/hello')
def hello_world():
Beispiel #12
0
 def __init__(self, args):
     super().__init__()
     self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.bceloss = nn.BCEWithLogitsLoss()
     self.args = args
Beispiel #13
0
    def __init__(self,
                 mlm_path: str = "bert-base-uncased",
                 k: int = 50,
                 threshold_pred_score: float = 0.3,
                 max_length: int = 512,
                 batch_size: int = 32,
                 replace_rate: float = 1.0,
                 insert_rate: float = 0.0,
                 device: Optional[torch.device] = None,
                 sentence_encoder=None,
                 filter_words: List[str] = None):
        """
        BAE: BERT-based Adversarial Examples for Text Classification. Siddhant Garg, Goutham Ramakrishnan. EMNLP 2020. 
        `[pdf] <https://arxiv.org/abs/2004.01970>`__
        `[code] <https://github.com/QData/TextAttack/blob/master/textattack/attack_recipes/bae_garg_2019.py>`__

        This script is adapted from <https://github.com/LinyangLee/BERT-Attack> given the high similarity between the two attack methods.
        
        This attacker supports the 4 attack methods (BAE-R, BAE-I, BAE-R/I, BAE-R+I) in the paper. 

        Args:
            mlm_path: The path to the masked language model. **Default:** 'bert-base-uncased'
            k: The k most important words / sub-words to substitute for. **Default:** 50
            threshold_pred_score: Threshold used in substitute module. **Default:** 0.3
            max_length: The maximum length of an input sentence for bert. **Default:** 512
            batch_size: The size of a batch of input sentences for bert. **Default:** 32
            replace_rate: Replace rate.
            insert_rate: Insert rate.
            device: A computing device for bert.
            sentence_encoder: A sentence encoder to calculate the semantic similarity of two sentences. Default: :py:class:`.UniversalSentenceEncoder`
            filter_words: A list of words that will be preserved in the attack procesudre.


        :Data Requirements: :py:data:`.TProcess.NLTKPerceptronPosTagger`
        :Classifier Capacity:
            * get_pred
            * get_prob
        :Language: english

        """

        if sentence_encoder is None:
            self.encoder = UniversalSentenceEncoder()
        else:
            self.encoder = sentence_encoder

        self.tokenizer_mlm = BertTokenizerFast.from_pretrained(
            mlm_path, do_lower_case=True)
        if device is not None:
            self.device = device
        else:
            self.device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")

        config_atk = BertConfig.from_pretrained(mlm_path)
        self.mlm_model = BertForMaskedLM.from_pretrained(mlm_path,
                                                         config=config_atk).to(
                                                             self.device)
        self.k = k
        self.threshold_pred_score = threshold_pred_score
        self.max_length = max_length
        self.batch_size = batch_size

        self.replace_rate = replace_rate
        self.insert_rate = insert_rate
        if self.replace_rate == 1.0 and self.insert_rate == 0.0:
            self.sub_mode = 0  # only using replacement
        elif self.replace_rate == 0.0 and self.insert_rate == 1.0:
            self.sub_mode = 1  # only using insertion
        elif self.replace_rate + self.insert_rate == 1.0:
            self.sub_mode = 2  # replacement OR insertion for each token / subword
        elif self.replace_rate == 1.0 and self.insert_rate == 1.0:
            self.sub_mode = 3  # first replacement AND then insertion for each token / subword
        else:
            raise NotImplementedError()

        self.__lang_tag = TAG_English
        if filter_words is None:
            filter_words = get_default_filter_words(self.__lang_tag)
        self.filter_words = set(filter_words)
        check_language([self.encoder], self.__lang_tag)
Beispiel #14
0
def bert_example():
	# NOTE [info] >> Refer to example codes in the comment of forward() of each BERT class in https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_bert.py

	pretrained_model_name = 'bert-base-uncased'
	tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

	input_ids = torch.tensor(tokenizer.encode('Hello, my dog is cute', add_special_tokens=True)).unsqueeze(0)  # Batch size 1.

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# The bare Bert Model transformer outputting raw hidden-states without any specific head on top.
		model = BertModel.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple.
		print('{} processed.'.format(BertModel.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model with two heads on top as done during the pre-training: a 'masked language modeling' head and a 'next sentence prediction (classification)' head.
		model = BertForPreTraining.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		prediction_scores, seq_relationship_scores = outputs[:2]
		print('{} processed.'.format(BertForPreTraining.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model with a 'language modeling' head on top.
		model = BertForMaskedLM.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids, masked_lm_labels=input_ids)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		loss, prediction_scores = outputs[:2]
		print('{} processed.'.format(BertForMaskedLM.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model with a 'next sentence prediction (classification)' head on top.
		model = BertForNextSentencePrediction.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		seq_relationship_scores = outputs[0]
		print('{} processed.'.format(BertForNextSentencePrediction.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
		model = BertForSequenceClassification.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1.

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids, labels=labels)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		loss, logits = outputs[:2]
		print('{} processed.'.format(BertForSequenceClassification.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
		model = BertForMultipleChoice.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		choices = ['Hello, my dog is cute', 'Hello, my cat is amazing']
		input_ids0 = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices.
		labels = torch.tensor(1).unsqueeze(0)  # Batch size 1.

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids0, labels=labels)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		loss, classification_scores = outputs[:2]
		print('{} processed.'.format(BertForMultipleChoice.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
		model = BertForTokenClassification.from_pretrained(pretrained_model_name)
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1.

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			outputs = model(input_ids, labels=labels)
		print('End inferring: {} secs.'.format(time.time() - start_time))

		loss, scores = outputs[:2]
		print('{} processed.'.format(BertForTokenClassification.__name__))

	if True:
		print('Start loading a model...')
		start_time = time.time()
		# Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute 'span start logits' and 'span end logits').
		model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
		print('End loading a model: {} secs.'.format(time.time() - start_time))

		question, text = 'Who was Jim Henson?', 'Jim Henson was a nice puppet'
		encoding = tokenizer.encode_plus(question, text)
		input_ids0, token_type_ids = encoding['input_ids'], encoding['token_type_ids']

		print('Start inferring...')
		start_time = time.time()
		model.eval()
		with torch.no_grad():
			start_scores, end_scores = model(torch.tensor([input_ids0]), token_type_ids=torch.tensor([token_type_ids]))
		print('End inferring: {} secs.'.format(time.time() - start_time))

		all_tokens = tokenizer.convert_ids_to_tokens(input_ids0)
		answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])

		assert answer == 'a nice puppet'
		print('{} processed.'.format(BertForQuestionAnswering.__name__))
def main():
    parser = setup_parser()
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_eval:
        raise ValueError("At least `do_eval` must be True.")

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None

    # Prepare model. Load pre-trained model weights.
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                                                   'distributed_{}'.format(args.local_rank))
    model = BertForMaskedLM.from_pretrained(args.bert_model, cache_dir=cache_dir)
    if args.fp16:
        model.half()
    model.to(device)

    # output_sr_file = open(args.output_SR_file, "a+")

    # Load fastText word embeddings
    print("Loading embeddings ...")
    wordVecPath = args.word_embeddings
    # wordVecPath = "./fastText/crawl-300d-2M-subword.vec"
    fasttext_dico, fasttext_emb = getWordmap(wordVecPath)

    # Load word frequency
    word_count_path = args.word_frequency
    with open(word_count_path, 'rb') as f:
        word_count = pickle.load(f)
    # with open('../word_frequency/counter_Tokens.p', 'rb') as f:
    #     word_count = pickle.load(f)

    stopword_list1 = set(stopwords.words('english'))
    with open(args.stopwords, "r") as f:
        stopword_list2 = set(eval(f.read()))
    stopword_list = stopword_list1.union(stopword_list2)

    ps = PorterStemmer()

    SS = []
    substitution_words = []
    source_words = []

    num_selection = args.num_selections

    window_context = 11

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Read dataset
        origin, sent_id, eval_examples, mask_words, CHV_selections, CHV_substitutions, CHV_sim_scores = read_df(args.eval_dir)
        print(sent_id)
        print(eval_examples)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        # logger.info("  Batch size = %d", args.eval_batch_size)

        # Put the model in "evaluation" mode, meaning feed-forward operation.
        model.eval()

        eval_size = len(eval_examples)

        for i in tqdm(range(eval_size)):
            substitution_df = []

            # print(f'{origin[i]}, sentence {sent_id[i]}: ')
            print(f'sentence {sent_id[i]}:\n{eval_examples[i]}')

            tokens, words, position = convert_sentence_to_token(
                sentence=eval_examples[i], tokenizer=tokenizer, seq_length=128)
            print("tokens: ", tokens)
            print("words: ", words)
            print("position: ", position)

            assert len(words) == len(position)

            # len_tokens = len(tokens)
            # print("len_tokens: ", len_tokens)
            try:
                mask_index = words.index(mask_words[i].lower()) #use lower case if do_lower_case == True
            except ValueError:
                print(f'"{mask_words[i]}" is not in list of words')
                try:
                    mask_index = words.index(mask_words[i].lower() + "'s")
                except ValueError:
                    print(f'"{mask_words[i]}" + "\'s" is also not in list of words\nThis sentence will be skipped.\n')
                    continue

            mask_position = position[mask_index]
            mask_context = extract_context(words, mask_index, window_context)
            # print("mask_index: ", mask_index)
            # print("mask_position: ", mask_position)
            # print("mask_context: ", mask_context)

            if isinstance(mask_position, list):
                feature = convert_whole_word_to_feature(tokens_a=tokens,
                                                        mask_position=mask_position,
                                                        seq_length=args.max_seq_length,
                                                        tokenizer=tokenizer)
            else:
                feature = convert_token_to_feature(tokens_a=tokens,
                                                   mask_position=mask_position,
                                                   seq_length=args.max_seq_length,
                                                   tokenizer=tokenizer)

            print("feature.tokens: ", feature.tokens)
            # print("feature.input_ids: ", feature.input_ids)
            # print("feature.input_type_ids: ", feature.input_type_ids)
            # print("feature.input_mask: ", feature.input_mask)

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([feature.input_ids])
            segments_tensor = torch.tensor([feature.input_type_ids])
            attention_mask = torch.tensor([feature.input_mask])

            # If we have a GPU, put the tensors on cuda
            tokens_tensor = tokens_tensor.to('cuda')
            segments_tensor = segments_tensor.to('cuda')
            attention_mask = attention_mask.to('cuda')

            # Predict all tokens
            with torch.no_grad():
                output = model(tokens_tensor,
                               token_type_ids=segments_tensor,
                               attention_mask=attention_mask)
                prediction_scores = output[0]
                # print("predictions: ", prediction_scores)

            if isinstance(mask_position, list):
                predicted_top = prediction_scores[0, mask_position[0]].topk(40)
            else:
                predicted_top = prediction_scores[0, mask_position].topk(40)

            pre_tokens = tokenizer.convert_ids_to_tokens(predicted_top[1].cpu().numpy())
            print("pre_tokens: ", pre_tokens)
            pre_prob_values = predicted_top[0].cpu().numpy()
            print("pre_prob_values: ", pre_prob_values)

            ss = substitution_selection(source_word=mask_words[i],
                                        pre_tokens=pre_tokens,
                                        pre_scores=pre_prob_values,
                                        stopwords=stopword_list,
                                        ps=ps,
                                        num_selection=num_selection)

            # SS.append(ss)
            # source_words.append(mask_words[i])

            pre_word = substitution_ranking(source_word=mask_words[i],
                                            source_context=mask_context,
                                            substitution_selection=ss,
                                            fasttext_dico=fasttext_dico,
                                            fasttext_emb=fasttext_emb,
                                            word_count=word_count,
                                            tokenizer=tokenizer,
                                            maskedLM=model)

            MLM_sim_score = fuzzy_match(mask_words[i], pre_word)

            print('---------------------------------------')
            print("Sentence: ", eval_examples[i])
            print("Source word: ", mask_words[i])
            print("Substitution selection: ", ss)
            print("Model substitution: ", pre_word)
            print("Model sim score: ", MLM_sim_score)
            print("CHV substitution: ", CHV_substitutions[i])
            print("CHV sim score: ", CHV_sim_scores[i])
            print(" ")

            # substitution_words.append(pre_word)

            substitution_df.append(OrderedDict({"origin": origin[i],
                                                "sent_id": sent_id[i],
                                                "sentence": eval_examples[i],
                                                "source_term": mask_words[i],
                                                "CHV_selection": CHV_selections[i],
                                                "CHV_substitution": CHV_substitutions[i],
                                                "CHV_sim_score": CHV_sim_scores[i],
                                                "MLM_selection": ss,
                                                "MLM_substitution": pre_word,
                                                "MLM_sim_score": MLM_sim_score
                                                }))

            save_output(args.output_path, substitution_df)
Beispiel #16
0
masked_index = 1
tokenized_text[masked_index] = '[MASK]'
# masked_index = 12
# tokenized_text[masked_index] = '[SEP]'
print(tokenized_text)
# ['テレビ', 'で', '[MASK]', 'の', '試合', 'を', '見る', '。']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# [571, 12, 4, 5, 608, 11, 2867, 8]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
# tensor([[ 571,   12,    4,    5,  608,   11, 2867,    8]])

# Load pre-trained model
model = BertForMaskedLM.from_pretrained(
    'cl-tohoku/bert-base-japanese-whole-word-masking')
model.eval()

# Predict
with torch.no_grad():
    outputs = model(tokens_tensor)
    predictions = outputs[0][0, masked_index].topk(10)  # 予測結果の上位5件を抽出

# Show results
for i, index_t in enumerate(predictions.indices):
    index = index_t.item()
    token = tokenizer.convert_ids_to_tokens([index])[0]
    print(i, token)
Beispiel #17
0
PRETRAINED_MODEL_NAME_OR_PATH = os.environ.get("PRETRAINED_MODEL_NAME_OR_PATH")
logging.info(f"PRETRAINED_MODEL_NAME_OR_PATH = {PRETRAINED_MODEL_NAME_OR_PATH}")
MASK_ID = 103
try:
    cuda = torch.cuda.is_available()
    if cuda:
        torch.cuda.set_device(0)  # singe gpu
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    logger.info(f"masked_lm is set to run on {device}")

    # init model
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)
    model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)
    model.eval()
    if cuda:
        model.cuda()

    logger.info("masked_lm model is ready")
except Exception as e:
    sentry_sdk.capture_exception(e)
    logger.exception(e)
    raise e

app = Flask(__name__)
health = HealthCheck(app, "/healthcheck")
logging.getLogger("werkzeug").setLevel("WARNING")

Beispiel #18
0
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
logging.basicConfig(level=logging.INFO)  # OPTIONAL

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
model.eval()

# model.to('cuda')  # if you have gpu


#https://stackoverflow.com/questions/54978443/predicting-missing-words-in-a-sentence-natural-language-processing-model
def predict_masked_sent(text, top_k=5):
    # Tokenize input
    text = "[CLS] %s [SEP]" % text
    tokenized_text = tokenizer.tokenize(text)
    #print(tokenizer.lang2id)
    masked_index = tokenized_text.index("[MASK]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input training data file (a text file).",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Optional parameters
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--do_evaluate",
        type=bool,
        default=False,
        help="Run model evaluation on out-of-sample data.",
    )
    parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
    parser.add_argument(
        "--do_overwrite_output_dir",
        type=bool,
        default=False,
        help="Whether to overwrite the output dir.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default="bert-base-cased",
        type=str,
        help="The model checkpoint to initialize the encoder and decoder's weights with.",
    )
    parser.add_argument(
        "--model_type",
        default="bert",
        type=str,
        help="The decoder architecture to be fine-tuned.",
    )
    parser.add_argument(
        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
    )
    parser.add_argument(
        "--num_train_epochs",
        default=10,
        type=int,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=4,
        type=int,
        help="Batch size per GPU/CPU for eval.",
    )
    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=4,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--input_block_size",
        default=256,
        type=int,
        help="Max seq length for input",
    )
    parser.add_argument(
        "--output_block_size",
        default=64,
        type=int,
        help="Max seq length for output",
    )

    parser.add_argument(
        "--trained_checkpoints",
        default="",
        type=str,
        help="trained_checkpoints",
    )

    parser.add_argument(
        "--decoding_type",
        default="pnt",
        type=str,
        help="",
    )

    parser.add_argument(
        "--encoder_lr",
        default=5e-4,
        type=float,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--decoder_lr",
        default=5e-4,
        type=float,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--encoder_warmup",
        default=10,
        type=int,
        help="encoder's learning rate",
    )

    parser.add_argument(
        "--decoder_warmup",
        default=100,
        type=int,
        help="encoder's learning rate",
    )

    parser.add_argument("--seed", default=42, type=int)
    args = parser.parse_args()

    if (
                        os.path.exists(args.output_dir)
                    and os.listdir(args.output_dir)
                and args.do_train
            and not args.do_overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
                args.output_dir
            )
        )

    # Set up training device
    if args.to_cpu or not torch.cuda.is_available():
        args.device = torch.device("cpu")
        args.n_gpu = 0
    else:
        args.device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()
        print(args.n_gpu)

    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    #config = BertConfig.from_pretrained(args.model_name_or_path)
    #config.num_hidden_layers=3
    #config.is_decoder=True
    #decoder_model = BertForMaskedLM(config)
    decoder_model = BertForMaskedLM.from_pretrained(r'/data/zhuoyu/semantic_parsing/models')
    model = Model2Model.from_pretrained(
        args.model_name_or_path, decoder_model=decoder_model
    )
    #model = Model2Model.from_pretrained(
    #    args.model_name_or_path, decoder_model=None
    #)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        0,
        args.device,
        args.n_gpu,
        False,
        False,
    )

    logger.info("Training/evaluation parameters %s", args)

    # Train the model
    model.to(args.device)
    if args.do_train:
        global_step, tr_loss = train(args, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)

        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))

    # Evaluate the model
    results = {}
    if args.do_evaluate:
        checkpoints = [args.trained_checkpoints]
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            encoder_checkpoint = os.path.join(checkpoint, "encoder")
            decoder_checkpoint = os.path.join(checkpoint, "decoder")
            #model = PreTrainedEncoderDecoder.from_pretrained(
            #    encoder_checkpoint, decoder_checkpoint
            #)
            #model = Model2Model.from_pretrained(encoder_checkpoint)
            #model.to(args.device)
            results = "placeholder"

            evaluate(args,model,tokenizer,"test")

    return results
args = parser.parse_args()
print("Reconstruction. step = ", args.step)
if args.type_of_model == 'albert':
  tokenizer = AlbertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model'))
  config = AlbertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json'))
  config.output_hidden_states = True
  model = AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None,
    config = config,
    state_dict = torch.load(os.path.join(
      args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin')))
elif args.type_of_model == 'bert':
  tokenizer = BertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model'))
  config = BertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json'))
  config.output_hidden_states = True
  model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None,
    config = config,
    state_dict = torch.load(os.path.join(
      args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin')))
else:
  raise NotImplementedError("The given model type %s is not supported" % args.type_of_model)


device = 'cuda' if torch.cuda.is_available else 'cpu'    
model.eval().to(device)

tag = {}
with open(os.path.join(args.data, 'ontonotes/const/pos/labels.txt')) as f:
  while True:
    pos = f.readline().rstrip()
    if pos == "": break
    tag[pos] = np.asarray([0, 0])
Beispiel #21
0
 def _get_masked_language_model(self):
     """
     Initializes the BertForMaskedLM transformer
     """
     self.mlm = BertForMaskedLM.from_pretrained(self.model)
     self.mlm.eval()
Beispiel #22
0
def main():

    # load the args from yaml file
    with open("bert_finetune.yaml") as file:
        args = yaml.load(file, Loader=yaml.FullLoader)

    print("Printing Arguments...")
    for key in args:
        print("- " + str(key) + ": " + str(args[key]))

    print("\nDevice used...")
    if torch.cuda.is_available():
        print("Using GPU")
    else:
        print("GPU not seen by Torch, please check again. Exiting for now...")
        exit()

    # load the model
    print("\nLoading the model...")
    model = BertForMaskedLM.from_pretrained(args["model_name"])
    # freeze the first 21 layers
    model = freeze_bert_fn(model,
                           [i for i in range(0, 21)])  # freeze all but three
    print(model.config)
    print("Number of parameters: " + str(model.num_parameters()))

    # invoke the tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(args["model_name"])

    # load the dataset
    print("\nLoading the dataset")
    t0 = time.time()

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=args["train_data_file"],
        block_size=512,
    )
    val_dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=args["val_data_file"],
        block_size=512,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=args["mlm_probability"])

    print("Time taken: " + str(time.time() - t0))

    # training
    training_args = TrainingArguments(
        output_dir=args["save_model_directory"],
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        logging_steps=args["logging_steps"],
        evaluation_strategy="steps",
        eval_steps=args["eval_steps"],
        num_train_epochs=args["num_train_epochs"],
        per_gpu_train_batch_size=args["batch_size"],
        per_gpu_eval_batch_size=args["batch_size"],
        save_steps=args["save_steps"],
        disable_tqdm=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    trainer.save_model(args["save_model_directory"])
Beispiel #23
0
import torch
from transformers import BertForMaskedLM, BertTokenizer
from torch.nn import CrossEntropyLoss
import math
from sentence_transformers import SentenceTransformer
from transformers import XLNetModel, XLNetTokenizer
import bert_preprocess
import numpy as np

st_model = SentenceTransformer('bert-base-nli-mean-tokens')



bertmodelname = 'bert-large-uncased-whole-word-masking'
tokenizer = BertTokenizer.from_pretrained(bertmodelname)
model = BertForMaskedLM.from_pretrained(bertmodelname)


"""
bertmodelname = 'bert-large-uncased-whole-word-masking'
tokenizer = BertTokenizer.from_pretrained(bertmodelname)
bertsavedmodelname = "pytorch_model3.bin"
model_state_dict = torch.load(bertsavedmodelname)
model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path=bertmodelname, state_dict=model_state_dict)
"""


#text = '[CLS] I want to [MASK] the car because it is cheap . [SEP]'

def predict_missing(text):
    
Beispiel #24
0
def main():
    """
    Modify substitute probabilities based on lexical similarity with target.
    """
    parser = argparse.ArgumentParser(
        description='Modify substitute probabilities based on lexical similarity with target.')
    parser.add_argument(
        '--model_name', type=str, required=True,
        help='HuggingFace model name or path')
    parser.add_argument(
        '--subs_path', type=str, required=True,
        help='Path to the pickle file containing substitute lists (output by substitutes.py).')
    parser.add_argument(
        '--targets_path', type=str, required=True,
        help='Path to the csv file containing target word forms.')
    parser.add_argument(
        '--output_path', type=str, required=True,
        help='Output path for pickle containing substitutes with lexical similarity values.')
    parser.add_argument(
        '--batch_size', type=int, default=64,
        help='The batch size per device (GPU core / CPU).')
    parser.add_argument(
        '--ignore_decoder_bias', action='store_true',
        help="Whether to ignore the decoder's bias vector during masked word prediction")
    parser.add_argument(
        '--normalise_embeddings', action='store_true',
        help="Whether to ignore the decoder's bias vector during masked word prediction")
    parser.add_argument(
        '--local_rank', type=int, default=-1,
        help='For distributed training.')
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.info(__file__.upper())
    start_time = time.time()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        n_gpu = 1

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        args.local_rank,
        device,
        n_gpu,
        bool(args.local_rank != -1)
    )

    # Set seeds across modules
    set_seed(42, n_gpu)

    # Load target forms
    target_forms = []
    with open(args.targets_path, 'r', encoding='utf-8') as f_in:
        for line in f_in.readlines():
            line = line.strip()
            forms = line.split(',')[1:]
            target_forms.extend(forms)
    print('=' * 80)
    print('targets:', target_forms)
    print('=' * 80)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    # Load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.model_name, never_split=target_forms, use_fast=False)
    model = BertForMaskedLM.from_pretrained(args.model_name, output_hidden_states=True)

    if args.ignore_decoder_bias:
        logger.warning('Ignoring bias vector for masked word prediction.')
        model.cls.predictions.decoder.bias = None

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    # Store vocabulary indices of target words
    targets_ids = [tokenizer.encode(t, add_special_tokens=False) for t in target_forms]
    assert len(target_forms) == len(targets_ids)
    words_added = []
    for t, t_id in zip(target_forms, targets_ids):
        if tokenizer.do_lower_case:
            t = t.lower()
        if t in tokenizer.added_tokens_encoder:
            continue
        if len(t_id) > 1 or (len(t_id) == 1 and t_id[0] == tokenizer.unk_token_id):
            if tokenizer.add_tokens([t]):
                model.resize_token_embeddings(len(tokenizer))
                words_added.append(t)
            else:
                logger.error('Word not properly added to tokenizer:', t, tokenizer.tokenize(t))

    # check if correctly added
    for t, t_id in zip(target_forms, targets_ids):
        if len(t_id) != 1:
            print(t, t_id)
    logger.warning("\nTarget words added to the vocabulary: {}.\n".format(', '.join(words_added)))

    # assert len(t_id) == 1  # because of never_split list
    # if t_id[0] == tokenizer.unk_token_id:
    #     if tokenizer.add_tokens([t]):
    #         model.resize_token_embeddings(len(tokenizer))
    #         words_added.append(t)
    #     else:
    #         logger.error('Word not properly added to tokenizer:', t, tokenizer.tokenize(t))


    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    with open(args.subs_path, 'rb') as f_in:
        substitutes_raw = pickle.load(f_in)

    substitutes_new = {
        w: [{'candidates': [], 'logp': [], 'dot_products': []} for _ in substitutes_raw[w]]
        for w in substitutes_raw
    }

    def collate(batch):
        return [
            {'input_ids': torch.cat([item[0]['input_ids'].unsqueeze(0) for item in batch], dim=0),
             'attention_mask': torch.cat([item[0]['attention_mask'].unsqueeze(0) for item in batch], dim=0)},
            [item[1] for item in batch],  # target
            [item[2] for item in batch],  # occurrence_idx
            [item[3] for item in batch],  # candidate_token
            torch.cat([torch.as_tensor(item[4]).unsqueeze(0) for item in batch], dim=0),  # embedding
            [item[5] for item in batch],  # logp
            [item[6] for item in batch]   #position
        ]

    dataset = SubstitutesDataset(substitutes_raw, tokenizer, args.normalise_embeddings)
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate)
    iterator = tqdm(dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])

    for step, batch in enumerate(iterator):
        model.eval()

        inputs, tgt, occurrence_idxs, candidate_tokens, tgt_embedding, logps, positions = batch
        inputs['input_ids'] = inputs['input_ids'].to(device)
        inputs['attention_mask'] = inputs['attention_mask'].to(device)
        tgt_embedding = tgt_embedding.to(device)
        bsz = inputs['input_ids'].shape[0]

        with torch.no_grad():
            outputs = model(**inputs)  # n_sentences, max_sent_len, vocab_size

            hidden_states = outputs[1]
            last_layer = hidden_states[-1][np.arange(bsz), positions, :]  # (bsz, hdims)
            if args.normalise_embeddings:
                last_layer = normalize(last_layer, p=2)

            dot_products = torch.sum(tgt_embedding * last_layer, dim=1)  # (bsz)

            if args.normalise_embeddings:
                assert all([d <= 1.01 for d in dot_products]), 'Dot product should not exceed 1 if vectors are normalised.'

            for b_id in np.arange(bsz):
                tgt_lemma = tgt[b_id]
                occurrence_idx = occurrence_idxs[b_id]

                substitutes_new[tgt_lemma][occurrence_idx]['candidates'].append(candidate_tokens[b_id])
                substitutes_new[tgt_lemma][occurrence_idx]['logp'].append(logps[b_id])
                substitutes_new[tgt_lemma][occurrence_idx]['dot_products'].append(dot_products[b_id].item())

    iterator.close()

    with open(args.output_path, 'wb') as f_out:
        pickle.dump(substitutes_new, f_out)

    logger.warning("--- %s seconds ---" % (time.time() - start_time))
Beispiel #25
0
    # model.save_pretrained('./ernie-1.0')
    # tokenizer.save_pretrained('./ernie-1.0')
    # tf_model.save_pretrained("./ernie-1.0")
    import torch
    from transformers import BertTokenizer, BertForMaskedLM

    tokenizer = BertTokenizer.from_pretrained('nghuyong/ernie-1.0')

    input_tx = "[CLS] [MASK] [MASK] [MASK] 是黑龙江的省会城市[SEP]"
    tokenized_text = tokenizer.tokenize(input_tx)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([[0] * len(tokenized_text)])

    model = BertForMaskedLM.from_pretrained('nghuyong/ernie-1.0')
    model.eval()

    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    predicted_index = [
        torch.argmax(predictions[0, i]).item()
        for i in range(0, (len(tokenized_text) - 1))
    ]
    predicted_token = [
        tokenizer.convert_ids_to_tokens([predicted_index[x]])[0]
        for x in range(1, (len(tokenized_text) - 1))
    ]
    return input_padded, index_list, len(clean_text)


def complete_missing_word(pred_binary, index_list, len_list):
    list_cwi_predictions = list(pred_binary[0][:len_list])
    for i in index_list:
        list_cwi_predictions.insert(i, 0)
    return list_cwi_predictions


# Second part: The Candidates generation and selection using BERT
# Load the BERT model for masked languge

bert_model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model)
model = BertForMaskedLM.from_pretrained(bert_model)
model.eval()

zipf_frequency('stop', 'en')
zipf_frequency('thwart', 'en')


# Now the function to get the candidates out of BERT (MLM):
def get_bert_candidates(input_text,
                        list_cwi_predictions,
                        numb_predictions_displayed=10):
    list_candidates_bert = []
    for word, pred in zip(input_text.split(), list_cwi_predictions):
        if (pred and
            (pos_tag([word])[0][1] in ['NNS', 'NN', 'VBP', 'RB', 'VBG', 'VBD'])
            ) or (zipf_frequency(word, 'en')) < 3.1:
    def __init__(self):
        super(MaskedLM, self).__init__()

        self.bert_layer = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased', output_hidden_states=True)

        self.dropout = nn.Dropout(0.1)
 def __init__(self, model_name_or_path: str = 'bert-base-cased') -> None:
     self._tokenizer: PreTrainedTokenizer = BertTokenizer.from_pretrained(
         model_name_or_path)
     self._model = BertForMaskedLM.from_pretrained(model_name_or_path)
     self._STOPWORDS: List[str] = stopwords.words('english')
    return res


def get_pronounce_dist_between_sentences(sent1, sent2):
    pron1 = pronounce_sentence(sent1)
    pron2 = pronounce_sentence(sent2)
    return LevenshteinDist(pron1, pron2, 1, 1, confusion, 1)[-1][-1]


if LOAD_MODEL:  # or 'BertForMaskedLM' not in locals():
    from transformers import BertForMaskedLM, BertTokenizer, BertTokenizerFast
    import torch, math
    device = torch.device('cuda')
    #     bertMaskedLM = BertForMaskedLM.from_pretrained(
    #         '/home/akiralll/PycharmProjects/bert_mlm/distilbert-base-uncased-train_wiki_articles_lm-train_youtube/')
    bertMaskedLM = BertForMaskedLM.from_pretrained('distilbert-base-uncased')
    tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-uncased')
    bertMaskedLM.to(device)


def make_features_for_candidate(orig_text: list,
                                candidate_text: list,
                                l_context_index,
                                r_context_index,
                                client_vocab=None):
    orig_text_str = ''.join(orig_text)
    candidate_text_str = ''.join(candidate_text)
    orig_text_pronounce = pronounce_sentence(orig_text)
    cand_text_pronounce = pronounce_sentence(candidate_text)

    features = [
Beispiel #30
0
def main():

    MODEL_CACHE = './model/bert-base-chinese'
    WORD_2_VECTOR_MODEL_DIR = './model/merge_sgns_bigram_char300.txt'

    WORD_FREQ_DICT = './dict/modern_chinese_word_freq.txt'

    EVAL_FILE_PATH = './dataset/annotation_data.csv'
    BERT_RES_PATH = './data/bert_ss_res.csv'
    # ERNIE_RES_PATH = './data/ernie_output.csv'
    VECTOR_RES_PATH = './data/vector_ss_res.csv'
    DICT_RES_PATH = './data/dict_ss_res.csv'
    HOWNET_RES_PATH = './data/hownet_ss_res.csv'
    HYBRID_RES_PATH = './data/hybrid_ss_res.csv'

    SUBSTITUTION_NUM = 10

    word_2_vector_model_dir = WORD_2_VECTOR_MODEL_DIR
    model_cache = MODEL_CACHE

    word_freq_dict = WORD_FREQ_DICT

    eval_file_path = EVAL_FILE_PATH

    bert_res_path = BERT_RES_PATH
    # ernie_res_path = ERNIE_RES_PATH
    vector_res_path = VECTOR_RES_PATH
    dict_res_path = DICT_RES_PATH
    hownet_res_path = HOWNET_RES_PATH
    hybrid_res_path = HYBRID_RES_PATH

    substitution_num = SUBSTITUTION_NUM

    print('loading models...')
    tokenizer = BertTokenizer.from_pretrained(model_cache)
    model = BertForMaskedLM.from_pretrained(model_cache)
    # OpenHowNet.download()
    hownet = OpenHowNet.HowNetDict(use_sim=True)
    model.to('cuda')
    model.eval()
    print('loading embeddings...')
    model_word2vector = gensim.models.KeyedVectors.load_word2vec_format(
        word_2_vector_model_dir, binary=False)
    print('loading files...')
    word_freq_dict = read_dict(word_freq_dict)

    bert_res = read_ss_result(bert_res_path)
    vector_res = read_ss_result(vector_res_path)
    dict_res = read_ss_result(dict_res_path)
    hownet_res = read_ss_result(hownet_res_path)
    hybrid_res = read_ss_result(hybrid_res_path)

    row_lines, source_sentences, source_words = read_dataset(eval_file_path)

    for row_line, source_sentence, source_word, bert_subs, vector_subs, dict_subs, hownet_subs, hybrid_subs in zip(
            row_lines, source_sentences, source_words, bert_res, vector_res,
            dict_res, hownet_res, hybrid_res):
        # 全部运行可能耗时较长,建议注释部分代码块运行需要的测试
        if bert_subs[0] != 'NULL':
            bert_pre_word, bert_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, hownet,
                source_sentence, source_word, bert_subs, word_freq_dict,
                substitution_num)
        else:
            bert_pre_word = 'NULL'
            bert_ss_sorted = ['NULL']
        if vector_subs[0] != 'NULL':
            vector_pre_word, vector_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, hownet,
                source_sentence, source_word, vector_subs, word_freq_dict,
                substitution_num)
        else:
            vector_pre_word = 'NULL'
            vector_ss_sorted = ['NULL']
        if dict_subs[0] != 'NULL':
            dict_pre_word, dict_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, hownet,
                source_sentence, source_word, dict_subs, word_freq_dict,
                substitution_num)
        else:
            dict_pre_word = 'NULL'
            dict_ss_sorted = ['NULL']
        if hownet_subs[0] != 'NULL':
            hownet_pre_word, hownet_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, hownet,
                source_sentence, source_word, hownet_subs, word_freq_dict,
                substitution_num)
        else:
            hownet_pre_word = 'NULL'
            hownet_ss_sorted = ['NULL']
        if hybrid_subs[0] != 'NULL':
            hybrid_pre_word, hybrid_ss_sorted = substitute_ranking(
                row_line, model_word2vector, model, tokenizer, hownet,
                source_sentence, source_word, hybrid_subs, word_freq_dict,
                substitution_num)
        else:
            hybrid_pre_word = 'NULL'
            hybrid_ss_sorted = ['NULL']

        save_result(row_line, bert_pre_word, bert_ss_sorted,
                    './test/data/nochnum/bert_sr_res_no_chnum.csv')
        save_result(row_line, vector_pre_word, vector_ss_sorted,
                    './test/data/nochnum/vector_sr_res_no_chnum.csv')
        save_result(row_line, dict_pre_word, dict_ss_sorted,
                    './test/data/nochnum/dict_sr_res_no_chnum.csv')
        save_result(row_line, hownet_pre_word, hownet_ss_sorted,
                    './test/data/nochnum/hownet_sr_res_no_chnum.csv')
        save_result(row_line, hybrid_pre_word, hybrid_ss_sorted,
                    './test/data/nochnum/hybrid_sr_res_no_chnum.csv')