def main():
    NUM_TRAIN_DATA = 150000
    NUM_TEST_DATA = 5000
    MODEL_DIR = './electra_chinese_base'
    MAX_LEN = 512
    BATCH_SIZE = 8 * 2 # 8gpu * 16
    LR = 1e-5
    NUM_LABELS = 33
    EPOCHS = 4

    # read data
    content, target = read_data('../../corpus/ettoday_2017.json')

    # train dataloader
    examples = DataProcessor().get_train_examples(
        content[:NUM_TRAIN_DATA], target[:NUM_TRAIN_DATA])
    train_dataset = convert_examples_to_features(
        examples, max_length=MAX_LEN, tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR))
    train_loader = DataLoader(
        train_dataset, shuffle=True, batch_size=BATCH_SIZE)

    # test dataloader
    examples = DataProcessor().get_test_examples(
        content[NUM_TRAIN_DATA:NUM_TEST_DATA + NUM_TRAIN_DATA], target[NUM_TRAIN_DATA:NUM_TRAIN_DATA + NUM_TEST_DATA])
    test_dataset = convert_examples_to_features(
        examples, max_length=MAX_LEN, tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR))
    test_loader = DataLoader(
        test_dataset, shuffle=False, batch_size=BATCH_SIZE)

    # start training and callback for eval
    # train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, epochs=EPOCHS, eval_callback=evaluate, test_loader=train_loader)
    train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, lr=LR,
          epochs=EPOCHS, eval_callback=evaluate, test_loader=test_loader)
def init_electra():
    electra_max_len = 512
    electra_path = "electra_base_turkish_cased_discriminator/"
    electra_model_name = "dbmdz-electra-base-turkish-cased-discriminator_seqlen512_bacth64_epochs15/"
    electra_tokenizer = ElectraTokenizerFast.from_pretrained(
        electra_path, do_lower_case=False)
    electra_model_class = Model(electra_max_len, electra_path,
                                electra_model_name, electra_tokenizer,
                                "electra")
    print("2. ELECTRA LOADED")
    return electra_model_class
Exemple #3
0
def getTokenizer(model_name):
    if 'roberta' in model_name:
        return RobertaTokenizerFast.from_pretrained(model_name,
                                                    add_prefix_space=False)
    elif model_name.startswith('bert'):
        return BertTokenizerFast.from_pretrained(model_name,
                                                 add_prefix_space=False)
    elif 'bart' in model_name:
        return RobertaTokenizerFast.from_pretrained(
            'roberta-large', add_prefix_space=False
        )  #check https://github.com/huggingface/transformers/blob/68e19f1c228c92d5d800533f558faff24b57127a/src/transformers/tokenization_bart.py#L27
    elif 'electra' in model_name:
        return ElectraTokenizerFast.from_pretrained(model_name,
                                                    add_prefix_space=False)
    else:
        return AutoTokenizer.from_pretrained(model_name,
                                             add_prefix_space=False)
def read_data(file):
    df = pd.read_json(file)
    df = shuffle(df)
    content = (df['title'] + ' ' + df['content']).to_list()
    target = df['category'].to_list()
    return content, target


if __name__ == '__main__':
    import pandas as pd

    NUM_TEST_DATA = 50016
    MODEL_DIR = './electra_chinese_base'
    MAX_LEN = 512
    BATCH_SIZE = 16 * 2  # 8gpu * 16
    NUM_LABELS = 33

    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

    content, target = read_data('../../corpus/ettoday_2017.json')
    examples = DataProcessor().get_test_examples(content[:NUM_TEST_DATA],
                                                 target[:NUM_TEST_DATA])
    test_dataset = convert_examples_to_features(
        examples,
        max_length=MAX_LEN,
        tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR))
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=BATCH_SIZE)
    evaluate(test_loader, MODEL_DIR, 'step_18749.ckpt', NUM_LABELS)
Exemple #5
0
    c.max_length = 128
elif c.size == "base":
    c.lr = 1e-4
    c.layer_lr_decay = 0.8
    c.max_length = 512
elif c.size == "large":
    c.lr = 5e-5
    c.layer_lr_decay = 0.9
    c.max_length = 512
else:
    raise ValueError(f"Invalid size {c.size}")
if c.pretrained_checkpoint is None:
    c.max_length = 512  # All public models is ++, which use max_length 512

# huggingface/transformers
hf_tokenizer = ElectraTokenizerFast.from_pretrained(
    f"google/electra-{c.size}-discriminator")
electra_config = ElectraConfig.from_pretrained(
    f"google/electra-{c.size}-discriminator")

# wsc
if c.wsc_trick:
    from _utils.wsc_trick import *  # importing spacy model takes time

# logging
# light logging callback here is to only log the last score and avoid exceeding the api access limit
if c.logger == "neptune":
    import neptune
    from fastai.callback.neptune import NeptuneCallback

    class LightNeptuneCallback(NeptuneCallback):
        def after_batch(self):
from data_prepocessing import add_token_positions, add_end_idx, read_squad
from transformers import AutoTokenizer, ElectraForQuestionAnswering, ElectraConfig, AdamW, ElectraTokenizerFast
from torch.utils.data import DataLoader
from transformers import AdamW
from types import SimpleNamespace
import argparse
import torch
import itertools
import os
import json
import subprocess
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

tokenizer = ElectraTokenizerFast.from_pretrained('deepset/electra-base-squad2')
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
EPOCHS = 3
BATCH_SIZE = 16
RESULTS_FOLDER = 'results'
LOSS_FILE = 'losses.json'
IMAGE_FOLDER = 'img'

# Building the parser syntax
parser = argparse.ArgumentParser(
    description=
    'Use me if you want load and preprocessing the custom squad data')

#Configuration parameters
parser.add_argument('-lr',
Exemple #7
0
    def __init__(self) -> None:
        self.lists = {}

        # M-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained(
            'bert-base-multilingual-cased')
        self.bert_multilingual_model = BertForMaskedLM.from_pretrained(
            'bert-base-multilingual-cased').eval()
        self.lists["M-BERT"] = {
            "Tokenizer": self.bert_multilingual_tokenizer,
            "Model": self.bert_multilingual_model
        }
        print("====================================")
        print("[BERT] Google Multilingual BERT loaded")
        print("====================================")

        # KR-BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.krbert_tokenizer = BertTokenizerFast.from_pretrained(
            'snunlp/KR-Medium')
        self.krbert_model = BertForMaskedLM.from_pretrained(
            'snunlp/KR-Medium').eval()
        self.lists["KR-Medium"] = {
            "Tokenizer": self.krbert_tokenizer,
            "Model": self.krbert_model
        }
        print("====================================")
        print("[BERT] KR-BERT loaded")
        print("====================================")

        # BERT
        from transformers import BertTokenizerFast, BertForMaskedLM
        self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/bert-kor-base')
        self.bert_kor_model = BertForMaskedLM.from_pretrained(
            'kykim/bert-kor-base').eval()
        self.lists["bert-kor-base"] = {
            "Tokenizer": self.bert_kor_tokenizer,
            "Model": self.bert_kor_model
        }
        print("====================================")
        print("[BERT] BERT-kor-base loaded")
        print("====================================")

        # ALBERT
        from transformers import AlbertForMaskedLM
        self.albert_tokenizer = BertTokenizerFast.from_pretrained(
            'kykim/albert-kor-base')
        self.albert_model = AlbertForMaskedLM.from_pretrained(
            'kykim/albert-kor-base').eval()
        self.lists["albert-kor-base"] = {
            "Tokenizer": self.albert_tokenizer,
            "Model": self.albert_model
        }
        print("====================================")
        print("[BERT] ALBERT-kor-base loaded")
        print("====================================")

        # XLM-Roberta
        from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM
        self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained(
            'xlm-roberta-base')
        self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
            'xlm-roberta-base').eval()
        self.lists["xlm-roberta-base"] = {
            "Tokenizer": self.xlmroberta_tokenizer,
            "Model": self.xlmroberta_model
        }
        print("====================================")
        print("[BERT] XLM-Roberta-kor loaded")
        print("====================================")

        from transformers import BertTokenizerFast, EncoderDecoderModel
        self.tokenizer_bertshared = BertTokenizerFast.from_pretrained(
            "kykim/bertshared-kor-base")
        self.bertshared_model = EncoderDecoderModel.from_pretrained(
            "kykim/bertshared-kor-base")
        self.lists["bertshared-kor-base"] = {
            "Tokenizer": self.tokenizer_bertshared,
            "Model": self.bertshared_model
        }
        print("====================================")
        print("[Seq2seq + BERT] bertshared-kor-base loaded")
        print("====================================")

        # gpt3-kor-small_based_on_gpt2
        from transformers import BertTokenizerFast, GPT2LMHeadModel
        self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.model_gpt3 = GPT2LMHeadModel.from_pretrained(
            "kykim/gpt3-kor-small_based_on_gpt2")
        self.lists["gpt3-kor-small_based_on_gpt2"] = {
            "Tokenizer": self.tokenizer_gpt3,
            "Model": self.model_gpt3
        }
        print("====================================")
        print("[GPT3] gpt3-small-based-on-gpt2 loaded")
        print("====================================")

        # electra-base-kor
        from transformers import ElectraTokenizerFast, ElectraModel
        self.tokenizer_electra = ElectraTokenizerFast.from_pretrained(
            "kykim/electra-kor-base")
        self.electra_model = ElectraModel.from_pretrained(
            "kykim/electra-kor-base")
        self.lists["electra-kor-base"] = {
            "Tokenizer": self.tokenizer_electra,
            "Model": self.electra_model
        }
        print("====================================")
        print("[ELECTRA] electra-kor-base loaded")
        print("====================================")

        from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering
        self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained(
            "monologg/koelectra-base-v3-finetuned-korquad")
        self.lists["electra-kor-QA"] = {
            "Tokenizer": self.electra_tokenizer_QA,
            "Model": self.electra_model_QA
        }
        print("====================================")
        print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded")
        print("====================================")
# Setting of different sizes
i = ['small', 'base', 'large'].index(c.size)
c.mask_prob = [0.15, 0.15, 0.25][i]
c.lr = [5e-4, 2e-4, 2e-4][i]
c.bs = [128, 256, 2048][i]
c.steps = [10**6, 766*1000, 400*1000][i]
c.max_length = [128, 512, 512][i]
generator_size_divisor = [4, 3, 4][i]
disc_config = ElectraConfig.from_pretrained(f'google/electra-{c.size}-discriminator')
gen_config = ElectraConfig.from_pretrained(f'google/electra-{c.size}-generator')
# note that public electra-small model is actually small++ and don't scale down generator size 
gen_config.hidden_size = int(disc_config.hidden_size/generator_size_divisor)
gen_config.num_attention_heads = int(disc_config.num_attention_heads/generator_size_divisor)
gen_config.intermediate_size = int(disc_config.intermediate_size/generator_size_divisor)
hf_tokenizer = ElectraTokenizerFast.from_pretrained(f"google/electra-{c.size}-generator")

# Path to data
Path('./datasets', exist_ok=True)
Path('./checkpoints/pretrain').mkdir(exist_ok=True, parents=True)
if c.size in ['small', 'base']:
  wiki_cache_dir = Path("./datasets/wikipedia/20200501.en/1.0.0")
  book_cache_dir = Path("./datasets/bookcorpus/plain_text/1.0.0")
  wbdl_cache_dir = Path("./datasets/wikibook_dl")
  wbdl_cache_dir.mkdir(exist_ok=True)

# Print info
print(f"process id: {os.getpid()}")
print(c)
print(hparam_update)
Exemple #9
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments)
    )
    (
        model_args,
        data_args,
        train_args,
        log_args,
        path_args,
        remaining_strings,
    ) = parser.parse_args_into_dataclasses(return_remaining_strings=True)
    # SageMaker may have some extra strings. TODO: Test this on SM.
    assert len(remaining_strings) == 0, f"The args {remaining_strings} could not be parsed."

    hvd.init()
    gpus = tf.config.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
    if train_args.eager == "true":
        tf.config.experimental_run_functions_eagerly(True)

    tokenizer = ElectraTokenizerFast.from_pretrained("bert-base-uncased")

    gen_config = ElectraConfig.from_pretrained(f"google/electra-{model_args.model_size}-generator")
    dis_config = ElectraConfig.from_pretrained(
        f"google/electra-{model_args.model_size}-discriminator"
    )

    gen = TFElectraForMaskedLM(config=gen_config)
    dis = TFElectraForPreTraining(config=dis_config)
    optimizer = get_adamw_optimizer(train_args)

    # Tie the weights
    if model_args.electra_tie_weights == "true":
        gen.electra.embeddings = dis.electra.embeddings

    loaded_optimizer_weights = None
    if model_args.load_from == "checkpoint":
        checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path)
        dis_ckpt, gen_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(checkpoint_path)
        if hvd.rank() == 0:
            dis.load_weights(dis_ckpt)
            gen.load_weights(gen_ckpt)
            loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True)

    start_time = time.perf_counter()

    if hvd.rank() == 0:
        # Logging should only happen on a single process
        # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time
        level = logging.INFO
        format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
        handlers = [
            TqdmLoggingHandler(),
        ]
        summary_writer = None  # Only create a writer if we make it through a successful step
        logging.basicConfig(level=level, format=format, handlers=handlers)
        wandb_run_name = None

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        if log_args.run_name is None:
            metadata = (
                f"electra-{hvd.size()}gpus"
                f"-{train_args.per_gpu_batch_size * hvd.size() * train_args.gradient_accumulation_steps}globalbatch"
                f"-{train_args.total_steps}steps"
            )
            run_name = (
                f"{current_time}-{metadata}-{train_args.name if train_args.name else 'unnamed'}"
            )
        else:
            run_name = log_args.run_name

    logger.info(f"Training with dataset at {path_args.train_dir}")
    logger.info(f"Validating with dataset at {path_args.val_dir}")

    train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord*")
    validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord*")

    train_filenames = glob.glob(train_glob)
    validation_filenames = glob.glob(validation_glob)
    logger.info(
        f"Number of train files {len(train_filenames)}, number of validation files {len(validation_filenames)}"
    )

    tf_train_dataset = get_dataset_from_tfrecords(
        model_type=model_args.model_type,
        filenames=train_filenames,
        per_gpu_batch_size=train_args.per_gpu_batch_size,
        max_seq_length=data_args.max_seq_length,
    )

    tf_train_dataset = tf_train_dataset.prefetch(buffer_size=8)

    if hvd.rank() == 0:
        tf_val_dataset = get_dataset_from_tfrecords(
            model_type=model_args.model_type,
            filenames=validation_filenames,
            per_gpu_batch_size=train_args.per_gpu_batch_size,
            max_seq_length=data_args.max_seq_length,
        )
        tf_val_dataset = tf_val_dataset.prefetch(buffer_size=8)

    wandb_run_name = None

    step = 1
    for batch in tf_train_dataset:
        learning_rate = optimizer.learning_rate(step=tf.constant(step, dtype=tf.float32))
        ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        train_result = train_step(
            optimizer=optimizer,
            gen=gen,
            dis=dis,
            ids=ids,
            attention_mask=attention_mask,
            mask_token_id=tokenizer.mask_token_id,
        )

        if step == 1:
            # Horovod broadcast
            if hvd.rank() == 0 and loaded_optimizer_weights is not None:
                optimizer.set_weights(loaded_optimizer_weights)
            hvd.broadcast_variables(gen.variables, root_rank=0)
            hvd.broadcast_variables(dis.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)
            step = optimizer.get_weights()[0]

        is_final_step = step >= train_args.total_steps
        if hvd.rank() == 0:
            do_log = step % log_args.log_frequency == 0
            do_checkpoint = (step > 1) and (
                (step % log_args.checkpoint_frequency == 0) or is_final_step
            )
            do_validation = step % log_args.validation_frequency == 0

            if do_log:
                elapsed_time = time.perf_counter() - start_time  # Off for first log
                it_s = log_args.log_frequency / elapsed_time
                start_time = time.perf_counter()
                description = f"Step {step} -- gen_loss: {train_result.gen_loss:.3f}, dis_loss: {train_result.dis_loss:.3f}, gen_acc: {train_result.gen_acc:.3f}, dis_acc: {train_result.dis_acc:.3f}, it/s: {it_s:.3f}\n"
                logger.info(description)

            if do_validation:
                for batch in tf_val_dataset.take(1):
                    val_ids = batch["input_ids"]
                    val_attention_mask = batch["attention_mask"]
                    val_result = val_step(
                        gen=gen,
                        dis=dis,
                        ids=val_ids,
                        attention_mask=val_attention_mask,
                        mask_token_id=tokenizer.mask_token_id,
                    )
                    log_example(
                        tokenizer,
                        val_ids,
                        val_result.masked_ids,
                        val_result.corruption_mask,
                        val_result.gen_ids,
                        val_result.dis_preds,
                    )
                    description = f"VALIDATION, Step {step} -- val_gen_loss: {val_result.gen_loss:.3f}, val_dis_loss: {val_result.dis_loss:.3f}, val_gen_acc: {val_result.gen_acc:.3f}, val_dis_acc: {val_result.dis_acc:.3f}\n"
                    logger.info(description)

            train_metrics = {
                "learning_rate": learning_rate,
                "train/loss": train_result.loss,
                "train/gen_loss": train_result.gen_loss,
                "train/dis_loss": train_result.dis_loss,
                "train/gen_acc": train_result.gen_acc,
                "train/dis_acc": train_result.dis_acc,
            }
            all_metrics = {**train_metrics}
            if do_validation:
                val_metrics = {
                    "val/loss": val_result.loss,
                    "val/gen_loss": val_result.gen_loss,
                    "val/dis_loss": val_result.dis_loss,
                    "val/gen_acc": val_result.gen_acc,
                    "val/dis_acc": val_result.dis_acc,
                }
                all_metrics = {**all_metrics, **val_metrics}
            if do_log:
                all_metrics = {"it_s": it_s, **all_metrics}

            if is_wandb_available():
                if wandb_run_name is None:
                    config = {
                        **asdict(model_args),
                        **asdict(data_args),
                        **asdict(train_args),
                        **asdict(log_args),
                        **asdict(path_args),
                        "global_batch_size": train_args.per_gpu_batch_size * hvd.size(),
                        "n_gpus": hvd.size(),
                    }
                    wandb.init(config=config, project="electra")
                    wandb.run.save()
                    wandb_run_name = wandb.run.name
                wandb.log({"step": step, **all_metrics})

                # Create summary_writer after the first step
            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name)
                )
                config = {
                    **asdict(model_args),
                    **asdict(data_args),
                    **asdict(train_args),
                    **asdict(log_args),
                    **asdict(path_args),
                    "global_batch_size": train_args.per_gpu_batch_size * hvd.size(),
                    "n_gpus": hvd.size(),
                }

            # Log to TensorBoard
            with summary_writer.as_default():
                for name, val in all_metrics.items():
                    tf.summary.scalar(name, val, step=step)

            if do_checkpoint:
                dis_model_ckpt = os.path.join(
                    path_args.filesystem_prefix,
                    path_args.checkpoint_dir,
                    f"{run_name}-step{step}-discriminator.ckpt",
                )
                gen_model_ckpt = os.path.join(
                    path_args.filesystem_prefix,
                    path_args.checkpoint_dir,
                    f"{run_name}-step{step}-generator.ckpt",
                )
                optimizer_ckpt = os.path.join(
                    path_args.filesystem_prefix,
                    path_args.checkpoint_dir,
                    f"{run_name}-step{step}-optimizer.npy",
                )
                logger.info(
                    f"Saving discriminator model at {dis_model_ckpt}, generator model at {gen_model_ckpt}, optimizer at {optimizer_ckpt}"
                )
                dis.save_weights(dis_model_ckpt)
                gen.save_weights(gen_model_ckpt)
                np.save(optimizer_ckpt, optimizer.get_weights())

        step += 1
        if is_final_step:
            break
Exemple #10
0
    def load(cls,
             pretrained_model_name_or_path,
             revision=None,
             tokenizer_class=None,
             use_fast=True,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        kwargs["revision"] = revision

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(
                pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if "AlbertTokenizer" in tokenizer_class:
            if use_fast:
                ret = AlbertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "XLMRobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLMRobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = RobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "XLNetTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLNetTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "CamembertTokenizer" in tokenizer_class:
            if use_fast:
                ret = CamembertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRQuestionEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRContextEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Exemple #11
0
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = ElectraForSequenceClassification.from_pretrained(
    'models/ELECTRA_last_line')
tokenizer = ElectraTokenizerFast.from_pretrained(
    'google/electra-small-discriminator')


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=128,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


test_dataset = load_dataset(
    'json',
    data_files={'test': 'dataset_last_line/quanta_test.json'},
    field='questions')['test']
test_dataset = test_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
test_dataset = test_dataset.map(tokenize,
                                batched=True,
                                batch_size=len(test_dataset))
test_dataset.set_format('torch',
def load_model_tokenizer(path):
    return ElectraForQuestionAnswering.from_pretrained(path), \
           ElectraTokenizerFast.from_pretrained(path)
Exemple #13
0
from transformers import ElectraConfig, ElectraTokenizerFast, TFElectraForMaskedLM, create_optimizer
from transformers import DataCollatorForLanguageModeling, TFTrainer, TFTrainingArguments, LineByLineTextDataset
#from tokenizers import Tokenizer
#from tokenizers.models import WordPiece
from transformers.modeling_tf_utils import TFMaskedLanguageModelingLoss
import deco
from deco.sources import Dataset
import sys
from functools import partial
import tensorflow as tf

tokenizer = ElectraTokenizerFast("/data/pubmed/model/vocab.txt")
#tokens = tokenizer.tokenize("hello world")
#res = tokenizer(tokens, is_split_into_words=True)
#print(res)

tokenizer_func = partial(tokenizer, max_length=128, truncation=True, padding='max_length', \
                return_token_type_ids=True, return_attention_mask=True)
#res = tokenizer_func("Hi. What's up.")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)
#ds = Dataset.from_lines("abstracts/*.tsv").map(str.strip) \
#    .where(lambda a: a).map(tokenizer.tokenize).whole_word_mask().top(10)
#for item in ds:
#    print(item)
#sys.exit()
ds = Dataset.from_lines("abstracts/*.tsv").map(str.strip) \
    .where(lambda a: a).map(tokenizer_func).batch(32).map(data_collator)