Beispiel #1
0
    def test_integration(self):
        level_origin = logging.get_verbosity()

        logger = logging.get_logger(
            "transformers.models.bart.tokenization_bart")
        msg = "Testing 1, 2, 3"

        # should be able to log warnings (if default settings weren't overridden by `pytest --log-level-all`)
        if level_origin <= logging.WARNING:
            with CaptureLogger(logger) as cl:
                logger.warning(msg)
            self.assertEqual(cl.out, msg + "\n")

        # this is setting the level for all of `transformers.*` loggers
        logging.set_verbosity_error()

        # should not be able to log warnings
        with CaptureLogger(logger) as cl:
            logger.warning(msg)
        self.assertEqual(cl.out, "")

        # should be able to log warnings again
        logging.set_verbosity_warning()
        with CaptureLogger(logger) as cl:
            logger.warning(msg)
        self.assertEqual(cl.out, msg + "\n")

        # restore to the original level
        logging.set_verbosity(level_origin)
    def __init__(self, model_folder: str, to_device, verbose=False):
        self.model_folder = model_folder
        self.device = to_device

        if not verbose:
            logging.set_verbosity_warning()

        # load model
        self.tokenizer = AutoTokenizer.from_pretrained(model_folder,
                                                       local_files_only=True)
        self.model = AutoModelForQuestionAnswering.from_pretrained(
            model_folder, local_files_only=True).to(to_device)
Beispiel #3
0
    def __init__(self, model_folder: str, to_device, verbose=False):
        self.model_folder = model_folder
        self.device = to_device

        if not verbose:
            logging.set_verbosity_warning()

        # load model
        self.tokenizer = AutoTokenizer.from_pretrained(model_folder,
                                                       local_files_only=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_folder, local_files_only=True).to(to_device)

        self.context_window = 1024
Beispiel #4
0
    def __init__(self, model_folder: str, to_device, verbose=False):
        self.model_folder = model_folder
        self.device = to_device

        if not verbose:
            logging.set_verbosity_warning()

        # load model
        self.tokenizer = LEDTokenizer.from_pretrained(model_folder,
                                                      local_files_only=True)
        self.model = LEDForConditionalGeneration.from_pretrained(
            model_folder, local_files_only=True).to(to_device)

        # model config
        self.model.config.no_repeat_ngram_size = 3
Beispiel #5
0
    def test_set_level(self):
        logger = logging.get_logger()

        # the current default level is logging.WARNING
        level_origin = logging.get_verbosity()

        logging.set_verbosity_error()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        logging.set_verbosity_warning()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        logging.set_verbosity_info()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        logging.set_verbosity_debug()
        self.assertEqual(logger.getEffectiveLevel(), logging.get_verbosity())

        # restore to the original level
        logging.set_verbosity(level_origin)
from collections import OrderedDict
from os.path import basename, dirname

import fairseq
import torch
from fairseq import hub_utils
from fairseq.data.dictionary import Dictionary

from transformers import WEIGHTS_NAME, logging
from transformers.configuration_fsmt import FSMTConfig
from transformers.modeling_fsmt import FSMTForConditionalGeneration
from transformers.tokenization_fsmt import VOCAB_FILES_NAMES
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE


logging.set_verbosity_warning()

json_indent = 2

# based on the results of a search on a range of `num_beams`, `length_penalty` and `early_stopping`
# values against wmt19 test data to obtain the best BLEU scores, we will use the following defaults:
#
# * `num_beams`: 5 (higher scores better, but requires more memory/is slower, can be adjusted by users)
# * `early_stopping`: `False` consistently scored better
# * `length_penalty` varied, so will assign the best one depending on the model
best_score_hparams = {
    # fairseq:
    "wmt19-ru-en": {"length_penalty": 1.1},
    "wmt19-en-ru": {"length_penalty": 1.15},
    "wmt19-en-de": {"length_penalty": 1.0},
    "wmt19-de-en": {"length_penalty": 1.1},
Beispiel #7
0
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

from typing import Tuple, Dict
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

from datasets import load_dataset
from transformers import (TFAutoModelForSequenceClassification, AutoTokenizer,
                          logging as transformers_logging)

from callbacks import HFModelCheckPoint

transformers_logging.set_verbosity_warning()
MODEL_PATH = r"D:\Models\NLP\longformer\tf-longformer-base-4096"
DATA_COLS = ['input_ids', 'attention_mask', 'label']
SAVE_PATH = r"D:\Fine-tuned Models\NLP\longformer\tf-longformer-base-4096"


def prep_data(fpath: str,
              tpath: str,
              text_col: str = 'text',
              label_col: str = 'label',
              seq_len: int = 500,
              *args,
              **kwargs) -> Tuple[tf.data.Dataset, int]:
    dataset = load_dataset('csv',
                           data_files=fpath,
                           split='train',
    AutoModel,
    AutoTokenizer,
    ProgressCallback,
)

from transformers.integrations import TensorBoardCallback, WandbCallback
from transformers.trainer_callback import PrinterCallback

from joint_nlu_models import *
from sklearn.metrics import classification_report
from preprocessing.conll_loader import ConLLLoader, intent_labels_list, slot_labels_list

from joint_metrics import running_metrics, joint_classification_report, exact_match

# configure loggers
trans_log.set_verbosity_warning()
logging.basicConfig( level=logging.INFO)

# disable huggingface warning and wandb
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

#suppress annoying warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")



# disable wandb for memory efficiency.
#  this does not seem to work.
# will make issue on huggingface github