Beispiel #1
0
def get_model(
        base_path: str,
        num_classes: int = 2,
        class_map: Dict[str, str] = {},
        freeze_base: bool = True,
        show_summary: bool = False) -> TFAutoModelForSequenceClassification:

    if len(class_map) > 0:
        rev_class_map: Dict[str, str] = {v: k for k, v in class_map.items()}
        model = TFAutoModelForSequenceClassification.from_pretrained(
            base_path,
            num_labels=num_classes,
            id2label=class_map,
            label2id=rev_class_map)
    else:
        model = TFAutoModelForSequenceClassification.from_pretrained(
            base_path, num_labels=num_classes)

    if freeze_base:
        model.layers[0].trainable = False

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    # Set from_logits=True as there is no Softmax/Sigmoid layer added in Huggingface Models
    model.compile(optimizer,
                  loss=tf.keras.losses.CategoricalCrossentropy(
                      from_logits=True) if num_classes > 2 else
                  tf.keras.losses.BinaryCrossentropy(from_logits=True))

    if show_summary:
        model.summary()
    return model
 async def train(self, sources: Sources):
     self.tokenizer = AutoTokenizer.from_pretrained(
         self.parent.config.tokenizer_name
         if self.parent.config.tokenizer_name else
         self.parent.config.model_name_or_path,
         cache_dir=self.parent.config.cache_dir,
     )
     with self.parent.config.strategy.scope():
         self.model = TFAutoModelForSequenceClassification.from_pretrained(
             self.parent.config.model_name_or_path,
             from_pt=self.parent.config.from_pt,
             config=self.config,
             cache_dir=self.parent.config.cache_dir,
         )
     train_features = await self._preprocess_data(sources)
     train_dataset = await self.example_features_to_dataset(train_features)
     trainer = TFTrainer(
         model=self.model,
         args=self.parent.config,
         train_dataset=train_dataset,
     )
     trainer.train()
     self.logger.info("Saving model to %s", self.parent.config.output_dir)
     trainer.save_model()
     self.tokenizer.save_pretrained(self.parent.config.output_dir)
    async def accuracy(self, sources: Sources):
        if not os.path.isfile(
                os.path.join(self.parent.config.output_dir, "tf_model.h5")):
            raise ModelNotTrained("Train model before assessing for accuracy.")

        config = self.parent.config._asdict()
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir)
        eval_features = await self._preprocess_data(sources)
        eval_dataset = await self.example_features_to_dataset(eval_features)

        def compute_metrics(p: EvalPrediction) -> Dict:
            preds = self.np.argmax(p.predictions, axis=1)
            return classification_compute_metrics(preds, p.label_ids)

        with self.parent.config.strategy.scope():
            self.model = TFAutoModelForSequenceClassification.from_pretrained(
                config["directory"])
        trainer = TFTrainer(
            model=self.model,
            args=self.parent.config,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )
        result = trainer.evaluate()
        return Accuracy(result["eval_acc"])
 def __init__(self, ):
     self.model = TFAutoModelForSequenceClassification.from_pretrained(
         'C:/Users/TwitterCovid-19/ANACONDA_WORKBENCH/SENTIMENT ANALYSIS/CT-BERT_FineTuned'
     )
     self.tokenizer = AutoTokenizer.from_pretrained(
         'digitalepidemiologylab/covid-twitter-bert-v2',
         add_special_tokens=True)
     print("CT-BERT model and tokenizer initialized")
    def __init__(self, model_path = "./pretrained_models/sentiment_analysis/", threshold = 0.5):
        pass
        self.threshold = threshold # If the score of a comment is below this threshold, it is considered neutral

        self.tokenizer = AutoTokenizer.from_pretrained("./pretrained_models/sentiment_analysis/")
        self.model = TFAutoModelForSequenceClassification.from_pretrained("./pretrained_models/sentiment_analysis/")

        self.nlp = pipeline('sentiment-analysis', model = self.model, tokenizer = self.tokenizer)
Beispiel #6
0
    def __init__(self, extractor, config, *args, **kwargs):
        super(TFBERTMaxP_Class, self).__init__(*args, **kwargs)
        self.extractor = extractor

        # TODO hidden prob missing below?
        if config["pretrained"] == "electra-base-msmarco":
            self.bert = TFAutoModelForSequenceClassification.from_pretrained("Capreolus/electra-base-msmarco")
            dropout, fc = self.bert.classifier.dropout, self.bert.classifier.out_proj
            self.bert.classifier = TFElectraRelevanceHead(dropout, fc)
        elif config["pretrained"] == "bert-base-msmarco":
            self.bert = TFAutoModelForSequenceClassification.from_pretrained("Capreolus/bert-base-msmarco")
        else:
            self.bert = TFAutoModelForSequenceClassification.from_pretrained(
                config["pretrained"], hidden_dropout_prob=config["hidden_dropout_prob"]
            )

        self.config = config
Beispiel #7
0
 def _load_local_model(self, model_path):
     try:
         self._tokenizer = AutoTokenizer.from_pretrained(model_path +
                                                         '/tokenizer')
     # Old models didn't use to have a tokenizer folder
     except OSError:
         self._tokenizer = AutoTokenizer.from_pretrained(model_path)
     self._model = TFAutoModelForSequenceClassification.from_pretrained(
         model_path, from_pt=False)
Beispiel #8
0
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine",
                                                       use_fast=True)
        self.classifier = TFAutoModelForSequenceClassification.from_pretrained(
            "tblard/tf-allocine")

        self.sentiment_analyzer = pipeline('sentiment-analysis',
                                           model=self.classifier,
                                           tokenizer=self.tokenizer)
Beispiel #9
0
    def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs):
        do_lower_case = False
        if 'uncased' in model_name.lower():
            do_lower_case = True
        tokenizer_kwargs.update({'do_lower_case': do_lower_case})

        self._tokenizer = None
        self._model = None

        self._tokenizer = AutoTokenizer.from_pretrained(
            model_name, **tokenizer_kwargs)

        temporary_path = self._get_temporary_path()
        make_dir(temporary_path)

        # TensorFlow model
        try:
            self._model = TFAutoModelForSequenceClassification.from_pretrained(
                model_name, from_pt=False)

        # PyTorch model
        except TypeError:
            try:
                self._model = TFAutoModelForSequenceClassification.from_pretrained(
                    model_name, from_pt=True)

            # Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name
            except OSError:
                model = AutoModel.from_pretrained(model_name)
                model.save_pretrained(temporary_path)
                self._model = TFAutoModelForSequenceClassification.from_pretrained(
                    temporary_path, from_pt=True)

        # Reset base model if the number of labels does not match
        if self._model.config.num_labels != model_kwargs['num_labels']:
            self._model.config.__dict__.update(model_kwargs)
            getattr(self._model,
                    self._get_model_family()).save_pretrained(temporary_path)
            self._model = TFAutoModelForSequenceClassification.from_pretrained(
                temporary_path, from_pt=False)

        remove_dir(temporary_path)
        assert self._tokenizer and self._model
    def test_sequence_classification_model_from_pretrained(self):
        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForSequenceClassification)
    def initialize(self):
        """
        Initializes everything, takes a bit of time
        """
        cache_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "storage", "hfcache")

        # Load pretrained model and tokenizer
        model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine", cache_dir=cache_path)
        tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine", cache_dir=cache_path)

        self.nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
    def test_sequence_classification_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
        for model_name in ["bert-base-uncased"]:
            config = AutoConfig.from_pretrained(model_name)
            self.assertIsNotNone(config)
            self.assertIsInstance(config, BertConfig)

            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForSequenceClassification)
Beispiel #13
0
def get_sentiment_model(
        model_name="distilbert-base-uncased-finetuned-sst-2-english",
        cache_dir="transformers_models/"):
    if not (isinstance(model_name, str) and isinstance(cache_dir, str)):
        raise SentimentModelInputTypeError()

    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              cache_dir=cache_dir,
                                              local_files_only=False)
    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_name, cache_dir=cache_dir, local_files_only=False)
    return model, tokenizer
Beispiel #14
0
    async def predict(
        self, sources: SourcesContext
    ) -> AsyncIterator[Tuple[Record, Any, float]]:
        if not os.path.isfile(
            os.path.join(self.parent.config.output_dir, "tf_model.h5")
        ):
            raise ModelNotTrained("Train model before prediction.")
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.parent.config.output_dir
        )

        with self.parent.config.strategy.scope():
            self.model = TFAutoModelForSequenceClassification.from_pretrained(
                self.parent.config.output_dir
            )
        trainer = TFTrainer(model=self.model, args=self.parent.config,)
        async for record in sources.with_features(self.features):
            to_predict = record.features(self.features)
            eval_example = [
                InputExample(
                    0,
                    to_predict[self.features[0]],
                    None,
                    self.parent.config.label_list[0],
                )
            ]
            eval_features = glue_convert_examples_to_features(
                eval_example,
                self.tokenizer,
                self.parent.config.max_seq_length,
                self.parent.config.task_name,
                self.parent.config.label_list,
            )
            eval_dataset = await self.example_features_to_dataset(
                eval_features
            )

            all_prob = trainer.predict(eval_dataset).predictions
            max_prob_idx = all_prob.argmax(axis=-1)
            self.logger.debug(
                "Predicted probability of {} for {}: {}".format(
                    self.parent.config.predict.name, to_predict, all_prob[0],
                )
            )
            record.predicted(
                self.parent.config.predict.name,
                self.parent.config.label_list[max_prob_idx[0]],
                all_prob[0][max_prob_idx[0]],
            )
            yield record
Beispiel #15
0
def Sentiment(text):
    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
    from transformers import pipeline
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    classifier = pipeline('sentiment-analysis',
                          model=tf_model,
                          tokenizer=tokenizer)
    results = classifier(text)
    for result in results:
        print(
            f"label: {result['label']}, with score: {round(result['score'], 4)}"
        )
    return results
    def compile(self, lr, freeze_pretrained):

        loss = 'binary_crossentropy'

        # Wrap up model + Compile with optimizer and loss function
        # self.model = Model(inputs=word_inputs, outputs=[outputs])
        '''self.model = TFBertForSequenceClassification.from_pretrained(Configuration['model']['uri'], num_labels=4271)'''

        # Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True
        if "longformer" in Configuration["model"]["uri"]:
            transformer_model = TFLongformerForSequenceClassification.from_pretrained(
                Configuration['model']['uri'], num_labels=4271)

        else:
            transformer_model = TFAutoModelForSequenceClassification.from_pretrained(
                Configuration['model']['uri'], num_labels=4271)

        if freeze_pretrained:
            """
            [ < transformers.modeling_tf_bert.TFBertMainLayer
            at
            0x169a23e10 >,
            < tensorflow.python.keras.layers.core.Dropout
            at
            0x169abde90 >,
            < tensorflow.python.keras.layers.core.Dense
            at
            0x169ac31d0 >]"""
            transformer_model.layers[0].trainable = False

        input_ids = tf.keras.Input(
            shape=(Configuration['sampling']['max_sequence_size'], ),
            dtype='int32')
        attention_mask = tf.keras.Input(
            shape=(Configuration['sampling']['max_sequence_size'], ),
            dtype='int32')

        transformer = transformer_model([input_ids, attention_mask],
                                        training=True)
        hidden_states = transformer[0]  # get output_hidden_states

        output = tf.keras.activations.sigmoid(hidden_states)
        self.model = tf.keras.models.Model(inputs=[input_ids, attention_mask],
                                           outputs=output)

        self.model.compile(optimizer=Adam(lr=lr), loss=loss)
Beispiel #17
0
  def __init__(self):
    """Possible states are 
    1. "await" (awaiting response)
    2. "proceed" (proceed with the conversation)- used to give the bot control over the converation"""
    self._state="await"
  
    """Possible Flags are 
    1. "Exec" (task Executed)
    2. "notExec" (proceed with the conversation)- used to give the bot control over the converation"""
    self._FLAG=None
    self._bert_base_case_mrpc_tokenizer=AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
    self._bert_base_case_mrpc_model=TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
    self._gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    self._gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self._gpt2_tokenizer.eos_token_id)
    self.bert_large_uncased_whole_word_masking_finetuned_squad_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self.bert_large_uncased_whole_word_masking_finetuned_squad_model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self._DialoGP_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    self._DialoGP_model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")



    self._conversation_started=False
    self._conversation_ended=True
Beispiel #18
0
def paraphrase(sequence_0, sequence_1):
    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
    import tensorflow as tf
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
    model = TFAutoModelForSequenceClassification.from_pretrained(
        "bert-base-cased-finetuned-mrpc")
    classes = ["not paraphrase", "is paraphrase"]
    # sequence_0 = "The company HuggingFace is based in New York City"
    # sequence_1 = "Apples are especially bad for your health"
    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
    paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")
    not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")
    paraphrase_classification_logits = model(paraphrase)[0]
    not_paraphrase_classification_logits = model(not_paraphrase)[0]
    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits,
                                       axis=1).numpy()[0]
    # not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
    # Should be paraphrase
    for i in range(len(classes)):
        print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
    # Should not be paraphrase
    # for i in range(len(classes)):
    #     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
    return
Beispiel #19
0
    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        # Load model and tokenizer
        model = TFAutoModelForSequenceClassification.from_pretrained(
            args.model_name_or_path)
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

        # Load dataset
        train_dataset, test_dataset = load_dataset("imdb",
                                                   split=["train", "test"])

        # Preprocess train dataset
        train_dataset = train_dataset.map(lambda e: tokenizer(
            e["text"], truncation=True, padding="max_length"),
                                          batched=True)
        train_dataset.set_format(
            type="tensorflow",
            columns=["input_ids", "attention_mask", "label"])

        train_features = {
Beispiel #20
0
from nltk.corpus import stopwords
from data_utils import data_utils
import os
from transformers import TFBertForQuestionAnswering, BertTokenizer
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
from rake_nltk import Rake
import column_types
import json
from clauses import Clause
from conditionmaps import conditions
from nltk.stem import WordNetLemmatizer 

qa_model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
rerank_model = TFAutoModelForSequenceClassification.from_pretrained('bert-large-cased-whole-word-masking')
rerank_tokenizer = AutoTokenizer.from_pretrained('bert-large-cased-whole-word-masking')
lemmatizer = WordNetLemmatizer()
lem=lemmatizer.lemmatize
stop_words = set(stopwords.words('english'))


def extract_keywords_from_doc(doc, phrases=True, return_scores=False):
    if phrases:
        r = Rake()
        if isinstance(doc, (list, tuple)):
            r.extract_keywords_from_sentences(doc)
        else:
            r.extract_keywords_from_text(doc)
        if return_scores:
            return [(b, a) for a, b in r.get_ranked_phrases_with_scores()]
Beispiel #21
0
    def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs):
        do_lower_case = False
        if 'uncased' in model_name.lower():
            do_lower_case = True
        tokenizer_kwargs.update({'do_lower_case': do_lower_case})

        self._tokenizer = AutoTokenizer.from_pretrained(
            model_name, **tokenizer_kwargs)
        self._config = AutoConfig.from_pretrained(model_name)

        temporary_path = self._get_temporary_path()
        make_dir(temporary_path)

        # TensorFlow model
        try:
            self._model = TFAutoModelForSequenceClassification.from_pretrained(
                model_name, from_pt=False)

        # PyTorch model
        except TypeError:
            try:
                self._model = \
                    TFAutoModelForSequenceClassification.from_pretrained(
                        model_name,
                        from_pt=True
                    )

            # Loading a TF model from a PyTorch checkpoint is not supported
            # when using a model identifier name
            except OSError:
                model = AutoModel.from_pretrained(model_name)
                model.save_pretrained(temporary_path)
                self._model = \
                    TFAutoModelForSequenceClassification.from_pretrained(
                        temporary_path,
                        from_pt=True
                    )

        # Clean the model's last layer if the provided properties are different
        clean_last_layer = False
        for key, value in model_kwargs.items():
            if not hasattr(self._model.config, key):
                clean_last_layer = True
                break

            if getattr(self._model.config, key) != value:
                clean_last_layer = True
                break

        if clean_last_layer:
            try:
                getattr(
                    self._model,
                    self._get_model_family()).save_pretrained(temporary_path)
                self._model = self._model.__class__.from_pretrained(
                    temporary_path, from_pt=False, **model_kwargs)

            # The model is itself the main layer
            except AttributeError:
                # TensorFlow model
                try:
                    self._model = self._model.__class__.from_pretrained(
                        model_name, from_pt=False, **model_kwargs)

                # PyTorch Model
                except (OSError, TypeError):
                    model = AutoModel.from_pretrained(model_name)
                    model.save_pretrained(temporary_path)
                    self._model = self._model.__class__.from_pretrained(
                        temporary_path, from_pt=True, **model_kwargs)

        remove_dir(temporary_path)
        assert self._tokenizer and self._model
Beispiel #22
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser((ModelArguments, GlueDataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    try:
        num_labels = glue_tasks_num_labels["mnli" if data_args.task_name == "mnli-mm" else data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    train_dataset = (
        get_tfds(
            task_name=data_args.task_name,
            tokenizer=tokenizer,
            max_seq_length=data_args.max_seq_length,
            data_dir=data_args.data_dir,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        get_tfds(
            task_name=data_args.task_name,
            tokenizer=tokenizer,
            max_seq_length=data_args.max_seq_length,
            mode=Split.dev,
            data_dir=data_args.data_dir,
        )
        if training_args.do_eval
        else None
    )

    def compute_metrics(p: EvalPrediction) -> Dict:
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()
        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")

            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

            results.update(result)

    return results
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    train_dataset, eval_dataset, test_ds, label2id = get_tfds(
        train_file=data_args.train_file,
        eval_file=data_args.dev_file,
        test_file=data_args.test_file,
        tokenizer=tokenizer,
        label_column_id=data_args.label_column_id,
        max_seq_length=data_args.max_seq_length,
    )

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=len(label2id),
        label2id=label2id,
        id2label={id: label
                  for label, id in label2id.items()},
        finetuning_task="text-classification",
        cache_dir=model_args.cache_dir,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)

        return {"acc": (preds == p.label_ids).mean()}

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        result = trainer.evaluate()
        trainer.log_metrics("eval", result)
        trainer.save_metrics("eval", result)
        results.update(result)

    return results
def main():
    parser = HfArgumentParser((GeneralArguments, ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        op_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        op_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
            os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir)
            and training_args.do_train
            and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Global setup
    labels = op_args._labels()
    output_mode = op_args.output_mode

    if output_mode == 'classification':
        label2id = {label: i for i, label in enumerate(labels)}
        id2label = {v: k for k, v in label2id.items()}
    else:
        num_labels = 1

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if output_mode == 'classification':
        config = AutoConfig.from_pretrained(
            model_args.config_name if model_args.config_name else model_args.model_name_or_path,
            label2id=label2id,
            id2label=id2label,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
    else:
        config = AutoConfig.from_pretrained(
            model_args.config_name if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        YourDataset(data_args, op_args=op_args, tokenizer=tokenizer, mode=Split.train)
        if training_args.do_train
        else None
    )
    eval_dataset = (
        YourDataset(data_args, op_args=op_args, tokenizer=tokenizer, mode=Split.dev)
        if training_args.do_eval
        else None
    )
    test_dataset = (
        YourDataset(data_args, op_args=op_args, tokenizer=tokenizer, mode=Split.test)
        if training_args.do_predict
        else None
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=mode_compute_metrics(op_args.output_mode),
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Convert PT to TF
    if op_args.convert_to_tf:
        logger.info("***** Convert PT to TF {} *****".format(training_args.output_dir + 'pytorch_model.bin'))
        tf_model = TFAutoModelForSequenceClassification.from_pretrained(training_args.output_dir, from_pt=True)
        tf_model.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle double evaluation (matched, mis-matched)
        eval_datasets = [eval_dataset]

        for eval_dataset in eval_datasets:
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            output_eval_file = os.path.join(
                training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt"
            )
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name))
                    for key, value in eval_result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            eval_results.update(eval_result)

    if training_args.do_predict:
        logging.info("*** Test ***")

        test_datasets = [test_dataset]
        for test_dataset in test_datasets:
            predictions = trainer.predict(test_dataset=test_dataset).predictions

            if output_mode == "classification":
                predictions = np.argmax(predictions, axis=1)

            output_test_file = os.path.join(
                training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt"
            )
            if trainer.is_world_master():
                with open(output_test_file, "w") as writer:
                    logger.info("***** Test results {} *****".format(test_dataset.args.task_name))
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if output_mode == "regression":
                            writer.write("%d\t%3.3f\n" % (index, item))
                        else:
                            item = test_dataset.get_labels()[item]
                            writer.write("%d\t%s\n" % (index, item))

    return eval_results
Beispiel #25
0

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(
    range(104 * 3))
full_eval_dataset = tokenized_datasets["test"]

import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from datetime import datetime
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased",
                                                             num_labels=2)

tf_eval_dataset = small_eval_dataset.remove_columns(
    ["text"]).with_format("tensorflow")

eval_features = {
    x: tf_eval_dataset[x].to_tensor()
    for x in tokenizer.model_input_names
}

eval_tf_dataset = tf.data.Dataset.from_tensor_slices(
    (eval_features, tf_eval_dataset["label"]))
eval_tf_dataset = eval_tf_dataset.batch(8)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
Beispiel #26
0
    def __call__(self, text_inputs):
        raw_outputs = self.pipeline(text_inputs)
        outputs = []
        for output in raw_outputs:
            score = output["score"]
            if output["label"] == "POSITIVE":
                outputs.append([1 - score, score])
            else:
                outputs.append([score, 1 - score])
        return np.array(outputs)


# Create the model: a French sentiment analysis model.
# see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert
model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline)

# Create the recipe: PWWS uses a WordNet transformation.
recipe = PWWSRen2019.build(model_wrapper)
# WordNet defaults to english. Set the default language to French ('fra')
#
# See
# "Building a free French wordnet from multilingual resources",
# E. L. R. A. (ELRA) (ed.),
# Proceedings of the Sixth International Language Resources and Evaluation (LREC’08).

recipe.transformation.language = "fra"
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if not (training_args.do_train or training_args.do_eval
            or training_args.do_predict):
        exit(
            "Must specify at least one of --do_train, --do_eval or --do_predict!"
        )
    # endregion

    # region Checkpoints
    checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        checkpoint = get_last_checkpoint(training_args.output_dir)
        if checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Dataset and labels
    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee
    # that only one local process can concurrently download the dataset.
    datasets = load_dataset("glue",
                            data_args.task_name,
                            cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    is_regression = data_args.task_name == "stsb"
    if not is_regression:
        label_list = datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1

    if data_args.predict_file is not None:
        logger.info("Preparing user-supplied file for predictions...")

        data_files = {"data": data_args.predict_file}

        for key in data_files.keys():
            logger.info(f"Loading a local file for {key}: {data_files[key]}")

        if data_args.predict_file.endswith(".csv"):
            # Loading a dataset from local csv files
            user_dataset = load_dataset("csv",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
            user_dataset = load_dataset("json",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        needed_keys = task_to_keys[data_args.task_name]
        for key in needed_keys:
            assert key in user_dataset[
                "data"].features, f"Your supplied predict_file is missing the {key} key!"
        datasets["user_data"] = user_dataset["data"]
    # endregion

    # region Load model config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    non_label_column_names = [
        name for name in datasets["train"].column_names if name != "label"
    ]

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if config.label2id != PretrainedConfig(
            num_labels=num_labels).label2id and not is_regression:
        # Some have all caps in their config, some don't.
        label_name_to_id = {k.lower(): v for k, v in config.label2id.items()}
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: int(label_name_to_id[label_list[i]])
                for i in range(num_labels)
            }
        else:
            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
            label_to_id = {label: i for i, label in enumerate(label_list)}
    if label_to_id is not None:
        config.label2id = label_to_id
        config.id2label = {id: label for label, id in config.label2id.items()}
    elif data_args.task_name is not None and not is_regression:
        config.label2id = {l: i for i, l in enumerate(label_list)}
        config.id2label = {id: label for label, id in config.label2id.items()}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)

    # endregion

    # region Metric function
    metric = load_metric("glue", data_args.task_name)

    def compute_metrics(preds, label_ids):
        preds = preds["logits"]
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        result = metric.compute(predictions=preds, references=label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result

    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Optimizer, loss and compilation
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=training_args.learning_rate,
            beta_1=training_args.adam_beta1,
            beta_2=training_args.adam_beta2,
            epsilon=training_args.adam_epsilon,
            clipnorm=training_args.max_grad_norm,
        )
        if is_regression:
            loss_fn = tf.keras.losses.MeanSquaredError()
            metrics = []
        else:
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True)
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
        # endregion

        # region Convert data to a tf.data.Dataset
        tf_data = dict()
        if isinstance(
                training_args.strategy,
                tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
            logger.info(
                "Padding all batches to max length because argument was set or we're on TPU."
            )
            dataset_mode = "constant_batch"
        else:
            dataset_mode = "variable_batch"
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_eval_samples,
            "validation_matched": data_args.max_eval_samples,
            "validation_mismatched": data_args.max_eval_samples,
            "test": data_args.max_predict_samples,
            "test_matched": data_args.max_predict_samples,
            "test_mismatched": data_args.max_predict_samples,
            "user_data": None,
        }
        for key in datasets.keys():
            if key == "train" or key.startswith("validation"):
                assert "label" in datasets[
                    key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size
                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size
                drop_remainder = False
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))
            data = convert_dataset_for_tensorflow(
                dataset,
                non_label_column_names,
                batch_size=batch_size,
                dataset_mode=dataset_mode,
                drop_remainder=drop_remainder,
                shuffle=shuffle,
            )
            tf_data[key] = data
        # endregion

        # region Training and validation
        if training_args.do_train:
            callbacks = [
                SavePretrainedCallback(output_dir=training_args.output_dir)
            ]
            if training_args.do_eval and not data_args.task_name == "mnli":
                # Do both evaluation and training in the Keras fit loop, unless the task is MNLI
                # because MNLI has two validation sets
                validation_data = tf_data["validation"]
            else:
                validation_data = None
            model.fit(
                tf_data["train"],
                validation_data=validation_data,
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        # endregion

        # region Evaluation
        if training_args.do_eval:
            # We normally do validation as part of the Keras fit loop, but we run it independently
            # if there was no fit() step (because we didn't train the model) or if the task is MNLI,
            # because MNLI has a separate validation-mismatched validation set
            logger.info("*** Evaluate ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            if data_args.task_name == "mnli":
                tasks = ["mnli", "mnli-mm"]
                tf_datasets = [
                    tf_data["validation_matched"],
                    tf_data["validation_mismatched"]
                ]
                raw_datasets = [
                    datasets["validation_matched"],
                    datasets["validation_mismatched"]
                ]
            else:
                tasks = [data_args.task_name]
                tf_datasets = [tf_data["validation"]]
                raw_datasets = [datasets["validation"]]

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                eval_predictions = model.predict(tf_dataset)
                eval_metrics = compute_metrics(eval_predictions,
                                               raw_dataset["label"])
                print(f"Evaluation metrics ({task}):")
                print(eval_metrics)

        # endregion

        # region Prediction
        if training_args.do_predict or data_args.predict_file:
            logger.info("*** Predict ***")

            # Loop to handle MNLI double evaluation (matched, mis-matched)
            tasks = []
            tf_datasets = []
            raw_datasets = []
            if training_args.do_predict:
                if data_args.task_name == "mnli":
                    tasks.extend(["mnli", "mnli-mm"])
                    tf_datasets.extend(
                        [tf_data["test_matched"], tf_data["test_mismatched"]])
                    raw_datasets.extend([
                        datasets["test_matched"], datasets["test_mismatched"]
                    ])
                else:
                    tasks.append(data_args.task_name)
                    tf_datasets.append(tf_data["test"])
                    raw_datasets.append(datasets["test"])
            if data_args.predict_file:
                tasks.append("user_data")
                tf_datasets.append(tf_data["user_data"])
                raw_datasets.append(datasets["user_data"])

            for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets,
                                                     tasks):
                test_predictions = model.predict(tf_dataset)
                if "label" in raw_dataset:
                    test_metrics = compute_metrics(test_predictions,
                                                   raw_dataset["label"])
                    print(f"Test metrics ({task}):")
                    print(test_metrics)

                if is_regression:
                    predictions_to_write = np.squeeze(
                        test_predictions["logits"])
                else:
                    predictions_to_write = np.argmax(
                        test_predictions["logits"], axis=1)

                output_predict_file = os.path.join(
                    training_args.output_dir, f"predict_results_{task}.txt")
                with open(output_predict_file, "w") as writer:
                    logger.info(
                        f"***** Writing prediction results for {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions_to_write):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = model.config.id2label[item]
                            writer.write(f"{index}\t{item}\n")
def save_tf_model_from_transformers():
    model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
    callable = tf.function(model.call)
    concrete_function = callable.get_concrete_function([tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="input_ids"),
                                                        tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="attention_mask")])
    model.save('saved_model/distilbert/1', signatures=concrete_function)
Beispiel #29
0
def main():
    # region Argument parsing
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_text_classification", model_args, data_args, framework="tensorflow")

    output_dir = Path(training_args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # endregion

    # region Checkpoints
    # Detecting last checkpoint.
    checkpoint = None
    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
        if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file():
            checkpoint = output_dir
            logger.info(
                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )
        else:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to continue regardless."
            )

    # endregion

    # region Logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)

    logger.info(f"Training/evaluation parameters {training_args}")
    # endregion

    # region Loading data
    # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally
    # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two
    # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than
    # a single grammatical sentence, when the task requires it.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    data_files = {"train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file}
    data_files = {key: file for key, file in data_files.items() if file is not None}

    for key in data_files.keys():
        logger.info(f"Loading a local file for {key}: {data_files[key]}")

    if data_args.input_file_extension == "csv":
        # Loading a dataset from local csv files
        datasets = load_dataset(
            "csv",
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        # Loading a dataset from local json files
        datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    # endregion

    # region Label preprocessing
    # If you've passed us a training set, we try to infer your labels from it
    if "train" in datasets:
        # By default we assume that if your label column looks like a float then you're doing regression,
        # and if not then you're doing classification. This is something you may want to change!
        is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)
    # If you haven't passed a training set, we read label info from the saved model (this happens later)
    else:
        num_labels = None
        label_list = None
        is_regression = None
    # endregion

    # region Load model config and tokenizer
    if checkpoint is not None:
        config_path = training_args.output_dir
    elif model_args.config_name:
        config_path = model_args.config_name
    else:
        config_path = model_args.model_name_or_path
    if num_labels is not None:
        config = AutoConfig.from_pretrained(
            config_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config = AutoConfig.from_pretrained(
            config_path,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # endregion

    # region Dataset preprocessing
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    column_names = {col for cols in datasets.column_names.values() for col in cols}
    non_label_column_names = [name for name in column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    elif "sentence1" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", None
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Ensure that our labels match the model's, if it has some pre-specified
    if "train" in datasets:
        if not is_regression and config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
            label_name_to_id = config.label2id
            if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
                label_to_id = label_name_to_id  # Use the model's labels
            else:
                logger.warning(
                    "Your model seems to have been trained with labels, but they don't match the dataset: ",
                    f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:"
                    f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
                )
                label_to_id = {v: i for i, v in enumerate(label_list)}
        elif not is_regression:
            label_to_id = {v: i for i, v in enumerate(label_list)}
        else:
            label_to_id = None
        # Now we've established our label2id, let's overwrite the model config with it.
        config.label2id = label_to_id
        if config.label2id is not None:
            config.id2label = {id: label for label, id in label_to_id.items()}
        else:
            config.id2label = None
    else:
        label_to_id = config.label2id  # Just load the data from the model

    if "validation" in datasets and config.label2id is not None:
        validation_label_list = datasets["validation"].unique("label")
        for val_label in validation_label_list:
            assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!"

    def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, max_length=max_seq_length, truncation=True)

        # Map labels to IDs
        if config.label2id is not None and "label" in examples:
            result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)

    if data_args.pad_to_max_length:
        data_collator = DefaultDataCollator(return_tensors="tf")
    else:
        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
    # endregion

    with training_args.strategy.scope():
        # region Load pretrained model
        # Set seed before initializing model
        set_seed(training_args.seed)
        #
        # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
        # download model & vocab.
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
            model_path = checkpoint
        model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
        # endregion

        # region Optimizer, loss and compilation
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=training_args.learning_rate,
            beta_1=training_args.adam_beta1,
            beta_2=training_args.adam_beta2,
            epsilon=training_args.adam_epsilon,
            clipnorm=training_args.max_grad_norm,
        )
        if is_regression:
            loss_fn = tf.keras.losses.MeanSquaredError()
            metrics = []
        else:
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
            metrics = ["accuracy"]
        model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
        # endregion

        # region Convert data to a tf.data.Dataset

        tf_data = dict()
        max_samples = {
            "train": data_args.max_train_samples,
            "validation": data_args.max_val_samples,
            "test": data_args.max_test_samples,
        }
        for key in ("train", "validation", "test"):
            if key not in datasets:
                tf_data[key] = None
                continue
            if key in ("train", "validation"):
                assert "label" in datasets[key].features, f"Missing labels from {key} data!"
            if key == "train":
                shuffle = True
                batch_size = training_args.per_device_train_batch_size
                drop_remainder = True  # Saves us worrying about scaling gradients for the last batch
            else:
                shuffle = False
                batch_size = training_args.per_device_eval_batch_size
                drop_remainder = False
            samples_limit = max_samples[key]
            dataset = datasets[key]
            if samples_limit is not None:
                dataset = dataset.select(range(samples_limit))
            data = dataset.to_tf_dataset(
                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
                shuffle=shuffle,
                batch_size=batch_size,
                collate_fn=data_collator,
                drop_remainder=drop_remainder,
                # `label_cols` is needed for user-defined losses, such as in this example
                label_cols="label" if "label" in dataset.column_names else None,
            )
            tf_data[key] = data
        # endregion

        # region Training and validation
        if tf_data["train"] is not None:
            callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
            model.fit(
                tf_data["train"],
                validation_data=tf_data["validation"],
                epochs=int(training_args.num_train_epochs),
                callbacks=callbacks,
            )
        elif tf_data["validation"] is not None:
            # If there's a validation dataset but no training set, just evaluate the metrics
            logger.info("Computing metrics on validation data...")
            if is_regression:
                loss = model.evaluate(tf_data["validation"])
                logger.info(f"Loss: {loss:.5f}")
            else:
                loss, accuracy = model.evaluate(tf_data["validation"])
                logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%")
        # endregion

        # region Prediction
        if tf_data["test"] is not None:
            logger.info("Doing predictions on test dataset...")
            predictions = model.predict(tf_data["test"])["logits"]
            predicted_class = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
            output_test_file = os.path.join(training_args.output_dir, "test_results.txt")
            with open(output_test_file, "w") as writer:
                writer.write("index\tprediction\n")
                for index, item in enumerate(predicted_class):
                    if is_regression:
                        writer.write(f"{index}\t{item:3.3f}\n")
                    else:
                        item = config.id2label[item]
                        writer.write(f"{index}\t{item}\n")
            logger.info(f"Wrote predictions to {output_test_file}!")
        # endregion

    # region Prediction losses
    # This section is outside the scope() because it's very quick to compute, but behaves badly inside it
    if "test" in datasets and "label" in datasets["test"].features:
        print("Computing prediction loss on test labels...")
        labels = datasets["test"]["label"]
        loss = float(loss_fn(labels, predictions).numpy())
        print(f"Test loss: {loss:.4f}")
Beispiel #30
0
def main():

    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=8)
    parser.add_argument("--eval_batch_size", type=int, default=4)
    parser.add_argument("--model_name_or_path", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--do_train", type=bool, default=True)
    parser.add_argument("--do_eval", type=bool, default=True)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir",
                        type=str,
                        default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir",
                        type=str,
                        default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus",
                        type=str,
                        default=os.environ["SM_NUM_GPUS"])

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    logger.info(args)
    #
    # Preprocesssing
    #

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

    # Load dataset
    train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"])

    # Preprocess train dataset
    train_dataset = train_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="longest"),
        batched=True)
    train_dataset.set_format(type="tensorflow",
                             columns=["input_ids", "attention_mask", "label"])

    train_features = {
        x: train_dataset[x].to_tensor(default_value=0,
                                      shape=[None, tokenizer.model_max_length])
        for x in ["input_ids", "attention_mask"]
    }
    tf_train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_features, train_dataset["label"])).batch(args.train_batch_size)

    # Preprocess test dataset
    test_dataset = test_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"),
        batched=True)
    test_dataset.set_format(type="tensorflow",
                            columns=["input_ids", "attention_mask", "label"])

    test_features = {
        x: test_dataset[x].to_tensor(default_value=0,
                                     shape=[None, tokenizer.model_max_length])
        for x in ["input_ids", "attention_mask"]
    }
    tf_test_dataset = tf.data.Dataset.from_tensor_slices(
        (test_features, test_dataset["label"])).batch(args.eval_batch_size)

    #
    # Training
    #

    # Load model
    # implementation is based on the horovod implementation in combination with sagemaker
    # https://horovod.readthedocs.io/en/stable/keras.html

    # adjust optimizer
    # https://sagemaker.readthedocs.io/en/stable/api/training/sdp_versions/smd_data_parallel_tensorflow.html#smdistributed.dataparallel.tensorflow.DistributedOptimizer
    learning_rate = args.learning_rate * hvd.size()
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    optimizer = hvd.DistributedOptimizer(optimizer)

    # fine optimizer and loss
    model = TFAutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

    # Specify `experimental_run_tf_function=False` to ensure TensorFlow
    # uses hvd.DistributedOptimizer() to compute gradients.
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics,
                  experimental_run_tf_function=False)

    # callbacks
    # https://horovod.readthedocs.io/en/stable/api.html#horovod.tensorflow.keras.callbacks.BroadcastGlobalVariablesCallback
    BroadcastGlobalVariablesCallback = hvd.callbacks.BroadcastGlobalVariablesCallback

    callbacks = [
        # broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        BroadcastGlobalVariablesCallback(0),
    ]
    # save checkpoints only on worker 0 to prevent other workers from corrupting them.
    # if hvd.rank() == 0:
    #     callbacks.append(tf.keras.callbacks.ModelCheckpoint("./checkpoint-{epoch}.h5"))

    # Training
    if args.do_train:
        logger.info("*** Train ***")
        start_time = time.time()
        # batch_size https://github.com/horovod/horovod/issues/1617
        # will be spread across all devices equally so. E.g.: train_batch_size 8, n_gpus 4 === 32
        train_results = model.fit(
            tf_train_dataset,
            epochs=args.epochs,
            # steps_per_epoch=500 // hvd.size(),
            callbacks=callbacks,
            validation_batch_size=args.eval_batch_size,
            batch_size=args.train_batch_size,
            verbose=1 if hvd.rank() == 0 else 0,
        )
        train_runtime = {f"train_runtime": round(time.time() - start_time, 4)}
        logger.info(f"train_runtime = {train_runtime}\n")

        output_train_file = os.path.join(args.output_data_dir,
                                         "train_results.txt")

        with open(output_train_file, "w") as writer:
            logger.info("***** Train results *****")
            logger.info(train_results)
            for key, value in train_results.history.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))
            writer.write(f"train_runtime = {train_runtime}\n")

    # Evaluation
    if args.do_eval:
        logger.info("*** Evaluate ***")
        result = model.evaluate(tf_test_dataset,
                                batch_size=args.eval_batch_size,
                                return_dict=True)

        output_eval_file = os.path.join(args.output_data_dir,
                                        "eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info(result)
            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

    # save checkpoints only on worker 0 to prevent other workers from corrupting them.
    model.save_pretrained(args.model_dir)
    tokenizer.save_pretrained(args.model_dir)