Ejemplo n.º 1
0
 def test_TFDistilBertForTokenClassification(self):
     from transformers import DistilBertConfig, TFDistilBertForTokenClassification
     keras.backend.clear_session()
     # pretrained_weights = 'distilbert-base-uncased'
     tokenizer_file = 'distilbert_distilbert-base-uncased.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = DistilBertConfig()
     model = TFDistilBertForTokenClassification(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
Ejemplo n.º 2
0
 def test_TFDistilBertForTokenClassification(self):
     from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
     pretrained_weights = 'distilbert-base-uncased'
     tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFDistilBertForTokenClassification.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
Ejemplo n.º 3
0
# val_labels = val_tags

import tensorflow as tf

train_encodings.pop(
    "offset_mapping")  # we don't want to pass this to the model
val_encodings.pop("offset_mapping")

train_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices(
    (dict(val_encodings), val_labels))

#Use tensorflow to train and evaluate
from transformers import TFDistilBertForTokenClassification
model = TFDistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased', num_labels=len(unique_tags))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer,
              loss=model.compute_loss,
              metrics=["accuracy"])  # can also use any keras loss fn
history = model.fit(train_dataset.shuffle(100).batch(16),
                    epochs=3,
                    batch_size=16)
# model.save("E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\models\model_files\\")
# import tensorflow as tf
# model = tf.keras.models.load_model("E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\models\model_files\\")
# Evaluate the model on the test data using `evaluate`

model_config = {
    'model': "TFDistilBertForTokenClassification_W_NUT",
Ejemplo n.º 4
0
    def use(self):
        if self.model_type == "classification":
            train_texts, train_labels = self.read_split(f"{self.path}/train")

            train_texts, val_texts, train_labels, val_labels = train_test_split(
                train_texts, train_labels, test_size=0.2)
            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')
            train_encodings = tokenizer(train_texts,
                                        truncation=True,
                                        padding=True)
            val_encodings = tokenizer(val_texts, truncation=True, padding=True)

            train_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(train_encodings), train_labels))
            val_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(val_encodings), val_labels))

            model = TFDistilBertForSequenceClassification.from_pretrained(
                "distilbert-base-uncased")

        if self.model_type == "token_classification":
            texts, tags = self.read_wnut(self.path)

            train_texts, val_texts, train_tags, val_tags = train_test_split(
                texts, tags, test_size=.2)

            unique_tags = set(tag for doc in tags for tag in doc)
            tag2id = {tag: id for id, tag in enumerate(unique_tags)}

            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-cased')
            train_encodings = tokenizer(train_texts,
                                        is_split_into_words=True,
                                        return_offsets_mapping=True,
                                        padding=True,
                                        truncation=True)
            val_encodings = tokenizer(val_texts,
                                      is_split_into_words=True,
                                      return_offsets_mapping=True,
                                      padding=True,
                                      truncation=True)

            train_labels = self.encode_tags(train_tags, train_encodings,
                                            tag2id)
            val_labels = self.encode_tags(val_tags, val_encodings, tag2id)

            train_encodings.pop("offset_mapping")
            val_encodings.pop("offset_mapping")

            train_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(train_encodings), train_labels))
            val_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(val_encodings), val_labels))

            model = TFDistilBertForTokenClassification.from_pretrained(
                'distilbert-base-cased', num_labels=len(unique_tags))

        if self.model_type == "q+a":
            train_contexts, train_questions, train_answers = self.read_squad(
                f"{self.path}/train-v2.0.json")
            val_contexts, val_questions, val_answers = self.read_squad(
                f"{self.path}/dev-v2.0.json")

            self.add_end_idx(train_answers, train_contexts)
            self.add_end_idx(val_answers, val_contexts)

            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')

            train_encodings = tokenizer(train_contexts,
                                        train_questions,
                                        truncation=True,
                                        padding=True)
            val_encodings = tokenizer(val_contexts,
                                      val_questions,
                                      truncation=True,
                                      padding=True)

            self.add_token_positions(train_encodings, train_answers)
            self.add_token_positions(val_encodings, val_answers)

            train_dataset = tf.data.Dataset.from_tensor_slices(({
                key: train_encodings[key]
                for key in ['input_ids', 'attention_mask']
            }, {
                key: train_encodings[key]
                for key in ['start_positions', 'end_positions']
            }))
            val_dataset = tf.data.Dataset.from_tensor_slices(({
                key: val_encodings[key]
                for key in ['input_ids', 'attention_mask']
            }, {
                key: val_encodings[key]
                for key in ['start_positions', 'end_positions']
            }))

            model = TFDistilBertForQuestionAnswering.from_pretrained(
                "distilbert-base-uncased")

            train_dataset = train_dataset.map(
                lambda x, y: (x, (y['start_positions'], y['end_positions'])))

            model.distilbert.return_dict = False

        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        model.compile(optimizer=optimizer, loss=model.compute_loss)
        model.fit(train_dataset.shuffle(1000).batch(self.batch_size),
                  validation_data=val_dataset,
                  epochs=self.epochs,
                  batch_size=self.batch_size)
        try:
            os.mkdir(f"{self.save}")
            model.save_pretrained(self.save)
        except OSError:
            model.save_pretrained(self.save)
Ejemplo n.º 5
0
# print(pd.DataFrame(nlp(sentence))) # gives out tokens and labes

sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \
           'infrastrucutre.'

## BERT tokenizer and token classification
nlp = TokenClassificationPipeline(model=TFAutoModelForTokenClassification.from_pretrained(
    'distilbert-base-cased'), tokenizer=AutoTokenizer.from_pretrained('distilbert-base-cased'),
    framework='tf')
print(pd.DataFrame(nlp(sentence)))


from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
import tensorflow as tf
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
print(model(input_ids))

import numpy as np
from transformers import AutoTokenizer, pipeline, TFDistilBertModel
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
features = pipe('any text data or list of text data', pad_to_max_length=True)
features = np.squeeze(features)


## Sentence classification