def test_TFDistilBertForQuestionAnswering(self):
     from transformers import DistilBertConfig, TFDistilBertForQuestionAnswering
     keras.backend.clear_session()
     # pretrained_weights = 'distilbert-base-uncased'
     tokenizer_file = 'distilbert_distilbert-base-uncased.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = DistilBertConfig()
     model = TFDistilBertForQuestionAnswering(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
Beispiel #2
0
 def test_TFDisillBertModel(self):
     from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
     name = 'distilbert-base-uncased-distilled-squad'
     tokenizer = DistilBertTokenizer.from_pretrained(name)
     model = TFDistilBertForQuestionAnswering.from_pretrained(name)
     question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
     input_dict = tokenizer(question, text, return_tensors='tf')
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["start_logits", "end_logits"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, rtol=1e-5)
 def test_TFDistilBertForQuestionAnswering(self):
     from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
     pretrained_weights = 'distilbert-base-uncased'
     tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFDistilBertForQuestionAnswering.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
Beispiel #4
0
def inference(question, context):
    tokenizer = DistilBertTokenizer.from_pretrained("./models/tokenizer/")
    model = TFDistilBertForQuestionAnswering.from_pretrained("./models/")
    input_dict = tokenizer.encode_plus(question,
                                       context,
                                       padding='max_length',
                                       max_length=128,
                                       return_tensors='tf')
    start_scores, end_scores = model(input_dict)
    del model
    all_tokens = tokenizer.convert_ids_to_tokens(
        input_dict["input_ids"].numpy()[0])
    answer = ' '.join(
        all_tokens[tf.math.argmax(start_scores, 1
                                  )[0]:tf.math.argmax(end_scores, 1)[0] + 1])
    return answer
Beispiel #5
0
    def use(self):
        if self.model_type == "classification":
            train_texts, train_labels = self.read_split(f"{self.path}/train")

            train_texts, val_texts, train_labels, val_labels = train_test_split(
                train_texts, train_labels, test_size=0.2)
            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')
            train_encodings = tokenizer(train_texts,
                                        truncation=True,
                                        padding=True)
            val_encodings = tokenizer(val_texts, truncation=True, padding=True)

            train_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(train_encodings), train_labels))
            val_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(val_encodings), val_labels))

            model = TFDistilBertForSequenceClassification.from_pretrained(
                "distilbert-base-uncased")

        if self.model_type == "token_classification":
            texts, tags = self.read_wnut(self.path)

            train_texts, val_texts, train_tags, val_tags = train_test_split(
                texts, tags, test_size=.2)

            unique_tags = set(tag for doc in tags for tag in doc)
            tag2id = {tag: id for id, tag in enumerate(unique_tags)}

            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-cased')
            train_encodings = tokenizer(train_texts,
                                        is_split_into_words=True,
                                        return_offsets_mapping=True,
                                        padding=True,
                                        truncation=True)
            val_encodings = tokenizer(val_texts,
                                      is_split_into_words=True,
                                      return_offsets_mapping=True,
                                      padding=True,
                                      truncation=True)

            train_labels = self.encode_tags(train_tags, train_encodings,
                                            tag2id)
            val_labels = self.encode_tags(val_tags, val_encodings, tag2id)

            train_encodings.pop("offset_mapping")
            val_encodings.pop("offset_mapping")

            train_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(train_encodings), train_labels))
            val_dataset = tf.data.Dataset.from_tensor_slices(
                (dict(val_encodings), val_labels))

            model = TFDistilBertForTokenClassification.from_pretrained(
                'distilbert-base-cased', num_labels=len(unique_tags))

        if self.model_type == "q+a":
            train_contexts, train_questions, train_answers = self.read_squad(
                f"{self.path}/train-v2.0.json")
            val_contexts, val_questions, val_answers = self.read_squad(
                f"{self.path}/dev-v2.0.json")

            self.add_end_idx(train_answers, train_contexts)
            self.add_end_idx(val_answers, val_contexts)

            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')

            train_encodings = tokenizer(train_contexts,
                                        train_questions,
                                        truncation=True,
                                        padding=True)
            val_encodings = tokenizer(val_contexts,
                                      val_questions,
                                      truncation=True,
                                      padding=True)

            self.add_token_positions(train_encodings, train_answers)
            self.add_token_positions(val_encodings, val_answers)

            train_dataset = tf.data.Dataset.from_tensor_slices(({
                key: train_encodings[key]
                for key in ['input_ids', 'attention_mask']
            }, {
                key: train_encodings[key]
                for key in ['start_positions', 'end_positions']
            }))
            val_dataset = tf.data.Dataset.from_tensor_slices(({
                key: val_encodings[key]
                for key in ['input_ids', 'attention_mask']
            }, {
                key: val_encodings[key]
                for key in ['start_positions', 'end_positions']
            }))

            model = TFDistilBertForQuestionAnswering.from_pretrained(
                "distilbert-base-uncased")

            train_dataset = train_dataset.map(
                lambda x, y: (x, (y['start_positions'], y['end_positions'])))

            model.distilbert.return_dict = False

        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        model.compile(optimizer=optimizer, loss=model.compute_loss)
        model.fit(train_dataset.shuffle(1000).batch(self.batch_size),
                  validation_data=val_dataset,
                  epochs=self.epochs,
                  batch_size=self.batch_size)
        try:
            os.mkdir(f"{self.save}")
            model.save_pretrained(self.save)
        except OSError:
            model.save_pretrained(self.save)
Beispiel #6
0
    add_token_positions(train_encodings, train_answers)
    add_token_positions(val_encodings, val_answers)
    train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: train_encodings[key] for key in ['input_ids', 'attention_mask']},
    {key: train_encodings[key] for key in ['start_positions', 'end_positions']}
          ))
    val_dataset = tf.data.Dataset.from_tensor_slices((
    {key: val_encodings[key] for key in ['input_ids', 'attention_mask']},
    {key: val_encodings[key] for key in ['start_positions', 'end_positions']}
       ))

    
    
    #device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    #model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
    model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
   
    #model.to(device)
    #model.train()
   

    #train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

    #optim = AdamW(model.parameters(), lr=5e-5)
   
# Keras will expect a tuple when dealing with labels
    train_dataset = train_dataset.map(lambda x, y: (x, (y['start_positions'], y['end_positions'])))

# Keras will assign a separate loss for each output and add them together. So we'll just use the standard CE loss
# instead of using the built-in model.compute_loss, which expects a dict of outputs and averages the two terms.
# Note that this means the loss will be 2x of when using TFTrainer since we're adding instead of averaging them.
import tensorflow as tf
from transformers import TFDistilBertForQuestionAnswering

model = TFDistilBertForQuestionAnswering.from_pretrained(
    'distilbert-base-uncased-distilled-squad')

input_spec = tf.TensorSpec([1, 384], tf.int32)
model._set_inputs(input_spec, training=False)

print(model.inputs)
print(model.outputs)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]

tflite_model = converter.convert()

open("distilbert-squad-384.tflite", "wb").write(tflite_model)
Beispiel #8
0
def main():
    """
    Main function
    """

    # Parse cmd line arguments
    args = nlp_parser.parse_arguments()

    source = ""
    subject = ""
    context = ""
    question = ""
    answer = ""
    squadid = ""

    if args:
        if "text" in args:
            if args["text"]:
                source = args["text"]
        if "subject" in args:
            if args["subject"]:
                subject = args["subject"]
        if "context" in args:
            if args["context"]:
                context = args["context"]
        if "question" in args:
            if args["question"]:
                question = args["question"]
                clean_question = nlp.clean(question)
        if "answer" in args:
            if args["answer"]:
                answer = args["answer"]
        if "squadid" in args:
            if args["squadid"]:
                squadid = args["squadid"]
    else:
        sys.exit("Parser didn't return args correctly")

    # Setup the question, either from a specified SQuAD record
    # or from cmd line arguments.
    # If no question details are provided, a random
    # SQuAD example will be chosen.

    if question:
        if source:
            with open(source, "r") as text_file_handle:
                context = text_file_handle.read()
        else:
            print("No text provided, searching SQuAD dev-2.0 dataset")
            squad_data = nlp.import_squad_data()
            squad_records = squad_data.loc[squad_data["clean_question"] ==
                                           clean_question]
            if squad_records.empty:
                sys.exit(
                    "Question not found in SQuAD data, please provide context using `--text`."
                )
            subject = squad_records["subject"].iloc[0]
            context = squad_records["context"].iloc[0]
            question = squad_records["question"].iloc[0]
            answer = squad_records["answer"]
    else:
        squad_data = nlp.import_squad_data()

        if squadid:
            source = args["squadid"]
            squad_records = squad_data.loc[squad_data["id"] == source]
            i_record = 0
        else:
            if subject:
                print(
                    "Picking a question at random on the subject: ",
                    subject,
                )
                squad_records = squad_data.loc[squad_data["subject"] ==
                                               subject]
            else:
                print(
                    "No SQuAD ID or question provided, picking one at random!")
                squad_records = squad_data

            n_records = len(squad_records.index)
            i_record = random.randint(0, max(0, n_records - 1))

        if squad_records.empty:
            sys.exit(
                "No questions found in SQuAD data, please provide valid ID or subject."
            )

        n_records = len(squad_records.index)
        i_record = random.randint(0, n_records - 1)
        source = squad_records["id"].iloc[i_record]
        subject = squad_records["subject"].iloc[i_record]
        context = squad_records["context"].iloc[i_record]
        question = squad_records["question"].iloc[i_record]
        answer = squad_records["answer"].iloc[i_record]

    # DistilBERT question answering using pre-trained model.
    token = DistilBertTokenizer.from_pretrained("distilbert-base-uncased",
                                                return_token_type_ids=True)

    model = TFDistilBertForQuestionAnswering.from_pretrained(
        "distilbert-base-uncased-distilled-squad")

    encoding = token.encode_plus(question,
                                 context,
                                 max_length=512,
                                 truncation=True)

    input_ids, attention_mask = (
        encoding["input_ids"],
        encoding["attention_mask"],
    )
    model_output = model(np.array([input_ids]),
                         attention_mask=np.array([attention_mask]))
    start_scores = model_output.start_logits
    end_scores = model_output.end_logits

    answer_ids = input_ids[np.argmax(start_scores):np.argmax(end_scores) + 1]
    answer_tokens = token.convert_ids_to_tokens(answer_ids,
                                                skip_special_tokens=True)
    answer_tokens_to_string = token.convert_tokens_to_string(answer_tokens)

    # Display results
    print("\nDistilBERT question answering example.")
    print("======================================")
    print("Reading from: ", subject, source)
    print("\nContext: ", context)
    print("--")
    print("Question: ", question)
    print("Answer: ", answer_tokens_to_string)
    print("Reference Answers: ", answer)