Example #1
0
 def load_context(self, context):
     """ function to enable loading flow mlflow """
     self.tokenizer = ConvBertTokenizerFast.from_pretrained(
         context.artifacts["tokenizer_dir"],
         config=ConvBertConfig.from_pretrained(
             os.path.join(context.artifacts["tokenizer_dir"],
                          "tokenizer_config.json")),
     )
     self.model = ConvBertForSequenceClassification.from_pretrained(
         context.artifacts["model_dir"], return_dict=True)
     self.model.eval()  # Put model in evaluation mode.
from transformers import ConvBertForSequenceClassification, ConvBertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

model = ConvBertForSequenceClassification.from_pretrained(
    'YituTech/conv-bert-base')
tokenizer = ConvBertTokenizerFast.from_pretrained('YituTech/conv-bert-base')

import random


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=128,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


train_dataset = load_dataset(
    'json',
    data_files={'train': 'dataset_last_line/quanta_train.json'},
    field='questions')['train']
train_dataset = train_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
train_dataset = train_dataset.map(tokenize,
                                  batched=True,
                                  batch_size=len(train_dataset))
train_dataset.set_format('torch',
from transformers import ConvBertForSequenceClassification, ConvBertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = ConvBertForSequenceClassification.from_pretrained('models/ConvBERT_full_question')
tokenizer = ConvBertTokenizerFast.from_pretrained('YituTech/conv-bert-base')

def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, max_length = 256, add_special_tokens=True, padding='max_length', return_attention_mask=True)

test_dataset = load_dataset('json', data_files={'test': 'dataset_full_question/quanta_test.json'}, field='questions')['test']
test_dataset = test_dataset.map(lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])



def compute_metrics(pred):
    labels = pred.label_ids
    # print(labels)
    preds = pred.predictions.argmax(-1)
    # print(preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
Example #4
0
from transformers import ConvBertForSequenceClassification, ConvBertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = ConvBertForSequenceClassification.from_pretrained(
    'models/ConvBERT_last_line')
tokenizer = ConvBertTokenizerFast.from_pretrained('YituTech/conv-bert-base')


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=128,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


test_dataset = load_dataset(
    'json',
    data_files={'test': 'dataset_last_line/quanta_test.json'},
    field='questions')['test']
test_dataset = test_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
test_dataset = test_dataset.map(tokenize,
                                batched=True,
                                batch_size=len(test_dataset))
test_dataset.set_format('torch',
                        columns=['input_ids', 'attention_mask', 'label'])