def load_context(self, context): """ function to enable loading flow mlflow """ self.tokenizer = ConvBertTokenizerFast.from_pretrained( context.artifacts["tokenizer_dir"], config=ConvBertConfig.from_pretrained( os.path.join(context.artifacts["tokenizer_dir"], "tokenizer_config.json")), ) self.model = ConvBertForSequenceClassification.from_pretrained( context.artifacts["model_dir"], return_dict=True) self.model.eval() # Put model in evaluation mode.
from transformers import ConvBertForSequenceClassification, ConvBertTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np model = ConvBertForSequenceClassification.from_pretrained( 'YituTech/conv-bert-base') tokenizer = ConvBertTokenizerFast.from_pretrained('YituTech/conv-bert-base') import random def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=128, add_special_tokens=True, padding='max_length', return_attention_mask=True) train_dataset = load_dataset( 'json', data_files={'train': 'dataset_last_line/quanta_train.json'}, field='questions')['train'] train_dataset = train_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) train_dataset.set_format('torch',
from transformers import ConvBertForSequenceClassification, ConvBertTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support model = ConvBertForSequenceClassification.from_pretrained('models/ConvBERT_full_question') tokenizer = ConvBertTokenizerFast.from_pretrained('YituTech/conv-bert-base') def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length = 256, add_special_tokens=True, padding='max_length', return_attention_mask=True) test_dataset = load_dataset('json', data_files={'test': 'dataset_full_question/quanta_test.json'}, field='questions')['test'] test_dataset = test_dataset.map(lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) def compute_metrics(pred): labels = pred.label_ids # print(labels) preds = pred.predictions.argmax(-1) # print(preds) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') acc = accuracy_score(labels, preds) return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall
from transformers import ConvBertForSequenceClassification, ConvBertTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support model = ConvBertForSequenceClassification.from_pretrained( 'models/ConvBERT_last_line') tokenizer = ConvBertTokenizerFast.from_pretrained('YituTech/conv-bert-base') def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=128, add_special_tokens=True, padding='max_length', return_attention_mask=True) test_dataset = load_dataset( 'json', data_files={'test': 'dataset_last_line/quanta_test.json'}, field='questions')['test'] test_dataset = test_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])