def test_processor_saving_loading(caplog): caplog.set_level(logging.CRITICAL) set_all_seeds(seed=42) lang_model = "bert-base-cased" tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir="samples/doc_class", train_filename="train-sample.tsv", dev_filename=None, test_filename=None, dev_split=0.1, columns=["text", "label", "unused"], label_list=["OTHER", "OFFENSE"], metrics=["f1_macro"]) dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv") data, tensor_names = processor.dataset_from_dicts(dicts) save_dir = "testsave/processor" processor.save(save_dir) processor = processor.load_from_dir(save_dir) dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv") data_loaded, tensor_names_loaded = processor.dataset_from_dicts(dicts) assert tensor_names == tensor_names_loaded for i in range(len(data.tensors)): assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
#define processor and create dictionaries label_list = ['positief', 'neutraal', 'negatief'] processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len = 30, data_dir="/content/drive/MyDrive/Files/", label_list = label_list, metric = "f1_macro", label_column_name = "sentiment", delimiter = ";", quote_char = '"', columns = ["text", "sentiment"], dev_filename = None, dev_split = 0.10) dicts = processor.file_to_dicts(file='/content/drive/MyDrive/Files/sentiment_v5.csv') #define test and train samples split = pd.read_csv(r'/content/drive/MyDrive/Files/sentiment_v5.csv', delimiter=";") train = split[:5660] #80% test = split.tail(1415) #20% train.to_csv(r'/content/drive/MyDrive/Files/train.tsv', index = False, header = True, sep = ";") test.to_csv(r'/content/drive/MyDrive/Files/test.tsv', index = False, header = True, sep = ";") #define datasilo data_silo = DataSilo(processor = processor, batch_size = 32) #Create adaptive model #pretrained language model as basis language_model = LanguageModel.load("wietsedv/bert-base-dutch-cased", language = 'Dutch') # n_added_tokens = 5