コード例 #1
0
def test_processor_saving_loading(caplog):
    caplog.set_level(logging.CRITICAL)
    set_all_seeds(seed=42)
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir="samples/doc_class",
        train_filename="train-sample.tsv",
        dev_filename=None,
        test_filename=None,
        dev_split=0.1,
        columns=["text", "label", "unused"],
        label_list=["OTHER", "OFFENSE"],
        metrics=["f1_macro"])
    dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv")
    data, tensor_names = processor.dataset_from_dicts(dicts)

    save_dir = "testsave/processor"
    processor.save(save_dir)

    processor = processor.load_from_dir(save_dir)
    dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv")
    data_loaded, tensor_names_loaded = processor.dataset_from_dicts(dicts)

    assert tensor_names == tensor_names_loaded
    for i in range(len(data.tensors)):
        assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
コード例 #2
0
#define processor and create dictionaries
label_list = ['positief', 'neutraal', 'negatief']

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len = 30, 
                                        data_dir="/content/drive/MyDrive/Files/",
                                        label_list = label_list,
                                        metric = "f1_macro",
                                        label_column_name = "sentiment",
                                        delimiter = ";",
                                        quote_char = '"',
                                        columns = ["text", "sentiment"],
                                        dev_filename = None,
                                        dev_split = 0.10)

dicts = processor.file_to_dicts(file='/content/drive/MyDrive/Files/sentiment_v5.csv')

#define test and train samples
split = pd.read_csv(r'/content/drive/MyDrive/Files/sentiment_v5.csv', delimiter=";")
train = split[:5660] #80%
test = split.tail(1415) #20%
train.to_csv(r'/content/drive/MyDrive/Files/train.tsv', index = False, header = True, sep = ";")
test.to_csv(r'/content/drive/MyDrive/Files/test.tsv', index = False, header = True, sep = ";")

#define datasilo
data_silo = DataSilo(processor = processor,
                     batch_size = 32)

#Create adaptive model
#pretrained language model as basis
language_model = LanguageModel.load("wietsedv/bert-base-dutch-cased", language = 'Dutch') # n_added_tokens = 5