def test_save_load_pretrained_additional_features(self): processor = LayoutLMv2Processor(feature_extractor=self.get_feature_extractor(), tokenizer=self.get_tokenizer()) processor.save_pretrained(self.tmpdirname) # slow tokenizer tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30) processor = LayoutLMv2Processor.from_pretrained( self.tmpdirname, use_fast=False, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertIsInstance(processor.tokenizer, LayoutLMv2Tokenizer) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor) # fast tokenizer tokenizer_add_kwargs = self.get_rust_tokenizer(bos_token="(BOS)", eos_token="(EOS)") feature_extractor_add_kwargs = self.get_feature_extractor(do_resize=False, size=30) processor = LayoutLMv2Processor.from_pretrained( self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_resize=False, size=30 ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) self.assertIsInstance(processor.tokenizer, LayoutLMv2TokenizerFast) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
def test_save_load_pretrained_default(self): feature_extractor = self.get_feature_extractor() tokenizers = self.get_tokenizers() for tokenizer in tokenizers: processor = LayoutLMv2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(self.tmpdirname) processor = LayoutLMv2Processor.from_pretrained(self.tmpdirname) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) self.assertIsInstance(processor.tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)) self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) self.assertIsInstance(processor.feature_extractor, LayoutLMv2FeatureExtractor)
def test_overflowing_tokens(self): # In the case of overflowing tokens, test that we still have 1-to-1 mapping between the images and input_ids (sequences that are too long are broken down into multiple sequences). from datasets import load_dataset # set up datasets = load_dataset("nielsr/funsd") processor = LayoutLMv2Processor.from_pretrained( "microsoft/layoutlmv2-base-uncased", revision="no_ocr") def preprocess_data(examples): images = [ Image.open(path).convert("RGB") for path in examples["image_path"] ] words = examples["words"] boxes = examples["bboxes"] word_labels = examples["ner_tags"] encoded_inputs = processor( images, words, boxes=boxes, word_labels=word_labels, padding="max_length", truncation=True, return_overflowing_tokens=True, stride=50, return_offsets_mapping=True, return_tensors="pt", ) return encoded_inputs train_data = preprocess_data(datasets["train"]) self.assertEqual(len(train_data["image"]), len(train_data["input_ids"]))
if filename.split(".")[-1] in ["png", "jpeg", "jpg"]: return "image" elif filename.split(".")[-1] in ["pdf"]: return "pdf" else: return "UNSUPPORTED file type" #-------------simple_invoice_extraction---------------------------------------- #all defined outside the api to not reload exery time import numpy as np from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification from datasets import load_dataset from PIL import Image, ImageDraw, ImageFont processor = LayoutLMv2Processor.from_pretrained( "microsoft/layoutlmv2-base-uncased") model_simple_invoice = LayoutLMv2ForTokenClassification.from_pretrained( "Theivaprakasham/layoutlmv2-finetuned-sroie_mod") dataset = load_dataset("darentang/generated", split="test") labels = dataset.features['ner_tags'].feature.names print('labels', labels) id2label = {v: k for v, k in enumerate(labels)} label2color = { 'b-abn': "blue", 'b-biller': "blue", 'b-biller_address': "black", 'b-biller_post_code': "green", 'b-due_date': "orange", 'b-gst': 'red', 'b-invoice_date': 'red',