def __init__(self, hparams): super().__init__() self.hparams = hparams self.use_radam = getattr(self.hparams, "use_radam", False) self.cnnt5_only = getattr(self.hparams, "cnnt5_only", False) self.hparams.tgt_seq_len = getattr(self.hparams, "tgt_seq_len", self.hparams.seq_len) if not self.cnnt5_only: if not self.hparams.t5_only: print("Initializing LayoutLM...") self.encoder = LayoutLMModel.from_pretrained( self.hparams.layoutlm_str) if self.hparams.freeze_layoutlm: for param in tqdm(self.encoder.parameters(), desc="Freezing LayoutLM...", leave=True): param.requires_grad = False print("Initializing T5...") self.t5 = T5ForConditionalGeneration.from_pretrained( self.hparams.t5_str) self.use_llm_emb = getattr(self.hparams, "llm_emb", False) if self.use_llm_emb: print("Initializing layoutlm embeddings") self.llm_emb = LayoutLMEmbeddings( LayoutLMModel.from_pretrained( self.hparams.layoutlm_str).config) if not self.hparams.no_image: print("Using images, CNNT5 small initialized.") self.cnnt5 = CNNT5({ "t5": "t5-small", "pre_train": False, "initial_ckpt": "models/wikipedia_pre_train_continue-epoch=1-val_exact_match=0.58-val_f1=0.98.ckpt", "seq_len": self.hparams.seq_len, "tgt_seq_len": self.hparams.tgt_seq_len }) if self.cnnt5_only: print("Fine-tuning CNNT5.") else: for param in tqdm( self.cnnt5.parameters(), desc= "Freezing CNNT5 as an image Embedding extractor...", leave=True): param.requires_grad = False self.adapt_cnnt5_features = nn.Linear(512, 768) if self.hparams.t5_only: self.tokenizer = T5Tokenizer.from_pretrained(self.hparams.t5_str) elif self.cnnt5_only: self.tokenizer = self.cnnt5.tokenizer else: self.tokenizer = LayoutLMTokenizer.from_pretrained( self.hparams.layoutlm_str) self.detokenizer = T5Tokenizer.from_pretrained(self.hparams.t5_str)
def _test_TFLayoutLM(self, size, large=False): from transformers import LayoutLMTokenizer, TFLayoutLMModel tokenizer = LayoutLMTokenizer.from_pretrained(size) model = TFLayoutLMModel.from_pretrained(size) words = ["Hello", "world"] normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782] token_boxes = [] for word, box in zip(words, normalized_word_boxes): word_tokens = tokenizer.tokenize(word) token_boxes.extend([box] * len(word_tokens)) # add bounding boxes of cls + sep tokens token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] input_dict = tokenizer(' '.join(words), return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def __init__(self, image_path, model_path, config_path, num_labels=13, args=None): super(LayoutLM, self).__init__() self.image = openImage(image_path) self.args = args self.tokenizer = LayoutLMTokenizer.from_pretrained( "microsoft/layoutlm-base-uncased") config = LayoutLMConfig.from_pretrained(config_path) self.model = LayoutLMForTokenClassification.from_pretrained( model_path, config=config) self.model.to(device) self.input_ids = None self.attention_mask = None self.token_type_ids = None self.bboxes = None self.token_actual_boxes = None
def get_tokenizer(self, **kwargs): return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Path(args.output_dir).mkdir(parents=True, exist_ok=True) logging.basicConfig( filename=os.path.join(args.output_dir, "train.log"), format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.addHandler(logging.StreamHandler()) if not args.test_only: if args.load_pretrain: model = LayoutLMForMaskedLM.from_pretrained(args.layoutlm_model, return_dict=True) tokenizer = LayoutLMTokenizer.from_pretrained(args.layoutlm_model) print('Loading pre-trained model from', args.layoutlm_model) else: config = LayoutLMConfig.from_pretrained(args.model_name_or_path, return_dict=True) if args.bert_model is not None: tokenizer = AutoTokenizer.from_pretrained(args.bert_model) config.vocab_size = tokenizer.vocab_size model = LayoutLMForMaskedLM(config) if args.bert_model is None: tokenizer = LayoutLMTokenizer.from_pretrained(args.layoutlm_model, do_lower_case=True) else: bert = BertModel.from_pretrained(args.bert_model)
x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box actual_boxes.append(actual_box) # normalize the bounding boxes boxes = [] for box in actual_boxes: boxes.append(normalize_box(box, width, height)) # add as extra columns assert len(words) == len(boxes) example['words'] = words example['bbox'] = boxes return example tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased") def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]): words = example['words'] normalized_word_boxes = example['bbox'] assert len(words) == len(normalized_word_boxes) token_boxes = [] for word, box in zip(words, normalized_word_boxes): word_tokens = tokenizer.tokenize(word) token_boxes.extend([box] * len(word_tokens)) # Truncation of token_boxes special_tokens_count = 2 if len(token_boxes) > max_seq_length - special_tokens_count: