def main(test_dir_pattern): test_files = glob.glob(test_dir_pattern) tokenizer = AutoTokenizer(config.CHECKPOINT) ner_model = NERModel() ner_model.load(config.MODEL_CHECKPOINT) bert = ner_model.bert.eval().cuda() last_layer = ner_model.last_layer.eval().cuda() mapping = {} submission_mapping = defaultdict(list) doc_index = 0 n_files = len(test_files) for i in range(0, n_files, 50): files_to_processed = test_files[i:i + 50] test_examples = [] for file in files_to_processed: doc = Document.read_from_path(file) sentences = doc.get_all_sentences() mapping[doc_index] = doc.document_id for sentence in sentences: te = TestExample(doc_index, sentence) test_examples.append(te) doc_index += 1 dataset = NERTestDataset(test_examples, tokenizer) data_loader = DataLoader(dataset, batch_size=config.TEST_BATCH_SIZE) for batch in data_loader: o = bert( input_ids=batch['input_ids'].to('cuda'), attention_mask=batch['attention_mask'].to('cuda'), token_type_ids=batch['token_type_ids'].to('cuda'), ) o = last_layer(o['last_hidden_state']) o = o.detach() o = o.argmax(axis=-1) probabilities = o.max(axis=-1) for j, sentence_prediction in enumerate(o): last_valid = batch['attention_mask'][j].argmin().item() candidates = LabelEncoding.extract_candidates( sentence_prediction, probabilities[j], last_valid, ) if len(candidates) == 0: continue id_ = batch['id'][j].item() ids = batch['input_ids'][j] for start_index, end_index in candidates: submission_mapping[mapping[id_]].append( tokenizer.decode(ids[start_index:end_index])) return submission_mapping
def filter_samples(samples, tokenizer: AutoTokenizer, vocab, template): msg = "" new_samples = [] samples_exluded = 0 for sample in samples: excluded = False if "obj_label" in sample and "sub_label" in sample: obj_label_ids = tokenizer.encode(sample["obj_label"], add_special_tokens=False) if obj_label_ids: # reconstructed_word = " ".join( # [vocab[x] for x in obj_label_ids] # ).strip() reconstructed_word = tokenizer.decode(obj_label_ids) if len(obj_label_ids) > 1: reconstructed_word = None # TODO: Find good solution for comparing two models else: reconstructed_word = None excluded = False if not template or len(template) == 0: masked_sentences = sample["masked_sentences"] text = " ".join(masked_sentences) if len(text.split()) > tokenizer.model_max_length: msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format( masked_sentences) samples_exluded += 1 excluded = True # # MAKE SURE THAT obj_label IS IN VOCABULARIES # (Removed as we do not have multiple different models that require different vocabularies) if excluded: pass elif obj_label_ids is None: msg += "\tEXCLUDED object label {} not in model vocabulary\n".format( sample["obj_label"]) samples_exluded += 1 elif not reconstructed_word or reconstructed_word != sample[ "obj_label"]: msg += "\tEXCLUDED object label {} not in model vocabulary\n".format( sample["obj_label"]) samples_exluded += 1 elif "judgments" in sample: # only for Google-RE num_no = 0 num_yes = 0 for x in sample["judgments"]: if x["judgment"] == "yes": num_yes += 1 else: num_no += 1 if num_no > num_yes: # SKIP NEGATIVE EVIDENCE pass else: new_samples.append(sample) else: new_samples.append(sample) else: msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format( sample) samples_exluded += 1 msg += "samples exluded : {}\n".format(samples_exluded) return new_samples, msg