Ejemplo n.º 1
0
def main(test_dir_pattern):
    test_files = glob.glob(test_dir_pattern)
    tokenizer = AutoTokenizer(config.CHECKPOINT)
    ner_model = NERModel()
    ner_model.load(config.MODEL_CHECKPOINT)

    bert = ner_model.bert.eval().cuda()
    last_layer = ner_model.last_layer.eval().cuda()

    mapping = {}
    submission_mapping = defaultdict(list)
    doc_index = 0
    n_files = len(test_files)

    for i in range(0, n_files, 50):
        files_to_processed = test_files[i:i + 50]
        test_examples = []
        for file in files_to_processed:
            doc = Document.read_from_path(file)
            sentences = doc.get_all_sentences()
            mapping[doc_index] = doc.document_id
            for sentence in sentences:
                te = TestExample(doc_index, sentence)
                test_examples.append(te)
            doc_index += 1

        dataset = NERTestDataset(test_examples, tokenizer)
        data_loader = DataLoader(dataset, batch_size=config.TEST_BATCH_SIZE)

        for batch in data_loader:
            o = bert(
                input_ids=batch['input_ids'].to('cuda'),
                attention_mask=batch['attention_mask'].to('cuda'),
                token_type_ids=batch['token_type_ids'].to('cuda'),
            )
            o = last_layer(o['last_hidden_state'])
            o = o.detach()
            o = o.argmax(axis=-1)
            probabilities = o.max(axis=-1)
            for j, sentence_prediction in enumerate(o):
                last_valid = batch['attention_mask'][j].argmin().item()
                candidates = LabelEncoding.extract_candidates(
                    sentence_prediction,
                    probabilities[j],
                    last_valid,
                )
                if len(candidates) == 0:
                    continue

                id_ = batch['id'][j].item()
                ids = batch['input_ids'][j]
                for start_index, end_index in candidates:
                    submission_mapping[mapping[id_]].append(
                        tokenizer.decode(ids[start_index:end_index]))

    return submission_mapping
def filter_samples(samples, tokenizer: AutoTokenizer, vocab, template):
    msg = ""
    new_samples = []
    samples_exluded = 0

    for sample in samples:
        excluded = False
        if "obj_label" in sample and "sub_label" in sample:
            obj_label_ids = tokenizer.encode(sample["obj_label"],
                                             add_special_tokens=False)
            if obj_label_ids:
                # reconstructed_word = " ".join(
                #     [vocab[x] for x in obj_label_ids]
                # ).strip()
                reconstructed_word = tokenizer.decode(obj_label_ids)

                if len(obj_label_ids) > 1:
                    reconstructed_word = None
                # TODO: Find good solution for comparing two models
            else:
                reconstructed_word = None

            excluded = False
            if not template or len(template) == 0:
                masked_sentences = sample["masked_sentences"]
                text = " ".join(masked_sentences)
                if len(text.split()) > tokenizer.model_max_length:
                    msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format(
                        masked_sentences)
                    samples_exluded += 1
                    excluded = True

            # # MAKE SURE THAT obj_label IS IN VOCABULARIES
            # (Removed as we do not have multiple different models that require different vocabularies)

            if excluded:
                pass
            elif obj_label_ids is None:
                msg += "\tEXCLUDED object label {} not in model vocabulary\n".format(
                    sample["obj_label"])
                samples_exluded += 1
            elif not reconstructed_word or reconstructed_word != sample[
                    "obj_label"]:
                msg += "\tEXCLUDED object label {} not in model vocabulary\n".format(
                    sample["obj_label"])
                samples_exluded += 1
            elif "judgments" in sample:
                # only for Google-RE
                num_no = 0
                num_yes = 0
                for x in sample["judgments"]:
                    if x["judgment"] == "yes":
                        num_yes += 1
                    else:
                        num_no += 1
                if num_no > num_yes:
                    # SKIP NEGATIVE EVIDENCE
                    pass
                else:
                    new_samples.append(sample)
            else:
                new_samples.append(sample)
        else:
            msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format(
                sample)
            samples_exluded += 1
    msg += "samples exluded  : {}\n".format(samples_exluded)
    return new_samples, msg