Exemple #1
0
 def __init__(self,
              abstracts: List[str],
              texts: List[str],
              tokenizer: AutoTokenizer,
              max_tokens=128,
              sep_word='|') -> None:
     self.abstracts = abstracts
     self.texts = texts
     self.tokenizer = tokenizer
     self.sep_token = tokenizer.encode(sep_word)
     self.max_tokens = max_tokens
    def __init__(self, tokenizer: AutoTokenizer, path_json: str, max_length: int):
        self._essay_text = []
        self._essay_grade = []
        with open(path_json, "r") as file:
            json_list = json.load(file)

        for example in json_list:
            text = example["filename"] + " " + example["text"]
            inputs_ids = torch.tensor(tokenizer.encode(text, max_length=max_length, truncation=True))
            grade = torch.tensor(float(example["grade"]))

            self._essay_text.append(inputs_ids)
            self._essay_grade.append(grade)
        self._essay_text = pad_sequence(self._essay_text, batch_first=True, padding_value=0)
def filter_samples(samples, tokenizer: AutoTokenizer, vocab, template):
    msg = ""
    new_samples = []
    samples_exluded = 0

    for sample in samples:
        excluded = False
        if "obj_label" in sample and "sub_label" in sample:
            obj_label_ids = tokenizer.encode(sample["obj_label"],
                                             add_special_tokens=False)
            if obj_label_ids:
                # reconstructed_word = " ".join(
                #     [vocab[x] for x in obj_label_ids]
                # ).strip()
                reconstructed_word = tokenizer.decode(obj_label_ids)

                if len(obj_label_ids) > 1:
                    reconstructed_word = None
                # TODO: Find good solution for comparing two models
            else:
                reconstructed_word = None

            excluded = False
            if not template or len(template) == 0:
                masked_sentences = sample["masked_sentences"]
                text = " ".join(masked_sentences)
                if len(text.split()) > tokenizer.model_max_length:
                    msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format(
                        masked_sentences)
                    samples_exluded += 1
                    excluded = True

            # # MAKE SURE THAT obj_label IS IN VOCABULARIES
            # (Removed as we do not have multiple different models that require different vocabularies)

            if excluded:
                pass
            elif obj_label_ids is None:
                msg += "\tEXCLUDED object label {} not in model vocabulary\n".format(
                    sample["obj_label"])
                samples_exluded += 1
            elif not reconstructed_word or reconstructed_word != sample[
                    "obj_label"]:
                msg += "\tEXCLUDED object label {} not in model vocabulary\n".format(
                    sample["obj_label"])
                samples_exluded += 1
            elif "judgments" in sample:
                # only for Google-RE
                num_no = 0
                num_yes = 0
                for x in sample["judgments"]:
                    if x["judgment"] == "yes":
                        num_yes += 1
                    else:
                        num_no += 1
                if num_no > num_yes:
                    # SKIP NEGATIVE EVIDENCE
                    pass
                else:
                    new_samples.append(sample)
            else:
                new_samples.append(sample)
        else:
            msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format(
                sample)
            samples_exluded += 1
    msg += "samples exluded  : {}\n".format(samples_exluded)
    return new_samples, msg
Exemple #4
0
def tokenize_text(text: str, tokenizer: AutoTokenizer, **kwargs) -> Tensor:
    return tensor(tokenizer.encode(text, truncation=True, **kwargs), dtype=longt)