def __init__(self, abstracts: List[str], texts: List[str], tokenizer: AutoTokenizer, max_tokens=128, sep_word='|') -> None: self.abstracts = abstracts self.texts = texts self.tokenizer = tokenizer self.sep_token = tokenizer.encode(sep_word) self.max_tokens = max_tokens
def __init__(self, tokenizer: AutoTokenizer, path_json: str, max_length: int): self._essay_text = [] self._essay_grade = [] with open(path_json, "r") as file: json_list = json.load(file) for example in json_list: text = example["filename"] + " " + example["text"] inputs_ids = torch.tensor(tokenizer.encode(text, max_length=max_length, truncation=True)) grade = torch.tensor(float(example["grade"])) self._essay_text.append(inputs_ids) self._essay_grade.append(grade) self._essay_text = pad_sequence(self._essay_text, batch_first=True, padding_value=0)
def filter_samples(samples, tokenizer: AutoTokenizer, vocab, template): msg = "" new_samples = [] samples_exluded = 0 for sample in samples: excluded = False if "obj_label" in sample and "sub_label" in sample: obj_label_ids = tokenizer.encode(sample["obj_label"], add_special_tokens=False) if obj_label_ids: # reconstructed_word = " ".join( # [vocab[x] for x in obj_label_ids] # ).strip() reconstructed_word = tokenizer.decode(obj_label_ids) if len(obj_label_ids) > 1: reconstructed_word = None # TODO: Find good solution for comparing two models else: reconstructed_word = None excluded = False if not template or len(template) == 0: masked_sentences = sample["masked_sentences"] text = " ".join(masked_sentences) if len(text.split()) > tokenizer.model_max_length: msg += "\tEXCLUDED for exeeding max sentence length: {}\n".format( masked_sentences) samples_exluded += 1 excluded = True # # MAKE SURE THAT obj_label IS IN VOCABULARIES # (Removed as we do not have multiple different models that require different vocabularies) if excluded: pass elif obj_label_ids is None: msg += "\tEXCLUDED object label {} not in model vocabulary\n".format( sample["obj_label"]) samples_exluded += 1 elif not reconstructed_word or reconstructed_word != sample[ "obj_label"]: msg += "\tEXCLUDED object label {} not in model vocabulary\n".format( sample["obj_label"]) samples_exluded += 1 elif "judgments" in sample: # only for Google-RE num_no = 0 num_yes = 0 for x in sample["judgments"]: if x["judgment"] == "yes": num_yes += 1 else: num_no += 1 if num_no > num_yes: # SKIP NEGATIVE EVIDENCE pass else: new_samples.append(sample) else: new_samples.append(sample) else: msg += "\tEXCLUDED since 'obj_label' not sample or 'sub_label' not in sample: {}\n".format( sample) samples_exluded += 1 msg += "samples exluded : {}\n".format(samples_exluded) return new_samples, msg
def tokenize_text(text: str, tokenizer: AutoTokenizer, **kwargs) -> Tensor: return tensor(tokenizer.encode(text, truncation=True, **kwargs), dtype=longt)