def read(self, data_path: str, split: str) -> Iterable[DataExample]: input_path = self.get_split_path(data_path, split) normalizer = TextNormalizer() with open(input_path, "r", encoding="utf-8") as input_file: for idx, line in enumerate(input_file): if idx == 0: continue values = line.split("\t") input1: str = normalizer.process(values[1].strip()) input2: str = normalizer.process(values[2].strip()) relatedness: float = float(values[3].strip()) entailment: str = values[4].strip() yield self.create_example(input1, input2, relatedness, entailment)
def read(self, data_path: str, split: str) -> Iterable[DataExample]: split = split if split == "train" else f"out-{split}" path = self.get_split_path(data_path, split) normalizer = TextNormalizer() with open(path, "r", encoding="utf-8") as input_file: for line in input_file: words = line.split() label = words[-1] text = " ".join(words[0:-1]) text = text.replace(" em ", "em ").replace(" śmy ", "śmy ").replace( " m ", "m ") text = normalizer.process(text) yield DataExample(text, label)
def create_example(self, row: Dict, normalizer: TextNormalizer, has_target: bool) -> DataExample: text1 = normalizer.process(row["sentence_A"].strip()) text2 = normalizer.process(row["sentence_B"].strip()) return DataExample( [text1, text2], row["entailment_judgment"].strip() if has_target else None)
def create_example(self, row: Dict, normalizer: TextNormalizer, has_target: bool) -> DataExample: text = normalizer.process(row["sentence"].strip()) for label in self.labels: if label in text: print(text) return DataExample(text, row["target"].strip() if has_target else None)
def read(self, data_path: str, split: str) -> Iterable[DataExample]: split_name = "training" if split == "train" else split file_pattern = "{}_set_clean_only_{}.txt" text_path = os.path.join(data_path, self._spec.task_path(), file_pattern.format(split_name, "text")) tags_path = os.path.join(data_path, self._spec.task_path(), file_pattern.format(split_name, "tags")) normalizer = TextNormalizer(detokenize=False) with open(text_path, "r", encoding="utf-8") as text_file, open( tags_path, "r", encoding="utf-8") as tags_file: text_lines = text_file.readlines() tags_lines = tags_file.readlines() assert len(text_lines) == len(tags_lines) for idx in range(len(text_lines)): text = normalizer.process(text_lines[idx].strip()) text = text.replace("@anonymized_account", "@ użytkownik") label = tags_lines[idx].strip() yield DataExample(text, label)
def create_example(self, row: Dict, normalizer: TextNormalizer, has_target: bool) -> DataExample: text1 = normalizer.process(row["sentence_A"].strip()) text2 = normalizer.process(row["sentence_B"].strip()) if has_target: score = float(row["relatedness_score"]) score = "%.5f" % (score / 5.0, ) else: score = None return DataExample([text1, text2], score)
def read_simple(self, data_path: str, split: str, separator: str = " ", label_first: bool = True, normalize: bool = True): label_idx = 0 if label_first else 1 text_idx = 1 if label_first else 0 input_path = self.get_split_path(data_path, split) normalize_func = lambda val: val if normalize: normalizer = TextNormalizer() normalize_func = lambda val: normalizer.process(val) with open(input_path, "r", encoding="utf-8") as input_file: for line in input_file: values = line.split(sep=separator, maxsplit=1) label = values[label_idx] text = values[text_idx].strip() text = normalize_func(text) yield DataExample(text, label)
def normalizer(self) -> TextNormalizer: return TextNormalizer(detokenize=False)
def create_example(self, row: Dict, normalizer: TextNormalizer, has_target: bool) -> DataExample: text = normalizer.process(row["sentence"].strip()) text = text.replace("@anonymized_account", "@ użytkownik") return DataExample(text, row["target"].strip() if has_target else None)
def normalizer(self): return TextNormalizer()
def normalizer(self): return TextNormalizer(detokenize=False, lang="en")