Python TextNormalizer Exemples, utils.normalizer.TextNormalizer Python Exemples

Exemple #1

0

Afficher le fichier

 def read(self, data_path: str, split: str) -> Iterable[DataExample]:
     input_path = self.get_split_path(data_path, split)
     normalizer = TextNormalizer()
     with open(input_path, "r", encoding="utf-8") as input_file:
         for idx, line in enumerate(input_file):
             if idx == 0: continue
             values = line.split("\t")
             input1: str = normalizer.process(values[1].strip())
             input2: str = normalizer.process(values[2].strip())
             relatedness: float = float(values[3].strip())
             entailment: str = values[4].strip()
             yield self.create_example(input1, input2, relatedness,
                                       entailment)

Exemple #2

0

Afficher le fichier

 def read(self, data_path: str, split: str) -> Iterable[DataExample]:
     split = split if split == "train" else f"out-{split}"
     path = self.get_split_path(data_path, split)
     normalizer = TextNormalizer()
     with open(path, "r", encoding="utf-8") as input_file:
         for line in input_file:
             words = line.split()
             label = words[-1]
             text = " ".join(words[0:-1])
             text = text.replace(" em ", "em ").replace(" śmy ",
                                                        "śmy ").replace(
                                                            " m ", "m ")
             text = normalizer.process(text)
             yield DataExample(text, label)

Exemple #3

0

Afficher le fichier

 def create_example(self, row: Dict, normalizer: TextNormalizer,
                    has_target: bool) -> DataExample:
     text1 = normalizer.process(row["sentence_A"].strip())
     text2 = normalizer.process(row["sentence_B"].strip())
     return DataExample(
         [text1, text2],
         row["entailment_judgment"].strip() if has_target else None)

Exemple #4

0

Afficher le fichier

 def create_example(self, row: Dict, normalizer: TextNormalizer,
                    has_target: bool) -> DataExample:
     text = normalizer.process(row["sentence"].strip())
     for label in self.labels:
         if label in text:
             print(text)
     return DataExample(text, row["target"].strip() if has_target else None)

Exemple #5

0

Afficher le fichier

 def read(self, data_path: str, split: str) -> Iterable[DataExample]:
     split_name = "training" if split == "train" else split
     file_pattern = "{}_set_clean_only_{}.txt"
     text_path = os.path.join(data_path, self._spec.task_path(),
                              file_pattern.format(split_name, "text"))
     tags_path = os.path.join(data_path, self._spec.task_path(),
                              file_pattern.format(split_name, "tags"))
     normalizer = TextNormalizer(detokenize=False)
     with open(text_path, "r", encoding="utf-8") as text_file, open(
             tags_path, "r", encoding="utf-8") as tags_file:
         text_lines = text_file.readlines()
         tags_lines = tags_file.readlines()
         assert len(text_lines) == len(tags_lines)
         for idx in range(len(text_lines)):
             text = normalizer.process(text_lines[idx].strip())
             text = text.replace("@anonymized_account", "@ użytkownik")
             label = tags_lines[idx].strip()
             yield DataExample(text, label)

Exemple #6

0

Afficher le fichier

 def create_example(self, row: Dict, normalizer: TextNormalizer,
                    has_target: bool) -> DataExample:
     text1 = normalizer.process(row["sentence_A"].strip())
     text2 = normalizer.process(row["sentence_B"].strip())
     if has_target:
         score = float(row["relatedness_score"])
         score = "%.5f" % (score / 5.0, )
     else:
         score = None
     return DataExample([text1, text2], score)

Exemple #7

0

Afficher le fichier

 def read_simple(self,
                 data_path: str,
                 split: str,
                 separator: str = " ",
                 label_first: bool = True,
                 normalize: bool = True):
     label_idx = 0 if label_first else 1
     text_idx = 1 if label_first else 0
     input_path = self.get_split_path(data_path, split)
     normalize_func = lambda val: val
     if normalize:
         normalizer = TextNormalizer()
         normalize_func = lambda val: normalizer.process(val)
     with open(input_path, "r", encoding="utf-8") as input_file:
         for line in input_file:
             values = line.split(sep=separator, maxsplit=1)
             label = values[label_idx]
             text = values[text_idx].strip()
             text = normalize_func(text)
             yield DataExample(text, label)

Exemple #8

0

Afficher le fichier

 def normalizer(self) -> TextNormalizer:
     return TextNormalizer(detokenize=False)

Exemple #9

0

Afficher le fichier

 def create_example(self, row: Dict, normalizer: TextNormalizer,
                    has_target: bool) -> DataExample:
     text = normalizer.process(row["sentence"].strip())
     text = text.replace("@anonymized_account", "@ użytkownik")
     return DataExample(text, row["target"].strip() if has_target else None)

Exemple #10

0

Afficher le fichier

 def normalizer(self):
     return TextNormalizer()

Exemple #11

0

Afficher le fichier

 def normalizer(self):
     return TextNormalizer(detokenize=False, lang="en")