Example #1
0
 def get_train_examples(self):
     return self._create_examples(
         lines=read_file_lines(self.train_path, strip_lines=True),
         ans_lines=read_file_lines(self.path_dict["train_ans"],
                                   strip_lines=True),
         set_type="train",
     )
Example #2
0
 def get_val_examples(self):
     return self._create_examples(
         lines=read_file_lines(self.val_path, strip_lines=True),
         ans_lines=read_file_lines(self.path_dict["val_ans"],
                                   strip_lines=True),
         set_type="val",
     )
Example #3
0
 def _get_examples(self, phase):
     eng_examples = self._create_examples(
         lines=read_file_lines(self.path_dict[phase]["eng"]), is_english=True, set_type=phase,
     )
     other_examples = self._create_examples(
         lines=read_file_lines(self.path_dict[phase]["other"]), is_english=False, set_type=phase,
     )
     return eng_examples + other_examples
Example #4
0
 def get_val_examples(self):
     eng_examples = self._create_examples(
         lines=read_file_lines(self.path_dict["eng"]), is_english=True, set_type="val",
     )
     other_examples = self._create_examples(
         lines=read_file_lines(self.path_dict["other"]), is_english=False, set_type="val",
     )
     return eng_examples + other_examples
Example #5
0
def download_mutual_plus_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    os.makedirs(task_data_path + "/train", exist_ok=True)
    os.makedirs(task_data_path + "/dev", exist_ok=True)
    os.makedirs(task_data_path + "/test", exist_ok=True)
    num_files = {"train": 7088, "dev": 886, "test": 886}
    for phase in num_files:
        examples = []
        for i in range(num_files[phase]):
            file_name = phase + "_" + str(i + 1) + ".txt"
            download_utils.download_file(
                f"https://raw.githubusercontent.com/Nealcly/MuTual/"
                + f"master/data/mutual_plus/{phase}/{file_name}",
                os.path.join(task_data_path, phase, file_name),
            )
            for line in py_io.read_file_lines(os.path.join(task_data_path, phase, file_name)):
                examples.append(line)
        py_io.write_jsonl(examples, os.path.join(task_data_path, phase + ".jsonl"))
        shutil.rmtree(os.path.join(task_data_path, phase))

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.jsonl"),
                "val": os.path.join(task_data_path, "dev.jsonl"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Example #6
0
 def _create_examples(cls, data_path, set_type):
     curr_token_list, curr_pos_list = [], []
     data_lines = read_file_lines(data_path, "r", encoding="utf-8")
     examples = []
     idx = 0
     for data_line in data_lines:
         data_line = data_line.strip()
         if data_line:
             if set_type == "test":
                 line_tokens = data_line.split("\t")
                 if len(line_tokens) == 2:
                     token, pos = line_tokens
                 else:
                     token, pos = data_line, None
             else:
                 token, pos = data_line.split("\t")
             curr_token_list.append(token)
             curr_pos_list.append(pos)
         else:
             examples.append(
                 Example(
                     guid="%s-%s" % (set_type, idx),
                     tokens=curr_token_list,
                     pos_list=curr_pos_list,
                 ))
             idx += 1
             curr_token_list, curr_pos_list = [], []
     if curr_token_list:
         examples.append(
             Example(guid="%s-%s" % (idx, idx),
                     tokens=curr_token_list,
                     pos_list=curr_pos_list))
     return examples
Example #7
0
 def get_val_examples(self):
     return self._create_examples(
         lines=zip(
             read_json_lines(self.val_path),
             read_file_lines(self.path_dict["val_labels"],
                             strip_lines=True),
         ),
         set_type="val",
     )
Example #8
0
 def _create_examples(cls, data_path, idx_path, set_type):
     curr_token_list, curr_pos_list, idx_ls = [], [], []
     data_lines = read_file_lines(data_path, "r", encoding="utf-8")
     idx_lines = read_file_lines(idx_path, "r", encoding="utf-8")
     examples = []
     for data_line, idx_line in zip_equal(data_lines, idx_lines):
         data_line, idx_line = data_line.strip(), idx_line.strip()
         assert bool(data_line) == bool(idx_line)
         if data_line:
             if set_type == "test":
                 line_tokens = data_line.split("\t")
                 if len(line_tokens) == 2:
                     token, pos = line_tokens
                 else:
                     token, pos = data_line, None
             else:
                 token, pos = data_line.split("\t")
             curr_token_list.append(token)
             curr_pos_list.append(pos)
             idx_ls.append(int(idx_line))
         else:
             idx = get_all_same(idx_ls)
             examples.append(
                 Example(
                     guid="%s-%s" % (set_type, idx),
                     tokens=curr_token_list,
                     pos_list=curr_pos_list,
                 ))
             curr_token_list, curr_pos_list, idx_ls = [], [], []
     if curr_token_list:
         idx = get_all_same(idx_ls)
         examples.append(
             Example(
                 guid="%s-%s" % (idx, idx),
                 tokens=curr_token_list,
                 pos_list=curr_pos_list,
             ))
     return examples
Example #9
0
 def _read_labels(cls, path):
     lines = read_file_lines(path)
     return [int(line.strip()) for line in lines]
Example #10
0
 def get_test_examples(self):
     return self._create_examples(lines=read_file_lines(self.test_path),
                                  set_type="test")
Example #11
0
 def get_val_examples(self):
     return self._create_examples(lines=read_file_lines(self.val_path),
                                  set_type="val")
Example #12
0
 def get_train_examples(self):
     return self._create_examples(lines=read_file_lines(self.train_path),
                                  set_type="train")
Example #13
0
 def get_test_examples(self):
     return self._create_examples(
         lines=read_file_lines(self.test_path, strip_lines=True),
         ans_lines=None,
         set_type="test",
     )
Example #14
0
def download_tatoeba_data_and_write_config(task_data_base_path: str,
                                           task_config_base_path: str):
    tatoeba_temp_path = py_io.create_dir(task_data_base_path, "tatoeba_temp")
    download_utils.download_and_unzip(
        "https://github.com/facebookresearch/LASER/archive/master.zip",
        tatoeba_temp_path,
    )
    languages_dict = {
        "afr": "af",
        "ara": "ar",
        "bul": "bg",
        "ben": "bn",
        "deu": "de",
        "ell": "el",
        "spa": "es",
        "est": "et",
        "eus": "eu",
        "pes": "fa",
        "fin": "fi",
        "fra": "fr",
        "heb": "he",
        "hin": "hi",
        "hun": "hu",
        "ind": "id",
        "ita": "it",
        "jpn": "ja",
        "jav": "jv",
        "kat": "ka",
        "kaz": "kk",
        "kor": "ko",
        "mal": "ml",
        "mar": "mr",
        "nld": "nl",
        "por": "pt",
        "rus": "ru",
        "swh": "sw",
        "tam": "ta",
        "tel": "te",
        "tha": "th",
        "tgl": "tl",
        "tur": "tr",
        "urd": "ur",
        "vie": "vi",
        "cmn": "zh",
        "eng": "en",
    }
    raw_base_path = os.path.join(tatoeba_temp_path, "LASER-master", "data",
                                 "tatoeba", "v1")
    for full_lang, lang in languages_dict.items():
        task_name = f"tatoeba_{lang}"
        if lang == "en":
            continue
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        eng_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.eng")
        other_src = os.path.join(raw_base_path,
                                 f"tatoeba.{full_lang}-eng.{full_lang}")
        eng_out = os.path.join(task_data_path, f"{lang}-en.en")
        other_out = os.path.join(task_data_path, f"{lang}-en.{lang}")
        labels_out = os.path.join(task_data_path, f"{lang}-en.labels")
        tgts = [line.strip() for line in py_io.read_file_lines(eng_src)]
        os.rename(src=other_src, dst=other_out)
        idx = range(len(tgts))
        data = zip(tgts, idx)

        # Tatoeba is a retrieval dataset where you have a set of sentences in English and another
        # set in another language, and you need to match them. It also doesn't have training
        # data, so it's pretty much evaluation only. However, the dataset is distributed with the
        # sentences in order, i.e. the retrieval pairing is the sentence order.
        #
        # The XTREME authors intentionally scramble the order by sorting one of the two
        # sets alphabetically. We're following their recipe, but also retaining the labels for
        # internal scoring.
        with py_io.get_lock(eng_out):
            with py_io.get_lock(labels_out):
                if os.path.exists(eng_out) and os.path.exists(labels_out):
                    logger.info('Skip writing to %s since it already exists.',
                                eng_out)
                    logger.info('Skip writing to %s since it already exists.',
                                labels_out)
                else:
                    with open(eng_out, "w") as ftgt, open(labels_out,
                                                          "w") as flabels:
                        for t, i in sorted(data, key=lambda x: x[0]):
                            ftgt.write(f"{t}\n")
                            flabels.write(f"{i}\n")
        py_io.write_json(
            data={
                "task": "tatoeba",
                "paths": {
                    "eng": eng_out,
                    "other": other_out,
                    "labels_path": labels_out
                },
                "kwargs": {
                    "language": lang
                },
                "name": task_name,
            },
            path=os.path.join(task_config_base_path,
                              f"{task_name}_config.json"),
            skip_if_exists=True,
        )
    shutil.rmtree(tatoeba_temp_path)