Ejemplo n.º 1
0
def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str):
    xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp")
    download_utils.download_and_unzip(
        "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path,
    )
    full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl"))
    val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"])
    full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl"))
    test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"])
    languages = sorted(list(val_data))
    for lang in languages:
        task_name = f"xnli_{lang}"
        task_data_path = py_io.create_dir(task_data_base_path, task_name)
        val_path = os.path.join(task_data_path, "val.jsonl")
        test_path = os.path.join(task_data_path, "test.jsonl")
        py_io.write_jsonl(data=val_data[lang], path=val_path)
        py_io.write_jsonl(data=test_data[lang], path=test_path)
        py_io.write_json(
            data={
                "task": "xnli",
                "paths": {"val": val_path, "test": test_path},
                "name": task_name,
                "kwargs": {"language": lang},
            },
            path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
        )
    shutil.rmtree(xnli_temp_path)
Ejemplo n.º 2
0
def download_fever_nli_data_and_write_config(
    task_name: str, task_data_path: str, task_config_path: str
):
    os.makedirs(task_data_path, exist_ok=True)
    download_utils.download_and_unzip(
        ("https://www.dropbox.com/s/hylbuaovqwo2zav/nli_fever.zip?dl=1"), task_data_path,
    )
    # Since the FEVER NLI dataset doesn't have labels for the dev set, we also download the original
    # FEVER dev set and match example CIDs to obtain labels.
    orig_dev_path = os.path.join(task_data_path, "fever-dev-temp.jsonl")
    download_utils.download_file(
        "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl", orig_dev_path,
    )
    id_to_label = {}
    for line in py_io.read_jsonl(orig_dev_path):
        if "id" not in line:
            logging.warning("FEVER dev dataset is missing ID.")
            continue
        if "label" not in line:
            logging.warning("FEVER dev dataset is missing label.")
            continue
        id_to_label[line["id"]] = line["label"]
    os.remove(orig_dev_path)

    dev_path = os.path.join(task_data_path, "nli_fever", "dev_fitems.jsonl")
    dev_examples = []
    for line in py_io.read_jsonl(dev_path):
        if "cid" not in line:
            logging.warning("Data in {} is missing CID.".format(dev_path))
            continue
        if int(line["cid"]) not in id_to_label:
            logging.warning("Could not match CID {} to dev data.".format(line["cid"]))
            continue
        dev_example = line
        dev_example["label"] = id_to_label[int(line["cid"])]
        dev_examples.append(dev_example)
    py_io.write_jsonl(dev_examples, os.path.join(task_data_path, "val.jsonl"))
    os.remove(dev_path)

    for phase in ["train", "test"]:
        os.rename(
            os.path.join(task_data_path, "nli_fever", f"{phase}_fitems.jsonl"),
            os.path.join(task_data_path, f"{phase}.jsonl"),
        )
    shutil.rmtree(os.path.join(task_data_path, "nli_fever"))

    py_io.write_json(
        data={
            "task": task_name,
            "paths": {
                "train": os.path.join(task_data_path, "train.jsonl"),
                "val": os.path.join(task_data_path, "val.jsonl"),
                "test": os.path.join(task_data_path, "test.jsonl"),
            },
            "name": task_name,
        },
        path=task_config_path,
    )
Ejemplo n.º 3
0
Archivo: zlog.py Proyecto: HonoMi/jiant
def load_log(fol_path):
    all_paths = filesystem.find_files_with_ext(fol_path, "zlog")
    log_data = {}
    for path in all_paths:
        key = os.path.abspath(path).replace(os.path.abspath(fol_path),
                                            "")[1:].replace(".zlog", "")
        log_data[key] = py_io.read_jsonl(path)
    return log_data
Ejemplo n.º 4
0
    def read_examples(cls, path: str, set_type: str):
        if path.endswith(".gz"):
            with gzip.open(path, "r") as f:
                data = [json.loads(row) for row in f.readlines()]
        elif path.endswith(".jsonl"):
            data = read_jsonl(path)
        else:
            raise KeyError(f"Unknown format: {path}")

        # First row is a header row
        assert "header" in data[0]

        examples = []
        for i, line in enumerate(data[1:]):
            for elem in line["qas"]:
                if set_type == PHASE.TRAIN:
                    # Just use first answer for training (e.g. "Car", "Vehicle")
                    answer = elem["detected_answers"][0]
                    # Just use first occurrence of answer for training ("Every car is a car.")
                    answer_span = answer["char_spans"][0]
                    answer_text = line["context"][
                        answer_span[0]:answer_span[1] + 1]
                    # assert len(elem["detected_answers"][0]["char_spans"]) == 1
                    examples.append(
                        Example(
                            qas_id=f"{set_type}-{i}",
                            question_text=elem["question"],
                            context_text=line["context"],
                            answer_text=answer_text,
                            start_position_character=answer_span[0],
                            title="",
                            is_impossible=False,
                            answers=[],
                        ))
                else:
                    answers = []
                    for answer in elem["detected_answers"]:
                        for answer_span in answer["char_spans"]:
                            answers.append({
                                "answer_start":
                                answer_span[0],
                                "text":
                                line["context"][answer_span[0]:answer_span[1] +
                                                1],
                            })
                    examples.append(
                        Example(
                            qas_id=f"{set_type}-{i}",
                            question_text=elem["question"],
                            context_text=line["context"],
                            answer_text=None,
                            start_position_character=None,
                            title="",
                            is_impossible=False,
                            answers=answers,
                        ))
        return examples
Ejemplo n.º 5
0
 def _create_examples(cls, path, set_type):
     for i, row in enumerate(py_io.read_jsonl(path)):
         yield Example(
             guid="%s-%s" % (set_type, i),
             tokenized_text=row["tokenized_text"],
             masked_spans=[
                 ExclusiveSpan(start, end)
                 for start, end in row["masked_spans"]
             ],
         )
Ejemplo n.º 6
0
def default_get_test_labels(task: Task, possible_labels):
    """test setのラベルを獲得する.

    本関数が動作するかは,サブクラスの実装による.
    多くのサブクラスは(上位クラスで明示されない)同一のinterfaceを持っているため,この実装でうまくいく.
    """
    labels = [
        example.label for example in task._create_examples(
            read_jsonl(task.test_path), "valid")
    ]
    if any([label not in possible_labels
            for label in labels]):  # masked labels
        labels = [possible_labels[-1]] * len(labels)
    return labels
Ejemplo n.º 7
0
 def read_examples(cls, path: str, set_type: str):
     examples = []
     for entry in read_jsonl(path):
         for qa in entry["qas"]:
             answer_text = entry["text"][
                 qa["answer"]["s"]:qa["answer"]["e"]]
             examples.append(
                 Example(
                     qas_id=f"{set_type}-{len(examples)}",
                     question_text=qa["question"],
                     context_text=entry["text"],
                     answer_text=answer_text,
                     start_position_character=qa["answer"]["s"],
                     title="",
                     is_impossible=False,
                     answers=[{
                         "answer_start": qa["answer"]["s"],
                         "text": answer_text
                     }],
                 ))
     return examples
Ejemplo n.º 8
0
 def read_examples(cls, path: str, set_type: str):
     examples = []
     for i, line in enumerate(read_jsonl(path)):
         if set_type == PHASE.TRAIN:
             for j, (answer_start, answer_text) in enumerate(
                     zip(line["answers"]["answer_start"],
                         line["answers"]["text"])):
                 examples.append(
                     Example(
                         qas_id=f"{set_type}-{i}",
                         question_text=line["question"],
                         context_text=line["context"],
                         answer_text=answer_text,
                         start_position_character=answer_start,
                         title=line["title"],
                         is_impossible=False,
                         answers=[],
                     ))
         else:
             answers = [{
                 "answer_start": answer_start,
                 "text": answer_text
             } for answer_start, answer_text in zip(
                 line["answers"]["answer_start"], line["answers"]["text"])]
             examples.append(
                 Example(
                     qas_id=f"{set_type}-{i}",
                     question_text=line["question"],
                     context_text=line["context"],
                     answer_text=None,
                     start_position_character=None,
                     title=line["title"],
                     is_impossible=False,
                     answers=answers,
                 ))
     return examples
Ejemplo n.º 9
0
 def get_test_examples(self):
     return self._create_examples(
         lines=read_jsonl(self.test_path),
         set_type="test",
         language=self.language,
     )
Ejemplo n.º 10
0
 def get_val_examples(self):
     return self._create_examples(
         lines=read_jsonl(self.val_path),
         set_type="val",
         language=self.language,
     )
Ejemplo n.º 11
0
 def get_val_examples(self):
     return self._create_examples(lines=read_jsonl(self.path_dict['validation']), set_type="val")
Ejemplo n.º 12
0
 def get_train_examples(self):
     return self._create_examples(lines=read_jsonl(self.train_path), set_type="train")