Beispiel #1
0
    def preprocess(self, re_preprocess: bool = False):
        """
        Preprocess the dataset.

        Args:
            re_preprocess (bool): If it is True, the dataset will be preprocessed no matter whether it has been preprocessed
                or not.
        """
        if not os.path.isdir(self.root_folder):
            os.makedirs(self.root_folder)
        is_preprocessed = len(set(os.listdir(self.root_folder)).intersection(set(self.file_names))) == len(self.file_names)
        if re_preprocess or not is_preprocessed:
            # check whether raw files exists
            raw_folder = os.path.join(self.root_folder, "raw")
            if not os.path.isdir(raw_folder) or len(os.listdir(raw_folder)) == 0:
                if not os.path.isdir(raw_folder):
                    os.makedirs(raw_folder)
                print("\033[33mWarning: {0} is not initialized!\033[0m".format(self.dataset_name))
                print("You need to manually download the dataset {0} into the folder {1}".format(self.dataset_name, raw_folder))
                for url in self.urls:
                    print(url)
                exit(0)

            # preprocess raw data with a unified format
            results = self._preprocess()
            assert all([key in self.file_names for key in results.keys()]),\
                   "Preprocessed files should be included in {0}".format(self.file_names)
            for key in results.keys():
                if key.endswith(".jsonl"):
                    assert isinstance(results[key], list)
                    save_jsonl(results[key], self.root_folder, key)
                else:
                    assert isinstance(results[key], list)
                    save_text("\n".join(results[key]), self.root_folder, key)
            print("Datahub has been preprocessed successfully in {0} with {1}".format(self.root_folder, results.keys()))
Beispiel #2
0
def test_chartokenizer():
    """Test the class `CharTokenizer`."""
    tokens = ["[UNK]", "[PAD]", "南", "京", "市", "长", "江", "大", "桥"]
    token_embeddings = [[0.0, 0.0], [0.0, 0.0], [0.9, 0.3], [0.7, 0.8],
                        [0.21, 0.78], [0.51, 0.82], [0.23, 0.91], [0.39, 0.61],
                        [0.98, 0.45]]
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(tokens), folder_name, "tokens.txt")
    torch.save(torch.tensor(token_embeddings),
               os.path.join(folder_name, "token_embeddings.checkpoints"))
    config = {"n_tokens": 9, "token_dim": 2}
    save_json(config, folder_name, "token_configs.json")
    tokenizer = CharTokenizer(folder_name)
    assert len(tokenizer) == 9
    config["pad_id"] = 1
    assert tokenizer.configs() == config
    assert tokenizer.unk_token == "[UNK]"
    assert tokenizer.pad_token == "[PAD]"
    assert tokenizer["南"] == 2
    assert tokenizer[3] == "京"
    assert tokenizer.tokenize("南京是好朋友") == ["南", "京", "是", "好", "朋", "友"]
    assert tokenizer.convert_tokens_to_ids(["南", "京", "是", "好", "朋",
                                            "友"]) == [2, 3, 0, 0, 0, 0]
    assert [[round(e, 2) for e in em]
            for em in tokenizer.embeddings().tolist()] == token_embeddings
    tmp_folder.cleanup()
Beispiel #3
0
    def save(self, folder: str):
        """
        Save tokens to a folder.

        Args:
            folder (str): The folder where tokens will save into.
        """
        save_text("\n".join(self._to_tokens), folder, "tokens.txt")
        save_json(self.configs(), folder, "token_configs.json")
Beispiel #4
0
    def save(self, folder: str, file_name: str = None):
        """
        Save labels to a folder.

        Args:
            folder (str): The folder where labels will save into.
            file_name (str): The file name of labels.
        """
        if file_name is None:
            file_name = self._file_name
        save_text("\n".join(self._to_tags), folder, file_name)
Beispiel #5
0
def test_outseqlab():
    """Test the class `OutSeqlab`."""
    labels = ["O", "B-Example", "I-Example"]
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(labels), folder_name, "labels")
    out_adapter = OutSeqlab(folder_name, "labels")
    assert len(out_adapter) == 3
    assert out_adapter.unk_id == 0
    assert out_adapter.unk_label == "O"
    assert out_adapter.convert_labels_to_ids(["O", "O", "B-Example"]) == [0, 0, 1]
    out_adapter.save(folder_name, "test")
    with open(os.path.join(folder_name, "test"), "r") as f_in:
        text = f_in.read()
    assert text == "\n".join(labels)
    tmp_folder.cleanup()
Beispiel #6
0
def test_lexicon_distribution():
    """Test the function `lexicon_distribution`."""
    lexicons = [
        ("[PAD]", "SEP", "TEST"),
        ("南京", "LOC", "TEST"),
        ("南京市", "LOC", "TEST"),
        ("长江", "VIEW", "TEST"),
        ("长江大桥", "BUILDING", "TEST"),
        ("江大桥", "PER", "TEST"),
        ("大桥", "SEGMENTATION", "TEST")
    ]
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt")
    gazetteer = Gazetteer(folder_name)
    results = {
        "sentence": {
            "dist": [],
            "avg": 0, "max": 0, "min": 0
        },
        "token": {
            "dist": [],
            "avg": 0, "max": 0, "min": 0
        },
        "gECR": 0.
    }
    assert lexicon_distribution(gazetteer, []) == results
    dataset = [
        {"text": "南京市长江大桥", "spans": [{"text": "南京", "label": "ORG", "start": 0, "end": 1}]},
        {"text": "重庆长江大桥", "spans": [{"text": "重庆", "label": "ORG", "start": 0, "end": 1}]}
    ]
    results = {
        "sentence": {
            "dist": [(4, 1), (6, 1)],
            "avg": 5.0, "max": 6, "min": 4
        },
        "token": {
            "dist": [(0, 2), (1, 1), (2, 4), (3, 6)],
            "avg": 27/13, "max": 3, "min": 0
        },
        "gECR": 0.5
    }
    assert lexicon_distribution(gazetteer, dataset) == results
    tmp_folder.cleanup()
Beispiel #7
0
def test_incgn():
    """Test the class `InCGN`."""
    # test normal mode
    max_seq_len = 8
    dataset = [{
        "text":
        "abcdefghijk",
        "spans": [{
            "label": "Example",
            "start": 1,
            "end": 2,
            "text": "ex",
            "confidence": 1.0
        }]
    }, {
        "text": "a",
        "spans": []
    }]
    tokens = [
        "[UNK]", "[PAD]", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"
    ]
    labels = ["O", "B-Example", "I-Example"]
    lexicons = [("[PAD]", "SEP", "TEST"), ("ab", "LOC", "TEST"),
                ("cd", "LOC", "TEST"), ("de", "VIEW", "TEST"),
                ("ki", "BUILDING", "TEST"), ("op", "PER", "TEST"),
                ("fg", "SEGMENTATION", "TEST"),
                ("cde", "SEGMENTATION", "TEST")]
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(tokens), folder_name, "tokens.txt")
    save_text("\n".join(labels), folder_name, "labels.txt")
    save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name,
              "lexicons.txt")
    tokenizer = CharTokenizer(folder_name)
    gazetteer = Gazetteer(folder_name)
    out_adapter = OutSeqlab(folder_name, "labels.txt")
    in_adapter = InCGN(dataset, max_seq_len, tokenizer, out_adapter, gazetteer)
    assert in_adapter.transform_sample(dataset[0]) == {
        "text":
        "abcdefghijk",
        "input_ids": [2, 3, 4, 5, 6, 7, 8, 9],
        "output_ids": [0, 1, 2, 0, 0, 0, 0, 0],
        "lexicon_ids": [1, 2, 7, 3, 6],
        "relations": [[((0, True), (0, False)), ((0, True), (1, False)),
                       ((1, True), (2, False)), ((1, True), (3, False)),
                       ((2, True), (2, False)), ((2, True), (3, False)),
                       ((2, True), (4, False)), ((3, True), (3, False)),
                       ((3, True), (4, False)), ((4, True), (5, False)),
                       ((4, True), (6, False))],
                      [((0, False), (1, False)), ((1, False), (2, False)),
                       ((2, False), (3, False)), ((3, False), (4, False)),
                       ((4, False), (5, False)), ((5, False), (6, False)),
                       ((6, False), (7, False)), ((0, True), (2, False)),
                       ((1, True), (1, False)), ((1, True), (0, True)),
                       ((1, True), (4, False)), ((2, True), (1, False)),
                       ((2, True), (0, True)), ((2, True), (5, False)),
                       ((3, True), (2, False)), ((3, True), (5, False)),
                       ((4, True), (4, False)), ((4, True), (2, True)),
                       ((4, True), (3, True)), ((4, True), (7, False))],
                      [((0, False), (1, False)), ((1, False), (2, False)),
                       ((2, False), (3, False)), ((3, False), (4, False)),
                       ((4, False), (5, False)), ((5, False), (6, False)),
                       ((6, False), (7, False)), ((0, True), (0, False)),
                       ((0, True), (1, False)), ((1, True), (2, False)),
                       ((1, True), (3, False)), ((2, True), (2, False)),
                       ((2, True), (4, False)), ((3, True), (3, False)),
                       ((3, True), (4, False)), ((4, True), (5, False)),
                       ((4, True), (6, False))]],
        "length":
        8,
        "start":
        0
    }
    assert in_adapter.transform_sample(dataset[1]) == {
        "text": "a",
        "input_ids": [2],
        "output_ids": [0],
        "lexicon_ids": [],
        "relations": [[], [], []],
        "length": 1,
        "start": 0
    }
    assert len(in_adapter) == 2
    tmp_folder.cleanup()

    # test MRC mode
    max_seq_len = 8
    dataset = [{
        "text":
        "acbq",
        "spans": [{
            "label": "T1",
            "start": 1,
            "end": 1,
            "text": "c",
            "confidence": 1.0
        }, {
            "label": "T2",
            "start": 2,
            "end": 3,
            "text": "bq",
            "confidence": 1.0
        }]
    }]
    lexicons = [("[PAD]", "SEP", "TEST"), ("c", "LOC", "TEST"),
                ("bq", "LOC", "TEST")]
    tokens = ["[UNK]", "[PAD]", "a", "b", "c", "q", "1", "2", "[CLS]", "[SEP]"]
    queries = {"T1": "q1", "T2": "q2"}
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(tokens), folder_name, "tokens.txt")
    save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name,
              "lexicons.txt")
    tokenizer = CharTokenizer(folder_name)
    gazetteer = Gazetteer(folder_name)
    out_adapter = OutMRC()
    in_adapter = InCGN(dataset,
                       max_seq_len,
                       tokenizer,
                       out_adapter,
                       gazetteer=gazetteer,
                       queries=queries)
    assert in_adapter.transform_sample(dataset[0]) == [{
        "type":
        "T1",
        "query":
        "q1",
        "text":
        "acbq",
        "input_ids": [8, 5, 6, 9, 2, 4, 3, 5],
        "output_ids": [0, 0, 0, 0, 0, 1, 0, 0],
        "length":
        8,
        "start":
        4,
        "lexicon_ids": [1, 2],
        "relations": [[((0, True), (5, False)), ((1, True), (6, False)),
                       ((1, True), (7, False))],
                      [((0, False), (1, False)), ((1, False), (2, False)),
                       ((2, False), (3, False)), ((3, False), (4, False)),
                       ((4, False), (5, False)), ((5, False), (6, False)),
                       ((6, False), (7, False)), ((0, True), (4, False)),
                       ((0, True), (6, False)), ((1, True), (5, False)),
                       ((1, True), (0, True))],
                      [((0, False), (1, False)), ((1, False), (2, False)),
                       ((2, False), (3, False)), ((3, False), (4, False)),
                       ((4, False), (5, False)), ((5, False), (6, False)),
                       ((6, False), (7, False)), ((0, True), (5, False)),
                       ((0, True), (5, False)), ((1, True), (6, False)),
                       ((1, True), (7, False))]]
    }, {
        "type":
        "T2",
        "query":
        "q2",
        "text":
        "acbq",
        "input_ids": [8, 5, 7, 9, 2, 4, 3, 5],
        "output_ids": [0, 0, 0, 0, 0, 0, 1, 2],
        "length":
        8,
        "start":
        4,
        "lexicon_ids": [1, 2],
        "relations": [[((0, True), (5, False)), ((1, True), (6, False)),
                       ((1, True), (7, False))],
                      [((0, False), (1, False)), ((1, False), (2, False)),
                       ((2, False), (3, False)), ((3, False), (4, False)),
                       ((4, False), (5, False)), ((5, False), (6, False)),
                       ((6, False), (7, False)), ((0, True), (4, False)),
                       ((0, True), (6, False)), ((1, True), (5, False)),
                       ((1, True), (0, True))],
                      [((0, False), (1, False)), ((1, False), (2, False)),
                       ((2, False), (3, False)), ((3, False), (4, False)),
                       ((4, False), (5, False)), ((5, False), (6, False)),
                       ((6, False), (7, False)), ((0, True), (5, False)),
                       ((0, True), (5, False)), ((1, True), (6, False)),
                       ((1, True), (7, False))]]
    }]
    assert len(in_adapter) == 2
    tmp_folder.cleanup()
Beispiel #8
0
def test_gazetteer():
    """Test the class `Gazetteer`."""
    lexicons = [
        ("[PAD]", "SEP", "TEST"),
        ("南京", "LOC", "TEST"),
        ("南京市", "LOC", "TEST"),
        ("长江", "VIEW", "TEST"),
        ("长江大桥", "BUILDING", "TEST"),
        ("江大桥", "PER", "TEST"),
        ("大桥", "SEGMENTATION", "TEST")
    ]
    lexicon_embeddings = [
        [0.0, 0.0],
        [1.0, 0.1],
        [0.9, 0.3],
        [0.7, 0.8],
        [0.21, 0.78],
        [0.51, 0.82],
        [0.23, 0.91]
    ]
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt")
    torch.save(torch.tensor(lexicon_embeddings), os.path.join(folder_name, "lexicon_embeddings.checkpoints"))
    save_json({"n_lexicons": 7, "lexicon_dim": 2}, folder_name, "lexicon_configs.json")
    gazetteer = Gazetteer(folder_name)
    assert len(gazetteer) == 7
    assert gazetteer.configs() == {"n_lexicons": 7, "lexicon_dim": 2, "n_edge_types": 7}
    assert gazetteer.pad_token == "[PAD]"
    assert gazetteer.num_types == 6
    assert gazetteer["长江"] == 3
    assert gazetteer[4] == "长江大桥"
    assert gazetteer.search(["长", "江", "大", "桥"]) == ["长江", "长江大桥"]
    assert gazetteer.exist(["长", "江", "大", "桥"]) is True
    assert gazetteer.exist(["长", "江", "大"]) is False
    assert [[round(e, 2) for e in em] for em in gazetteer.embeddings().tolist()] == lexicon_embeddings
    assert gazetteer.freq("南京") == 0
    gazetteer.count_freq([{"text": "南京市长江大桥"}])
    assert gazetteer.freq("南京") == 1
    # update lexicons
    gazetteer.update(["{0}\tTEST\tTEST".format(lexicon) for lexicon in ["[PAD]", "重庆", "长江"]])
    assert len(gazetteer) == 3
    assert gazetteer.configs() == {"n_lexicons": 3, "lexicon_dim": 2, "n_edge_types": 2}
    assert gazetteer.pad_token == "[PAD]"
    assert gazetteer.num_types == 1
    assert gazetteer["长江"] == 2
    assert gazetteer["重庆市"] == 0
    assert gazetteer.embeddings() is None
    tmp_folder.cleanup()
    # test mask
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt")
    sentence = "南京市长江大桥"
    gazetteer = Gazetteer(folder_name)
    tmp_folder.cleanup()
    gazetteer.mask(["南京市", "长江大桥"], True)
    assert gazetteer.search(list(sentence)) == ["南京"]
    assert gazetteer.search(list(sentence[3:])) == ["长江"]
    gazetteer.mask(["南京市", "长江大桥"], False)
    assert gazetteer.search(list(sentence)) == ["南京", "南京市"]
    assert gazetteer.search(list(sentence[3:])) == ["长江", "长江大桥"]
Beispiel #9
0
def test_inseqlab():
    """Test the class `InSeqlab`."""
    # test normal mode
    max_seq_len = 8
    dataset = [
        {
            "text": "abcdefghijk",
            "spans": [
                {"label": "Example", "start": 1, "end": 2, "text": "ex", "confidence": 1.0}
            ]
        },
        {
            "text": "a",
            "spans": []
        }
    ]
    tokens = ["[UNK]", "[PAD]", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]
    labels = ["O", "B-Example", "I-Example"]
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(tokens), folder_name, "tokens.txt")
    save_text("\n".join(labels), folder_name, "labels.txt")
    tokenizer = CharTokenizer(folder_name)
    out_adapter = OutSeqlab(folder_name, "labels.txt")
    in_adapter = InSeqlab(dataset, max_seq_len, tokenizer, out_adapter)
    assert in_adapter.transform_sample(dataset[1]) == \
        {"text": "a", "input_ids": [2], "output_ids": [0], "length": 1, "start": 0}
    assert in_adapter.transform_sample(dataset[0]) == \
        {"text": "abcdefghijk", "input_ids": [2, 3, 4, 5, 6, 7, 8, 9], "output_ids": [0, 1, 2, 0, 0, 0, 0, 0], "length": 8, "start": 0}
    assert len(in_adapter) == 2
    tmp_folder.cleanup()

    # test MRC mode
    max_seq_len = 8
    dataset = [
        {
            "text": "acbq",
            "spans": [
                {"label": "T1", "start": 1, "end": 1, "text": "c", "confidence": 1.0},
                {"label": "T2", "start": 2, "end": 3, "text": "bq", "confidence": 1.0}
            ]
        }
    ]
    tokens = ["[UNK]", "[PAD]", "a", "b", "c", "q", "1", "2", "[CLS]", "[SEP]"]
    queries = {"T1": "q1", "T2": "q2"}
    tmp_folder = tempfile.TemporaryDirectory()
    folder_name = tmp_folder.name
    save_text("\n".join(tokens), folder_name, "tokens.txt")
    tokenizer = CharTokenizer(folder_name)
    out_adapter = OutMRC()
    in_adapter = InSeqlab(dataset, max_seq_len, tokenizer, out_adapter, queries=queries)
    assert in_adapter.transform_sample(dataset[0]) == [
        {
            "type": "T1", "query": "q1", "text": "acbq",
            "input_ids": [8, 5, 6, 9, 2, 4, 3, 5], "output_ids": [0, 0, 0, 0, 0, 1, 0, 0], "length": 8, "start": 4
        },
        {
            "type": "T2", "query": "q2", "text": "acbq",
            "input_ids": [8, 5, 7, 9, 2, 4, 3, 5], "output_ids": [0, 0, 0, 0, 0, 0, 1, 2], "length": 8, "start": 4
        }
    ]
    assert len(in_adapter) == 2
    tmp_folder.cleanup()