def preprocess(self, re_preprocess: bool = False): """ Preprocess the dataset. Args: re_preprocess (bool): If it is True, the dataset will be preprocessed no matter whether it has been preprocessed or not. """ if not os.path.isdir(self.root_folder): os.makedirs(self.root_folder) is_preprocessed = len(set(os.listdir(self.root_folder)).intersection(set(self.file_names))) == len(self.file_names) if re_preprocess or not is_preprocessed: # check whether raw files exists raw_folder = os.path.join(self.root_folder, "raw") if not os.path.isdir(raw_folder) or len(os.listdir(raw_folder)) == 0: if not os.path.isdir(raw_folder): os.makedirs(raw_folder) print("\033[33mWarning: {0} is not initialized!\033[0m".format(self.dataset_name)) print("You need to manually download the dataset {0} into the folder {1}".format(self.dataset_name, raw_folder)) for url in self.urls: print(url) exit(0) # preprocess raw data with a unified format results = self._preprocess() assert all([key in self.file_names for key in results.keys()]),\ "Preprocessed files should be included in {0}".format(self.file_names) for key in results.keys(): if key.endswith(".jsonl"): assert isinstance(results[key], list) save_jsonl(results[key], self.root_folder, key) else: assert isinstance(results[key], list) save_text("\n".join(results[key]), self.root_folder, key) print("Datahub has been preprocessed successfully in {0} with {1}".format(self.root_folder, results.keys()))
def test_chartokenizer(): """Test the class `CharTokenizer`.""" tokens = ["[UNK]", "[PAD]", "南", "京", "市", "长", "江", "大", "桥"] token_embeddings = [[0.0, 0.0], [0.0, 0.0], [0.9, 0.3], [0.7, 0.8], [0.21, 0.78], [0.51, 0.82], [0.23, 0.91], [0.39, 0.61], [0.98, 0.45]] tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(tokens), folder_name, "tokens.txt") torch.save(torch.tensor(token_embeddings), os.path.join(folder_name, "token_embeddings.checkpoints")) config = {"n_tokens": 9, "token_dim": 2} save_json(config, folder_name, "token_configs.json") tokenizer = CharTokenizer(folder_name) assert len(tokenizer) == 9 config["pad_id"] = 1 assert tokenizer.configs() == config assert tokenizer.unk_token == "[UNK]" assert tokenizer.pad_token == "[PAD]" assert tokenizer["南"] == 2 assert tokenizer[3] == "京" assert tokenizer.tokenize("南京是好朋友") == ["南", "京", "是", "好", "朋", "友"] assert tokenizer.convert_tokens_to_ids(["南", "京", "是", "好", "朋", "友"]) == [2, 3, 0, 0, 0, 0] assert [[round(e, 2) for e in em] for em in tokenizer.embeddings().tolist()] == token_embeddings tmp_folder.cleanup()
def save(self, folder: str): """ Save tokens to a folder. Args: folder (str): The folder where tokens will save into. """ save_text("\n".join(self._to_tokens), folder, "tokens.txt") save_json(self.configs(), folder, "token_configs.json")
def save(self, folder: str, file_name: str = None): """ Save labels to a folder. Args: folder (str): The folder where labels will save into. file_name (str): The file name of labels. """ if file_name is None: file_name = self._file_name save_text("\n".join(self._to_tags), folder, file_name)
def test_outseqlab(): """Test the class `OutSeqlab`.""" labels = ["O", "B-Example", "I-Example"] tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(labels), folder_name, "labels") out_adapter = OutSeqlab(folder_name, "labels") assert len(out_adapter) == 3 assert out_adapter.unk_id == 0 assert out_adapter.unk_label == "O" assert out_adapter.convert_labels_to_ids(["O", "O", "B-Example"]) == [0, 0, 1] out_adapter.save(folder_name, "test") with open(os.path.join(folder_name, "test"), "r") as f_in: text = f_in.read() assert text == "\n".join(labels) tmp_folder.cleanup()
def test_lexicon_distribution(): """Test the function `lexicon_distribution`.""" lexicons = [ ("[PAD]", "SEP", "TEST"), ("南京", "LOC", "TEST"), ("南京市", "LOC", "TEST"), ("长江", "VIEW", "TEST"), ("长江大桥", "BUILDING", "TEST"), ("江大桥", "PER", "TEST"), ("大桥", "SEGMENTATION", "TEST") ] tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt") gazetteer = Gazetteer(folder_name) results = { "sentence": { "dist": [], "avg": 0, "max": 0, "min": 0 }, "token": { "dist": [], "avg": 0, "max": 0, "min": 0 }, "gECR": 0. } assert lexicon_distribution(gazetteer, []) == results dataset = [ {"text": "南京市长江大桥", "spans": [{"text": "南京", "label": "ORG", "start": 0, "end": 1}]}, {"text": "重庆长江大桥", "spans": [{"text": "重庆", "label": "ORG", "start": 0, "end": 1}]} ] results = { "sentence": { "dist": [(4, 1), (6, 1)], "avg": 5.0, "max": 6, "min": 4 }, "token": { "dist": [(0, 2), (1, 1), (2, 4), (3, 6)], "avg": 27/13, "max": 3, "min": 0 }, "gECR": 0.5 } assert lexicon_distribution(gazetteer, dataset) == results tmp_folder.cleanup()
def test_incgn(): """Test the class `InCGN`.""" # test normal mode max_seq_len = 8 dataset = [{ "text": "abcdefghijk", "spans": [{ "label": "Example", "start": 1, "end": 2, "text": "ex", "confidence": 1.0 }] }, { "text": "a", "spans": [] }] tokens = [ "[UNK]", "[PAD]", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k" ] labels = ["O", "B-Example", "I-Example"] lexicons = [("[PAD]", "SEP", "TEST"), ("ab", "LOC", "TEST"), ("cd", "LOC", "TEST"), ("de", "VIEW", "TEST"), ("ki", "BUILDING", "TEST"), ("op", "PER", "TEST"), ("fg", "SEGMENTATION", "TEST"), ("cde", "SEGMENTATION", "TEST")] tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(tokens), folder_name, "tokens.txt") save_text("\n".join(labels), folder_name, "labels.txt") save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt") tokenizer = CharTokenizer(folder_name) gazetteer = Gazetteer(folder_name) out_adapter = OutSeqlab(folder_name, "labels.txt") in_adapter = InCGN(dataset, max_seq_len, tokenizer, out_adapter, gazetteer) assert in_adapter.transform_sample(dataset[0]) == { "text": "abcdefghijk", "input_ids": [2, 3, 4, 5, 6, 7, 8, 9], "output_ids": [0, 1, 2, 0, 0, 0, 0, 0], "lexicon_ids": [1, 2, 7, 3, 6], "relations": [[((0, True), (0, False)), ((0, True), (1, False)), ((1, True), (2, False)), ((1, True), (3, False)), ((2, True), (2, False)), ((2, True), (3, False)), ((2, True), (4, False)), ((3, True), (3, False)), ((3, True), (4, False)), ((4, True), (5, False)), ((4, True), (6, False))], [((0, False), (1, False)), ((1, False), (2, False)), ((2, False), (3, False)), ((3, False), (4, False)), ((4, False), (5, False)), ((5, False), (6, False)), ((6, False), (7, False)), ((0, True), (2, False)), ((1, True), (1, False)), ((1, True), (0, True)), ((1, True), (4, False)), ((2, True), (1, False)), ((2, True), (0, True)), ((2, True), (5, False)), ((3, True), (2, False)), ((3, True), (5, False)), ((4, True), (4, False)), ((4, True), (2, True)), ((4, True), (3, True)), ((4, True), (7, False))], [((0, False), (1, False)), ((1, False), (2, False)), ((2, False), (3, False)), ((3, False), (4, False)), ((4, False), (5, False)), ((5, False), (6, False)), ((6, False), (7, False)), ((0, True), (0, False)), ((0, True), (1, False)), ((1, True), (2, False)), ((1, True), (3, False)), ((2, True), (2, False)), ((2, True), (4, False)), ((3, True), (3, False)), ((3, True), (4, False)), ((4, True), (5, False)), ((4, True), (6, False))]], "length": 8, "start": 0 } assert in_adapter.transform_sample(dataset[1]) == { "text": "a", "input_ids": [2], "output_ids": [0], "lexicon_ids": [], "relations": [[], [], []], "length": 1, "start": 0 } assert len(in_adapter) == 2 tmp_folder.cleanup() # test MRC mode max_seq_len = 8 dataset = [{ "text": "acbq", "spans": [{ "label": "T1", "start": 1, "end": 1, "text": "c", "confidence": 1.0 }, { "label": "T2", "start": 2, "end": 3, "text": "bq", "confidence": 1.0 }] }] lexicons = [("[PAD]", "SEP", "TEST"), ("c", "LOC", "TEST"), ("bq", "LOC", "TEST")] tokens = ["[UNK]", "[PAD]", "a", "b", "c", "q", "1", "2", "[CLS]", "[SEP]"] queries = {"T1": "q1", "T2": "q2"} tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(tokens), folder_name, "tokens.txt") save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt") tokenizer = CharTokenizer(folder_name) gazetteer = Gazetteer(folder_name) out_adapter = OutMRC() in_adapter = InCGN(dataset, max_seq_len, tokenizer, out_adapter, gazetteer=gazetteer, queries=queries) assert in_adapter.transform_sample(dataset[0]) == [{ "type": "T1", "query": "q1", "text": "acbq", "input_ids": [8, 5, 6, 9, 2, 4, 3, 5], "output_ids": [0, 0, 0, 0, 0, 1, 0, 0], "length": 8, "start": 4, "lexicon_ids": [1, 2], "relations": [[((0, True), (5, False)), ((1, True), (6, False)), ((1, True), (7, False))], [((0, False), (1, False)), ((1, False), (2, False)), ((2, False), (3, False)), ((3, False), (4, False)), ((4, False), (5, False)), ((5, False), (6, False)), ((6, False), (7, False)), ((0, True), (4, False)), ((0, True), (6, False)), ((1, True), (5, False)), ((1, True), (0, True))], [((0, False), (1, False)), ((1, False), (2, False)), ((2, False), (3, False)), ((3, False), (4, False)), ((4, False), (5, False)), ((5, False), (6, False)), ((6, False), (7, False)), ((0, True), (5, False)), ((0, True), (5, False)), ((1, True), (6, False)), ((1, True), (7, False))]] }, { "type": "T2", "query": "q2", "text": "acbq", "input_ids": [8, 5, 7, 9, 2, 4, 3, 5], "output_ids": [0, 0, 0, 0, 0, 0, 1, 2], "length": 8, "start": 4, "lexicon_ids": [1, 2], "relations": [[((0, True), (5, False)), ((1, True), (6, False)), ((1, True), (7, False))], [((0, False), (1, False)), ((1, False), (2, False)), ((2, False), (3, False)), ((3, False), (4, False)), ((4, False), (5, False)), ((5, False), (6, False)), ((6, False), (7, False)), ((0, True), (4, False)), ((0, True), (6, False)), ((1, True), (5, False)), ((1, True), (0, True))], [((0, False), (1, False)), ((1, False), (2, False)), ((2, False), (3, False)), ((3, False), (4, False)), ((4, False), (5, False)), ((5, False), (6, False)), ((6, False), (7, False)), ((0, True), (5, False)), ((0, True), (5, False)), ((1, True), (6, False)), ((1, True), (7, False))]] }] assert len(in_adapter) == 2 tmp_folder.cleanup()
def test_gazetteer(): """Test the class `Gazetteer`.""" lexicons = [ ("[PAD]", "SEP", "TEST"), ("南京", "LOC", "TEST"), ("南京市", "LOC", "TEST"), ("长江", "VIEW", "TEST"), ("长江大桥", "BUILDING", "TEST"), ("江大桥", "PER", "TEST"), ("大桥", "SEGMENTATION", "TEST") ] lexicon_embeddings = [ [0.0, 0.0], [1.0, 0.1], [0.9, 0.3], [0.7, 0.8], [0.21, 0.78], [0.51, 0.82], [0.23, 0.91] ] tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt") torch.save(torch.tensor(lexicon_embeddings), os.path.join(folder_name, "lexicon_embeddings.checkpoints")) save_json({"n_lexicons": 7, "lexicon_dim": 2}, folder_name, "lexicon_configs.json") gazetteer = Gazetteer(folder_name) assert len(gazetteer) == 7 assert gazetteer.configs() == {"n_lexicons": 7, "lexicon_dim": 2, "n_edge_types": 7} assert gazetteer.pad_token == "[PAD]" assert gazetteer.num_types == 6 assert gazetteer["长江"] == 3 assert gazetteer[4] == "长江大桥" assert gazetteer.search(["长", "江", "大", "桥"]) == ["长江", "长江大桥"] assert gazetteer.exist(["长", "江", "大", "桥"]) is True assert gazetteer.exist(["长", "江", "大"]) is False assert [[round(e, 2) for e in em] for em in gazetteer.embeddings().tolist()] == lexicon_embeddings assert gazetteer.freq("南京") == 0 gazetteer.count_freq([{"text": "南京市长江大桥"}]) assert gazetteer.freq("南京") == 1 # update lexicons gazetteer.update(["{0}\tTEST\tTEST".format(lexicon) for lexicon in ["[PAD]", "重庆", "长江"]]) assert len(gazetteer) == 3 assert gazetteer.configs() == {"n_lexicons": 3, "lexicon_dim": 2, "n_edge_types": 2} assert gazetteer.pad_token == "[PAD]" assert gazetteer.num_types == 1 assert gazetteer["长江"] == 2 assert gazetteer["重庆市"] == 0 assert gazetteer.embeddings() is None tmp_folder.cleanup() # test mask tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(["\t".join(lex) for lex in lexicons]), folder_name, "lexicons.txt") sentence = "南京市长江大桥" gazetteer = Gazetteer(folder_name) tmp_folder.cleanup() gazetteer.mask(["南京市", "长江大桥"], True) assert gazetteer.search(list(sentence)) == ["南京"] assert gazetteer.search(list(sentence[3:])) == ["长江"] gazetteer.mask(["南京市", "长江大桥"], False) assert gazetteer.search(list(sentence)) == ["南京", "南京市"] assert gazetteer.search(list(sentence[3:])) == ["长江", "长江大桥"]
def test_inseqlab(): """Test the class `InSeqlab`.""" # test normal mode max_seq_len = 8 dataset = [ { "text": "abcdefghijk", "spans": [ {"label": "Example", "start": 1, "end": 2, "text": "ex", "confidence": 1.0} ] }, { "text": "a", "spans": [] } ] tokens = ["[UNK]", "[PAD]", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] labels = ["O", "B-Example", "I-Example"] tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(tokens), folder_name, "tokens.txt") save_text("\n".join(labels), folder_name, "labels.txt") tokenizer = CharTokenizer(folder_name) out_adapter = OutSeqlab(folder_name, "labels.txt") in_adapter = InSeqlab(dataset, max_seq_len, tokenizer, out_adapter) assert in_adapter.transform_sample(dataset[1]) == \ {"text": "a", "input_ids": [2], "output_ids": [0], "length": 1, "start": 0} assert in_adapter.transform_sample(dataset[0]) == \ {"text": "abcdefghijk", "input_ids": [2, 3, 4, 5, 6, 7, 8, 9], "output_ids": [0, 1, 2, 0, 0, 0, 0, 0], "length": 8, "start": 0} assert len(in_adapter) == 2 tmp_folder.cleanup() # test MRC mode max_seq_len = 8 dataset = [ { "text": "acbq", "spans": [ {"label": "T1", "start": 1, "end": 1, "text": "c", "confidence": 1.0}, {"label": "T2", "start": 2, "end": 3, "text": "bq", "confidence": 1.0} ] } ] tokens = ["[UNK]", "[PAD]", "a", "b", "c", "q", "1", "2", "[CLS]", "[SEP]"] queries = {"T1": "q1", "T2": "q2"} tmp_folder = tempfile.TemporaryDirectory() folder_name = tmp_folder.name save_text("\n".join(tokens), folder_name, "tokens.txt") tokenizer = CharTokenizer(folder_name) out_adapter = OutMRC() in_adapter = InSeqlab(dataset, max_seq_len, tokenizer, out_adapter, queries=queries) assert in_adapter.transform_sample(dataset[0]) == [ { "type": "T1", "query": "q1", "text": "acbq", "input_ids": [8, 5, 6, 9, 2, 4, 3, 5], "output_ids": [0, 0, 0, 0, 0, 1, 0, 0], "length": 8, "start": 4 }, { "type": "T2", "query": "q2", "text": "acbq", "input_ids": [8, 5, 7, 9, 2, 4, 3, 5], "output_ids": [0, 0, 0, 0, 0, 0, 1, 2], "length": 8, "start": 4 } ] assert len(in_adapter) == 2 tmp_folder.cleanup()