def load(self, path, cut_long_sent=False): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet() for sample in datalist: # print(sample) res = self.get_one(sample) if res is None: continue line = ' '.join(res) if cut_long_sent: sents = cut_long_sentence(line) else: sents = [line] for raw_sentence in sents: ds.append(Instance(raw_sentence=raw_sentence)) return ds
def load(self, path): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet(name='conll') for sample in datalist: # print(sample) res = self.get_one(sample) ds.append( Instance(word_seq=TextField(res[0], is_target=False), pos_seq=TextField(res[1], is_target=False), head_indices=SeqLabelField(res[2], is_target=True), head_labels=TextField(res[3], is_target=True))) return ds
def test_add_field(self): ds = DataSet({"x": [3, 4]}) ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True) # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y') print(ds)
def load(self, path): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) data = [self.get_one(sample) for sample in datalist] data_list = list(filter(lambda x: x is not None, data)) ds = DataSet() for example in data_list: ds.append( Instance(words=example[0], pos_tags=example[1], heads=example[2], labels=example[3])) return ds
def test_save_load(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds.save("./my_ds.pkl") self.assertTrue(os.path.exists("./my_ds.pkl")) ds_1 = DataSet.load("./my_ds.pkl") os.remove("my_ds.pkl")
def test_init_assert(self): with self.assertRaises(AssertionError): _ = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 100}) with self.assertRaises(AssertionError): _ = DataSet([[1, 2, 3, 4]] * 10) with self.assertRaises(ValueError): _ = DataSet(0.00001)
def test_append(self): dd = DataSet() for _ in range(3): dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6])) self.assertEqual(len(dd), 3) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3) self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): if in_word_splitter is None: in_word_splitter = self.in_word_splitter dataset = DataSet() with open(filepath, 'r') as f: words = [] for line in f: line = line.strip() if len(line) == 0: # new line if len(words) == 0: # 不能接受空行 continue line = ' '.join(words) if cut_long_sent: sents = cut_long_sentence(line) else: sents = [line] for sent in sents: instance = Instance(raw_sentence=sent) dataset.append(instance) words = [] else: line = line.split()[0] if in_word_splitter is None: words.append(line) else: words.append(line.split(in_word_splitter)[0]) return dataset
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): """ 允许使用的情况有(默认以\t或空格作为seg) 这是 fastNLP , 一个 非常 good 的 包 . 和 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY 如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] :param filepath: :param in_word_splitter: :return: """ if in_word_splitter == None: in_word_splitter = self.in_word_splitter dataset = DataSet() with open(filepath, 'r') as f: for line in f: line = line.strip() if len(line.replace(' ', '')) == 0: # 不能接受空行 continue if not in_word_splitter is None: words = [] for part in line.split(): word = part.split(in_word_splitter)[0] words.append(word) line = ' '.join(words) if cut_long_sent: sents = cut_long_sentence(line) else: sents = [line] for sent in sents: instance = Instance(raw_sentence=sent) dataset.append(instance) return dataset
def convert(self, data): """Convert a 3D list to a DataSet object. :param data: A 3D tensor. [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], ... ] :return: data_set: A DataSet object. """ data_set = DataSet() for example in data: p, h, l = example # list, list, str x1 = TextField(p, is_target=False) x2 = TextField(h, is_target=False) x1_len = TextField([1] * len(p), is_target=False) x2_len = TextField([1] * len(h), is_target=False) y = LabelField(l, is_target=True) instance = Instance() instance.add_field("premise", x1) instance.add_field("hypothesis", x2) instance.add_field("premise_len", x1_len) instance.add_field("hypothesis_len", x2_len) instance.add_field("truth", y) data_set.append(instance) return data_set
def test_drop(self): ds = DataSet({ "x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20 }) ds.drop(lambda ins: len(ins["y"]) < 3) self.assertEqual(len(ds), 20)
def predict(self, content): """ :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field('words', sentence_list) # 3. 使用pipeline self.pipeline(dataset) output = dataset['word_pos_output'].content if isinstance(content, str): return output[0] elif isinstance(content, list): return output
def _load(self, path: str = None): logging.info(path) ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: if line == '': continue splits = line.strip().split('\t') if len(splits) == 4: raw_targets = [int(i) for i in splits[3].strip().lstrip('[').rstrip(']').split(' ')] elif len(splits) == 3: raw_targets = [0, 0, 0, 0, 0] else: logging.error('data format error') raw_query = splits[0] raw_entity = splits[1] left_context = raw_query[0:raw_query.find(raw_entity)] right_context = raw_query[raw_query.find(raw_entity) + len(raw_entity):] if left_context == '': left_context = '-' if right_context == '': right_context = '-' raw_entity_label = splits[2] if left_context and right_context and raw_entity and raw_entity_label: ds.append( Instance(left_context=tokenize(left_context), right_context=tokenize(right_context), raw_entity=tokenize(raw_entity), raw_entity_label=entity_label_tokenize(raw_entity_label), target=raw_targets)) return ds
def _load(self, path): ds = DataSet() for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): # if data[0][0] == '#': # data[0] = data[0][1:] # data[1] = data[1][1:] for i in range(len(self.headers)): if data[i][0].startswith('NE-'): data[i] = data[i][1:] if 'TOKEN' in data[i][0]: data[i] = data[i][1:] # print(data) #data[1] = iob(list(data[1])) doc_start = False for i, h in enumerate(self.headers): field = data[i] if str(' '.join(list(field))).startswith(' #'): continue if str(field[0]).startswith('-DOCSTART-'): doc_start = True break if doc_start: continue ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) if len(ds) == 0: raise RuntimeError("No data found {}.".format(path)) return ds
def predict(self, content): """ :param content: list of list of str. Each string is a token(word). :return answer: list of list of str. Each string is a tag. """ if not hasattr(self, "pipeline"): raise ValueError("You have to load model first.") sentence_list = content # 1. 检查sentence的类型 for sentence in sentence_list: if not all((type(obj) == str for obj in sentence)): raise ValueError("Input must be list of list of string.") # 2. 组建dataset dataset = DataSet() dataset.add_field("words", sentence_list) # 3. 使用pipeline self.pipeline(dataset) def merge_tag(words_list, tags_list): rtn = [] for words, tags in zip(words_list, tags_list): rtn.append([w + "/" + t for w, t in zip(words, tags)]) return rtn output = dataset.field_arrays["tag"].content if isinstance(content, str): return output[0] elif isinstance(content, list): return merge_tag(content, output)
def predict(self, content): """ 分词接口。 :param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如 [ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。 :return: str或List[str], 根据输入的的类型决定。 """ if not hasattr(self, 'pipeline'): raise ValueError("You have to load model first.") sentence_list = [] # 1. 检查sentence的类型 if isinstance(content, str): sentence_list.append(content) elif isinstance(content, list): sentence_list = content # 2. 组建dataset dataset = DataSet() dataset.add_field('raw_sentence', sentence_list) # 3. 使用pipeline self.pipeline(dataset) output = dataset.get_field('output').content if isinstance(content, str): return output[0] elif isinstance(content, list): return output
def test(self): data = DataSet() for text in texts: x = TextField(text, is_target=False) ins = Instance(text=x) data.append(ins) data_set = create_dataset_from_lists(texts, vocab, has_target=False) self.assertTrue(type(data) == type(data_set))
def test_get_item_error(self): with self.assertRaises(RuntimeError): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) _ = ds[40:] with self.assertRaises(KeyError): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) _ = ds["kom"]
def test_get_field(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ans = ds.get_field("x") self.assertTrue(isinstance(ans, FieldArray)) self.assertEqual(ans.content, [[1, 2, 3, 4]] * 10) ans = ds.get_field("y") self.assertTrue(isinstance(ans, FieldArray)) self.assertEqual(ans.content, [[5, 6]] * 10)
def load(self, path): """ 返回的DataSet, 包含以下的field words:list of str, tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..] 假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即 :: 1 编者按 编者按 NN O 11 nmod:topic 2 : : PU O 11 punct 3 7月 7月 NT DATE 4 compound:nn 4 12日 12日 NT DATE 11 nmod:tmod 5 , , PU O 11 punct 1 这 这 DT O 3 det 2 款 款 M O 1 mark:clf 3 飞行 飞行 NN O 8 nsubj 4 从 从 P O 5 case 5 外型 外型 NN O 8 nmod:prep """ datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet() for sample in datalist: # print(sample) res = self.get_one(sample) if res is None: continue char_seq = [] pos_seq = [] for word, tag in zip(res[0], res[1]): char_seq.extend(list(word)) if len(word) == 1: pos_seq.append('S-{}'.format(tag)) elif len(word) > 1: pos_seq.append('B-{}'.format(tag)) for _ in range(len(word) - 2): pos_seq.append('M-{}'.format(tag)) pos_seq.append('E-{}'.format(tag)) else: raise ValueError("Zero length of word detected.") ds.append(Instance(words=char_seq, tag=pos_seq)) return ds
def test_case_TokenizeDatasetLoader(self): loader = TokenizeDataSetLoader() filepath = "./test/data_for_tests/cws_pku_utf_8" data = loader.load(filepath, max_seq_len=32) assert len(data) > 0 data1 = DataSet() data1.read_tokenize(filepath, max_seq_len=32) assert len(data1) > 0 print("pass TokenizeDataSetLoader test!")
def formatRowString(self, msg): msg = msg.strip() tokenized_char = [x for x in msg] self._dataset = DataSet() if self._addTarget2Vocab: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char, target=list(dict(self._target_vocab).keys())) else: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char) self._dataset.append(ins)
def test_input_target(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds.set_input("x") ds.set_target("y") self.assertTrue(ds.field_arrays["x"].is_input) self.assertTrue(ds.field_arrays["y"].is_target) with self.assertRaises(KeyError): ds.set_input("xxx") with self.assertRaises(KeyError): ds.set_input("yyy")
def test_add_append(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) dd.add_field("y", [[1, 2, 3, 4]] * 10) dd.add_field("z", [[5, 6]] * 10) self.assertEqual(len(dd), 10) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3]] * 10) self.assertEqual(dd.field_arrays["y"].content, [[1, 2, 3, 4]] * 10) self.assertEqual(dd.field_arrays["z"].content, [[5, 6]] * 10) with self.assertRaises(RuntimeError): dd.add_field("??", [[1, 2]] * 40)
def convert(self, parsed_data): dataset = DataSet() for sample in parsed_data: label0_list = list(map(lambda labels: labels[0], sample[1])) label1_list = list(map(lambda labels: labels[1], sample[1])) label2_list = list(map(lambda labels: labels[2], sample[1])) dataset.append( Instance(token_list=sample[0], label0_list=label0_list, label1_list=label1_list, label2_list=label2_list)) return dataset
def test_delete_field(self): dd = DataSet() dd.add_field("x", [[1, 2, 3]] * 10) dd.add_field("y", [[1, 2, 3, 4]] * 10) dd.delete_field("x") self.assertFalse("x" in dd.field_arrays) self.assertTrue("y" in dd.field_arrays)
def test_case_1(self): args = { "epochs": 3, "batch_size": 2, "validate": False, "use_cuda": False, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "loss": Loss("cross_entropy"), "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0), "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5, "evaluator": SeqLabelEvaluator() } trainer = SeqLabelTrainer(**args) train_data = [ [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], ] vocab = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9 } label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} data_set = DataSet() for example in train_data: text, label = example[0], example[1] x = TextField(text, False) x_len = LabelField(len(text), is_target=False) y = TextField(label, is_target=False) ins = Instance(word_seq=x, truth=y, word_seq_origin_len=x_len) data_set.append(ins) data_set.index_field("word_seq", vocab) data_set.index_field("truth", label_vocab) model = SeqLabeling(args) trainer.train(network=model, train_data=data_set, dev_data=data_set) # If this can run, everything is OK. os.system("rm -rf save") print("pickle path deleted")
def test_reader(self): # 跑通即可 ds = DataSet().read_naive( "test/data_for_tests/tutorial_sample_dataset.csv") self.assertTrue(isinstance(ds, DataSet)) self.assertTrue(len(ds) > 0) ds = DataSet().read_rawdata("test/data_for_tests/people_daily_raw.txt") self.assertTrue(isinstance(ds, DataSet)) self.assertTrue(len(ds) > 0) ds = DataSet().read_pos("test/data_for_tests/people.txt") self.assertTrue(isinstance(ds, DataSet)) self.assertTrue(len(ds) > 0)
def convert(data): dataset = DataSet() for sample in data: word_seq = [BOS] + sample[0] pos_seq = [BOS] + sample[1] heads = [0] + list(map(int, sample[2])) head_tags = [BOS] + sample[3] dataset.append( Instance(words=word_seq, pos=pos_seq, gold_heads=heads, arc_true=heads, tags=head_tags)) return dataset
def convert(self, data): dataset = DataSet() for sample in data: word_seq = [BOS] + sample[0] + [EOS] pos_seq = [BOS] + sample[1] + [EOS] heads = [0] + list(map(int, sample[2])) + [0] head_tags = [BOS] + sample[3] + [EOS] dataset.append( Instance(word_seq=TextField(word_seq, is_target=False), pos_seq=TextField(pos_seq, is_target=False), gold_heads=SeqLabelField(heads, is_target=False), head_indices=SeqLabelField(heads, is_target=True), head_labels=TextField(head_tags, is_target=True))) return dataset