def test_init(self): fields = {"x": [1, 2, 3], "y": [4, 5, 6]} ins = Instance(x=[1, 2, 3], y=[4, 5, 6]) self.assertTrue(isinstance(ins.fields, dict)) self.assertEqual(ins.fields, fields) ins = Instance(**fields) self.assertEqual(ins.fields, fields)
def convert_for_infer(self, data, vocabs): for word_seq in data: # list x = TextField(word_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"])
def test_list_of_numpy_to_tensor(self): ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] + [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def formatRowString(self, msg): msg = msg.strip() tokenized_char = [x for x in msg] self._dataset = DataSet() if self._addTarget2Vocab: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char, target=list(dict(self._target_vocab).keys())) else: ins = Instance(chars=tokenized_char, raw_chars=tokenized_char) self._dataset.append(ins)
def test_list_of_list_to_tensor(self): ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def prepare_fake_dataset(): mean = np.array([-3, -3]) cov = np.array([[1, 0], [0, 1]]) class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) mean = np.array([3, 3]) cov = np.array([[1, 0], [0, 1]]) class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) return data_set
def convert(self, data): """Convert a 3D list to a DataSet object. :param data: A 3D tensor. Example:: [ [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ], [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ], ... ] :return: A DataSet object. """ data_set = DataSet() for example in data: p, h, l = example # list, list, str instance = Instance() instance.add_field("premise", p) instance.add_field("hypothesis", h) instance.add_field("truth", l) data_set.append(instance) data_set.apply(lambda ins: len(ins["premise"]), new_field_name="premise_len") data_set.apply(lambda ins: len(ins["hypothesis"]), new_field_name="hypothesis_len") data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len") data_set.set_target("truth") return data_set
def test(self): data = DataSet() for text, label in zip(texts, labels): x = TextField(text, is_target=False) y = LabelField(label, is_target=True) ins = Instance(text=x, label=y) data.append(ins) # use vocabulary to index data data.index_field("text", vocab) # define naive sampler for batch class class SeqSampler: def __call__(self, dataset): return list(range(len(dataset))) # use batch to iterate dataset data_iterator = Batch(data, 2, SeqSampler(), False) total_data = 0 for batch_x, batch_y in data_iterator: total_data += batch_x["text"].size(0) self.assertTrue(batch_x["text"].size(0) == 2 or total_data == len(raw_texts)) self.assertTrue(isinstance(batch_x, dict)) self.assertTrue(isinstance(batch_x["text"], torch.LongTensor)) self.assertTrue(isinstance(batch_y, dict)) self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
def test_case_1(self): args = { "epochs": 3, "batch_size": 2, "validate": False, "use_cuda": False, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "loss": Loss("cross_entropy"), "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0), "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5, "evaluator": SeqLabelEvaluator() } trainer = SeqLabelTrainer(**args) train_data = [ [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], ] vocab = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9 } label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} data_set = DataSet() for example in train_data: text, label = example[0], example[1] x = TextField(text, False) x_len = LabelField(len(text), is_target=False) y = TextField(label, is_target=False) ins = Instance(word_seq=x, truth=y, word_seq_origin_len=x_len) data_set.append(ins) data_set.index_field("word_seq", vocab) data_set.index_field("truth", label_vocab) model = SeqLabeling(args) trainer.train(network=model, train_data=data_set, dev_data=data_set) # If this can run, everything is OK. os.system("rm -rf save") print("pickle path deleted")
def __getitem__(self, idx): """Fetch Instance(s) at the `idx` position(s) in the dataset. Notice: This method returns a copy of the actual instance(s). Any change to the returned value would not modify the origin instance(s) of the DataSet. If you want to make in-place changes to all Instances, use `apply` method. :param idx: can be int or slice. :return: If `idx` is int, return an Instance object. If `idx` is slice, return a DataSet object. """ if isinstance(idx, int): return Instance(**{ name: self.field_arrays[name][idx] for name in self.field_arrays }) elif isinstance(idx, slice): if idx.start is not None and (idx.start >= len(self) or idx.start <= -len(self)): raise RuntimeError( f"Start index {idx.start} out of range 0-{len(self)-1}") data_set = DataSet() for field in self.field_arrays.values(): data_set.add_field(name=field.name, fields=field.content[idx], padding_val=field.padding_val, is_input=field.is_input, is_target=field.is_target) return data_set else: raise KeyError( "Unrecognized type {} for idx in __getitem__ method".format( type(idx)))
def load(self, path): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) data = [self.get_one(sample) for sample in datalist] data_list = list(filter(lambda x: x is not None, data)) ds = DataSet() for example in data_list: ds.append( Instance(words=example[0], pos_tags=example[1], heads=example[2], labels=example[3])) return ds
def _load(self, path): ds = DataSet() for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): # if data[0][0] == '#': # data[0] = data[0][1:] # data[1] = data[1][1:] for i in range(len(self.headers)): if data[i][0].startswith('NE-'): data[i] = data[i][1:] if 'TOKEN' in data[i][0]: data[i] = data[i][1:] # print(data) #data[1] = iob(list(data[1])) doc_start = False for i, h in enumerate(self.headers): field = data[i] if str(' '.join(list(field))).startswith(' #'): continue if str(field[0]).startswith('-DOCSTART-'): doc_start = True break if doc_start: continue ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) if len(ds) == 0: raise RuntimeError("No data found {}.".format(path)) return ds
def _load(self, path: str = None): logging.info(path) ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: if line == '': continue splits = line.strip().split('\t') if len(splits) == 4: raw_targets = [int(i) for i in splits[3].strip().lstrip('[').rstrip(']').split(' ')] elif len(splits) == 3: raw_targets = [0, 0, 0, 0, 0] else: logging.error('data format error') raw_query = splits[0] raw_entity = splits[1] left_context = raw_query[0:raw_query.find(raw_entity)] right_context = raw_query[raw_query.find(raw_entity) + len(raw_entity):] if left_context == '': left_context = '-' if right_context == '': right_context = '-' raw_entity_label = splits[2] if left_context and right_context and raw_entity and raw_entity_label: ds.append( Instance(left_context=tokenize(left_context), right_context=tokenize(right_context), raw_entity=tokenize(raw_entity), raw_entity_label=entity_label_tokenize(raw_entity_label), target=raw_targets)) return ds
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): if in_word_splitter is None: in_word_splitter = self.in_word_splitter dataset = DataSet() with open(filepath, 'r') as f: words = [] for line in f: line = line.strip() if len(line) == 0: # new line if len(words) == 0: # 不能接受空行 continue line = ' '.join(words) if cut_long_sent: sents = cut_long_sentence(line) else: sents = [line] for sent in sents: instance = Instance(raw_sentence=sent) dataset.append(instance) words = [] else: line = line.split()[0] if in_word_splitter is None: words.append(line) else: words.append(line.split(in_word_splitter)[0]) return dataset
def test_append(self): dd = DataSet() for _ in range(3): dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6])) self.assertEqual(len(dd), 3) self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3) self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
def load(self, filepath, in_word_splitter=None, cut_long_sent=False): """ 允许使用的情况有(默认以\t或空格作为seg) 这是 fastNLP , 一个 非常 good 的 包 . 和 也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY 如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0] :param filepath: :param in_word_splitter: :return: """ if in_word_splitter == None: in_word_splitter = self.in_word_splitter dataset = DataSet() with open(filepath, 'r') as f: for line in f: line = line.strip() if len(line.replace(' ', '')) == 0: # 不能接受空行 continue if not in_word_splitter is None: words = [] for part in line.split(): word = part.split(in_word_splitter)[0] words.append(word) line = ' '.join(words) if cut_long_sent: sents = cut_long_sentence(line) else: sents = [line] for sent in sents: instance = Instance(raw_sentence=sent) dataset.append(instance) return dataset
def load(self, path, cut_long_sent=False): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet() for sample in datalist: # print(sample) res = self.get_one(sample) if res is None: continue line = ' '.join(res) if cut_long_sent: sents = cut_long_sentence(line) else: sents = [line] for raw_sentence in sents: ds.append(Instance(raw_sentence=raw_sentence)) return ds
def load(self, path): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet(name='conll') for sample in datalist: # print(sample) res = self.get_one(sample) ds.append( Instance(word_seq=TextField(res[0], is_target=False), pos_seq=TextField(res[1], is_target=False), head_indices=SeqLabelField(res[2], is_target=True), head_labels=TextField(res[3], is_target=True))) return ds
def convert(self, data): data_set = DataSet() for item in data: sent_words = item[0] if self.pos is True and self.ner is True: instance = Instance(words=sent_words, pos_tags=item[1], ner=item[2]) elif self.pos is True: instance = Instance(words=sent_words, pos_tags=item[1]) elif self.ner is True: instance = Instance(words=sent_words, ner=item[1]) else: instance = Instance(words=sent_words) data_set.append(instance) data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len") return data_set
def test(self): data = DataSet() for text in texts: x = TextField(text, is_target=False) ins = Instance(text=x) data.append(ins) data_set = create_dataset_from_lists(texts, vocab, has_target=False) self.assertTrue(type(data) == type(data_set))
def load(self, path): """ 返回的DataSet, 包含以下的field words:list of str, tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..] 假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即 :: 1 编者按 编者按 NN O 11 nmod:topic 2 : : PU O 11 punct 3 7月 7月 NT DATE 4 compound:nn 4 12日 12日 NT DATE 11 nmod:tmod 5 , , PU O 11 punct 1 这 这 DT O 3 det 2 款 款 M O 1 mark:clf 3 飞行 飞行 NN O 8 nsubj 4 从 从 P O 5 case 5 外型 外型 NN O 8 nmod:prep """ datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet() for sample in datalist: # print(sample) res = self.get_one(sample) if res is None: continue char_seq = [] pos_seq = [] for word, tag in zip(res[0], res[1]): char_seq.extend(list(word)) if len(word) == 1: pos_seq.append('S-{}'.format(tag)) elif len(word) > 1: pos_seq.append('B-{}'.format(tag)) for _ in range(len(word) - 2): pos_seq.append('M-{}'.format(tag)) pos_seq.append('E-{}'.format(tag)) else: raise ValueError("Zero length of word detected.") ds.append(Instance(words=char_seq, tag=pos_seq)) return ds
def test_init_v1(self): ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40) self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays) self.assertEqual(ds.field_arrays["x"].content, [ [1, 2, 3, 4], ] * 40) self.assertEqual(ds.field_arrays["y"].content, [ [5, 6], ] * 40)
def convert(self, data): data_set = DataSet() for item in data: sent_words, sent_pos_tag = item[0], item[1] data_set.append(Instance(words=sent_words, tags=sent_pos_tag)) data_set.apply(lambda ins: len(ins), new_field_name="seq_len") data_set.set_target("tags") data_set.set_input("sent_words") data_set.set_input("seq_len") return data_set
def convert_seq2seq_dataset(data): """Convert list of data into DataSet :param data: list of list of strings, [num_examples, *]. :: [ [ [word_11, word_12, ...], [label_1, label_1, ...] ], [ [word_21, word_22, ...], [label_2, label_1, ...] ], ... ] :return: a DataSet. """ dataset = DataSet() for sample in data: word_seq, label_seq = sample[0], sample[1] ins = Instance() ins.add_field("word_seq", TextField(word_seq, is_target=False)) \ .add_field("label_seq", TextField(label_seq, is_target=True)) dataset.append(ins) return dataset
def construct_dataset(sentences): """Construct a data set from a list of sentences. :param sentences: list of list of str :return dataset: a DataSet object """ dataset = DataSet() for sentence in sentences: instance = Instance() instance['raw_sentence'] = sentence dataset.append(instance) return dataset
def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label_seq = example[0], example[1] # list, list x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("truth", vocabs["label_vocab"])
def convert(self, parsed_data): dataset = DataSet() for sample in parsed_data: label0_list = list(map(lambda labels: labels[0], sample[1])) label1_list = list(map(lambda labels: labels[1], sample[1])) label2_list = list(map(lambda labels: labels[2], sample[1])) dataset.append( Instance(token_list=sample[0], label0_list=label0_list, label1_list=label1_list, label2_list=label2_list)) return dataset
def convert(data): dataset = DataSet() for sample in data: word_seq = [BOS] + sample['words'] pos_seq = [BOS] + sample['pos_tags'] heads = [0] + sample['heads'] head_tags = [BOS] + sample['labels'] dataset.append( Instance(raw_words=word_seq, pos=pos_seq, gold_heads=heads, arc_true=heads, tags=head_tags)) return dataset
def convert(data): dataset = DataSet() for sample in data: word_seq = [BOS] + sample[0] pos_seq = [BOS] + sample[1] heads = [0] + list(map(int, sample[2])) head_tags = [BOS] + sample[3] dataset.append( Instance(words=word_seq, pos=pos_seq, gold_heads=heads, arc_true=heads, tags=head_tags)) return dataset
def convert(self, data): dataset = DataSet() for sample in data: word_seq = [BOS] + sample[0] + [EOS] pos_seq = [BOS] + sample[1] + [EOS] heads = [0] + list(map(int, sample[2])) + [0] head_tags = [BOS] + sample[3] + [EOS] dataset.append( Instance(word_seq=TextField(word_seq, is_target=False), pos_seq=TextField(pos_seq, is_target=False), gold_heads=SeqLabelField(heads, is_target=False), head_indices=SeqLabelField(heads, is_target=True), head_labels=TextField(head_tags, is_target=True))) return dataset