def test_case_1(self): args = { "epochs": 3, "batch_size": 2, "validate": False, "use_cuda": False, "pickle_path": "./save/", "save_best_dev": True, "model_name": "default_model_name.pkl", "loss": Loss("cross_entropy"), "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0), "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5, "evaluator": SeqLabelEvaluator() } trainer = SeqLabelTrainer(**args) train_data = [ [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], ] vocab = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9 } label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} data_set = DataSet() for example in train_data: text, label = example[0], example[1] x = TextField(text, False) x_len = LabelField(len(text), is_target=False) y = TextField(label, is_target=False) ins = Instance(word_seq=x, truth=y, word_seq_origin_len=x_len) data_set.append(ins) data_set.index_field("word_seq", vocab) data_set.index_field("truth", label_vocab) model = SeqLabeling(args) trainer.train(network=model, train_data=data_set, dev_data=data_set) # If this can run, everything is OK. os.system("rm -rf save") print("pickle path deleted")
def load(self, path): datalist = [] with open(path, 'r', encoding='utf-8') as f: sample = [] for line in f: if line.startswith('\n'): datalist.append(sample) sample = [] elif line.startswith('#'): continue else: sample.append(line.split('\t')) if len(sample) > 0: datalist.append(sample) ds = DataSet(name='conll') for sample in datalist: # print(sample) res = self.get_one(sample) ds.append( Instance(word_seq=TextField(res[0], is_target=False), pos_seq=TextField(res[1], is_target=False), head_indices=SeqLabelField(res[2], is_target=True), head_labels=TextField(res[3], is_target=True))) return ds
def convert(self, data): dataset = DataSet() for sample in data: word_seq = [BOS] + sample[0] + [EOS] pos_seq = [BOS] + sample[1] + [EOS] heads = [0] + list(map(int, sample[2])) + [0] head_tags = [BOS] + sample[3] + [EOS] dataset.append( Instance(word_seq=TextField(word_seq, is_target=False), pos_seq=TextField(pos_seq, is_target=False), gold_heads=SeqLabelField(heads, is_target=False), head_indices=SeqLabelField(heads, is_target=True), head_labels=TextField(head_tags, is_target=True))) return dataset
def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label_seq = example[0], example[1] # list, list x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("truth", vocabs["label_vocab"])
def test(self): data = DataSet() for text, label in zip(texts, labels): x = TextField(text, is_target=False) y = LabelField(label, is_target=True) ins = Instance(text=x, label=y) data.append(ins) # use vocabulary to index data data.index_field("text", vocab) # define naive sampler for batch class class SeqSampler: def __call__(self, dataset): return list(range(len(dataset))) # use batch to iterate dataset data_iterator = Batch(data, 2, SeqSampler(), False) total_data = 0 for batch_x, batch_y in data_iterator: total_data += batch_x["text"].size(0) self.assertTrue(batch_x["text"].size(0) == 2 or total_data == len(raw_texts)) self.assertTrue(isinstance(batch_x, dict)) self.assertTrue(isinstance(batch_x["text"], torch.LongTensor)) self.assertTrue(isinstance(batch_y, dict)) self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
def convert_to_dataset(self, data, vocab, label_vocab): """Convert list of indices into a DataSet object. :param data: list. Entries are strings. :param vocab: a dict, mapping string (token) to index (int). :param label_vocab: a dict, mapping string (label) to index (int). :return data_set: a DataSet object """ use_word_seq = False use_label_seq = False use_label_str = False # construct a DataSet object and fill it with Instances data_set = DataSet() for example in data: words, label = example[0], example[1] instance = Instance() if isinstance(words, list): x = TextField(words, is_target=False) instance.add_field("word_seq", x) use_word_seq = True else: raise NotImplementedError("words is a {}".format(type(words))) if isinstance(label, list): y = TextField(label, is_target=True) instance.add_field("label_seq", y) use_label_seq = True elif isinstance(label, str): y = LabelField(label, is_target=True) instance.add_field("label", y) use_label_str = True else: raise NotImplementedError("label is a {}".format(type(label))) data_set.append(instance) # convert strings to indices if use_word_seq: data_set.index_field("word_seq", vocab) if use_label_seq: data_set.index_field("label_seq", label_vocab) if use_label_str: data_set.index_field("label", label_vocab) return data_set
def test(self): data = DataSet() for text in texts: x = TextField(text, is_target=False) ins = Instance(text=x) data.append(ins) data_set = create_dataset_from_lists(texts, vocab, has_target=False) self.assertTrue(type(data) == type(data_set))
def convert_for_infer(self, data, vocabs): for word_seq in data: # list x = TextField(word_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"])
def test_case_1(self): model_args = { "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5 } valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, "save_loss": True, "batch_size": 2, "pickle_path": "./save/", "use_cuda": False, "print_every_step": 1} train_data = [ [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']], [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']], [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']], ] vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9} label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4} data_set = DataSet() for example in train_data: text, label = example[0], example[1] x = TextField(text, False) y = TextField(label, is_target=True) ins = Instance(word_seq=x, label_seq=y) data_set.append(ins) data_set.index_field("word_seq", vocab) data_set.index_field("label_seq", label_vocab) model = SeqLabeling(model_args) tester = SeqLabelTester(**valid_args) tester.test(network=model, dev_data=data_set) # If this can run, everything is OK. os.system("rm -rf save") print("pickle path deleted")
def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label = example[0], example[1] # list, str x = TextField(word_seq, is_target=False) y = LabelField(label, is_target=True) instance = Instance() instance.add_field("word_seq", x) instance.add_field("label", y) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("label", vocabs["label_vocab"])
def convert(self, data): """Convert lists of strings into Instances with Fields. :param data: 3-level lists. Entries are strings. """ bar = ProgressBar(total=len(data)) for example in data: word_seq, label_seq = example[0], example[1] # list, list self.word_vocab.update(word_seq) self.label_vocab.update(label_seq) x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) bar.move() self.index_field("word_seq", self.word_vocab) self.index_field("truth", self.label_vocab)
def convert(self, data): for example in data: word_seq, label = example[0], example[1] # list, str self.word_vocab.update(word_seq) self.label_vocab.update(label) x = TextField(word_seq, is_target=False) y = LabelField(label, is_target=True) instance = Instance() instance.add_field("word_seq", x) instance.add_field("label", y) self.append(instance) self.index_field("word_seq", self.word_vocab) self.index_field("label", self.label_vocab)
def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab): """Create an DataSet instance that contains labels. :param str_lists: list of list of strings, [num_examples, 2, *]. :: [ [[word_11, word_12, ...], [label_11, label_12, ...]], ... ] :param word_vocab: dict of (str: int), which means (word: index). :param label_vocab: dict of (str: int), which means (word: index). :return data_set: a DataSet instance. """ data_set = DataSet() for example in str_lists: word_seq, label_seq = example[0], example[1] x = TextField(word_seq, is_target=False) y = TextField(label_seq, is_target=True) data_set.append(Instance(word_seq=x, label_seq=y)) data_set.index_field("word_seq", word_vocab) data_set.index_field("label_seq", label_vocab) return data_set
def create_unlabeled_dataset_from_lists(str_lists, word_vocab): """Create an DataSet instance that contains no labels. :param str_lists: list of list of strings, [num_examples, *]. :: [ [word_11, word_12, ...], ... ] :param word_vocab: dict of (str: int), which means (word: index). :return data_set: a DataSet instance. """ data_set = DataSet() for word_seq in str_lists: x = TextField(word_seq, is_target=False) data_set.append(Instance(word_seq=x)) data_set.index_field("word_seq", word_vocab) return data_set
""" texts = ["i am a cat", "this is a test of new batch", "haha"] labels = [0, 1, 0] # prepare vocabulary vocab = {} for text in texts: for tokens in text.split(): if tokens not in vocab: vocab[tokens] = len(vocab) print("vocabulary: ", vocab) # prepare input dataset data = DataSet() for text, label in zip(texts, labels): x = TextField(text.split(), False) y = LabelField(label, is_target=True) ins = Instance(text=x, label=y) data.append(ins) # use vocabulary to index data data.index_field("text", vocab) # define naive sampler for batch class class SeqSampler: def __call__(self, dataset): return list(range(len(dataset))) # use batch to iterate dataset data_iterator = Batch(data, 2, SeqSampler(), False) for epoch in range(1):