def load_embedding(emb_dim, emb_file, emb_type, vocab): """Load the pre-trained embedding and combine with the given dictionary. :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding. :param str emb_file: the pre-trained embedding file path. :param str emb_type: the pre-trained embedding format, support glove now :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim) vocab: input vocab or vocab built by pre-train """ pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): vocab.add(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: raise ValueError( "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v return embedding_tensor, vocab
def test_vocab(self): import _pickle as pickle import os vocab = Vocabulary() filename = 'vocab' vocab.update(filename) vocab.update([filename, ['a'], [['b']], ['c']]) idx = vocab[filename] before_pic = (vocab.to_word(idx), vocab[filename]) with open(filename, 'wb') as f: pickle.dump(vocab, f) with open(filename, 'rb') as f: vocab = pickle.load(f) os.remove(filename) vocab.build_reverse_vocab() after_pic = (vocab.to_word(idx), vocab[filename]) TRUE_DICT = {'vocab': 5, 'a': 6, 'b': 7, 'c': 8} TRUE_DICT.update(DEFAULT_WORD_TO_INDEX) TRUE_IDXDICT = { 0: '<pad>', 1: '<unk>', 2: '<reserved-2>', 3: '<reserved-3>', 4: '<reserved-4>', 5: 'vocab', 6: 'a', 7: 'b', 8: 'c' } self.assertEqual(before_pic, after_pic) self.assertDictEqual(TRUE_DICT, vocab.word2idx) self.assertDictEqual(TRUE_IDXDICT, vocab.idx2word)
def test_len(self): vocab = Vocabulary(max_size=None, min_freq=None, unknown=None, padding=None) vocab.update(text) self.assertEqual(len(vocab), len(counter))
def test_encoding_type(self): # 检查传入的tag_vocab与encoding_type不符合时,是否会报错 vocabs = {} import random from itertools import product for encoding_type in ['bio', 'bioes', 'bmeso']: vocab = Vocabulary(unknown=None, padding=None) for i in range(random.randint(10, 100)): label = str(random.randint(1, 10)) for tag in encoding_type: if tag!='o': vocab.add_word(f'{tag}-{label}') else: vocab.add_word('o') vocabs[encoding_type] = vocab for e1, e2 in product(['bio', 'bioes', 'bmeso'], ['bio', 'bioes', 'bmeso']): with self.subTest(e1=e1, e2=e2): if e1==e2: metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2) else: s2 = set(e2) s2.update(set(e1)) if s2==set(e2): continue with self.assertRaises(AssertionError): metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2) for encoding_type in ['bio', 'bioes', 'bmeso']: with self.assertRaises(AssertionError): metric = SpanFPreRecMetric(vocabs[encoding_type], encoding_type='bmes') with self.assertWarns(Warning): vocab = Vocabulary(unknown=None, padding=None).add_word_lst(list('bmes')) metric = SpanFPreRecMetric(vocab, encoding_type='bmeso') vocab = Vocabulary().add_word_lst(list('bmes')) metric = SpanFPreRecMetric(vocab, encoding_type='bmeso')
def process(self, paths, **kwargs): data_info = DataBundle() for name in ['train', 'test', 'dev']: data_info.datasets[name] = self.load(paths[name]) config = Config() vocab = Vocabulary().from_dataset(*data_info.datasets.values(), field_name='sentences') vocab.build_vocab() word2id = vocab.word2idx char_dict = preprocess.get_char_dict(config.char_path) data_info.vocabs = vocab genres = { g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"]) } for name, ds in data_info.datasets.items(): ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[0], new_field_name='doc_np') ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[1], new_field_name='char_index') ds.apply( lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), config.max_sentences, is_train=name == 'train')[2], new_field_name='seq_len') ds.apply(lambda x: preprocess.speaker2numpy( x["speakers"], config.max_sentences, is_train=name == 'train'), new_field_name='speaker_ids_np') ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre') ds.set_ignore_type('clusters') ds.set_padder('clusters', None) ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len") ds.set_target("clusters") # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False) # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False) return data_info
def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl): """Load the pre-trained embedding and combine with the given dictionary. :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding. :param emb_file: str, the pre-trained embedding file path. :param emb_type: str, the pre-trained embedding format, support glove now :param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding :param emb_pkl: str, the embedding pickle file. :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim) vocab: input vocab or vocab built by pre-train TODO: fragile code """ # If the embedding pickle exists, load it and return. # if os.path.exists(emb_pkl): # with open(emb_pkl, "rb") as f: # embedding_tensor, vocab = _pickle.load(f) # return embedding_tensor, vocab # Otherwise, load the pre-trained embedding. pretrain = EmbedLoader._load_pretrain(emb_file, emb_type) if vocab is None: # build vocabulary from pre-trained embedding vocab = Vocabulary() for w in pretrain.keys(): vocab.update(w) embedding_tensor = torch.randn(len(vocab), emb_dim) for w, v in pretrain.items(): if len(v.shape) > 1 or emb_dim != v.shape[0]: raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,))) if vocab.has_word(w): embedding_tensor[vocab[w]] = v # save and return the result # with open(emb_pkl, "wb") as f: # _pickle.dump((embedding_tensor, vocab), f) return embedding_tensor, vocab
def test_index(self): vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) vocab.update(text) res = [vocab[w] for w in set(text)] self.assertEqual(len(res), len(set(res))) res = [vocab.to_index(w) for w in set(text)] self.assertEqual(len(res), len(set(res)))
class SeqLabelDataSet(DataSet): def __init__(self, instances=None, load_func=POSDataSetLoader().load): super(SeqLabelDataSet, self).__init__(name="", instances=instances, load_func=load_func) self.word_vocab = Vocabulary() self.label_vocab = Vocabulary() def convert(self, data): """Convert lists of strings into Instances with Fields. :param data: 3-level lists. Entries are strings. """ bar = ProgressBar(total=len(data)) for example in data: word_seq, label_seq = example[0], example[1] # list, list self.word_vocab.update(word_seq) self.label_vocab.update(label_seq) x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) bar.move() self.index_field("word_seq", self.word_vocab) self.index_field("truth", self.label_vocab) # no need to index "word_seq_origin_len" def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label_seq = example[0], example[1] # list, list x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) y = TextField(label_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("truth", y) instance.add_field("word_seq_origin_len", x_len) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("truth", vocabs["label_vocab"]) # no need to index "word_seq_origin_len" def convert_for_infer(self, data, vocabs): for word_seq in data: # list x = TextField(word_seq, is_target=False) x_len = LabelField(len(word_seq), is_target=False) instance = Instance() instance.add_field("word_seq", x) instance.add_field("word_seq_origin_len", x_len) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"])
def test_vocab(self): vocab = Vocabulary() word_list = "this is a word list".split() vocab.update(word_list) pred_dict = {"pred": torch.zeros(4, 3)} target_dict = {'target': torch.zeros(4)} metric = ConfusionMatrixMetric(vocab=vocab) metric(pred_dict=pred_dict, target_dict=target_dict) print(metric.get_metric())
def train_test(): # Config Loader train_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": train_args}) # define dataset data_train = TokenizeDataSetLoader().load(cws_data_path) word_vocab = Vocabulary() label_vocab = Vocabulary() data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab) data_train.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab) data_train.set_origin_len("word_seq") data_train.rename_field("label_seq", "truth").set_target(truth=False) train_args["vocab_size"] = len(word_vocab) train_args["num_classes"] = len(label_vocab) save_pickle(word_vocab, pickle_path, "word2id.pkl") save_pickle(label_vocab, pickle_path, "label2id.pkl") # Trainer trainer = SeqLabelTrainer(**train_args.data) # Model model = SeqLabeling(train_args) # Start training trainer.train(model, data_train) # Saver saver = ModelSaver("./save/saved_model.pkl") saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(train_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, "./save/saved_model.pkl") # Load test configuration test_args = ConfigSection() ConfigLoader().load_config(config_path, {"POS_infer": test_args}) test_args["evaluator"] = SeqLabelEvaluator() # Tester tester = SeqLabelTester(**test_args.data) # Start testing data_train.set_target(truth=True) tester.test(model, data_train)
def construct_vocab(self, *datasets): """ 使用传入的DataSet创建vocabulary :param datasets: DataSet类型的数据,用于构建vocabulary :return: """ self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size) for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) self.vocab.build_vocab() if self.verbose: print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))
def mock_cws(): os.makedirs("mock", exist_ok=True) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] word2id = Vocabulary() word_list = [ch for ch in "".join(text)] word2id.update(word_list) save_pickle(word2id, "./mock/", "word2id.pkl") class2id = Vocabulary(need_default=False) label_list = ['B', 'M', 'E', 'S'] class2id.update(label_list) save_pickle(class2id, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(word2id), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(class2id) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(word2id), len(class2id)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = AdvSeqLabel(model_args) ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
def mock_pos_tag(): os.makedirs("mock", exist_ok=True) text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"] vocab = Vocabulary() word_list = [ch for ch in "".join(text)] vocab.update(word_list) save_pickle(vocab, "./mock/", "word2id.pkl") idx2label = Vocabulary(need_default=False) label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv'] idx2label.update(label_list) save_pickle(idx2label, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(vocab), len(idx2label)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = AdvSeqLabel(model_args) ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
class VocabProcessor(Processor): def __init__(self, field_name): super(VocabProcessor, self).__init__(field_name, None) self.vocab = Vocabulary() def process(self, *datasets): for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) for ins in dataset: tokens = ins[self.field_name] self.vocab.update(tokens) def get_vocab(self): self.vocab.build_vocab() return self.vocab
class VocabProcessor(Processor): def __init__(self, field_name, min_freq=1, max_size=None): super(VocabProcessor, self).__init__(field_name, None) self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size) def process(self, *datasets): for dataset in datasets: assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) def get_vocab(self): self.vocab.build_vocab() return self.vocab def get_vocab_size(self): return len(self.vocab)
class TextClassifyDataSet(DataSet): def __init__(self, instances=None, load_func=ClassDataSetLoader().load): super(TextClassifyDataSet, self).__init__(name="", instances=instances, load_func=load_func) self.word_vocab = Vocabulary() self.label_vocab = Vocabulary(need_default=False) def convert(self, data): for example in data: word_seq, label = example[0], example[1] # list, str self.word_vocab.update(word_seq) self.label_vocab.update(label) x = TextField(word_seq, is_target=False) y = LabelField(label, is_target=True) instance = Instance() instance.add_field("word_seq", x) instance.add_field("label", y) self.append(instance) self.index_field("word_seq", self.word_vocab) self.index_field("label", self.label_vocab) def convert_with_vocabs(self, data, vocabs): for example in data: word_seq, label = example[0], example[1] # list, str x = TextField(word_seq, is_target=False) y = LabelField(label, is_target=True) instance = Instance() instance.add_field("word_seq", x) instance.add_field("label", y) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"]) self.index_field("label", vocabs["label_vocab"]) def convert_for_infer(self, data, vocabs): for word_seq in data: # list x = TextField(word_seq, is_target=False) instance = Instance() instance.add_field("word_seq", x) self.append(instance) self.index_field("word_seq", vocabs["word_vocab"])
def test_contains(self): vocab = Vocabulary(need_default=True, max_size=None, min_freq=None) vocab.update(text) self.assertTrue(text[-1] in vocab) self.assertFalse("~!@#" in vocab) self.assertEqual(text[-1] in vocab, vocab.has_word(text[-1])) self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#"))
def add_words_field_2_databundle(data_bundle): train_cws_field = "data/wb_cws/train_cws_word.txt" dev_cws_field = "data/wb_cws/dev_cws_word.txt" test_cws_field = "data/wb_cws/test_cws_word.txt" train_field = _read_txt(train_cws_field) dev_field = _read_txt(dev_cws_field) test_field = _read_txt(test_cws_field) # # data_bundle.get_dataset('train').add_field(field_name="raw_words", fields=train_field) data_bundle.get_dataset('dev').add_field(field_name="raw_words", fields=dev_field) data_bundle.get_dataset('test').add_field(field_name="raw_words", fields=test_field) # 添加词表 words_vocab = Vocabulary() word_list = get_corpus_words(train_cws_field, dev_cws_field, test_cws_field) words_vocab.update(word_list) data_bundle.set_vocab(words_vocab, field_name="words") # 将raw_words转换为words_id for dataset in ["train", "dev", "test"]: raw_words = list(data_bundle.get_dataset(dataset)["raw_words"]) words_ids = [] for words in raw_words: words_id = [] for word in words: words_id.append(words_vocab.to_index(word)) words_ids.append(words_id) data_bundle.get_dataset(dataset).add_field(field_name="words", fields=words_ids) data_bundle.set_input('words') data_bundle.set_ignore_type('words', flag=False) data_bundle.set_pad_val("words", 0) return data_bundle
def test_case3(self): number_labels = 4 # bio tag fastnlp_bio_vocab = Vocabulary(unknown=None, padding=None) fastnlp_bio_vocab.word_count = Counter(_generate_tags('BIO', number_labels)) fastnlp_bio_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bio_vocab, only_gross=False) bio_sequence = torch.FloatTensor([[[-0.4424, -0.4579, -0.7376, 1.8129, 0.1316, 1.6566, -1.2169, -0.3782, 0.8240], [-1.2348, -0.1876, -0.1462, -0.4834, -0.6692, -0.9735, 1.1563, -0.3562, -1.4116], [ 1.6550, -0.9555, 0.3782, -1.3160, -1.5835, -0.3443, -1.7858, 2.0023, 0.7075], [-0.3772, -0.5447, -1.5631, 1.1614, 1.4598, -1.2764, 0.5186, 0.3832, -0.1540], [-0.1011, 0.0600, 1.1090, -0.3545, 0.1284, 1.1484, -1.0120, -1.3508, -0.9513], [ 1.8948, 0.8627, -2.1359, 1.3740, -0.7499, 1.5019, 0.6919, -0.0842, -0.4294]], [[-0.2802, 0.6941, -0.4788, -0.3845, 1.7752, 1.2950, -1.9490, -1.4138, -0.8853], [-1.3752, -0.5457, -0.5305, 0.4018, 0.2934, 0.7931, 2.3845, -1.0726, 0.0364], [ 0.3621, 0.2609, 0.1269, -0.5950, 0.7212, 0.5959, 1.6264, -0.8836, -0.9320], [ 0.2003, -1.0758, -1.1560, -0.6472, -1.7549, 0.1264, 0.6044, -1.6857, 1.1571], [ 1.4277, -0.4915, 0.4496, 2.2027, 0.0730, -3.1792, -0.5125, -0.5837, 1.0184], [ 1.9495, 1.7145, -0.2143, -0.1230, -0.2205, 0.8250, 0.4943, -0.9025, 0.0864]]]) bio_target = torch.LongTensor([[3, 6, 0, 8, 2, 4], [4, 1, 7, 0, 4, 7]]) fastnlp_bio_metric({'pred': bio_sequence, 'seq_len': torch.LongTensor([6, 6])}, {'target': bio_target}) expect_bio_res = {'pre-1': 0.333333, 'rec-1': 0.333333, 'f-1': 0.333333, 'pre-2': 0.5, 'rec-2': 0.5, 'f-2': 0.5, 'pre-0': 0.0, 'rec-0': 0.0, 'f-0': 0.0, 'pre-3': 0.0, 'rec-3': 0.0, 'f-3': 0.0, 'pre': 0.222222, 'rec': 0.181818, 'f': 0.2} self.assertDictEqual(expect_bio_res, fastnlp_bio_metric.get_metric())
def _generate_samples(): target = [] seq_len = [] vocab = Vocabulary(unknown=None, padding=None) for i in range(3): target_i = [] seq_len_i = 0 for j in range(1, 10): word_len = np.random.randint(1, 5) seq_len_i += word_len if word_len == 1: target_i.append('S') else: target_i.append('B') target_i.extend(['M'] * (word_len - 2)) target_i.append('E') vocab.add_word_lst(target_i) target.append(target_i) seq_len.append(seq_len_i) target_ = np.zeros((3, max(seq_len))) for i in range(3): target_i = [vocab.to_index(t) for t in target[i]] target_[i, :seq_len[i]] = target_i return target_, target, seq_len, vocab
def test_contains(self): vocab = Vocabulary(max_size=None, min_freq=None, unknown=None, padding=None) vocab.update(text) self.assertTrue(text[-1] in vocab) self.assertFalse("~!@#" in vocab) self.assertEqual(text[-1] in vocab, vocab.has_word(text[-1])) self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#"))
def test_additional_update(self): vocab = Vocabulary(max_size=None, min_freq=None) vocab.update(text) _ = vocab["well"] self.assertEqual(vocab.rebuild, False) vocab.add("hahaha") self.assertEqual(vocab.rebuild, True) _ = vocab["hahaha"] self.assertEqual(vocab.rebuild, False) self.assertTrue("hahaha" in vocab)
def test_warning(self): vocab = Vocabulary(max_size=len(set(text)), min_freq=None) vocab.update(text) self.assertEqual(vocab.rebuild, True) print(len(vocab)) self.assertEqual(vocab.rebuild, False) vocab.update([ "hahahha", "hhh", "vvvv", "ass", "asss", "jfweiong", "eqgfeg", "feqfw" ]) # this will print a warning self.assertEqual(vocab.rebuild, True)
def mock_text_classify(): os.makedirs("mock", exist_ok=True) text = [ "世界物联网大会明日在京召开龙头股启动在即", "乌鲁木齐市新增一处城市中心旅游目的地", "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”" ] vocab = Vocabulary() word_list = [ch for ch in "".join(text)] vocab.update(word_list) save_pickle(vocab, "./mock/", "word2id.pkl") idx2label = Vocabulary(need_default=False) label_list = [ 'class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F' ] idx2label.update(label_list) save_pickle(idx2label, "./mock/", "label2id.pkl") model_args = { "vocab_size": len(vocab), "word_emb_dim": 50, "rnn_hidden_units": 50, "num_classes": len(idx2label) } config_file = """ [test_section] vocab_size = {} word_emb_dim = 50 rnn_hidden_units = 50 num_classes = {} """.format(len(vocab), len(idx2label)) with open("mock/test.cfg", "w", encoding="utf-8") as f: f.write(config_file) model = CNNText(model_args) ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model)
def __init__(self, label_is_seq=False, share_vocab=False, add_char_field=False): """ :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve several special tokens for sequence processing. :param share_vocab: bool, whether word sequence and label sequence share the same vocabulary. Typically, this is only available when label_is_seq is True. Default: False. :param add_char_field: bool, whether to add character representations to all TextFields. Default: False. """ print("Preprocessor is about to deprecate. Please use DataSet class.") self.data_vocab = Vocabulary() if label_is_seq is True: if share_vocab is True: self.label_vocab = self.data_vocab else: self.label_vocab = Vocabulary() else: self.label_vocab = Vocabulary(need_default=False) self.character_vocab = Vocabulary(need_default=False) self.add_char_field = add_char_field
def test_seq_label(self): model_args = { "vocab_size": 10, "word_emb_dim": 100, "rnn_hidden_units": 100, "num_classes": 5 } infer_data = [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e'], ['a', 'b', '#', 'd', 'e'], ['a', 'b', 'c', '?', 'e'], ['a', 'b', 'c', 'd', '$'], ['!', 'b', 'c', 'd', 'e']] vocab = Vocabulary() vocab.word2idx = { 'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9 } class_vocab = Vocabulary() class_vocab.word2idx = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} os.system("mkdir save") save_pickle(class_vocab, "./save/", "class2id.pkl") save_pickle(vocab, "./save/", "word2id.pkl") model = SeqLabeling(model_args) predictor = Predictor("./save/", task="seq_label") results = predictor.predict(network=model, data=infer_data) self.assertTrue(isinstance(results, list)) self.assertGreater(len(results), 0) for res in results: self.assertTrue(isinstance(res, list)) self.assertEqual(len(res), 5) self.assertTrue(isinstance(res[0], str)) os.system("rm -rf save") print("pickle path deleted")
class TestEmbedLoader(unittest.TestCase): glove_path = './test/data_for_tests/glove.6B.50d_test.txt' pkl_path = './save' raw_texts = ["i am a cat", "this is a test of new batch", "ha ha", "I am a good boy .", "This is the most beautiful girl ." ] texts = [text.strip().split() for text in raw_texts] vocab = Vocabulary() vocab.update(texts) def test1(self): emb, _ = EmbedLoader.load_embedding(50, self.glove_path, 'glove', self.vocab, self.pkl_path) self.assertTrue(emb.shape[0] == (len(self.vocab))) self.assertTrue(emb.shape[1] == 50) os.remove(self.pkl_path) def test2(self): try: _ = EmbedLoader.load_embedding(100, self.glove_path, 'glove', self.vocab, self.pkl_path) self.fail(msg="load dismatch embedding") except ValueError: pass
def process(self, data_bundle: DataBundle): r""" 对load进来的数据进一步处理原始数据包含:raw_key,raw_speaker,raw_words,raw_clusters .. csv-table:: :header: "raw_key", "raw_speaker","raw_words","raw_clusters" "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" "[...]", "[...]","[...]","[...]" :param data_bundle: :return: """ genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name= Const.RAW_WORDS(3)) vocab.build_vocab() word2id = vocab.word2idx data_bundle.set_vocab(vocab, Const.INPUTS(0)) if self.config.char_path: char_dict = get_char_dict(self.config.char_path) else: char_set = set() for i,w in enumerate(word2id): if i < 2: continue for c in w: char_set.add(c) char_dict = collections.defaultdict(int) char_dict.update({c: i for i, c in enumerate(char_set)}) for name, ds in data_bundle.datasets.items(): # genre ds.apply(lambda x: genres[x[Const.RAW_WORDS(0)][:2]], new_field_name=Const.INPUTS(0)) # speaker_ids_np ds.apply(lambda x: speaker2numpy(x[Const.RAW_WORDS(1)], self.config.max_sentences, is_train=name == 'train'), new_field_name=Const.INPUTS(1)) # sentences ds.rename_field(Const.RAW_WORDS(3),Const.INPUTS(2)) # doc_np ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[0], new_field_name=Const.INPUTS(3)) # char_index ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[1], new_field_name=Const.CHAR_INPUT) # seq len ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[2], new_field_name=Const.INPUT_LEN) # clusters ds.rename_field(Const.RAW_WORDS(2), Const.TARGET) ds.set_ignore_type(Const.TARGET) ds.set_padder(Const.TARGET, None) ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN) ds.set_target(Const.TARGET) return data_bundle
def test_auto_encoding_type_infer(self): # 检查是否可以自动check encode的类型 vocabs = {} import random for encoding_type in ['bio', 'bioes', 'bmeso']: vocab = Vocabulary(unknown=None, padding=None) for i in range(random.randint(10, 100)): label = str(random.randint(1, 10)) for tag in encoding_type: if tag != 'o': vocab.add_word(f'{tag}-{label}') else: vocab.add_word('o') vocabs[encoding_type] = vocab for e in ['bio', 'bioes', 'bmeso']: with self.subTest(e=e): metric = SpanFPreRecMetric(tag_vocab=vocabs[e]) assert metric.encoding_type == e bmes_vocab = _generate_tags('bmes') vocab = Vocabulary() for tag, index in bmes_vocab.items(): vocab.add_word(tag) metric = SpanFPreRecMetric(vocab) assert metric.encoding_type == 'bmes' # 一些无法check的情况 vocab = Vocabulary() for i in range(10): vocab.add_word(str(i)) with self.assertRaises(Exception): metric = SpanFPreRecMetric(vocab)
def test_training(): # Config Loader trainer_args = ConfigSection() model_args = ConfigSection() ConfigLoader().load_config(config_dir, { "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args}) data_set = TokenizeDataSetLoader().load(data_path) word_vocab = Vocabulary() label_vocab = Vocabulary() data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab) data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab) data_set.set_origin_len("word_seq") data_set.rename_field("label_seq", "truth").set_target(truth=False) data_train, data_dev = data_set.split(0.3, shuffle=True) model_args["vocab_size"] = len(word_vocab) model_args["num_classes"] = len(label_vocab) save_pickle(word_vocab, pickle_path, "word2id.pkl") save_pickle(label_vocab, pickle_path, "label2id.pkl") trainer = SeqLabelTrainer( epochs=trainer_args["epochs"], batch_size=trainer_args["batch_size"], validate=False, use_cuda=False, pickle_path=pickle_path, save_best_dev=trainer_args["save_best_dev"], model_name=model_name, optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), ) # Model model = SeqLabeling(model_args) # Start training trainer.train(model, data_train, data_dev) # Saver saver = ModelSaver(os.path.join(pickle_path, model_name)) saver.save_pytorch(model) del model, trainer # Define the same model model = SeqLabeling(model_args) # Dump trained parameters into the model ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name)) # Load test configuration tester_args = ConfigSection() ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args}) # Tester tester = SeqLabelTester(batch_size=4, use_cuda=False, pickle_path=pickle_path, model_name="seq_label_in_test.pkl", evaluator=SeqLabelEvaluator() ) # Start testing with validation data data_dev.set_target(truth=True) tester.test(model, data_dev)