Beispiel #1
0
    def load_embedding(emb_dim, emb_file, emb_type, vocab):
        """Load the pre-trained embedding and combine with the given dictionary.

        :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding.
        :param str emb_file: the pre-trained embedding file path.
        :param str emb_type: the pre-trained embedding format, support glove now
        :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
        :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim)
                vocab: input vocab or vocab built by pre-train

        """
        pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
        if vocab is None:
            # build vocabulary from pre-trained embedding
            vocab = Vocabulary()
            for w in pretrain.keys():
                vocab.add(w)
        embedding_tensor = torch.randn(len(vocab), emb_dim)
        for w, v in pretrain.items():
            if len(v.shape) > 1 or emb_dim != v.shape[0]:
                raise ValueError(
                    "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,)))
            if vocab.has_word(w):
                embedding_tensor[vocab[w]] = v
        return embedding_tensor, vocab
Beispiel #2
0
    def test_vocab(self):
        import _pickle as pickle
        import os
        vocab = Vocabulary()
        filename = 'vocab'
        vocab.update(filename)
        vocab.update([filename, ['a'], [['b']], ['c']])
        idx = vocab[filename]
        before_pic = (vocab.to_word(idx), vocab[filename])

        with open(filename, 'wb') as f:
            pickle.dump(vocab, f)
        with open(filename, 'rb') as f:
            vocab = pickle.load(f)
        os.remove(filename)

        vocab.build_reverse_vocab()
        after_pic = (vocab.to_word(idx), vocab[filename])
        TRUE_DICT = {'vocab': 5, 'a': 6, 'b': 7, 'c': 8}
        TRUE_DICT.update(DEFAULT_WORD_TO_INDEX)
        TRUE_IDXDICT = {
            0: '<pad>',
            1: '<unk>',
            2: '<reserved-2>',
            3: '<reserved-3>',
            4: '<reserved-4>',
            5: 'vocab',
            6: 'a',
            7: 'b',
            8: 'c'
        }
        self.assertEqual(before_pic, after_pic)
        self.assertDictEqual(TRUE_DICT, vocab.word2idx)
        self.assertDictEqual(TRUE_IDXDICT, vocab.idx2word)
Beispiel #3
0
 def test_len(self):
     vocab = Vocabulary(max_size=None,
                        min_freq=None,
                        unknown=None,
                        padding=None)
     vocab.update(text)
     self.assertEqual(len(vocab), len(counter))
Beispiel #4
0
    def test_encoding_type(self):
        # 检查传入的tag_vocab与encoding_type不符合时,是否会报错
        vocabs = {}
        import random
        from itertools import product
        for encoding_type in ['bio', 'bioes', 'bmeso']:
            vocab = Vocabulary(unknown=None, padding=None)
            for i in range(random.randint(10, 100)):
                label = str(random.randint(1, 10))
                for tag in encoding_type:
                    if tag!='o':
                        vocab.add_word(f'{tag}-{label}')
                    else:
                        vocab.add_word('o')
            vocabs[encoding_type] = vocab
        for e1, e2 in product(['bio', 'bioes', 'bmeso'], ['bio', 'bioes', 'bmeso']):
            with self.subTest(e1=e1, e2=e2):
                if e1==e2:
                    metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2)
                else:
                    s2 = set(e2)
                    s2.update(set(e1))
                    if s2==set(e2):
                        continue
                    with self.assertRaises(AssertionError):
                        metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2)
        for encoding_type in ['bio', 'bioes', 'bmeso']:
            with self.assertRaises(AssertionError):
                metric = SpanFPreRecMetric(vocabs[encoding_type], encoding_type='bmes')

        with self.assertWarns(Warning):
            vocab = Vocabulary(unknown=None, padding=None).add_word_lst(list('bmes'))
            metric = SpanFPreRecMetric(vocab, encoding_type='bmeso')
            vocab = Vocabulary().add_word_lst(list('bmes'))
            metric = SpanFPreRecMetric(vocab, encoding_type='bmeso')
Beispiel #5
0
    def process(self, paths, **kwargs):
        data_info = DataBundle()
        for name in ['train', 'test', 'dev']:
            data_info.datasets[name] = self.load(paths[name])

        config = Config()
        vocab = Vocabulary().from_dataset(*data_info.datasets.values(),
                                          field_name='sentences')
        vocab.build_vocab()
        word2id = vocab.word2idx

        char_dict = preprocess.get_char_dict(config.char_path)
        data_info.vocabs = vocab

        genres = {
            g: i
            for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])
        }

        for name, ds in data_info.datasets.items():
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[0],
                new_field_name='doc_np')
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[1],
                new_field_name='char_index')
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[2],
                new_field_name='seq_len')
            ds.apply(lambda x: preprocess.speaker2numpy(
                x["speakers"], config.max_sentences, is_train=name == 'train'),
                     new_field_name='speaker_ids_np')
            ds.apply(lambda x: genres[x["doc_key"][:2]],
                     new_field_name='genre')

            ds.set_ignore_type('clusters')
            ds.set_padder('clusters', None)
            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre",
                         "char_index", "seq_len")
            ds.set_target("clusters")

        # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False)
        # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False)

        return data_info
    def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl):
        """Load the pre-trained embedding and combine with the given dictionary.

        :param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
        :param emb_file: str, the pre-trained embedding file path.
        :param emb_type: str, the pre-trained embedding format, support glove now
        :param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding
        :param emb_pkl: str, the embedding pickle file.
        :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim)
                vocab: input vocab or vocab built by pre-train
        TODO: fragile code
        """
        # If the embedding pickle exists, load it and return.
        # if os.path.exists(emb_pkl):
        #     with open(emb_pkl, "rb") as f:
        #         embedding_tensor, vocab = _pickle.load(f)
        #     return embedding_tensor, vocab
        # Otherwise, load the pre-trained embedding.
        pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
        if vocab is None:
            # build vocabulary from pre-trained embedding
            vocab = Vocabulary()
            for w in pretrain.keys():
                vocab.update(w)
        embedding_tensor = torch.randn(len(vocab), emb_dim)
        for w, v in pretrain.items():
            if len(v.shape) > 1 or emb_dim != v.shape[0]:
                raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,)))
            if vocab.has_word(w):
                embedding_tensor[vocab[w]] = v

        # save and return the result
        # with open(emb_pkl, "wb") as f:
        #     _pickle.dump((embedding_tensor, vocab), f)
        return embedding_tensor, vocab
Beispiel #7
0
    def test_index(self):
        vocab = Vocabulary(need_default=True, max_size=None, min_freq=None)
        vocab.update(text)
        res = [vocab[w] for w in set(text)]
        self.assertEqual(len(res), len(set(res)))

        res = [vocab.to_index(w) for w in set(text)]
        self.assertEqual(len(res), len(set(res)))
Beispiel #8
0
class SeqLabelDataSet(DataSet):
    def __init__(self, instances=None, load_func=POSDataSetLoader().load):
        super(SeqLabelDataSet, self).__init__(name="",
                                              instances=instances,
                                              load_func=load_func)
        self.word_vocab = Vocabulary()
        self.label_vocab = Vocabulary()

    def convert(self, data):
        """Convert lists of strings into Instances with Fields.

        :param data: 3-level lists. Entries are strings.
        """
        bar = ProgressBar(total=len(data))
        for example in data:
            word_seq, label_seq = example[0], example[1]
            # list, list
            self.word_vocab.update(word_seq)
            self.label_vocab.update(label_seq)
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            y = TextField(label_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("truth", y)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
            bar.move()
        self.index_field("word_seq", self.word_vocab)
        self.index_field("truth", self.label_vocab)
        # no need to index "word_seq_origin_len"

    def convert_with_vocabs(self, data, vocabs):
        for example in data:
            word_seq, label_seq = example[0], example[1]
            # list, list
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            y = TextField(label_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("truth", y)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
        self.index_field("truth", vocabs["label_vocab"])
        # no need to index "word_seq_origin_len"

    def convert_for_infer(self, data, vocabs):
        for word_seq in data:
            # list
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
Beispiel #9
0
    def test_vocab(self):
        vocab = Vocabulary()
        word_list = "this is a word list".split()
        vocab.update(word_list)

        pred_dict = {"pred": torch.zeros(4, 3)}
        target_dict = {'target': torch.zeros(4)}
        metric = ConfusionMatrixMetric(vocab=vocab)
        metric(pred_dict=pred_dict, target_dict=target_dict)
        print(metric.get_metric())
Beispiel #10
0
def train_test():
    # Config Loader
    train_args = ConfigSection()
    ConfigLoader().load_config(config_path, {"POS_infer": train_args})

    # define dataset
    data_train = TokenizeDataSetLoader().load(cws_data_path)
    word_vocab = Vocabulary()
    label_vocab = Vocabulary()
    data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
    data_train.index_field("word_seq",
                           word_vocab).index_field("label_seq", label_vocab)
    data_train.set_origin_len("word_seq")
    data_train.rename_field("label_seq", "truth").set_target(truth=False)
    train_args["vocab_size"] = len(word_vocab)
    train_args["num_classes"] = len(label_vocab)

    save_pickle(word_vocab, pickle_path, "word2id.pkl")
    save_pickle(label_vocab, pickle_path, "label2id.pkl")

    # Trainer
    trainer = SeqLabelTrainer(**train_args.data)

    # Model
    model = SeqLabeling(train_args)

    # Start training
    trainer.train(model, data_train)

    # Saver
    saver = ModelSaver("./save/saved_model.pkl")
    saver.save_pytorch(model)

    del model, trainer

    # Define the same model
    model = SeqLabeling(train_args)

    # Dump trained parameters into the model
    ModelLoader.load_pytorch(model, "./save/saved_model.pkl")

    # Load test configuration
    test_args = ConfigSection()
    ConfigLoader().load_config(config_path, {"POS_infer": test_args})
    test_args["evaluator"] = SeqLabelEvaluator()

    # Tester
    tester = SeqLabelTester(**test_args.data)

    # Start testing
    data_train.set_target(truth=True)
    tester.test(model, data_train)
Beispiel #11
0
    def construct_vocab(self, *datasets):
        """
        使用传入的DataSet创建vocabulary

        :param datasets: DataSet类型的数据,用于构建vocabulary
        :return:
        """
        self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
        for dataset in datasets:
            assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
            dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
        self.vocab.build_vocab()
        if self.verbose:
            print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))
Beispiel #12
0
def mock_cws():
    os.makedirs("mock", exist_ok=True)
    text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"]

    word2id = Vocabulary()
    word_list = [ch for ch in "".join(text)]
    word2id.update(word_list)
    save_pickle(word2id, "./mock/", "word2id.pkl")

    class2id = Vocabulary(need_default=False)
    label_list = ['B', 'M', 'E', 'S']
    class2id.update(label_list)
    save_pickle(class2id, "./mock/", "label2id.pkl")

    model_args = {
        "vocab_size": len(word2id),
        "word_emb_dim": 50,
        "rnn_hidden_units": 50,
        "num_classes": len(class2id)
    }
    config_file = """
    [test_section]
    vocab_size = {}
    word_emb_dim = 50
    rnn_hidden_units = 50
    num_classes = {}
    """.format(len(word2id), len(class2id))
    with open("mock/test.cfg", "w", encoding="utf-8") as f:
        f.write(config_file)

    model = AdvSeqLabel(model_args)
    ModelSaver("mock/cws_basic_model_v_0.pkl").save_pytorch(model)
Beispiel #13
0
def mock_pos_tag():
    os.makedirs("mock", exist_ok=True)
    text = ["这是最好的基于深度学习的中文分词系统。", "大王叫我来巡山。", "我党多年来致力于改善人民生活水平。"]

    vocab = Vocabulary()
    word_list = [ch for ch in "".join(text)]
    vocab.update(word_list)
    save_pickle(vocab, "./mock/", "word2id.pkl")

    idx2label = Vocabulary(need_default=False)
    label_list = ['B-n', 'M-v', 'E-nv', 'S-adj', 'B-v', 'M-vn', 'S-adv']
    idx2label.update(label_list)
    save_pickle(idx2label, "./mock/", "label2id.pkl")

    model_args = {
        "vocab_size": len(vocab),
        "word_emb_dim": 50,
        "rnn_hidden_units": 50,
        "num_classes": len(idx2label)
    }
    config_file = """
        [test_section]
        vocab_size = {}
        word_emb_dim = 50
        rnn_hidden_units = 50
        num_classes = {}
        """.format(len(vocab), len(idx2label))
    with open("mock/test.cfg", "w", encoding="utf-8") as f:
        f.write(config_file)

    model = AdvSeqLabel(model_args)
    ModelSaver("mock/pos_tag_model_v_0.pkl").save_pytorch(model)
Beispiel #14
0
class VocabProcessor(Processor):
    def __init__(self, field_name):
        super(VocabProcessor, self).__init__(field_name, None)
        self.vocab = Vocabulary()

    def process(self, *datasets):
        for dataset in datasets:
            assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
            for ins in dataset:
                tokens = ins[self.field_name]
                self.vocab.update(tokens)

    def get_vocab(self):
        self.vocab.build_vocab()
        return self.vocab
Beispiel #15
0
class VocabProcessor(Processor):
    def __init__(self, field_name, min_freq=1, max_size=None):

        super(VocabProcessor, self).__init__(field_name, None)
        self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size)

    def process(self, *datasets):
        for dataset in datasets:
            assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
            dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))

    def get_vocab(self):
        self.vocab.build_vocab()
        return self.vocab

    def get_vocab_size(self):
        return len(self.vocab)
Beispiel #16
0
class TextClassifyDataSet(DataSet):
    def __init__(self, instances=None, load_func=ClassDataSetLoader().load):
        super(TextClassifyDataSet, self).__init__(name="",
                                                  instances=instances,
                                                  load_func=load_func)
        self.word_vocab = Vocabulary()
        self.label_vocab = Vocabulary(need_default=False)

    def convert(self, data):
        for example in data:
            word_seq, label = example[0], example[1]
            # list, str
            self.word_vocab.update(word_seq)
            self.label_vocab.update(label)
            x = TextField(word_seq, is_target=False)
            y = LabelField(label, is_target=True)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("label", y)
            self.append(instance)
        self.index_field("word_seq", self.word_vocab)
        self.index_field("label", self.label_vocab)

    def convert_with_vocabs(self, data, vocabs):
        for example in data:
            word_seq, label = example[0], example[1]
            # list, str
            x = TextField(word_seq, is_target=False)
            y = LabelField(label, is_target=True)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("label", y)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
        self.index_field("label", vocabs["label_vocab"])

    def convert_for_infer(self, data, vocabs):
        for word_seq in data:
            # list
            x = TextField(word_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            self.append(instance)
        self.index_field("word_seq", vocabs["word_vocab"])
Beispiel #17
0
 def test_contains(self):
     vocab = Vocabulary(need_default=True, max_size=None, min_freq=None)
     vocab.update(text)
     self.assertTrue(text[-1] in vocab)
     self.assertFalse("~!@#" in vocab)
     self.assertEqual(text[-1] in vocab, vocab.has_word(text[-1]))
     self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#"))
Beispiel #18
0
def add_words_field_2_databundle(data_bundle):
    train_cws_field = "data/wb_cws/train_cws_word.txt"
    dev_cws_field = "data/wb_cws/dev_cws_word.txt"
    test_cws_field = "data/wb_cws/test_cws_word.txt"

    train_field = _read_txt(train_cws_field)
    dev_field = _read_txt(dev_cws_field)
    test_field = _read_txt(test_cws_field)
    #
    #
    data_bundle.get_dataset('train').add_field(field_name="raw_words",
                                               fields=train_field)
    data_bundle.get_dataset('dev').add_field(field_name="raw_words",
                                             fields=dev_field)
    data_bundle.get_dataset('test').add_field(field_name="raw_words",
                                              fields=test_field)

    # 添加词表
    words_vocab = Vocabulary()
    word_list = get_corpus_words(train_cws_field, dev_cws_field,
                                 test_cws_field)
    words_vocab.update(word_list)
    data_bundle.set_vocab(words_vocab, field_name="words")

    # 将raw_words转换为words_id
    for dataset in ["train", "dev", "test"]:
        raw_words = list(data_bundle.get_dataset(dataset)["raw_words"])
        words_ids = []
        for words in raw_words:
            words_id = []
            for word in words:
                words_id.append(words_vocab.to_index(word))
            words_ids.append(words_id)
        data_bundle.get_dataset(dataset).add_field(field_name="words",
                                                   fields=words_ids)
    data_bundle.set_input('words')
    data_bundle.set_ignore_type('words', flag=False)
    data_bundle.set_pad_val("words", 0)
    return data_bundle
Beispiel #19
0
    def test_case3(self):
        number_labels = 4
        # bio tag
        fastnlp_bio_vocab = Vocabulary(unknown=None, padding=None)
        fastnlp_bio_vocab.word_count = Counter(_generate_tags('BIO', number_labels))
        fastnlp_bio_metric = SpanFPreRecMetric(tag_vocab=fastnlp_bio_vocab, only_gross=False)
        bio_sequence = torch.FloatTensor([[[-0.4424, -0.4579, -0.7376,  1.8129,  0.1316,  1.6566, -1.2169,
          -0.3782,  0.8240],
         [-1.2348, -0.1876, -0.1462, -0.4834, -0.6692, -0.9735,  1.1563,
          -0.3562, -1.4116],
         [ 1.6550, -0.9555,  0.3782, -1.3160, -1.5835, -0.3443, -1.7858,
           2.0023,  0.7075],
         [-0.3772, -0.5447, -1.5631,  1.1614,  1.4598, -1.2764,  0.5186,
           0.3832, -0.1540],
         [-0.1011,  0.0600,  1.1090, -0.3545,  0.1284,  1.1484, -1.0120,
          -1.3508, -0.9513],
         [ 1.8948,  0.8627, -2.1359,  1.3740, -0.7499,  1.5019,  0.6919,
          -0.0842, -0.4294]],

        [[-0.2802,  0.6941, -0.4788, -0.3845,  1.7752,  1.2950, -1.9490,
          -1.4138, -0.8853],
         [-1.3752, -0.5457, -0.5305,  0.4018,  0.2934,  0.7931,  2.3845,
          -1.0726,  0.0364],
         [ 0.3621,  0.2609,  0.1269, -0.5950,  0.7212,  0.5959,  1.6264,
          -0.8836, -0.9320],
         [ 0.2003, -1.0758, -1.1560, -0.6472, -1.7549,  0.1264,  0.6044,
          -1.6857,  1.1571],
         [ 1.4277, -0.4915,  0.4496,  2.2027,  0.0730, -3.1792, -0.5125,
          -0.5837,  1.0184],
         [ 1.9495,  1.7145, -0.2143, -0.1230, -0.2205,  0.8250,  0.4943,
          -0.9025,  0.0864]]])
        bio_target = torch.LongTensor([[3, 6, 0, 8, 2, 4],
                                        [4, 1, 7, 0, 4, 7]])
        fastnlp_bio_metric({'pred': bio_sequence, 'seq_len': torch.LongTensor([6, 6])}, {'target': bio_target})
        expect_bio_res = {'pre-1': 0.333333, 'rec-1': 0.333333, 'f-1': 0.333333, 'pre-2': 0.5, 'rec-2': 0.5,
                          'f-2': 0.5, 'pre-0': 0.0, 'rec-0': 0.0, 'f-0': 0.0, 'pre-3': 0.0, 'rec-3': 0.0,
                          'f-3': 0.0, 'pre': 0.222222, 'rec': 0.181818, 'f': 0.2}

        self.assertDictEqual(expect_bio_res, fastnlp_bio_metric.get_metric())
Beispiel #20
0
 def _generate_samples():
     target = []
     seq_len = []
     vocab = Vocabulary(unknown=None, padding=None)
     for i in range(3):
         target_i = []
         seq_len_i = 0
         for j in range(1, 10):
             word_len = np.random.randint(1, 5)
             seq_len_i += word_len
             if word_len == 1:
                 target_i.append('S')
             else:
                 target_i.append('B')
                 target_i.extend(['M'] * (word_len - 2))
                 target_i.append('E')
         vocab.add_word_lst(target_i)
         target.append(target_i)
         seq_len.append(seq_len_i)
     target_ = np.zeros((3, max(seq_len)))
     for i in range(3):
         target_i = [vocab.to_index(t) for t in target[i]]
         target_[i, :seq_len[i]] = target_i
     return target_, target, seq_len, vocab
Beispiel #21
0
 def test_contains(self):
     vocab = Vocabulary(max_size=None,
                        min_freq=None,
                        unknown=None,
                        padding=None)
     vocab.update(text)
     self.assertTrue(text[-1] in vocab)
     self.assertFalse("~!@#" in vocab)
     self.assertEqual(text[-1] in vocab, vocab.has_word(text[-1]))
     self.assertEqual("~!@#" in vocab, vocab.has_word("~!@#"))
Beispiel #22
0
    def test_additional_update(self):
        vocab = Vocabulary(max_size=None, min_freq=None)
        vocab.update(text)

        _ = vocab["well"]
        self.assertEqual(vocab.rebuild, False)

        vocab.add("hahaha")
        self.assertEqual(vocab.rebuild, True)

        _ = vocab["hahaha"]
        self.assertEqual(vocab.rebuild, False)
        self.assertTrue("hahaha" in vocab)
Beispiel #23
0
    def test_warning(self):
        vocab = Vocabulary(max_size=len(set(text)), min_freq=None)
        vocab.update(text)
        self.assertEqual(vocab.rebuild, True)
        print(len(vocab))
        self.assertEqual(vocab.rebuild, False)

        vocab.update([
            "hahahha", "hhh", "vvvv", "ass", "asss", "jfweiong", "eqgfeg",
            "feqfw"
        ])
        # this will print a warning
        self.assertEqual(vocab.rebuild, True)
Beispiel #24
0
def mock_text_classify():
    os.makedirs("mock", exist_ok=True)
    text = [
        "世界物联网大会明日在京召开龙头股启动在即", "乌鲁木齐市新增一处城市中心旅游目的地",
        "朱元璋的大明朝真的源于明教吗?——告诉你一个真实的“明教”"
    ]
    vocab = Vocabulary()
    word_list = [ch for ch in "".join(text)]
    vocab.update(word_list)
    save_pickle(vocab, "./mock/", "word2id.pkl")

    idx2label = Vocabulary(need_default=False)
    label_list = [
        'class_A', 'class_B', 'class_C', 'class_D', 'class_E', 'class_F'
    ]
    idx2label.update(label_list)
    save_pickle(idx2label, "./mock/", "label2id.pkl")

    model_args = {
        "vocab_size": len(vocab),
        "word_emb_dim": 50,
        "rnn_hidden_units": 50,
        "num_classes": len(idx2label)
    }
    config_file = """
            [test_section]
            vocab_size = {}
            word_emb_dim = 50
            rnn_hidden_units = 50
            num_classes = {}
            """.format(len(vocab), len(idx2label))
    with open("mock/test.cfg", "w", encoding="utf-8") as f:
        f.write(config_file)

    model = CNNText(model_args)
    ModelSaver("mock/text_class_model_v0.pkl").save_pytorch(model)
Beispiel #25
0
    def __init__(self,
                 label_is_seq=False,
                 share_vocab=False,
                 add_char_field=False):
        """

        :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve
                several special tokens for sequence processing.
        :param share_vocab: bool, whether word sequence and label sequence share the same vocabulary. Typically, this
                is only available when label_is_seq is True. Default: False.
        :param add_char_field: bool, whether to add character representations to all TextFields. Default: False.
        """
        print("Preprocessor is about to deprecate. Please use DataSet class.")
        self.data_vocab = Vocabulary()
        if label_is_seq is True:
            if share_vocab is True:
                self.label_vocab = self.data_vocab
            else:
                self.label_vocab = Vocabulary()
        else:
            self.label_vocab = Vocabulary(need_default=False)

        self.character_vocab = Vocabulary(need_default=False)
        self.add_char_field = add_char_field
Beispiel #26
0
    def test_seq_label(self):
        model_args = {
            "vocab_size": 10,
            "word_emb_dim": 100,
            "rnn_hidden_units": 100,
            "num_classes": 5
        }

        infer_data = [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e'],
                      ['a', 'b', '#', 'd', 'e'], ['a', 'b', 'c', '?', 'e'],
                      ['a', 'b', 'c', 'd', '$'], ['!', 'b', 'c', 'd', 'e']]

        vocab = Vocabulary()
        vocab.word2idx = {
            'a': 0,
            'b': 1,
            'c': 2,
            'd': 3,
            'e': 4,
            '!': 5,
            '@': 6,
            '#': 7,
            '$': 8,
            '?': 9
        }
        class_vocab = Vocabulary()
        class_vocab.word2idx = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4}

        os.system("mkdir save")
        save_pickle(class_vocab, "./save/", "class2id.pkl")
        save_pickle(vocab, "./save/", "word2id.pkl")

        model = SeqLabeling(model_args)
        predictor = Predictor("./save/", task="seq_label")

        results = predictor.predict(network=model, data=infer_data)

        self.assertTrue(isinstance(results, list))
        self.assertGreater(len(results), 0)
        for res in results:
            self.assertTrue(isinstance(res, list))
            self.assertEqual(len(res), 5)
            self.assertTrue(isinstance(res[0], str))

        os.system("rm -rf save")
        print("pickle path deleted")
Beispiel #27
0
class TestEmbedLoader(unittest.TestCase):
    glove_path = './test/data_for_tests/glove.6B.50d_test.txt'
    pkl_path = './save'
    raw_texts = ["i am a cat",
                "this is a test of new batch",
                "ha ha",
                "I am a good boy .",
                "This is the most beautiful girl ."
                ]
    texts = [text.strip().split() for text in raw_texts]
    vocab = Vocabulary()
    vocab.update(texts)
    def test1(self):
        emb, _ = EmbedLoader.load_embedding(50, self.glove_path, 'glove', self.vocab, self.pkl_path)
        self.assertTrue(emb.shape[0] == (len(self.vocab)))
        self.assertTrue(emb.shape[1] == 50)
        os.remove(self.pkl_path)
    
    def test2(self):
        try:
            _ = EmbedLoader.load_embedding(100, self.glove_path, 'glove', self.vocab, self.pkl_path)
            self.fail(msg="load dismatch embedding")
        except ValueError:
            pass
Beispiel #28
0
    def process(self, data_bundle: DataBundle):
        r"""
        对load进来的数据进一步处理原始数据包含:raw_key,raw_speaker,raw_words,raw_clusters
        
        .. csv-table::
           :header: "raw_key", "raw_speaker","raw_words","raw_clusters"

           "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "[...]", "[...]","[...]","[...]"


        :param data_bundle:
        :return:
        """
        genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name= Const.RAW_WORDS(3))
        vocab.build_vocab()
        word2id = vocab.word2idx
        data_bundle.set_vocab(vocab, Const.INPUTS(0))
        if self.config.char_path:
            char_dict = get_char_dict(self.config.char_path)
        else:
            char_set = set()
            for i,w in enumerate(word2id):
                if i < 2:
                    continue
                for c in w:
                    char_set.add(c)

            char_dict = collections.defaultdict(int)
            char_dict.update({c: i for i, c in enumerate(char_set)})

        for name, ds in data_bundle.datasets.items():
            # genre
            ds.apply(lambda x: genres[x[Const.RAW_WORDS(0)][:2]], new_field_name=Const.INPUTS(0))

            # speaker_ids_np
            ds.apply(lambda x: speaker2numpy(x[Const.RAW_WORDS(1)], self.config.max_sentences, is_train=name == 'train'),
                     new_field_name=Const.INPUTS(1))

            # sentences
            ds.rename_field(Const.RAW_WORDS(3),Const.INPUTS(2))

            # doc_np
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[0],
                     new_field_name=Const.INPUTS(3))
            # char_index
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[1],
                     new_field_name=Const.CHAR_INPUT)
            # seq len
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[2],
                     new_field_name=Const.INPUT_LEN)

            # clusters
            ds.rename_field(Const.RAW_WORDS(2), Const.TARGET)

            ds.set_ignore_type(Const.TARGET)
            ds.set_padder(Const.TARGET, None)
            ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN)
            ds.set_target(Const.TARGET)

        return data_bundle
Beispiel #29
0
    def test_auto_encoding_type_infer(self):
        #  检查是否可以自动check encode的类型
        vocabs = {}
        import random
        for encoding_type in ['bio', 'bioes', 'bmeso']:
            vocab = Vocabulary(unknown=None, padding=None)
            for i in range(random.randint(10, 100)):
                label = str(random.randint(1, 10))
                for tag in encoding_type:
                    if tag != 'o':
                        vocab.add_word(f'{tag}-{label}')
                    else:
                        vocab.add_word('o')
            vocabs[encoding_type] = vocab
        for e in ['bio', 'bioes', 'bmeso']:
            with self.subTest(e=e):
                metric = SpanFPreRecMetric(tag_vocab=vocabs[e])
                assert metric.encoding_type == e

        bmes_vocab = _generate_tags('bmes')
        vocab = Vocabulary()
        for tag, index in bmes_vocab.items():
            vocab.add_word(tag)
        metric = SpanFPreRecMetric(vocab)
        assert metric.encoding_type == 'bmes'

        # 一些无法check的情况
        vocab = Vocabulary()
        for i in range(10):
            vocab.add_word(str(i))
        with self.assertRaises(Exception):
            metric = SpanFPreRecMetric(vocab)
Beispiel #30
0
def test_training():
    # Config Loader
    trainer_args = ConfigSection()
    model_args = ConfigSection()
    ConfigLoader().load_config(config_dir, {
        "test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})

    data_set = TokenizeDataSetLoader().load(data_path)
    word_vocab = Vocabulary()
    label_vocab = Vocabulary()
    data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
    data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
    data_set.set_origin_len("word_seq")
    data_set.rename_field("label_seq", "truth").set_target(truth=False)
    data_train, data_dev = data_set.split(0.3, shuffle=True)
    model_args["vocab_size"] = len(word_vocab)
    model_args["num_classes"] = len(label_vocab)

    save_pickle(word_vocab, pickle_path, "word2id.pkl")
    save_pickle(label_vocab, pickle_path, "label2id.pkl")

    trainer = SeqLabelTrainer(
        epochs=trainer_args["epochs"],
        batch_size=trainer_args["batch_size"],
        validate=False,
        use_cuda=False,
        pickle_path=pickle_path,
        save_best_dev=trainer_args["save_best_dev"],
        model_name=model_name,
        optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
    )

    # Model
    model = SeqLabeling(model_args)

    # Start training
    trainer.train(model, data_train, data_dev)

    # Saver
    saver = ModelSaver(os.path.join(pickle_path, model_name))
    saver.save_pytorch(model)

    del model, trainer

    # Define the same model
    model = SeqLabeling(model_args)

    # Dump trained parameters into the model
    ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name))

    # Load test configuration
    tester_args = ConfigSection()
    ConfigLoader().load_config(config_dir, {"test_seq_label_tester": tester_args})

    # Tester
    tester = SeqLabelTester(batch_size=4,
                            use_cuda=False,
                            pickle_path=pickle_path,
                            model_name="seq_label_in_test.pkl",
                            evaluator=SeqLabelEvaluator()
                            )

    # Start testing with validation data
    data_dev.set_target(truth=True)
    tester.test(model, data_dev)