Ejemplo n.º 1
0
    def test_local_textset_integration(self):
        local_set = LocalTextSet(self.texts, self.labels)
        assert local_set.is_local()
        assert not local_set.is_distributed()
        assert local_set.get_texts() == self.texts
        assert local_set.get_labels() == self.labels
        tokenized = ChainedPreprocessing([Tokenizer(), Normalizer()])(local_set)
        word_index = tokenized.generate_word_index_map(max_words_num=10)
        transformed = ChainedPreprocessing([WordIndexer(word_index), SequenceShaper(10),
                                            TextFeatureToSample()])(tokenized)
        assert transformed.is_local()
        word_index = transformed.get_word_index()
        assert len(word_index) == 10
        assert word_index["my"] == 1
        samples = transformed.get_samples()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 10

        vocab_file = create_tmp_path() + ".txt"
        transformed.save_word_index(vocab_file)
        local_set2 = LocalTextSet(self.texts, self.labels)
        local_set2.load_word_index(vocab_file)
        transformed2 = local_set2.tokenize().normalize().word2idx()\
            .shape_sequence(10).generate_sample()
        samples2 = transformed2.get_samples()
        for s1, s2 in zip(samples, samples2):
            assert np.allclose(s1.feature.to_ndarray(), s2.feature.to_ndarray())
        os.remove(vocab_file)

        model = TextClassifier(5, self.glove_path, word_index, 10)
        model.compile("adagrad", "sparse_categorical_crossentropy", ['accuracy'])
        tmp_log_dir = create_tmp_path()
        tmp_checkpoint_path = create_tmp_path()
        os.mkdir(tmp_checkpoint_path)
        model.set_tensorboard(tmp_log_dir, "textclassification")
        model.set_checkpoint(tmp_checkpoint_path)
        model.fit(transformed, batch_size=2, nb_epoch=2, validation_data=transformed)
        acc = model.evaluate(transformed, batch_size=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts()

        # Test for loaded model predict on TextSet
        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts()
        assert len(predicts) == len(loaded_predicts)

        for i in range(0, len(predicts)):  # (uri, prediction)
            assert not predicts[i][0]
            assert not loaded_predicts[i][0]  # uri is not recorded and thus None
            assert len(predicts[i][1]) == 1
            assert len(loaded_predicts[i][1]) == 1
            assert predicts[i][1][0].shape == (5, )
            assert np.allclose(predicts[i][1][0], loaded_predicts[i][1][0])
        shutil.rmtree(tmp_log_dir)
        shutil.rmtree(tmp_checkpoint_path)
        os.remove(tmp_path)
Ejemplo n.º 2
0
 def test_forward_backward(self):
     resource_path = os.path.join(
         os.path.split(__file__)[0], "../../resources")
     glove_path = os.path.join(resource_path, "glove.6B/glove.6B.50d.txt")
     model = TextClassifier(10, glove_path)
     model.summary()
     input_data = np.random.randint(20, size=(4, 500))
     self.assert_forward_backward(model, input_data)
Ejemplo n.º 3
0
    def test_distributed_textset_integration(self):
        texts_rdd = self.sc.parallelize(self.texts)
        labels_rdd = self.sc.parallelize(self.labels)
        distributed_set = DistributedTextSet(texts_rdd, labels_rdd)
        assert distributed_set.is_distributed()
        assert not distributed_set.is_local()
        assert distributed_set.get_texts().collect() == self.texts
        assert distributed_set.get_labels().collect() == self.labels

        sets = distributed_set.random_split([0.5, 0.5])
        train_texts = sets[0].get_texts().collect()
        test_texts = sets[1].get_texts().collect()
        assert set(train_texts + test_texts) == set(self.texts)

        tokenized = Tokenizer()(distributed_set)
        transformed = tokenized.normalize().word2idx().shape_sequence(
            5).generate_sample()
        word_index = transformed.get_word_index()
        assert len(word_index) == 14
        samples = transformed.get_samples().collect()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 5

        vocab_file = create_tmp_path() + ".txt"
        transformed.save_word_index(vocab_file)
        distributed_set2 = DistributedTextSet(texts_rdd, labels_rdd)
        distributed_set2.load_word_index(vocab_file)
        transformed2 = distributed_set2.tokenize().normalize().word2idx()\
            .shape_sequence(5).generate_sample()
        samples2 = transformed2.get_samples().collect()
        for s1, s2 in zip(samples, samples2):
            assert np.allclose(s1.feature.to_ndarray(),
                               s2.feature.to_ndarray())
        os.remove(vocab_file)

        model = TextClassifier(5,
                               self.glove_path,
                               word_index,
                               5,
                               encoder="lstm")
        model.compile(SGD(), SparseCategoricalCrossEntropy())
        model.fit(transformed, batch_size=2, nb_epoch=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts().collect()
        for predict in predicts:
            assert len(predict) == 1
            assert predict[0].shape == (5, )

        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts().collect()
        assert len(loaded_predicts) == len(predicts)
        os.remove(tmp_path)
Ejemplo n.º 4
0
 def test_forward_backward(self):
     model = TextClassifier(10, glove_path)
     model.summary()
     input_data = np.random.randint(20, size=(4, 500))
     self.assert_forward_backward(model, input_data)
     model.set_evaluate_status()
     # Forward twice will get the same output
     output1 = model.forward(input_data)
     output2 = model.forward(input_data)
     assert np.allclose(output1, output2)
Ejemplo n.º 5
0
        [float(options.training_split), 1 - float(options.training_split)])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        token_length = int(options.token_length)
        if not (token_length == 50 or token_length == 100
                or token_length == 200 or token_length == 300):
            raise ValueError(
                'token_length for GloVe can only be 50, 100, 200, 300, but got '
                + str(token_length))
        embedding_file = options.embedding_path + "/glove.6B." + str(
            token_length) + "d.txt"
        word_index = transformed.get_word_index()
        model = TextClassifier(int(options.class_num),
                               embedding_file, word_index,
                               int(options.sequence_length), options.encoder,
                               int(options.encoder_output_dim))

    model.compile(optimizer=Adagrad(learningrate=float(options.learning_rate),
                                    learningrate_decay=0.001),
                  loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])
    app_name = 'textclassification-' + dt.datetime.now().strftime(
        "%Y%m%d-%H%M%S")
    model.set_tensorboard(options.log_dir, app_name)
    model.fit(train_set,
              batch_size=int(options.batch_size),
              nb_epoch=int(options.nb_epoch),
              validation_data=val_set)
    predict_set = model.predict(val_set,
                                batch_per_thread=int(options.partition_num))
Ejemplo n.º 6
0
 def test_save_load(self):
     model = TextClassifier(20, 200)
     input_data = np.random.random([2, 500, 200])
     self.assert_zoo_model_save_load(model, input_data)
Ejemplo n.º 7
0
 def test_forward_backward(self):
     model = TextClassifier(10, 30, 100)
     model.summary()
     input_data = np.random.random([3, 100, 30])
     self.assert_forward_backward(model, input_data)
Ejemplo n.º 8
0
        tokens_label[0], "##", sequence_len), tokens_label[1]))
    vector_rdd = padded_tokens_rdd.map(lambda tokens_label: ([
        to_vec(w, filtered_word2vec_broadcast.value, token_length)
        for w in tokens_label[0]
    ], tokens_label[1]))
    sample_rdd = vector_rdd.map(lambda vectors_label: to_sample(
        vectors_label[0], vectors_label[1], token_length))

    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1 - training_split])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        model = TextClassifier(CLASS_NUM, token_length,
                               sequence_len, options.encoder,
                               int(options.encoder_output_dim))

    optimizer = Optimizer(model=model,
                          training_rdd=train_rdd,
                          criterion=ClassNLLCriterion(logProbAsInput=False),
                          end_trigger=MaxEpoch(int(options.nb_epoch)),
                          batch_size=batch_size,
                          optim_method=Adagrad(learningrate=float(
                              options.learning_rate),
                                               learningrate_decay=0.001))
    optimizer.set_validation(batch_size=batch_size,
                             val_rdd=val_rdd,
                             trigger=EveryEpoch(),
                             val_method=[Top1Accuracy()])
Ejemplo n.º 9
0
    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1 - training_split])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        if not (token_length == 50 or token_length == 100
                or token_length == 200 or token_length == 300):
            raise ValueError(
                'token_length for GloVe can only be 50, 100, 200, 300, but got '
                + str(token_length))
        embedding_file = data_path + "/glove.6B/glove.6B." + str(
            token_length) + "d.txt"
        word_index = {w: i_f[0] for w, i_f in word_meta.items()}
        model = TextClassifier(class_num, embedding_file, word_index,
                               sequence_len, options.encoder,
                               int(options.encoder_output_dim))

    optimizer = Optimizer(model=model,
                          training_rdd=train_rdd,
                          criterion=SparseCategoricalCrossEntropy(),
                          end_trigger=MaxEpoch(int(options.nb_epoch)),
                          batch_size=batch_size,
                          optim_method=Adagrad(learningrate=float(
                              options.learning_rate),
                                               learningrate_decay=0.001))
    optimizer.set_validation(batch_size=batch_size,
                             val_rdd=val_rdd,
                             trigger=EveryEpoch(),
                             val_method=[Accuracy()])
Ejemplo n.º 10
0
 def test_save_load(self):
     model = TextClassifier(20, glove_path, sequence_length=100)
     input_data = np.random.randint(20, size=(3, 100))
     self.assert_zoo_model_save_load(model, input_data)
Ejemplo n.º 11
0
 def test_forward_backward(self):
     model = TextClassifier(10, glove_path)
     model.summary()
     input_data = np.random.randint(20, size=(4, 500))
     self.assert_forward_backward(model, input_data)