Esempio n. 1
0
    def test_distributed_textset_integration(self):
        texts_rdd = self.sc.parallelize(self.texts)
        labels_rdd = self.sc.parallelize(self.labels)
        distributed_set = DistributedTextSet(texts_rdd, labels_rdd)
        assert distributed_set.is_distributed()
        assert not distributed_set.is_local()
        assert distributed_set.get_texts().collect() == self.texts
        assert distributed_set.get_labels().collect() == self.labels

        sets = distributed_set.random_split([0.5, 0.5])
        train_texts = sets[0].get_texts().collect()
        test_texts = sets[1].get_texts().collect()
        assert set(train_texts + test_texts) == set(self.texts)

        tokenized = Tokenizer()(distributed_set)
        transformed = tokenized.normalize().word2idx().shape_sequence(
            5).generate_sample()
        word_index = transformed.get_word_index()
        assert len(word_index) == 14
        samples = transformed.get_samples().collect()
        assert len(samples) == 3
        for sample in samples:
            assert sample.feature.shape[0] == 5

        vocab_file = create_tmp_path() + ".txt"
        transformed.save_word_index(vocab_file)
        distributed_set2 = DistributedTextSet(texts_rdd, labels_rdd)
        distributed_set2.load_word_index(vocab_file)
        transformed2 = distributed_set2.tokenize().normalize().word2idx()\
            .shape_sequence(5).generate_sample()
        samples2 = transformed2.get_samples().collect()
        for s1, s2 in zip(samples, samples2):
            assert np.allclose(s1.feature.to_ndarray(),
                               s2.feature.to_ndarray())
        os.remove(vocab_file)

        model = TextClassifier(5,
                               self.glove_path,
                               word_index,
                               5,
                               encoder="lstm")
        model.compile(SGD(), SparseCategoricalCrossEntropy())
        model.fit(transformed, batch_size=2, nb_epoch=2)
        res_set = model.predict(transformed, batch_per_thread=2)
        predicts = res_set.get_predicts().collect()
        for predict in predicts:
            assert len(predict) == 1
            assert predict[0].shape == (5, )

        tmp_path = create_tmp_path() + ".bigdl"
        model.save_model(tmp_path, over_write=True)
        loaded_model = TextClassifier.load_model(tmp_path)
        loaded_res_set = loaded_model.predict(transformed, batch_per_thread=2)
        loaded_predicts = loaded_res_set.get_predicts().collect()
        assert len(loaded_predicts) == len(predicts)
        os.remove(tmp_path)
Esempio n. 2
0
def bigdl_estimator():
    from zoo.orca.learn.bigdl.estimator import Estimator
    from tensorflow.python.keras.datasets import imdb
    from tensorflow.python.keras.preprocessing import sequence
    from zoo.pipeline.api.keras.models import Model
    from zoo.pipeline.api.keras.objectives import SparseCategoricalCrossEntropy
    from zoo.orca.data import XShards
    from zoo.orca.learn.metrics import Accuracy
    import numpy as np

    # conf = {"spark.executor.extraJavaOptions": "-Xss512m", "spark.driver.extraJavaOptions": "-Xss512m"}

    # init_orca_context(cluster_mode="local", cores=8, memory="16g")
    init_orca_context(cluster_mode="local", cores=4, memory="16g")
    max_features = 200
    max_len = 20

    print("running bigdl estimator")

    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    x_test = x_test[-1000:]
    y_test = y_test[-1000:]
    
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=max_len)
    x_test = sequence.pad_sequences(x_test, maxlen=max_len)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    train_pos = np.zeros((len(x_train), max_len), dtype=np.int32)
    val_pos = np.zeros((len(x_test), max_len), dtype=np.int32)
    for i in range(0, len(x_train)):
        train_pos[i, :] = np.arange(max_len)
        val_pos[i, :] = np.arange(max_len)

    train_dataset = XShards.partition({"x": (x_train, train_pos), "y": np.array(y_train)})
    val_dataset = XShards.partition({"x": (x_test, val_pos), "y": np.array(y_test)})

    token_shape = (max_len,)
    position_shape = (max_len,)
    token_input = Input(shape=token_shape)
    position_input = Input(shape=position_shape)
    O_seq = TransformerLayer.init(vocab=max_features, hidden_size=128, n_head=8, seq_len=max_len)([token_input, position_input])
    # Select the first output of the Transformer. The second is the pooled output.
    O_seq = SelectTable(0)(O_seq)
    O_seq = GlobalAveragePooling1D()(O_seq)
    O_seq = Dropout(0.2)(O_seq)
    outputs = Dense(2, activation='softmax')(O_seq)

    model = Model([token_input, position_input], outputs)
    model.summary()
    batch_size = 64
    print("Train started")
    est = Estimator.from_bigdl(model=model, loss=SparseCategoricalCrossEntropy(), optimizer=Adam(), metrics=[Accuracy()])
    est.set_constant_gradient_clipping(0.1, 0.2)
    est.fit(data=train_dataset, batch_size=batch_size, epochs=1)
    result = est.evaluate(val_dataset)
    print(result)
    est.clear_gradient_clipping()
    est.set_l2_norm_gradient_clipping(0.5)
    est.fit(data=train_dataset, batch_size=batch_size, epochs=1)
    print("Train finished") 
    
    print("Evaluating started")
    result = est.evaluate(val_dataset)
    print(result)
    print("Evaluating finished")
    est.save('work/saved_model')
    # est.load('work/saved_model')
    print("load and save API finished")

    est.get_train_summary(tag='Loss')
    est.get_validation_summary(tag='Top1Accuracy')
    print("get summary API finished")


    stop_orca_context()
Esempio n. 3
0
    sample_rdd = vector_rdd.map(lambda vectors_label: to_sample(
        vectors_label[0], vectors_label[1], token_length))

    train_rdd, val_rdd = sample_rdd.randomSplit(
        [training_split, 1 - training_split])

    if options.model:
        model = TextClassifier.load_model(options.model)
    else:
        model = TextClassifier(CLASS_NUM, token_length,
                               sequence_len, options.encoder,
                               int(options.encoder_output_dim))

    optimizer = Optimizer(model=model,
                          training_rdd=train_rdd,
                          criterion=SparseCategoricalCrossEntropy(),
                          end_trigger=MaxEpoch(int(options.nb_epoch)),
                          batch_size=batch_size,
                          optim_method=Adagrad(learningrate=float(
                              options.learning_rate),
                                               learningrate_decay=0.001))
    optimizer.set_validation(batch_size=batch_size,
                             val_rdd=val_rdd,
                             trigger=EveryEpoch(),
                             val_method=[Accuracy()])

    log_dir = options.log_dir
    app_name = 'adam-' + dt.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_summary = TrainSummary(log_dir=log_dir, app_name=app_name)
    train_summary.set_summary_trigger("Parameters", SeveralIteration(50))
    val_summary = ValidationSummary(log_dir=log_dir, app_name=app_name)
Esempio n. 4
0
token_input = Input(shape=token_shape)
position_input = Input(shape=position_shape)
O_seq = TransformerLayer.init(vocab=max_features,
                              hidden_size=128,
                              n_head=8,
                              seq_len=max_len)([token_input, position_input])
# Select the first output of the Transformer. The second is the pooled output.
O_seq = SelectTable(0)(O_seq)
O_seq = GlobalAveragePooling1D()(O_seq)
O_seq = Dropout(0.2)(O_seq)
outputs = Dense(2, activation='softmax')(O_seq)

model = Model([token_input, position_input], outputs)
model.summary()
batch_size = 128
print('Train...')

est = Estimator.from_bigdl(model=model,
                           loss=SparseCategoricalCrossEntropy(),
                           optimizer=Adam(),
                           metrics=[Accuracy()])
est.fit(data=train_dataset, batch_size=batch_size, epochs=1)
print("Train finished.")

print('Evaluating...')
result = est.evaluate(val_dataset)
print(result)

print("finished...")
stop_orca_context()