コード例 #1
0
ファイル: dl_runner.py プロジェクト: tspannhw/sparkflow
def test_overlapping_guassians():
    dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10)))
           for _ in range(0, 200)]
    dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10)))
            for _ in range(0, 200)]
    dat.extend(dat2)
    random.shuffle(dat)
    processed = spark.createDataFrame(dat, ["label", "features"])

    first_graph = tf.Graph()
    with first_graph.as_default() as g:
        v = create_random_model()
        mg = json_format.MessageToJson(tf.train.export_meta_graph())

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=35,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label')

    data = spark_model.fit(processed).transform(processed).take(10)
    nb_errors = 0
    for d in data:
        lab = d['label']
        predicted = d['predicted'][0]
        if predicted != lab:
            nb_errors += 1
    assert nb_errors < len(data)
コード例 #2
0
ファイル: dl_runner.py プロジェクト: tspannhw/sparkflow
def test_multi_partition_shuffle():
    dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10)))
           for _ in range(0, 200)]
    dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10)))
            for _ in range(0, 200)]
    dat.extend(dat2)
    random.shuffle(dat)
    processed = spark.createDataFrame(dat, ["label", "features"])

    mg = build_graph(create_random_model)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=20,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label',
                               partitionShuffles=2)
    data = spark_model.fit(processed).transform(processed).take(10)
    nb_errors = 0
    for d in data:
        lab = d['label']
        predicted = d['predicted'][0]
        if predicted != lab:
            nb_errors += 1
    assert nb_errors < len(data)
コード例 #3
0
 def test_auto_encoder(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_autoencoder)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel=None,
                                tfOutput='out/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.001,
                                iters=10,
                                predictionCol='predicted',
                                partitions=4,
                                miniBatchSize=10,
                                verbose=1)
     encoded = spark_model.fit(processed).transform(processed).take(10)
     print(encoded[0]['predicted'])
コード例 #4
0
 def test_save_model(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     fitted = spark_model.fit(processed)
     fitted.save('saved_model')
     model = SparkAsyncDLModel.load("saved_model")
     data = model.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
コード例 #5
0
    def test_small_sparse(self):
        xor = [(0.0, Vectors.sparse(2, [0, 1], [0.0, 0.0])),
               (0.0, Vectors.sparse(2, [0, 1], [1.0, 1.0])),
               (1.0, Vectors.sparse(2, [0], [1.0])),
               (1.0, Vectors.sparse(2, [1], [1.0]))]
        processed = self.spark.createDataFrame(xor, ["label", "features"])

        mg = build_graph(SparkFlowTests.create_model)
        spark_model = SparkAsyncDL(inputCol='features',
                                   tensorflowGraph=mg,
                                   tfInput='x:0',
                                   tfLabel='y:0',
                                   tfOutput='outer/Sigmoid:0',
                                   tfOptimizer='adam',
                                   tfLearningRate=.1,
                                   iters=35,
                                   partitions=2,
                                   predictionCol='predicted',
                                   labelCol='label')
        assert spark_model.fit(processed).transform(
            processed).collect() is not None
コード例 #6
0
ファイル: simple_dnn.py プロジェクト: zhongkailv/sparkflow
    df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)
    adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df)
    encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='out:0',
        tfOptimizer='adam',
        miniBatchSize=300,
        miniStochasticIters=-1,
        shufflePerIter=True,
        iters=20,
        predictionCol='predicted',
        labelCol='labels',
        partitions=4,
        verbose=1,
        optimizerOptions=adam_config
    )

    spark_model.fit(encoded).save('simple_dnn')
    x = SparkAsyncDLModel.load("simple_dnn").transform(encoded).take(10)
    print(x)
コード例 #7
0
    tfLabel='y:0',
    tfOutput='out/Sigmoid:0',
    tfLearningRate=0.001,  # 学习率设为0.001
    iters=50,  # 训练50轮
    predictionCol='probability',
    miniBatchSize=200,  # 每批200个数据
    verbose=1,
    tfOptimizer='adam'  # 优化器选择Adam
)

# 拟合模型
pipeline = Pipeline(stages=[featuresCreator])
data_transformer = pipeline.fit(data_train)

ANN_model = spark_model.fit(
        data_transformer \
        .transform(data_train)
)

# 模型预测
prediction = ANN_model.transform( \
    data_transformer \
        .transform(data_test)
                                 )
results = prediction.select('id', 'label', 'probability')

from pyspark.sql.functions import col, when
prediction = when(col('probability') > 0.5, 1.0).otherwise(0.0)
results = results.withColumn('prediction', prediction)

# 查看预测结果(前10行)
results.show(10)
コード例 #8
0
    spark = SparkSession.builder \
        .appName("examples") \
        .master('local[8]').config('spark.driver.memory', '4g') \
        .getOrCreate()

    df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats'])
    na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel=None,
        tfOutput='out:0',
        tfOptimizer='adam',
        tfLearningRate=.001,
        iters=10,
        predictionCol='predicted',
        partitions=3,
        miniBatchSize=256,
        verbose=1
    ).fit(na)

    spark_model.fit(na).save('auto_encoded')
    x = SparkAsyncDLModel.load("auto_encoded").transform(na).take(10)
    print(x)