コード例 #1
0
ファイル: dl_runner.py プロジェクト: tspannhw/sparkflow
def test_multi_partition_shuffle():
    dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10)))
           for _ in range(0, 200)]
    dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10)))
            for _ in range(0, 200)]
    dat.extend(dat2)
    random.shuffle(dat)
    processed = spark.createDataFrame(dat, ["label", "features"])

    mg = build_graph(create_random_model)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=20,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label',
                               partitionShuffles=2)
    data = spark_model.fit(processed).transform(processed).take(10)
    nb_errors = 0
    for d in data:
        lab = d['label']
        predicted = d['predicted'][0]
        if predicted != lab:
            nb_errors += 1
    assert nb_errors < len(data)
コード例 #2
0
ファイル: dl_runner.py プロジェクト: tspannhw/sparkflow
def test_overlapping_guassians():
    dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10)))
           for _ in range(0, 200)]
    dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10)))
            for _ in range(0, 200)]
    dat.extend(dat2)
    random.shuffle(dat)
    processed = spark.createDataFrame(dat, ["label", "features"])

    first_graph = tf.Graph()
    with first_graph.as_default() as g:
        v = create_random_model()
        mg = json_format.MessageToJson(tf.train.export_meta_graph())

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=35,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label')

    data = spark_model.fit(processed).transform(processed).take(10)
    nb_errors = 0
    for d in data:
        lab = d['label']
        predicted = d['predicted'][0]
        if predicted != lab:
            nb_errors += 1
    assert nb_errors < len(data)
コード例 #3
0
 def test_auto_encoder(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_autoencoder)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel=None,
                                tfOutput='out/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.001,
                                iters=10,
                                predictionCol='predicted',
                                partitions=4,
                                miniBatchSize=10,
                                verbose=1)
     encoded = spark_model.fit(processed).transform(processed).take(10)
     print(encoded[0]['predicted'])
コード例 #4
0
 def test_save_model(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     fitted = spark_model.fit(processed)
     fitted.save('saved_model')
     model = SparkAsyncDLModel.load("saved_model")
     data = model.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
コード例 #5
0
    def test_small_sparse(self):
        xor = [(0.0, Vectors.sparse(2, [0, 1], [0.0, 0.0])),
               (0.0, Vectors.sparse(2, [0, 1], [1.0, 1.0])),
               (1.0, Vectors.sparse(2, [0], [1.0])),
               (1.0, Vectors.sparse(2, [1], [1.0]))]
        processed = self.spark.createDataFrame(xor, ["label", "features"])

        mg = build_graph(SparkFlowTests.create_model)
        spark_model = SparkAsyncDL(inputCol='features',
                                   tensorflowGraph=mg,
                                   tfInput='x:0',
                                   tfLabel='y:0',
                                   tfOutput='outer/Sigmoid:0',
                                   tfOptimizer='adam',
                                   tfLearningRate=.1,
                                   iters=35,
                                   partitions=2,
                                   predictionCol='predicted',
                                   labelCol='label')
        assert spark_model.fit(processed).transform(
            processed).collect() is not None
コード例 #6
0
def stage_result(data_train, message):
    """
	Input: Dataset
	Output: Pipeline model.
	"""
    #params_dict_LinearRegression = {'labelCol': 'price', 'featuresCol': '1_vector', 'predictionCol': 'prediction', 'aggregationDepth': 2, 'solver': 'auto', 'standardization': True, 'fitIntercept': True, 'elasticNetParam': 0, 'maxIter': 100, 'regParam': 0, 'tol': 1e-06, 'loss': 'squaredError', 'epsilon': 1.35}
    #modul_LinearRegression = LinearRegression(**params_dict_LinearRegression)

    import tensorflow as tf
    from sparkflow.pipeline_util import PysparkPipelineWrapper
    from sparkflow.graph_utils import build_graph
    from sparkflow.tensorflow_async import SparkAsyncDL

    mg = build_graph(graph_model)
    params_dict_DNN = {
        "featuresCol": "features",
    }
    params_dict_DNN["tensorflowGraph"] = mg
    modul_DNN = SparkAsyncDL(**params_dict_DNN)
    #modul_DNN = SparkAsyncDL(inputCol='features',
    #						 tensorflowGraph=mg,
    #						 tfInput='x:0',
    #						 tfLabel='y:0',
    #						 tfOutput='out:0',
    #						 tfLearningRate=.001,
    #					     iters=10,
    #					     predictionCol='predicted',
    #					     labelCol='labels',
    #					     verbose=1)

    #stages_pipeline = [modul_LinearRegression]
    stages_pipeline = [modul_DNN]

    pipeline = Pipeline(stages=stages_pipeline)
    model = pipeline.fit(data_train)

    prediction = model.transform(data_train)
    if message['output_col'] == 'all':
        data_prediction = prediction
    else:
        try:
            output_col = set(message['output_col'])
            prediction_col = set(prediction.columns)
            output_col = list(output_col.intersection(prediction_col))
            data_prediction = prediction.select(
                [col for col in prediction.columns if col in output_col])
        except Exception as er:
            raise er

    return model, data_prediction
コード例 #7
0
def test_overlapping_guassians():
    processed = generate_random_data()
    mg = build_graph(create_random_model)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=35,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label')
    handle_test(spark_model, processed)
コード例 #8
0
 def test_multi_partition_shuffle(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label',
                                partitionShuffles=2)
     self.handle_assertions(spark_model, processed)
コード例 #9
0
def stage_result():
    """
	Input: Dataset
	Output: Pipeline model.
	"""
    def graph_model():
        x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
        y = tf.placeholder(tf.float32, shape=[None, 10], name='y')

        layer0 = tf.layers.dense(x, 128, activation=tf.nn.relu)
        layer1 = tf.layers.dense(layer0, 128, activation=tf.nn.relu)
        layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.relu)
        layer3 = tf.layers.dense(layer2, 32, activation=tf.nn.relu)

        out = tf.layers.dense(layer3, 10)
        z = tf.argmax(out, 1, name='out')
        loss = tf.losses.softmax_cross_entropy(y, out)
        return loss

    params_dict_DNN = {'labelCol': 'price', 'tfLearningRate': 0.001, 'iters': 10, \
         'predictionCol' : 'predicted', 'labelCol': 'labels', 'verbose' :1}

    params_dict_DNN["tensorflowGraph"] = build_graph(graph_model)
    print("tensorflowGraph type : ", params_dict_DNN["tensorflowGraph"])
    modul_DNN = SparkAsyncDL(**params_dict_DNN)

    stages_pipeline = [modul_DNN]

    pipeline = Pipeline(stages=stages_pipeline)
    #model = pipeline.fit(data_train)

    #prediction = model.transform(data_train)
    #if message['output_col'] == 'all':
    #	data_prediction = prediction
    #else:
    #	try:
    #		output_col = set(message['output_col'])
    #		prediction_col = set(prediction.columns)
    #		output_col = list(output_col.intersection(prediction_col))
    #		data_prediction = prediction.select([col for col in prediction.columns if col in output_col])
    #	except Exception as er:
    #		raise er

    return pipeline  #model, data_prediction
コード例 #10
0
ファイル: dl_runner.py プロジェクト: wh-forker/sparkflow
def test_rmsprop():
    processed = generate_random_data()
    mg = build_graph(create_random_model)
    options = build_rmsprop_config(learning_rate=0.1, decay=0.95, momentum=0.1, centered=False)
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='outer/Sigmoid:0',
        tfOptimizer='rmsprop',
        tfLearningRate=.1,
        iters=25,
        partitions=4,
        predictionCol='predicted',
        labelCol='label',
        optimizerOptions=options
    )
    handle_assertions(spark_model, processed)
コード例 #11
0
ファイル: dl_runner.py プロジェクト: wh-forker/sparkflow
def test_adam_optimizer_options():
    processed = generate_random_data()
    mg = build_graph(create_random_model)
    options = build_adam_config(learning_rate=0.1, beta1=0.85, beta2=0.98, epsilon=1e-8)
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='outer/Sigmoid:0',
        tfOptimizer='adam',
        tfLearningRate=.1,
        iters=25,
        partitions=4,
        predictionCol='predicted',
        labelCol='label',
        optimizerOptions=options
    )
    handle_assertions(spark_model, processed)
コード例 #12
0
 def test_save_pipeline(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     p = Pipeline(stages=[spark_model]).fit(processed)
     p.write().overwrite().save('example_pipeline')
     p = PysparkPipelineWrapper.unwrap(
         PipelineModel.load('example_pipeline'))
     data = p.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
コード例 #13
0
ファイル: train.py プロジェクト: jimmy-academia/Stock-Spark
    mg = build_graph(small_model)
    #Assemble and one hot encode
    va = VectorAssembler(inputCols=final_df.columns[1:151],
                         outputCol='features')
    encoded = OneHotEncoder(inputCol='result',
                            outputCol='labels',
                            dropLast=False)
    adam_config = build_adam_config(learning_rate=0.001,
                                    beta1=0.9,
                                    beta2=0.999)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='out:0',
                               tfLearningRate=.001,
                               iters=20,
                               predictionCol='predicted',
                               labelCol='labels',
                               verbose=1,
                               optimizerOptions=adam_config)

    ckptpath = os.path.join(ckptdir, task)
    print('save model in ckptpath')
    p = Pipeline(stages=[va, encoded, spark_model]).fit(final_df)
    p.write().overwrite().save(ckptpath)

    print('===task all done===')
コード例 #14
0
    out = tf.layers.dense(fc1, 10)
    z = tf.argmax(out, 1, name='out')
    loss = tf.losses.softmax_cross_entropy(y, out)
    return loss

# Build the graph
mg = build_graph(cnn_model)

spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='out:0',
        tfOptimizer='adam',
        miniBatchSize=300,
        miniStochasticIters=-1,
        shufflePerIter=True,
        iters=10,
        tfLearningRate=.001,
        predictionCol='predicted',
        labelCol='labels',
        verbose=1
    )


if __name__ == '__main__':

    from pyspark.ml.pipeline import Pipeline
    try:
        import time
        # Pipeline definition
コード例 #15
0
ファイル: simple_dnn.py プロジェクト: zhongkailv/sparkflow
    df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)
    adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df)
    encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='out:0',
        tfOptimizer='adam',
        miniBatchSize=300,
        miniStochasticIters=-1,
        shufflePerIter=True,
        iters=20,
        predictionCol='predicted',
        labelCol='labels',
        partitions=4,
        verbose=1,
        optimizerOptions=adam_config
    )

    spark_model.fit(encoded).save('simple_dnn')
    x = SparkAsyncDLModel.load("simple_dnn").transform(encoded).take(10)
    print(x)
コード例 #16
0
    spark = SparkSession.builder \
        .appName("examples") \
        .master('local[8]').config('spark.driver.memory', '4g') \
        .getOrCreate()

    df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats'])
    na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel=None,
        tfOutput='out:0',
        tfOptimizer='adam',
        tfLearningRate=.001,
        iters=10,
        predictionCol='predicted',
        partitions=3,
        miniBatchSize=256,
        verbose=1
    ).fit(na)

    spark_model.fit(na).save('auto_encoded')
    x = SparkAsyncDLModel.load("auto_encoded").transform(na).take(10)
    print(x)
コード例 #17
0
    layer1 = tf.layers.dense(x, 10, activation=tf.nn.relu)
    layer2 = tf.layers.dense(layer1, 5, activation=tf.nn.relu)
    out = tf.layers.dense(layer2, 1, activation=tf.nn.sigmoid, name='out')
    loss = tf.losses.mean_squared_error(y, out)  # 前面代表真实标签,后面代表神经网络输出结果
    return loss


mg = build_graph(small_model)  # 构建计算图

spark_model = SparkAsyncDL(
    inputCol='features',  # 输入列
    labelCol='label',  # 输出列
    tensorflowGraph=mg,
    tfInput='x:0',
    tfLabel='y:0',
    tfOutput='out/Sigmoid:0',
    tfLearningRate=0.001,  # 学习率设为0.001
    iters=50,  # 训练50轮
    predictionCol='probability',
    miniBatchSize=200,  # 每批200个数据
    verbose=1,
    tfOptimizer='adam'  # 优化器选择Adam
)

# 拟合模型
pipeline = Pipeline(stages=[featuresCreator])
data_transformer = pipeline.fit(data_train)

ANN_model = spark_model.fit(
        data_transformer \
        .transform(data_train)
)
コード例 #18
0
if __name__ == '__main__':
    spark = SparkSession.builder \
        .appName("examples") \
        .master('local[4]').config('spark.driver.memory', '2g') \
        .getOrCreate()

    df = spark.read.option("inferSchema", "true").csv('examples/mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats'])
    na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel=None,
        tfOutput='out/Sigmoid:0',
        tfOptimizer='adam',
        tfLearningRate=.001,
        iters=10,
        predictionCol='predicted',
        partitions=4,
        miniBatchSize=256,
        verbose=1
    ).fit(na)

    t = spark_model.transform(na).take(1)
    print(t[0]['predicted'])