Ejemplo n.º 1
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
import datetime
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

if __name__ == "__main__":
    time1 = datetime.datetime.now()
    sc = env.CreateSparkContext("MLPClassifier")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())

    print("===========DataProcess====================")

    if env.recode_label == True:
        df, cat_dist = dataprocess(df, recode=env.idx_label)
    else:
        df, cat_dist = dataprocess(df)

    count = 0
    for idx in range(0, len(env.idx_cat)):
        for i in range(0, len(cat_dist[idx].labels)):
Ejemplo n.º 2
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

if __name__ == "__main__":
    sc = env.CreateSparkContext("SVMClassifier")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())
    lable_name = [df.schema.names[i] for i in env.idx_label]

    print("===========DataProcess====================")
    df, cat_dist = dataprocess(df)

    for idx in range(0, len(env.idx_cat)):
        for i in range(0, len(cat_dist[idx].labels)):
            print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' +
                  cat_dist[idx].labels[i])

    print("===========SplitData====================")
    train_df, test_df = df.randomSplit(env.split_prop)

    print("===========VectorAssembler====================")
Ejemplo n.º 3
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
 

if __name__ == "__main__":
    sc= env.CreateSparkContext("LogisticRegressionClassifier")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())
    
    print("===========DataProcess====================")
    
    if env.recode_label == True:
        df,cat_dist = dataprocess(df,recode=env.idx_label)
    else:
        df,cat_dist = dataprocess(df)
    
    for idx in range(0,len(env.idx_cat)):
        for i in range(0,len(cat_dist[idx].labels)):
            print("idx_cat"+ str(env.idx_cat[idx])+" "+str(i)+':'+cat_dist[idx].labels[i]) 
    
    print("===========SplitData====================")
Ejemplo n.º 4
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

if __name__ == "__main__":
    sc = env.CreateSparkContext("DecisionTreeClassifier")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())
    lable_name = [df.schema.names[i] for i in env.idx_label]

    print("===========DataProcess====================")

    if env.recode_label == True:
        df, cat_dist = dataprocess(df, recode=env.idx_label)
    else:
        df, cat_dist = dataprocess(df)

    for idx in range(0, len(env.idx_cat)):
        for i in range(0, len(cat_dist[idx].labels)):
            print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' +
                  cat_dist[idx].labels[i])
Ejemplo n.º 5
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

if __name__ == "__main__":
    sc = env.CreateSparkContext("NaiveBayesClassifier")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())

    print("===========DataProcess====================")

    if env.recode_label == True:
        df, cat_dist = dataprocess(df, True, recode=env.idx_label)
    else:
        df, cat_dist = dataprocess(df, True)

    for idx in range(0, len(env.idx_cat)):
        for i in range(0, len(cat_dist[idx].labels)):
            print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' +
                  cat_dist[idx].labels[i])

    print("===========SplitData====================")
Ejemplo n.º 6
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import datetime
from pyspark.mllib.evaluation import MulticlassMetrics

if __name__ == "__main__":
    time1 = datetime.datetime.now()
    sc = env.CreateSparkContext("RandomForestClassifier")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())

    print("===========DataProcess====================")

    if env.recode_label == True:
        df, cat_dist = dataprocess(df, recode=env.idx_label)
    else:
        df, cat_dist = dataprocess(df)

    for idx in range(0, len(env.idx_cat)):
        for i in range(0, len(cat_dist[idx].labels)):
            print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' +
Ejemplo n.º 7
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import sys
import ml_environment as env
from dataprocess_DF import *

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

if __name__ == "__main__":
    print(sys.argv)
    sc = env.CreateSparkContext("LinearRegression")
    sqlContext = SQLContext(sc)
    env.Setargv()

    print("===========LoadData====================")
    df = Load_data(sqlContext)
    print(df.count())
    lable_name = [df.schema.names[i] for i in env.idx_label]
    print("predicted_label=", lable_name)

    print("===========DataProcess====================")
    df, cat_dist = dataprocess(df)

    for idx in range(0, len(env.idx_cat)):
        for i in range(0, len(cat_dist[idx].labels)):
            print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' +
                  cat_dist[idx].labels[i])

    print("===========SplitData====================")