#!/usr/bin/python # -*- coding:utf-8 -*- import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.mllib.evaluation import MulticlassMetrics import datetime from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator if __name__ == "__main__": time1 = datetime.datetime.now() sc = env.CreateSparkContext("MLPClassifier") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) print("===========DataProcess====================") if env.recode_label == True: df, cat_dist = dataprocess(df, recode=env.idx_label) else: df, cat_dist = dataprocess(df) count = 0 for idx in range(0, len(env.idx_cat)): for i in range(0, len(cat_dist[idx].labels)):
#!/usr/bin/python # -*- coding:utf-8 -*- import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.classification import LinearSVC from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator if __name__ == "__main__": sc = env.CreateSparkContext("SVMClassifier") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) lable_name = [df.schema.names[i] for i in env.idx_label] print("===========DataProcess====================") df, cat_dist = dataprocess(df) for idx in range(0, len(env.idx_cat)): for i in range(0, len(cat_dist[idx].labels)): print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' + cat_dist[idx].labels[i]) print("===========SplitData====================") train_df, test_df = df.randomSplit(env.split_prop) print("===========VectorAssembler====================")
#!/usr/bin/python # -*- coding:utf-8 -*- import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator if __name__ == "__main__": sc= env.CreateSparkContext("LogisticRegressionClassifier") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) print("===========DataProcess====================") if env.recode_label == True: df,cat_dist = dataprocess(df,recode=env.idx_label) else: df,cat_dist = dataprocess(df) for idx in range(0,len(env.idx_cat)): for i in range(0,len(cat_dist[idx].labels)): print("idx_cat"+ str(env.idx_cat[idx])+" "+str(i)+':'+cat_dist[idx].labels[i]) print("===========SplitData====================")
#!/usr/bin/python # -*- coding:utf-8 -*- import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator if __name__ == "__main__": sc = env.CreateSparkContext("DecisionTreeClassifier") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) lable_name = [df.schema.names[i] for i in env.idx_label] print("===========DataProcess====================") if env.recode_label == True: df, cat_dist = dataprocess(df, recode=env.idx_label) else: df, cat_dist = dataprocess(df) for idx in range(0, len(env.idx_cat)): for i in range(0, len(cat_dist[idx].labels)): print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' + cat_dist[idx].labels[i])
#!/usr/bin/python # -*- coding:utf-8 -*- import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator if __name__ == "__main__": sc = env.CreateSparkContext("NaiveBayesClassifier") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) print("===========DataProcess====================") if env.recode_label == True: df, cat_dist = dataprocess(df, True, recode=env.idx_label) else: df, cat_dist = dataprocess(df, True) for idx in range(0, len(env.idx_cat)): for i in range(0, len(cat_dist[idx].labels)): print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' + cat_dist[idx].labels[i]) print("===========SplitData====================")
#!/usr/bin/python # -*- coding:utf-8 -*- import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator import datetime from pyspark.mllib.evaluation import MulticlassMetrics if __name__ == "__main__": time1 = datetime.datetime.now() sc = env.CreateSparkContext("RandomForestClassifier") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) print("===========DataProcess====================") if env.recode_label == True: df, cat_dist = dataprocess(df, recode=env.idx_label) else: df, cat_dist = dataprocess(df) for idx in range(0, len(env.idx_cat)): for i in range(0, len(cat_dist[idx].labels)): print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' +
#!/usr/bin/python # -*- coding:utf-8 -*- import sys import ml_environment as env from dataprocess_DF import * from pyspark.ml import Pipeline from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator if __name__ == "__main__": print(sys.argv) sc = env.CreateSparkContext("LinearRegression") sqlContext = SQLContext(sc) env.Setargv() print("===========LoadData====================") df = Load_data(sqlContext) print(df.count()) lable_name = [df.schema.names[i] for i in env.idx_label] print("predicted_label=", lable_name) print("===========DataProcess====================") df, cat_dist = dataprocess(df) for idx in range(0, len(env.idx_cat)): for i in range(0, len(cat_dist[idx].labels)): print("idx_cat" + str(env.idx_cat[idx]) + " " + str(i) + ':' + cat_dist[idx].labels[i]) print("===========SplitData====================")