def create_estimators(self): logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='Survived', featuresCol='feature_data') self.stages.append(logistic)
def classification_ml(): if False: spark = SparkSession.builder.appName('classification-ml') \ .config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.23.1') \ .getOrCreate() df = spark.read \ .format('jdbc') \ .option('url', 'jdbc:sqlite:iris.db') \ .option('driver', 'org.sqlite.JDBC') \ .option('dbtable', 'iris') \ .load() else: spark = SparkSession.builder.appName('classification-ml').getOrCreate() df = spark.read.option('header', 'true').option('inferSchema', 'true').format('csv').load('dataset/iris.csv') spark.sparkContext.setLogLevel('WARN') df.show() labels = [ ('index', types.IntegerType()), ('a1', types.FloatType()), ('a2', types.FloatType()), ('a3', types.FloatType()), ('a4', types.FloatType()), ('id', types.StringType()), ('label', types.StringType()) ] stringIndexer = ml_feature.StringIndexer(inputCol='label', outputCol='label_int') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[1:5]], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(featuresCol=featuresCreator.getOutputCol(), labelCol=stringIndexer.getOutputCol(), maxIter=10, regParam=0.01) # Create a pipeline. pipeline = Pipeline(stages=[stringIndexer, featuresCreator, logistic]) # Split the dataset into training and testing datasets. df_train, df_test = df.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(df_train) test_result = model.transform(df_test) # Dataframe. #print(test_result.take(1)) #test_result.show(5, truncate=True, vertical=False) test_result.show(truncate=False) # Save and load. lr_path = './lr' logistic.write().overwrite().save(lr_path) lr2 = ml_classification.LogisticRegression.load(lr_path) print('Param =', lr2.getRegParam()) model_path = './lr_model' model.write().overwrite().save(model_path) model2 = PipelineModel.load(model_path) print('Stages =', model.stages) print(model.stages[2].coefficientMatrix == model2.stages[2].coefficientMatrix) print(model.stages[2].interceptVector == model2.stages[2].interceptVector)
def test_multi_model_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() models = (classification.LogisticRegression(), classification.RandomForestClassifier(), classification.LogisticRegression().setElasticNetParam(0.2), classification.GBTClassifier()) ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \ classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def test_ml_pipe(): df = sc. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]). \ toDF() pl = feature.Tokenizer().setInputCol('sentence') | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def test_stackedml_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \ classification.\ RandomForestClassifier() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def logistic_regression_text(df, input_col): """ Runs a logistic regression for input (text) DataFrame. :param df: Pyspark dataframe to analyze :param input_col: Column to predict :return: DataFrame with logistic regression and prediction run. """ assert_spark_df(df) pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) df_model = ml_model.transform(df) return df_model, ml_model
def model_build(df, feature_events, cov_event, model_name = 'logistic'): df_train, df_test = df.randomSplit([0.7, 0.3], 23333) va = VectorAssembler(inputCols= feature_events, outputCol='features') df_train_converted = va.transform(df_train) df_test_converted = va.transform(df_test) if model_name == 'logistic': clf = classification.LogisticRegression(regParam = 0.01, labelCol = 'label') model = clf.fit(df_train_converted) train_result = model.evaluate(df_train_converted) test_result = model.evaluate(df_test_converted) print (train_result.areaUnderROC) print (test_result.areaUnderROC) fi = pd.DataFrame(zip(feature_events, model.coefficients.toArray()),columns = ['feature', 'importance']) print (fi.sort_values(by = 'importance', ascending = False)) return (model, va, df_train, df_test)
schema=typ.StructType([typ.StructField(e[0],e[1],False) for e in labels]) #读入数据 births=spark.read.csv('births_transformed.csv',header=True,schema=schema) #string转为int births=births.withColumn('BIRTH_PLACE_INT',births['BIRTH_PLACE'].cast(typ.IntegerType())) births=births.withColumn('INFANT_ALIVE_AT_REPORT',births['INFANT_ALIVE_AT_REPORT'].cast(typ.DoubleType())) #转为one-hot encoder=ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',outputCol='BIRTH_PLACE_VEC') #将所有的变量合并 featuresCreator=ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]]+[encoder.getOutputCol()],outputCol='features') #选择特征值 selector=ft.ChiSqSelector(numTopFeatures=6,featuresCol=featuresCreator.getOutputCol(),outputCol='selectedFeatures',labelCol="INFANT_ALIVE_AT_REPORT") #分割数据 births_train,births_test=births.randomSplit([0.7,0.3],seed=666) #建立分类模型 #逻辑回归 logistic=cl.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT',featuresCol='selectedFeatures') #创建网格 grid=tune.ParamGridBuilder().addGrid(logistic.maxIter,[2,10,50]).addGrid(logistic.regParam,[0.01,0.05,0.03]).build() #创建pipeline pipeline=Pipeline(stages=[encoder,featuresCreator,selector]) #指定评估 evaluator=ev.BinaryClassificationEvaluator(rawPredictionCol='probability',labelCol="INFANT_ALIVE_AT_REPORT") #训练cv模型 cv=tune.CrossValidator(estimator=logistic,estimatorParamMaps=grid,evaluator=evaluator) cvmodel=cv.fit(pipeline.fit(births_train).transform(births_train)) #测试模型 test_model=cvmodel.transform(pipeline.fit(births_train).transform(births_test)) print(evaluator.evaluate(test_model,{evaluator.metricName:'areaUnderROC'})) print(evaluator.evaluate(test_model,{evaluator.metricName:'areaUnderPR'}))
fea_pool = data.columns fea_pool.remove('y') ##featuerCreator: featuerCreator = ft.VectorAssembler(inputCols=fea_pool, outputCol='features') ##weightCol: data = data.withColumn('weight', fn.when(data['y'] == 1, 1.0).otherwise(0.02)) train, test = data.randomSplit([0.7, 0.3], seed=1234) #42 lr_model = cl.LogisticRegression( # maxIter=10, # regParam=0.01, elasticNetParam=0, family='binomial', threshold=0.5, weightCol='weight', labelCol='y') grid = tune.ParamGridBuilder()\ .addGrid(lr_model.maxIter,[200,300,500,800])\ .addGrid(lr_model.regParam,[0.001,0.002])\ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='y') cv = tune.CrossValidator(estimator=lr_model, estimatorParamMaps=grid, evaluator=evaluator,
def __init__(self, fit_data_prep_args: Optional[dict] = None, probability_estimator: Optional[ml.Estimator] = None, response_col: str = 'response'): r""" Parameters ---------- fit_data_prep_args: Optional[dict] = None, arguments around preparing the data to be fit default args are default_fit_data_prep_args = { 'class_balance': 1, 'train_prop': .8, 'bin_features':True, 'remove_redundant_features':True, } 'class balance' is ratio of control_candidates : treatment to train the model on train_prop is the proportion of the population (post-rebalance) that is in the training set 'bin_features' can be bool, dict, or absent. if you do not want to bin them here, they MUST be binned prior. Unbinned features will undermine validity of outcome. if bin_features is absent or True, bin_features will be run with default args. If it is a dict, it will be passed as kwargs to bin_features. see utils.bin_features for arg details 'remove_redundant_features' can be bool, dict or absent True or absent will run remove redundant features with default args. Dict will passed as kwargs instead. see utils.remove_redundant_features for arg details probability_estimator: ml.Estimator = mlc.LogisticRegression default args are default_probability_estimator_args = { "featuresCol": "features", "labelCol": "label", "predictionCol": "prediction", "maxIter": 10, "regParam": .2, "elasticNetParam": 0, "fitIntercept": True, "probabilityCol": "probability", "family": "binomial" } Correct labelCol and featuresCol are crucial so special attention should be paid response_col: str ='response' column in df containt the response Raises ------ UncaughtExceptions """ if probability_estimator is None: probability_estimator = mlc.LogisticRegression( **self.default_probability_estimator_args) if fit_data_prep_args is None: fit_data_prep_args = self.default_fit_data_prep_args self.fit_data_prep_args = fit_data_prep_args self.probability_estimator = probability_estimator self.response_col = response_col # set vals to None - will be correctly assigned in fit self.train_set = None self.test_set = None self.rebalanced_df = None
def skl_predict(spark): print (1111) data = [(list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 1])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])), (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])) ] labels = ['_1', '_2', '_3', '_4','_5','_6','_7','_8','_9','_10','_11','_12','_13','_14','_15','_16','_17','_18','_19','_20','_21','_22','_23','_24','_25','_26','_27','_28','_29','_30', 'INFANT_ALIVE_AT_REPORT'] df = spark.createDataFrame(data, schema = labels) # df = df.withColumn( "age", df['age']+1 ) df.show() # df.select("age").distinct().show() # df.count() # 列数据合并 from pyspark.sql.functions import split, explode, concat, concat_ws df_concat = df.withColumn("_concat", concat(df['_1'], df['_2'], df['_3'], df['_4'])) print ('df_concat>>>>>>>>>>>>>>>>>>>') df_concat.show() # 将所有的特征整和到一起 featuresCreator = ft.VectorAssembler( inputCols=[ col for col in labels], outputCol='features' ) # 创建评估器 import pyspark.ml.classification as cl logistic = cl.LogisticRegression( maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') print ('logistic:', logistic) # 创建一个管道 from pyspark.ml import Pipeline pipeline = Pipeline(stages=[ featuresCreator, logistic ]) # fit births_train, births_test = df.randomSplit([0.7, 0.3], seed=666) print ('births_train', births_train) print ( 'births_test', births_test ) # 运行管道,评估模型。 model = pipeline.fit(births_train) test_model = model.transform(births_test) print ('test_model:', test_model) test_model.take(1) print ('test_model.take(1):', test_model.take(1)) '''
inputCols=[ col[0] for col in labels[2:]] + \ [encoder.getOutputCol()], outputCol='features' ) # In[6]: import pyspark.ml.classification as cl # In[7]: logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # In[8]: from pyspark.ml import Pipeline pipeline = Pipeline(stages=[encoder, featuresCreator, logistic]) # In[9]: births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # In[10]: model = pipeline.fit(births_train)
# 将所有的特征整和到一起 featuresCreator = ft.VectorAssembler( inputCols=[ col[0] for col in labels[2:]] + \ [encoder.getOutputCol()], outputCol='features' ) print ('featuresCreator:', featuresCreator) # 创建评估器 import pyspark.ml.classification as cl logistic = cl.LogisticRegression( maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') print ('logistic:', logistic) # 创建一个管道 from pyspark.ml import Pipeline pipeline = Pipeline(stages=[ encoder, featuresCreator, logistic ]) # fit births_train, births_test = births \
data_sample.groupby('age').count().show() # 按建筑建成时间分组 data_sample.agg({'age': 'skewness'}).show() numerical = ['floors_before', 'floors_after', 'height_before', 'height_after'] desc = data_sample.describe(numerical) # 查看地震前后的楼层数和高度变化 desc.show() ########################## 建立Logistic回归模型 ########################## ## 超参调优 Parameter hyper-tuning ### 创建评估器 Create an estimator import pyspark.ml.classification as cl logistic = cl.LogisticRegression( labelCol='label') # 对评估器的参数还需进一步进行超参调优,故先不设定超参数 ### 网格搜索 Grid search import pyspark.ml.tuning as tune grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [10, 50, 80]) \ .addGrid(logistic.regParam, [0.01, 0.001]) \ .build() ### 创建管道 Create a pipeline from pyspark.ml import Pipeline
featuresCreator = ft.VectorAssembler(inputCols=[ 'cool', 'funny', 'useful', 'is_open', 'business_review_count', 'business_stars', 'average_stars', 'fans', 'user_review_count', 'sentiment_score' ], outputCol='features') # ### 4: Estimator Creation # This is the step where we select the machine learning model that we wish to utilize. Here, we create an Estimator object that contains the machine learning model along with all the hyper optimization parameters that need to be passed to it. Here, we are using LogisticRegression. # In[48]: import pyspark.ml.classification as cl logistic_regression_model = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='review_stars', family='multinomial') print(type(logistic_regression_model)) # ### 5: Pipeline Creation # In[50]: from pyspark.ml import Pipeline pipeline = Pipeline(stages=[featuresCreator, logistic_regression_model]) print(type(pipeline)) # ### 6: Dataset Splitting # In[52]:
def train_validation_splitting_ml(): spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Select only the top five features. selector = ml_feature.ChiSqSelector( numTopFeatures=5, featuresCol=featuresCreator.getOutputCol(), outputCol='selectedFeatures', labelCol='INFANT_ALIVE_AT_REPORT' ) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, selector]) data_transformer = pipeline.fit(births_train) # Create LogisticRegression and Pipeline. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a TrainValidationSplit object. tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) # Fit our data to the model. tvsModel = tvs.fit(data_transformer.transform(births_train)) data_train = data_transformer.transform(births_test) # Calculate results. results = tvsModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
def infant_survival_ml(): spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # Create a pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, logistic]) # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(births_train) test_model = model.transform(births_test) print(test_model.take(1)) # Evaluate the performance of the model. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # Save the Pipeline definition. pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # Load the Pipeline definition. loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # Save the PipelineModel. modelPath = './infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) # Load the PipelineModel. loadedPipelineModel = PipelineModel.load(modelPath) test_reloadedModel = loadedPipelineModel.transform(births_test) print(test_reloadedModel.take(1))
def semi_supervised_batch_single_classifier_generate_approach(data, featureCols=None, labelCol='used_label', predictionCol='prediction', *args, **kwargs): """ A first approach to a semi-supervised learning method. Uses a k-means combined with logistic regression to find the best classification of the data. @input: data: spark dataframe with missing lables, but all are missing! @input: featureCols: @input: labelCol: @input: predictionCol: returns spark dataframe with classified data, with help from the clustering method """ import numpy as np import pandas as pd from pyspark.sql import DataFrame from pyspark.sql import functions as F from pyspark.ml import clustering from pyspark.ml import feature from pyspark.ml import Pipeline from pyspark.ml import classification from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import ParamGridBuilder, CrossValidator assert labelCol in data.columns, 'Lables are missing please provide a label column!' assert isinstance(data, DataFrame), 'Data is not of type Spark.DataFrame, but {}'.format(type(data)) assert featureCols is not None, 'Please give a list of features as string!' cluster_model = kwargs.get('clusterModel','KMeans') #TODO Future stuf that makes our semi supervised more dynamic classification_model = kwargs.get('classificationModel','LogisticRegression') k_clusters = (data .filter((F.col(labelCol) != np.NaN)) .groupBy(labelCol) .count() .count() ) print(k_clusters) # Feature vectorizer and k-means model is initialized here! feature_vector = feature.VectorAssembler( inputCols=featureCols, outputCol='features') k_means = clustering.KMeans( featuresCol=feature_vector.getOutputCol(), predictionCol='Kmeans_prediction', k=k_clusters) # Classification begins here! log_reg = classification.LogisticRegression( featuresCol=feature_vector.getOutputCol(), labelCol=k_means.getPredictionCol(), predictionCol=predictionCol) # Pipeline get assembled here! pipeline = Pipeline(stages=[feature_vector, k_means, log_reg]) # CrossValidation gets build here! param_grid = (ParamGridBuilder() .addGrid(log_reg.regParam, [0.1, 0.01]) .build() ) evaluator = BinaryClassificationEvaluator( rawPredictionCol=log_reg.getRawPredictionCol(), labelCol=k_means.getPredictionCol()) folds = kwargs.get('folds', 3) cross_validator = CrossValidator( estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=folds) evaluated_pipeline = cross_validator.fit(data) cluster_fitted_data = evaluated_pipeline.transform(data) return cluster_fitted_data
def impact(df: pyspark.sql.DataFrame, response_col: str, prob_mod: mlc.Model) -> Tuple[float, float, float]: r"""observe impact of treatment on response variable currently response must be binary if the df is small enough return naive difference in groupby label response mean. otherwise do additional regression on response col with label as predictor and use its coefficient as a measure of its impact. binning and dimensionality reduction will occur if necessary to do an effective regression Parameters ---------- df: pyspark.sql.DataFrame response_col: str prob_mod: Tmlc.Model propensity model, mostly used to keep track of feature_col, label_col, pred_cols Returns ------- treatment_rate : float treatment response rate control_rate : float control response rate adjusted_response : float impact of treatment on response, which may be `control_rate`-`treatment_rate` or may have further bias adjustement Raises ------ ValueError when number of rows is less than `MINIMUM_POS_COUNT`*2 UncaughtExceptions See Also -------- bin_features _reduce_dimensionality Notes ----- """ _persist_if_unpersisted(df) label_col = prob_mod.getOrDefault('labelCol') features_col = prob_mod.getOrDefault('featuresCol') pred_cols = _get_pred_cols(df, features_col) all_count = df.count() # safety check if all_count < MINIMUM_POS_COUNT * 2: logging.getLogger(__name__).critical( "somehow have less than 2*MINIMUM_POS_COUNT*2 rows") raise ValueError( "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening" ) # dict because 1, 0 for label col are not guaranteed to be ordered naive_response_dict = dict() response_list = df.groupby(label_col).mean(response_col).collect() naive_response_dict[response_list[0][label_col]] = response_list[0][ "avg({col})".format(col=response_col)] naive_response_dict[response_list[1][label_col]] = response_list[1][ "avg({col})".format(col=response_col)] treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[ 0] logging.getLogger(__name__).info( "treatment_rate:{tr:.2f} control_rate:{cr:.2f}".format( tr=treatment_rate, cr=control_rate)) # return early if additional bias reduction is not applicable if all_count < NAIVE_THRESHOLD_COUNT: logging.getLogger(__name__).info( "additional bias adjustment inapplicable, returning naive difference" ) return treatment_rate, control_rate, (control_rate - treatment_rate) logging.getLogger(__name__).info("additional bias adjustment possible") # choose fewer features if appropriate to prevent overfit. round down num_preds = int( df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1 logging.getLogger(__name__).info( "need max {n:,} predictors".format(n=num_preds)) if num_preds < len(list(pred_cols)): logging.getLogger(__name__).info( "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality" .format(np=num_preds, ep=len(pred_cols))) kwargs = { 'df': df, 'label_col': label_col, 'binned_features_col': features_col, 'ncols': num_preds } df, pred_cols = reduce_dimensionality(args=kwargs, method='chi') pred_cols_r = pred_cols + [label_col] assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r, outputCol='features_r') df = assembler_r.transform(df) _persist_if_unpersisted(df) lre_r = mlc.LogisticRegression( featuresCol='features_r', labelCol=response_col, predictionCol='prediction_{0}'.format(response_col), rawPredictionCol='rawPrediction_{0}'.format(response_col), probabilityCol='probability_{0}'.format(response_col)) lrm_r = lre_r.fit(df) coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients)) adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col])) logging.getLogger(__name__).info( "bias asjusted response is {ar:.2f}".format(ar=adjusted_response)) return treatment_rate, control_rate, adjusted_response
def hyper_parameter_optimization_ml(): spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator]) data_transformer = pipeline.fit(births_train) # Specify our model and the list of parameters we want to loop through. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a logic that will do the validation work. cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(data_transformer.transform(births_train)) # See if cvModel performed better than our previous model data_train = data_transformer.transform(births_test) results = cvModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) # Parameters which the best model has. results = [ ([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics) ] print(sorted(results, key=lambda el: el[1], reverse=True)[0])