Beispiel #1
0
    def create_estimators(self):

        logistic = cl.LogisticRegression(maxIter=10,
                                         regParam=0.01,
                                         labelCol='Survived',
                                         featuresCol='feature_data')
        self.stages.append(logistic)
Beispiel #2
0
def classification_ml():
	if False:
		spark = SparkSession.builder.appName('classification-ml') \
			.config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.23.1') \
			.getOrCreate()

		df = spark.read \
			.format('jdbc') \
			.option('url', 'jdbc:sqlite:iris.db') \
			.option('driver', 'org.sqlite.JDBC') \
			.option('dbtable', 'iris') \
			.load()
	else:
		spark = SparkSession.builder.appName('classification-ml').getOrCreate()
		df = spark.read.option('header', 'true').option('inferSchema', 'true').format('csv').load('dataset/iris.csv')
	spark.sparkContext.setLogLevel('WARN')
	df.show()

	labels = [
		('index', types.IntegerType()),
		('a1', types.FloatType()),
		('a2', types.FloatType()),
		('a3', types.FloatType()),
		('a4', types.FloatType()),
		('id', types.StringType()),
		('label', types.StringType())
	]

	stringIndexer = ml_feature.StringIndexer(inputCol='label', outputCol='label_int')
	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[1:5]], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(featuresCol=featuresCreator.getOutputCol(), labelCol=stringIndexer.getOutputCol(), maxIter=10, regParam=0.01)

	# Create a pipeline.
	pipeline = Pipeline(stages=[stringIndexer, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	df_train, df_test = df.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(df_train)
	test_result = model.transform(df_test)  # Dataframe.

	#print(test_result.take(1))
	#test_result.show(5, truncate=True, vertical=False)
	test_result.show(truncate=False)

	# Save and load.
	lr_path = './lr'
	logistic.write().overwrite().save(lr_path)
	lr2 = ml_classification.LogisticRegression.load(lr_path)
	print('Param =', lr2.getRegParam())

	model_path = './lr_model'
	model.write().overwrite().save(model_path)
	model2 = PipelineModel.load(model_path)
	print('Stages =', model.stages)
	print(model.stages[2].coefficientMatrix == model2.stages[2].coefficientMatrix)
	print(model.stages[2].interceptVector == model2.stages[2].interceptVector)
Beispiel #3
0
def test_multi_model_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    models = (classification.LogisticRegression(),
              classification.RandomForestClassifier(),
              classification.LogisticRegression().setElasticNetParam(0.2),
              classification.GBTClassifier())
    ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \
        classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #4
0
def test_ml_pipe():
    df = sc. \
         parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]). \
         toDF()

    pl = feature.Tokenizer().setInputCol('sentence') | feature.CountVectorizer()
    ml = pl | classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #5
0
def test_stackedml_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \
        classification.\
        RandomForestClassifier()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #6
0
def logistic_regression_text(df, input_col):
    """
    Runs a logistic regression for input (text) DataFrame.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to predict
    :return: DataFrame with logistic regression and prediction run.
    """

    assert_spark_df(df)

    pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer()
    ml = pl | classification.LogisticRegression()
    ml_model = ml.fit(df)
    df_model = ml_model.transform(df)
    return df_model, ml_model
Beispiel #7
0
def model_build(df, feature_events, cov_event, model_name = 'logistic'):
    df_train, df_test = df.randomSplit([0.7, 0.3], 23333)
    va = VectorAssembler(inputCols= feature_events, outputCol='features')
    df_train_converted = va.transform(df_train)
    df_test_converted = va.transform(df_test)
    if model_name == 'logistic':
        clf = classification.LogisticRegression(regParam = 0.01, labelCol = 'label')
    model = clf.fit(df_train_converted)
    train_result = model.evaluate(df_train_converted)
    test_result = model.evaluate(df_test_converted)
    print (train_result.areaUnderROC)
    print (test_result.areaUnderROC)
    fi = pd.DataFrame(zip(feature_events, model.coefficients.toArray()),columns = ['feature', 'importance'])
    print (fi.sort_values(by = 'importance', ascending = False))
    return (model, va, df_train, df_test)
schema=typ.StructType([typ.StructField(e[0],e[1],False) for e in labels])
#读入数据
births=spark.read.csv('births_transformed.csv',header=True,schema=schema)
#string转为int
births=births.withColumn('BIRTH_PLACE_INT',births['BIRTH_PLACE'].cast(typ.IntegerType()))
births=births.withColumn('INFANT_ALIVE_AT_REPORT',births['INFANT_ALIVE_AT_REPORT'].cast(typ.DoubleType()))
#转为one-hot
encoder=ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',outputCol='BIRTH_PLACE_VEC')
#将所有的变量合并
featuresCreator=ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]]+[encoder.getOutputCol()],outputCol='features')
#选择特征值
selector=ft.ChiSqSelector(numTopFeatures=6,featuresCol=featuresCreator.getOutputCol(),outputCol='selectedFeatures',labelCol="INFANT_ALIVE_AT_REPORT")
#分割数据
births_train,births_test=births.randomSplit([0.7,0.3],seed=666)
#建立分类模型
#逻辑回归
logistic=cl.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT',featuresCol='selectedFeatures')
#创建网格
grid=tune.ParamGridBuilder().addGrid(logistic.maxIter,[2,10,50]).addGrid(logistic.regParam,[0.01,0.05,0.03]).build()
#创建pipeline
pipeline=Pipeline(stages=[encoder,featuresCreator,selector])
#指定评估
evaluator=ev.BinaryClassificationEvaluator(rawPredictionCol='probability',labelCol="INFANT_ALIVE_AT_REPORT")
#训练cv模型
cv=tune.CrossValidator(estimator=logistic,estimatorParamMaps=grid,evaluator=evaluator)
cvmodel=cv.fit(pipeline.fit(births_train).transform(births_train))
#测试模型
test_model=cvmodel.transform(pipeline.fit(births_train).transform(births_test))
print(evaluator.evaluate(test_model,{evaluator.metricName:'areaUnderROC'}))
print(evaluator.evaluate(test_model,{evaluator.metricName:'areaUnderPR'}))
Beispiel #9
0
    fea_pool = data.columns
    fea_pool.remove('y')

    ##featuerCreator:
    featuerCreator = ft.VectorAssembler(inputCols=fea_pool,
                                        outputCol='features')

    ##weightCol:
    data = data.withColumn('weight',
                           fn.when(data['y'] == 1, 1.0).otherwise(0.02))

    train, test = data.randomSplit([0.7, 0.3], seed=1234)  #42
    lr_model = cl.LogisticRegression(
        # maxIter=10,
        # regParam=0.01,
        elasticNetParam=0,
        family='binomial',
        threshold=0.5,
        weightCol='weight',
        labelCol='y')

    grid = tune.ParamGridBuilder()\
        .addGrid(lr_model.maxIter,[200,300,500,800])\
        .addGrid(lr_model.regParam,[0.001,0.002])\
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='y')

    cv = tune.CrossValidator(estimator=lr_model,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
    def __init__(self,
                 fit_data_prep_args: Optional[dict] = None,
                 probability_estimator: Optional[ml.Estimator] = None,
                 response_col: str = 'response'):
        r"""
        Parameters
        ----------
        fit_data_prep_args: Optional[dict] = None,
            arguments around preparing the data to be fit
            default args are
            default_fit_data_prep_args = {
                'class_balance': 1,
                'train_prop': .8,
                'bin_features':True,
                'remove_redundant_features':True,
            }

            'class balance' is ratio of control_candidates : treatment
            to train the model on

           train_prop is the proportion of the population (post-rebalance)
           that is in the training set

           'bin_features' can be bool, dict, or absent.
            if you do not want to bin them here, they MUST be binned
            prior. Unbinned features will undermine validity of outcome.
            if bin_features is absent or True, bin_features will be run
            with default args. If it is a dict, it will be passed as
            kwargs to bin_features. see utils.bin_features for arg details

            'remove_redundant_features' can be bool, dict or absent
            True or absent will run remove redundant features with default
            args. Dict will passed as kwargs instead.
            see utils.remove_redundant_features for arg details

        probability_estimator: ml.Estimator = mlc.LogisticRegression

                default args are
                default_probability_estimator_args = {
                "featuresCol": "features",
                "labelCol": "label",
                "predictionCol": "prediction",
                "maxIter": 10,
                "regParam": .2,
                "elasticNetParam": 0,
                "fitIntercept": True,
                "probabilityCol": "probability",
                "family": "binomial"
            }   Correct labelCol and featuresCol
            are crucial so special attention should be paid


        response_col: str ='response'
            column in df containt the response

        Raises
        ------
        UncaughtExceptions
        """

        if probability_estimator is None:
            probability_estimator = mlc.LogisticRegression(
                **self.default_probability_estimator_args)

        if fit_data_prep_args is None:
            fit_data_prep_args = self.default_fit_data_prep_args

        self.fit_data_prep_args = fit_data_prep_args
        self.probability_estimator = probability_estimator
        self.response_col = response_col

        # set vals to None - will be correctly assigned in fit

        self.train_set = None
        self.test_set = None
        self.rebalanced_df = None
Beispiel #11
0
def skl_predict(spark): 

    print (1111)
    
    
    data = [(list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 1])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0]))
        ]
    labels = ['_1', '_2', '_3', '_4','_5','_6','_7','_8','_9','_10','_11','_12','_13','_14','_15','_16','_17','_18','_19','_20','_21','_22','_23','_24','_25','_26','_27','_28','_29','_30', 'INFANT_ALIVE_AT_REPORT']
    df = spark.createDataFrame(data, schema = labels)

    # df = df.withColumn( "age", df['age']+1 ) 
    df.show()
    # df.select("age").distinct().show() 
    # df.count()

    # 列数据合并
    from pyspark.sql.functions import split, explode, concat, concat_ws
    df_concat = df.withColumn("_concat", concat(df['_1'], df['_2'], df['_3'], df['_4']))
    print ('df_concat>>>>>>>>>>>>>>>>>>>')
    df_concat.show()


    # 将所有的特征整和到一起
    featuresCreator = ft.VectorAssembler( inputCols=[ col for col in labels], outputCol='features' )


    # 创建评估器
    import pyspark.ml.classification as cl
    logistic = cl.LogisticRegression(
        maxIter=10, 
        regParam=0.01, 
        labelCol='INFANT_ALIVE_AT_REPORT')
    print ('logistic:', logistic)


    # 创建一个管道
    from pyspark.ml import Pipeline
    
    pipeline = Pipeline(stages=[
            featuresCreator, 
            logistic
        ])

    # fit 
    births_train, births_test = df.randomSplit([0.7, 0.3], seed=666)

    print ('births_train', births_train)
    print ( 'births_test', births_test )

    # 运行管道,评估模型。
    model = pipeline.fit(births_train)
    test_model = model.transform(births_test)

    print ('test_model:', test_model) 

    
    test_model.take(1)

    print ('test_model.take(1):', test_model.take(1))






    '''
Beispiel #12
0
    inputCols=[
        col[0]
        for col
        in labels[2:]] + \
    [encoder.getOutputCol()],
    outputCol='features'
)

# In[6]:

import pyspark.ml.classification as cl

# In[7]:

logistic = cl.LogisticRegression(maxIter=10,
                                 regParam=0.01,
                                 labelCol='INFANT_ALIVE_AT_REPORT')

# In[8]:

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

# In[9]:

births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

# In[10]:

model = pipeline.fit(births_train)
Beispiel #13
0
# 将所有的特征整和到一起
featuresCreator = ft.VectorAssembler(
    inputCols=[
        col[0] 
        for col 
        in labels[2:]] + \
    [encoder.getOutputCol()], 
    outputCol='features'
)

print ('featuresCreator:', featuresCreator)

# 创建评估器
import pyspark.ml.classification as cl
logistic = cl.LogisticRegression(
    maxIter=10, 
    regParam=0.01, 
    labelCol='INFANT_ALIVE_AT_REPORT')
print ('logistic:', logistic)


# 创建一个管道
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
        encoder, 
        featuresCreator, 
        logistic
    ])

# fit 
births_train, births_test = births \
Beispiel #14
0
data_sample.groupby('age').count().show()  # 按建筑建成时间分组
data_sample.agg({'age': 'skewness'}).show()

numerical = ['floors_before', 'floors_after', 'height_before', 'height_after']
desc = data_sample.describe(numerical)  # 查看地震前后的楼层数和高度变化
desc.show()

##########################  建立Logistic回归模型 ##########################

## 超参调优 Parameter hyper-tuning

### 创建评估器 Create an estimator

import pyspark.ml.classification as cl

logistic = cl.LogisticRegression(
    labelCol='label')  # 对评估器的参数还需进一步进行超参调优,故先不设定超参数

### 网格搜索 Grid search

import pyspark.ml.tuning as tune

grid = tune.ParamGridBuilder() \
    .addGrid(logistic.maxIter,
             [10, 50, 80]) \
    .addGrid(logistic.regParam,
             [0.01, 0.001]) \
    .build()

### 创建管道 Create a pipeline

from pyspark.ml import Pipeline
Beispiel #15
0
featuresCreator = ft.VectorAssembler(inputCols=[
    'cool', 'funny', 'useful', 'is_open', 'business_review_count',
    'business_stars', 'average_stars', 'fans', 'user_review_count',
    'sentiment_score'
],
                                     outputCol='features')

# ### 4: Estimator Creation
# This is the step where we select the machine learning model that we wish to utilize. Here, we create an Estimator object that contains the machine learning model along with all the hyper optimization parameters that need to be passed to it. Here, we are using LogisticRegression.

# In[48]:

import pyspark.ml.classification as cl

logistic_regression_model = cl.LogisticRegression(maxIter=10,
                                                  regParam=0.01,
                                                  labelCol='review_stars',
                                                  family='multinomial')
print(type(logistic_regression_model))

# ### 5: Pipeline Creation

# In[50]:

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[featuresCreator, logistic_regression_model])
print(type(pipeline))

# ### 6: Dataset Splitting

# In[52]:
Beispiel #16
0
def train_validation_splitting_ml():
	spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Select only the top five features.
	selector = ml_feature.ChiSqSelector(
		numTopFeatures=5,
		featuresCol=featuresCreator.getOutputCol(),
		outputCol='selectedFeatures',
		labelCol='INFANT_ALIVE_AT_REPORT'
	)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, selector])
	data_transformer = pipeline.fit(births_train)

	# Create LogisticRegression and Pipeline.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a TrainValidationSplit object.
	tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	# Fit our data to the model.
	tvsModel = tvs.fit(data_transformer.transform(births_train))
	data_train = data_transformer.transform(births_test)

	# Calculate results.
	results = tvsModel.transform(data_train)
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
Beispiel #17
0
def infant_survival_ml():
	spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(births_train)
	test_model = model.transform(births_test)

	print(test_model.take(1))

	# Evaluate the performance of the model.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

	# Save the Pipeline definition.
	pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline'
	pipeline.write().overwrite().save(pipelinePath)

	# Load the Pipeline definition.
	loadedPipeline = Pipeline.load(pipelinePath)
	loadedPipeline.fit(births_train).transform(births_test).take(1)

	# Save the PipelineModel.
	modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
	model.write().overwrite().save(modelPath)

	# Load the PipelineModel.
	loadedPipelineModel = PipelineModel.load(modelPath)
	test_reloadedModel = loadedPipelineModel.transform(births_test)

	print(test_reloadedModel.take(1))
def semi_supervised_batch_single_classifier_generate_approach(data,
                                                              featureCols=None,
                                                              labelCol='used_label',
                                                              predictionCol='prediction',
                                                              *args,
                                                              **kwargs):
    """
    A first approach to a semi-supervised learning method. Uses a k-means combined with logistic regression to find
    the best classification of the data.
    @input: data: spark dataframe with missing lables, but all are missing!
    @input: featureCols:
    @input: labelCol:
    @input: predictionCol:
    returns spark dataframe with classified data, with help from the clustering method
    """
    import numpy as np
    import pandas as pd
    from pyspark.sql import DataFrame
    from pyspark.sql import functions as F
    from pyspark.ml import clustering
    from pyspark.ml import feature
    from pyspark.ml import Pipeline
    from pyspark.ml import classification
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

    assert labelCol in data.columns, 'Lables are missing please provide a label column!'
    assert isinstance(data, DataFrame), 'Data is not of type Spark.DataFrame, but {}'.format(type(data))
    assert featureCols is not None, 'Please give a list of features as string!'

    cluster_model = kwargs.get('clusterModel','KMeans') #TODO Future stuf that makes our semi supervised more dynamic
    classification_model = kwargs.get('classificationModel','LogisticRegression')

    k_clusters = (data
                  .filter((F.col(labelCol) != np.NaN))
                  .groupBy(labelCol)
                  .count()
                  .count()
                  )
    print(k_clusters)

    # Feature vectorizer and k-means model is initialized here!
    feature_vector = feature.VectorAssembler(
        inputCols=featureCols,
        outputCol='features')

    k_means = clustering.KMeans(
        featuresCol=feature_vector.getOutputCol(),
        predictionCol='Kmeans_prediction',
        k=k_clusters)

    # Classification begins here!
    log_reg = classification.LogisticRegression(
        featuresCol=feature_vector.getOutputCol(),
        labelCol=k_means.getPredictionCol(),
        predictionCol=predictionCol)

    # Pipeline get assembled here!
    pipeline = Pipeline(stages=[feature_vector, k_means, log_reg])

    # CrossValidation gets build here!
    param_grid = (ParamGridBuilder()
                  .addGrid(log_reg.regParam, [0.1, 0.01])
                  .build()
                  )
    evaluator = BinaryClassificationEvaluator(
        rawPredictionCol=log_reg.getRawPredictionCol(),
        labelCol=k_means.getPredictionCol())

    folds = kwargs.get('folds', 3)

    cross_validator = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=folds)

    evaluated_pipeline = cross_validator.fit(data)
    cluster_fitted_data = evaluated_pipeline.transform(data)
    return cluster_fitted_data
def impact(df: pyspark.sql.DataFrame, response_col: str,
           prob_mod: mlc.Model) -> Tuple[float, float, float]:
    r"""observe impact of treatment on response variable

    currently response must be binary
    if the df is small enough return naive difference in groupby label
    response mean. otherwise do additional regression on response col
    with label as predictor and use its coefficient as a measure of its
    impact. binning and dimensionality reduction will occur if necessary
    to do an effective regression

    Parameters
    ----------
    df: pyspark.sql.DataFrame
    response_col: str
    prob_mod: Tmlc.Model
        propensity model, mostly used to keep track of feature_col,
        label_col, pred_cols

    Returns
    -------
    treatment_rate : float
        treatment response rate
    control_rate : float
        control response rate
    adjusted_response : float
        impact of treatment on response, which may be
        `control_rate`-`treatment_rate` or may have further bias adjustement

    Raises
    ------
    ValueError
        when number of rows is less than `MINIMUM_POS_COUNT`*2
    UncaughtExceptions

    See Also
    --------
    bin_features
    _reduce_dimensionality

    Notes
    -----

    """

    _persist_if_unpersisted(df)

    label_col = prob_mod.getOrDefault('labelCol')
    features_col = prob_mod.getOrDefault('featuresCol')
    pred_cols = _get_pred_cols(df, features_col)

    all_count = df.count()

    # safety check
    if all_count < MINIMUM_POS_COUNT * 2:
        logging.getLogger(__name__).critical(
            "somehow have less than 2*MINIMUM_POS_COUNT*2 rows")
        raise ValueError(
            "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening"
        )

    # dict because 1, 0 for label col are not guaranteed to be ordered
    naive_response_dict = dict()
    response_list = df.groupby(label_col).mean(response_col).collect()
    naive_response_dict[response_list[0][label_col]] = response_list[0][
        "avg({col})".format(col=response_col)]
    naive_response_dict[response_list[1][label_col]] = response_list[1][
        "avg({col})".format(col=response_col)]
    treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[
        0]
    logging.getLogger(__name__).info(
        "treatment_rate:{tr:.2f}   control_rate:{cr:.2f}".format(
            tr=treatment_rate, cr=control_rate))

    # return early if additional bias reduction is not applicable
    if all_count < NAIVE_THRESHOLD_COUNT:
        logging.getLogger(__name__).info(
            "additional bias adjustment inapplicable, returning naive difference"
        )
        return treatment_rate, control_rate, (control_rate - treatment_rate)

    logging.getLogger(__name__).info("additional bias adjustment possible")
    # choose fewer features if appropriate to prevent overfit. round down
    num_preds = int(
        df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1
    logging.getLogger(__name__).info(
        "need max {n:,} predictors".format(n=num_preds))
    if num_preds < len(list(pred_cols)):
        logging.getLogger(__name__).info(
            "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality"
            .format(np=num_preds, ep=len(pred_cols)))
        kwargs = {
            'df': df,
            'label_col': label_col,
            'binned_features_col': features_col,
            'ncols': num_preds
        }
        df, pred_cols = reduce_dimensionality(args=kwargs, method='chi')

    pred_cols_r = pred_cols + [label_col]
    assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r,
                                      outputCol='features_r')
    df = assembler_r.transform(df)
    _persist_if_unpersisted(df)
    lre_r = mlc.LogisticRegression(
        featuresCol='features_r',
        labelCol=response_col,
        predictionCol='prediction_{0}'.format(response_col),
        rawPredictionCol='rawPrediction_{0}'.format(response_col),
        probabilityCol='probability_{0}'.format(response_col))
    lrm_r = lre_r.fit(df)

    coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients))

    adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col]))
    logging.getLogger(__name__).info(
        "bias asjusted response is {ar:.2f}".format(ar=adjusted_response))
    return treatment_rate, control_rate, adjusted_response
Beispiel #20
0
def hyper_parameter_optimization_ml():
	spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator])
	data_transformer = pipeline.fit(births_train)

	# Specify our model and the list of parameters we want to loop through.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a logic that will do the validation work.
	cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	cvModel = cv.fit(data_transformer.transform(births_train))

	# See if cvModel performed better than our previous model
	data_train = data_transformer.transform(births_test)
	results = cvModel.transform(data_train)

	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

	# Parameters which the best model has.
	results = [
		([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric)
		for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
	]
	print(sorted(results, key=lambda el: el[1], reverse=True)[0])