Python OneHotEncoderの例、pyspark.ml.feature.OneHotEncoder Pythonの例

コード例 #1

0

ファイルを表示

    def exercise_in_machine_learning(self):
        self.static_data_frame.printSchema()

        prepped_data_frame = self.static_data_frame.na.fill(0). \
            withColumn("day_of_week", functions.date_format(functions.col("InvoiceDate"), "EEEE")).coalesce(5)

        train_data_frame = prepped_data_frame.where(
            "InvoiceDate < '2011-03-01'")
        test_data_frame = prepped_data_frame.where(
            "InvoiceDate >= '2011-03-01'")

        print(f"TRAINING items: {train_data_frame.count()}")
        print(f"TEST DATA items: {test_data_frame.count()}")

        transformation_pipeline = Pipeline().setStages([
            feature.StringIndexer().setInputCol("day_of_week").setOutputCol(
                "day_of_week_index"),
            feature.OneHotEncoder().setInputCol(
                "day_of_week_index").setOutputCol("day_of_week_encoded"),
            feature.VectorAssembler().setInputCols(
                ["UnitPrice", "Quantity",
                 "day_of_week_encoded"]).setOutputCol("features"),
        ])

        fitted_pipeline = transformation_pipeline.fit(train_data_frame)
        transformed_training = fitted_pipeline.transform(train_data_frame)
        # transformed_training.cache()

        kmeans = clustering.KMeans().setK(2).setSeed(2)
        km_model = kmeans.fit(transformed_training)
        print(f"Training cost: {km_model.summary.trainingCost}")

        transformed_test = fitted_pipeline.transform(test_data_frame)
        transformed_test.summary().show()

コード例 #2

0

ファイルを表示

def train_evaluate(train_data, test_data):
    # 将文字的分类特征转为数字
    stringIndexer = ft.StringIndexer(inputCol='alchemy_category',
                                     outputCol="alchemy_category_Index")

    encoder = ft.OneHotEncoder(dropLast=False,
                               inputCol='alchemy_category_Index',
                               outputCol="alchemy_category_IndexVec")

    assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1]
    assembler = ft.VectorAssembler(inputCols=assemblerInputs,
                                   outputCol="features")

    # dt = cl.DecisionTreeClassifier(labelCol="label",
    #                             featuresCol="features")
    rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features")

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol='label',
        metricName='areaUnderROC')

    grid_search = tune.ParamGridBuilder()\
        .addGrid(rf.impurity, [ "gini","entropy"])\
        .addGrid(rf.maxDepth, [ 5,10,15])\
        .addGrid(rf.maxBins, [10, 15,20])\
        .addGrid(rf.numTrees, [10, 20,30])\
        .build()

    rf_cv = tune.CrossValidator(estimator=rf,
                                estimatorParamMaps=grid_search,
                                evaluator=evaluator,
                                numFolds=5)

    # rf_tvs = tune.TrainValidationSplit(
    #     estimator=rf,
    #     estimatorParamMaps=grid_search,
    #     evaluator=evaluator,
    #     trainRatio=0.7
    # )
    pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv])
    cv_pipeline_model = pipeline.fit(train_data)

    best_model = cv_pipeline_model.stages[-1]
    best_parm = get_best_param(best_model)

    AUC, AP = evaluate_model(cv_pipeline_model, test_data)

    return AUC, AP, best_parm, cv_pipeline_model

コード例 #3

0

ファイルを表示

ファイル: pyspark ml.py プロジェクト: YangZZhong/mygit

stringCSVRDD = spark.sparkContext.parallelize([(123, "Katie", 19, "brown"),
                                               (123, "Kkk", 19, "red"),
                                               (234, "Michael", 22, "green"),
                                               (345, "Simone", 23, "blue")])

schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

swimmers = spark.createDataFrame(stringCSVRDD, schema)
swimmers = swimmers.withColumn('id_int',
                               swimmers['id'].cast(typ.IntegerType()))
encoder = ft.OneHotEncoder(inputCol='id_int', outputCol='idvec')
swimmers.registerTempTable("swimmers")
swimmers.select("idvec").show()

from pyspark.sql import SparkSession
import pyspark.sql.types as typ
import pyspark.ml.feature as ft

labels = [('INFANT_ALIVE_AT_REPORT', typ.IntegerType()),
          ('BIRTH_PLACE', typ.IntegerType()),
          ('MOTHER_AGE_YEARS', typ.IntegerType()),
          ('FATHER_COMBINED_AGE', typ.IntegerType()),
          ('CIG_BEFORE', typ.IntegerType()), ('CIG_1_TRI', typ.IntegerType()),
          ('CIG_2_TRI', typ.IntegerType()), ('CIG_3_TRI', typ.IntegerType()),
          ('MOTHER_HEIGHT_IN', typ.IntegerType()),
          ('MOTHER_PRE_WEIGHT', typ.IntegerType()),

コード例 #4

0

ファイルを表示

ファイル: 4_feature_engineering.py プロジェクト: rp13g10/udacity-capstone

    if not isinstance(string_in, str):
        return fill_value
    elif not string_in:
        return fill_value
    else:
        return string_in


na_handler = ssf.udf(fill_empty_string, sst.StringType())

indexers = {}
for cat_col in cat_cols:
    merged = merged.withColumn(cat_col, na_handler(cat_col))
    indexer = smf.StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}Inx")
    indexer = indexer.fit(merged)
    merged = indexer.transform(merged)
    merged = merged.drop(cat_col).withColumnRenamed(f"{cat_col}Inx", cat_col)
    indexers[cat_col] = indexer

# merged.write.parquet('data/cached/indexed.parquet')

encoder = smf.OneHotEncoder(inputCols=cat_cols,
                            outputCols=[f"{x}Vec" for x in cat_cols])
encoder = encoder.fit(merged)
encoded = encoder.transform(merged)
encoded = encoded.drop(*cat_cols)

encoded = encoded.persist()

encoded.write.parquet("data/cached/encoded.parquet", mode='overwrite')

コード例 #5

0

ファイルを表示

ファイル: TuningApp.py プロジェクト: prashantsuryav/alpha

    model_data = model_data.withColumn("is_late", model_data.arr_delay > 0)
    # Convert to an integer
    model_data = model_data.withColumn("label",
                                       model_data.is_late.cast("integer"))
    # Remove missing values
    model_data = model_data.filter(
        "arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL"
    )

    model_data.show()

    # Create a StringIndexer
    carr_indexer = features.StringIndexer(inputCol="carrier",
                                          outputCol="carrier_index")
    # Create a OneHotEncoder
    carr_encoder = features.OneHotEncoder(inputCol="carrier_index",
                                          outputCol="carrier_fact")

    # Create a StringIndexer
    dest_indexer = features.StringIndexer(inputCol="dest",
                                          outputCol="dest_index")
    # Create a OneHotEncoder
    dest_encoder = features.OneHotEncoder(inputCol="dest_index",
                                          outputCol="dest_fact")

    # Make a VectorAssembler
    vec_assembler = features.VectorAssembler(inputCols=[
        "month", "air_time", "carrier_fact", "dest_fact", "plane_age"
    ],
                                             outputCol="features")

    # Make the pipeline

コード例 #6

0

ファイルを表示

ファイル: Untitled1.py プロジェクト: chane1214/ml

births = spark.read.csv('./data/births_transformed.csv.gz',
                        header=True,
                        schema=schema)

# In[3]:

import pyspark.ml.feature as ft

births = births     .withColumn(       'BIRTH_PLACE_INT',
                births['BIRTH_PLACE'] \
                    .cast(typ.IntegerType()))

# In[4]:

encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',
                           outputCol='BIRTH_PLACE_VEC')

# In[5]:


featuresCreator = ft.VectorAssembler(
    inputCols=[
        col[0]
        for col
        in labels[2:]] + \
    [encoder.getOutputCol()],
    outputCol='features'
)

# In[6]:

コード例 #7

0

ファイルを表示

ファイル: run.py プロジェクト: aaronmckinstry706/twitter-crime-prediction

data_matrix = topic_distributions.join(past_complaint_counts_df,
                                       on='grid_square',
                                       how='inner')
# So far, data_matrix contains Row(date, grid_square, topic_distributions, complaint_count).

# Get weekday from date.

get_weekday_udf = functions.udf(lambda d: d.weekday(),
                                returnType=types.IntegerType())
data_matrix = data_matrix.withColumn('weekday',
                                     get_weekday_udf(data_matrix['date']))

# Assemble the feature vectors.

weekday_one_hot_encoder = feature.OneHotEncoder(inputCol='weekday',
                                                outputCol='weekday_vector')
feature_vector_assembler = feature.VectorAssembler(
    inputCols=['weekday_vector', 'topic_distribution'],
    outputCol='final_feature_vector')
feature_assembly_pipeline = (ml.Pipeline(
    stages=[weekday_one_hot_encoder, feature_vector_assembler]).fit(
        data_matrix))

data_matrix = (feature_assembly_pipeline.transform(data_matrix).select(
    'date', 'grid_square', 'final_feature_vector', 'complaint_count'))

LOGGER.debug(
    str(data_matrix.count()) + " rows like " + str(data_matrix.take(1)))

#logistic_regression = classification.LogisticRegression(
#    maxIter=10, regParam=0.3, elasticNetParam=0.8,

コード例 #8

0

ファイルを表示


# def main():


if __name__ == "__main__":
    # specify schema structure of the df
    schema = typ.StructType(
        [typ.StructField(e[0], e[1], False) for e in labels])
    births = spark.read.csv(
        "../data/births_transformed.csv.gz", header=True, schema=schema)

    births = births.withColumn("BIRTH_PLACE_INT",
                               births["BIRTH_PLACE"].cast(typ.IntegerType()))

    encoder = ft.OneHotEncoder(inputCol="BIRTH_PLACE_INT",
                               outputCol="BIRTH_PLACE_VEC")
    # column with all features collected together
    features_creator = ft.VectorAssembler(
        inputCols=[col[0] for col in labels[2:]]
                  + [encoder.getOutputCol()],
        outputCol="features")

    logistic = cl.LogisticRegression(
        maxIter=10, regParam=0.01, labelCol="INFANT_ALIVE_AT_REPORT")

    pipe = Pipeline(stages=[encoder, features_creator, logistic])
    # train and test
    births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)
    model = pipe.fit(births_train)
    model_test = model.transform(births_test)
    print(model_test.take(1))

コード例 #9

0

ファイルを表示

ファイル: ml_codes.py プロジェクト: ywzhang188/spark-learn

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# __author__='zhangyuwei37'

import pyspark.ml.feature as ft
from pyspark.ml import Pipeline

# 特征预处理：对类别变量onehot,对数值变量scaling, 最后整合特征，输出pca降维结果
# onehot
indexers = [
    ft.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in nomial_features
]
encoders = [
    ft.OneHotEncoder(inputCol=indexer.getOutputCol(),
                     outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]
assembler_onehot = ft.VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders],
    outputCol="onehot_features")

#scaler
assembler_numeric = ft.VectorAssembler(inputCols=numeric_features,
                                       outputCol="numeric_features")
std_scaler = ft.StandardScaler(inputCol="numeric_features",
                               outputCol="numeric_features_scaled")

assembler_final = ft.VectorAssembler(
    inputCols=['onehot_features', 'numeric_features_scaled'],
    outputCol="final_features")

コード例 #10

0

ファイルを表示

def hyper_parameter_optimization_ml():
	spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator])
	data_transformer = pipeline.fit(births_train)

	# Specify our model and the list of parameters we want to loop through.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a logic that will do the validation work.
	cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	cvModel = cv.fit(data_transformer.transform(births_train))

	# See if cvModel performed better than our previous model
	data_train = data_transformer.transform(births_test)
	results = cvModel.transform(data_train)

	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

	# Parameters which the best model has.
	results = [
		([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric)
		for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
	]
	print(sorted(results, key=lambda el: el[1], reverse=True)[0])

コード例 #11

0

ファイルを表示

def train_validation_splitting_ml():
	spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Select only the top five features.
	selector = ml_feature.ChiSqSelector(
		numTopFeatures=5,
		featuresCol=featuresCreator.getOutputCol(),
		outputCol='selectedFeatures',
		labelCol='INFANT_ALIVE_AT_REPORT'
	)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, selector])
	data_transformer = pipeline.fit(births_train)

	# Create LogisticRegression and Pipeline.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a TrainValidationSplit object.
	tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	# Fit our data to the model.
	tvsModel = tvs.fit(data_transformer.transform(births_train))
	data_train = data_transformer.transform(births_test)

	# Calculate results.
	results = tvsModel.transform(data_train)
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

コード例 #12

0

ファイルを表示

def infant_survival_ml():
	spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(births_train)
	test_model = model.transform(births_test)

	print(test_model.take(1))

	# Evaluate the performance of the model.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))

	# Save the Pipeline definition.
	pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline'
	pipeline.write().overwrite().save(pipelinePath)

	# Load the Pipeline definition.
	loadedPipeline = Pipeline.load(pipelinePath)
	loadedPipeline.fit(births_train).transform(births_test).take(1)

	# Save the PipelineModel.
	modelPath = './infant_oneHotEncoder_Logistic_PipelineModel'
	model.write().overwrite().save(modelPath)

	# Load the PipelineModel.
	loadedPipelineModel = PipelineModel.load(modelPath)
	test_reloadedModel = loadedPipelineModel.transform(births_test)

	print(test_reloadedModel.take(1))

コード例 #13

0

ファイルを表示

ファイル: sink_search_slct_binary_ucvr_predict.py プロジェクト: ywzhang188/spark-learn

def main(spark):
    n = len(sys.argv) - 1
    if n < 1:
        print('\nParameters are needed!!\n')
        sys.exit()
    else:
        result_type = sys.argv[1]
        sku_type = sys.argv[2]
        end_date = sys.argv[3]
        end_date_1w = sys.argv[4]
        end_date_2w = sys.argv[5]
        input_train_data_table = sys.argv[6]
        input_predict_data_table = sys.argv[7]
        output_predict_result_table = sys.argv[8]
        predict_date = sys.argv[9]

    spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
    spark.sql("set spark.sql.hive.mergeFiles=true")
    spark.sql("set hive.exec.orc.split.strategy=BI")
    spark.sql("set mapred.job.priority = HIGH")
    spark.sql("set hive.default.fileformat=Orc")
    spark.sql("set hive.exec.parallel=true")
    spark.sql("set hive.auto.convert.join=true")
    spark.sql("set hive.merge.mapfiles = true")
    spark.sql("set hive.merge.mapredfiles = true")
    spark.sql("set hive.merge.size.per.task = 256000000")
    spark.sql("set hive.merge.smallfiles.avgsize=128000000")
    spark.sql("set hive.merge.orcfile.stripe.level=false")
    spark.sql("set hive.exec.dynamic.partition=true")
    spark.sql("set hive.exec.max.dynamic.partitions=1000000")
    spark.sql("set hive.exec.max.dynamic.partitions.pernode=1000000")
    spark.sql("set hive.exec.max.created.files=1000000")
    spark.sql("set mapreduce.job.counters.limit=10000")
    spark.sql("set mapred.output.compress=true")
    spark.sql("set hive.exec.compress.output=true")
    spark.sql("set spark.shuffle.service.enabled = true")
    spark.sql("set spark.sql.broadcastTimeout = 10000")

    print('end_date = {}\n'.format(end_date))
    print('sku_type = {}\n'.format(sku_type))
    print('result_type = {}\n'.format(result_type))

    ### 构建训练和预测样本

    # 确定取数口径
    if sku_type == 'old':
        sku_type_sql = ' and otc_days >= 60'
    elif sku_type == 'new':
        sku_type_sql = ' and otc_days < 60'
    else:
        sku_type_sql = ''

    # 当周正样本
    data_now = spark.sql("""
          select 
              t1.*
          from 
              (
              select * 
              from """ + input_train_data_table + """ 
              where end_date = '""" + end_date + """' and label > 0""" +
                         sku_type_sql + """
              )t1
          join
              (
              select 
                  item_third_cate_cd
              from 
                  app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
              where 
                  dt = '""" + predict_date + """'
                  and app_id = 4
                  and scene_id = 1
                  and status = 3
              group by 
                  item_third_cate_cd
              )t2
           on t1.item_third_cate_cd = t2.item_third_cate_cd
    """)

    # 提前1周的独有正样本
    data_1w = spark.sql("""
                select 
                    a.*
                from 
                    (
                    select 
                        t1.*
                    from 
                        (
                        select * 
                        from """ + input_train_data_table + """ 
                        where end_date = '""" + end_date_1w +
                        """' and label > 0""" + sku_type_sql + """
                        )t1
                    join
                        (
                        select 
                            item_third_cate_cd
                        from 
                            app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
                        where 
                            dt = '""" + predict_date + """'
                            and app_id = 4
                            and scene_id = 1
                            and status = 3
                        group by 
                            item_third_cate_cd
                        )t2
                    on t1.item_third_cate_cd = t2.item_third_cate_cd
                    )a
                left join 
                    (
                    select 
                        item_sku_id,1 as index
                    from 
                        """ + input_train_data_table + """ 
                    where 
                        end_date = '""" + end_date + """' and label > 0""" +
                        sku_type_sql + """
                    )b 
                on 
                    a.item_sku_id=b.item_sku_id
                where 
                    index is null or index = ''
                """)

    # 提前2周的独有正样本
    data_2w = spark.sql("""
                select 
                    a.*
                from 
                    (
                    select 
                        t1.*
                    from 
                        (
                        select * 
                        from """ + input_train_data_table + """ 
                        where end_date = '""" + end_date_2w +
                        """' and label > 0""" + sku_type_sql + """
                        )t1
                    join
                        (
                        select 
                            item_third_cate_cd
                        from 
                            app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
                        where 
                            dt = '""" + predict_date + """'
                            and app_id = 4
                            and scene_id = 1
                            and status = 3
                        group by 
                            item_third_cate_cd
                        )t2
                    on t1.item_third_cate_cd = t2.item_third_cate_cd
                    )a
                left join 
                    (
                    select 
                        item_sku_id,1 as index
                    from 
                        """ + input_train_data_table + """ 
                    where 
                        end_date = '""" + end_date + """' and label > 0""" +
                        sku_type_sql + """
                    )b 
                on 
                    a.item_sku_id=b.item_sku_id
                where 
                    index is null or index = ''
                """)

    # 合并正样本
    data = data_now.union(data_1w).union(data_2w)
    data_filter = data.filter("otc_days >= 0").filter("sku_status_cd = 3001")
    data_filter.cache()
    data_count = data_filter.count()
    print('positive data count = {}\n'.format(data_count))

    # 补充负样本
    data_neg = spark.sql("""
          select 
              t1.*
          from 
              (
              select * 
              from """ + input_train_data_table + """ 
              where end_date = '""" + end_date + """' and label = 0""" +
                         sku_type_sql + """
              and otc_days >= 0 and sku_status_cd = 3001
              )t1
          join
              (
              select 
                  item_third_cate_cd
              from 
                  app.app_vdp_ai_sink_dept3_cate3_scope_mid_da
              where 
                  dt = '""" + predict_date + """'
                  and app_id = 4
                  and scene_id = 1
                  and status = 3
              group by 
                  item_third_cate_cd
              )t2
           on t1.item_third_cate_cd = t2.item_third_cate_cd
              """)
    data_neg.cache()
    data_neg_count = data_neg.count()
    neg_sample_ratio = min(data_count /
                           data_neg_count, 1.0) if data_neg_count > 0 else 0.0
    data_neg_sample = data_neg.sample(neg_sample_ratio, seed=66)

    # 合并正负样本
    if result_type == 'ucvr':
        data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\
                          .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\
                          .withColumn('label_adjust',func.when(func.col('label') > 1,1).otherwise(func.col('label')))\
                          .drop('label').withColumnRenamed('label_adjust','label')
    else:
        data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\
                          .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\
                          .withColumn('label_binary',func.when(func.col('label') > 0,1).otherwise(0))\
                          .drop('label').withColumnRenamed('label_binary','label')

    # 合并sku embedding特征
    predict_date_str = ''.join(predict_date.split('-'))
    sku_vec = spark.sql(
        "select * from tmp.tmp_qzl_sink_search_08_sku2vec_features_{0}".format(
            predict_date_str))
    vec_size = len(sku_vec.columns) - 1
    data_union_sku2vec = data_union.join(sku_vec, on='item_sku_id', how='left')

    ### 训练模型

    # 特征分类
    # 非特征
    features_useless = [
        'item_first_cate_name', 'item_second_cate_cd', 'item_second_cate_name',
        'item_third_cate_cd', 'item_third_cate_name', 'barndname_full',
        'sku_name', 'item_sku_id', 'uv_value_label', 'first_into_otc_tm',
        'end_date', 'sku_status_cd', 'red_price', 'red_price_level_rank'
    ]
    # 类别型特征
    features_catagory = ['item_first_cate_cd']
    # embedding特征
    features_embedding = ['sku_vec_' + str(i) for i in range(vec_size)]
    # 数值型特征
    features_numerical = [
        f for f in data_union_sku2vec.columns if f not in ['label'] +
        features_useless + features_catagory + features_embedding
    ]

    # 特征缺失值统计
    feature_na = data_union_sku2vec.agg(
        *[(1 - (func.count(c) / func.count('*'))).alias(c)
          for c in data_union_sku2vec.columns])
    feature_na_DF = sqlDF2pandasDF(feature_na).T
    feature_na_DF = feature_na_DF.reset_index()
    feature_na_DF.columns = ['features', 'na_rate']
    for i, row in feature_na_DF.iterrows():
        print('{}: {}'.format(row['features'], row['na_rate']))

    # 处理缺失值
    fillna_value = {c: -1 for c in features_numerical}
    fillna_value.update({c: -10 for c in features_embedding})
    data_union_sku2vec_fillna = data_union_sku2vec.fillna(fillna_value)

    # 数据预处理
    stringIndexer_cd1 = ft.StringIndexer(inputCol="item_first_cate_cd",
                                         outputCol="item_first_cate_cd_index")
    encoder_cd1 = ft.OneHotEncoder(inputCol='item_first_cate_cd_index',
                                   outputCol='item_first_cate_cd_vec')
    featuresCreator = ft.VectorAssembler(inputCols=features_numerical +
                                         [encoder_cd1.getOutputCol()] +
                                         features_embedding,
                                         outputCol='features')
    pipeline = Pipeline(
        stages=[stringIndexer_cd1, encoder_cd1, featuresCreator])
    data_transformer = pipeline.fit(data_union_sku2vec_fillna)
    data_transformed = data_transformer.transform(data_union_sku2vec_fillna)
    data_transformed.cache()
    data_union_count = data_transformed.count()
    print('data_union_count = {}\n'.format(data_union_count))
    data_filter.unpersist()
    data_neg.unpersist()

    p_num = get_best_partition(data_union_count)
    data_transformed = data_transformed.repartition(p_num)

    # 开始训练
    best_depth = 12  # get_best_depth(data_union_count)
    best_iter = 150  # get_best_iter(data_union_count)
    f = '1.0'  # '0.8'
    s = 1.0  # 0.8

    if result_type == 'ucvr':
        gbdt = GBTRegressor(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='squared',seed=66,maxMemoryInMB=2048,cacheNodeIds=True, \
                             maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01)
    else:
        gbdt = GBTClassifier(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='logistic',seed=66,maxMemoryInMB=2048,cacheNodeIds=True,\
                             maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01)

    gbdt_model = gbdt.fit(data_transformed)

    ### 预测候选商品的结果

    # 构建待预测样本
    if sku_type == 'old':
        sku_type_sql_2 = ' where otc_days >= 60'
    elif sku_type == 'new':
        sku_type_sql_2 = ' where otc_days < 60'
    else:
        sku_type_sql_2 = ''

    data_test = spark.sql("select * from " + input_predict_data_table + "" +
                          sku_type_sql_2 + "")
    data_test = data_test.withColumn(
        'data_type_int',
        func.col('data_type').cast(
            IntegerType())).drop('data_type').withColumnRenamed(
                'data_type_int', 'data_type')
    data_test.cache()
    data_test_count = data_test.count()
    print('data_test_count = {}\n'.format(data_test_count))
    data_test = data_test.repartition(get_best_partition(data_test_count))

    # 处理预测样本
    data_test_sku2vec = data_test.join(sku_vec, on='item_sku_id', how='left')
    fillna_value_test = {c: -1 for c in features_numerical}
    fillna_value_test.update({c: -10 for c in features_embedding})
    data_test_fillna = data_test_sku2vec.fillna(fillna_value_test)
    data_transformer_test = pipeline.fit(data_test_fillna)
    data_transformed_test = data_transformer_test.transform(data_test_fillna)
    data_transformed_test.cache()
    data_test.unpersist()

    # 得到并输出候选商品池的预测结果
    gbdt_pred_test = gbdt_model.transform(data_transformed_test)
    features_result = [
        'item_third_cate_cd', 'item_sku_id', 'prediction', 'red_price',
        'red_price_level_rank', 'otc_days'
    ]

    if result_type == 'binary_prob':
        gbdt_pred_test = gbdt_pred_test.select(['item_third_cate_cd','item_sku_id','probability','red_price','red_price_level_rank','otc_days'])\
                         .rdd.map(lambda row:(row['item_third_cate_cd'],row['item_sku_id'],float(row['probability'][1]),row['red_price'],row['red_price_level_rank'],row['otc_days'])).toDF(features_result)
    else:
        gbdt_pred_test = gbdt_pred_test.withColumn('prediction_adjust',func.when(func.col('prediction') > 1,1).when(func.col('prediction') < 0,0).otherwise(func.col('prediction')))\
                          .drop('prediction').withColumnRenamed('prediction_adjust','prediction')

    result = gbdt_pred_test.select(features_result).withColumn(
        'new_old',
        func.when(func.col('otc_days') < 90, 'new').otherwise('old'))
    result.createOrReplaceTempView("result_df")
    spark.sql("""
             insert overwrite table """ + output_predict_result_table + """ 
             partition(dt='""" + predict_date + """',sku_type='""" + sku_type +
              """',result_type='""" + result_type + """') 
             select * from result_df
    """)

    data_transformed.unpersist()
    data_transformed_test.unpersist()