def exercise_in_machine_learning(self): self.static_data_frame.printSchema() prepped_data_frame = self.static_data_frame.na.fill(0). \ withColumn("day_of_week", functions.date_format(functions.col("InvoiceDate"), "EEEE")).coalesce(5) train_data_frame = prepped_data_frame.where( "InvoiceDate < '2011-03-01'") test_data_frame = prepped_data_frame.where( "InvoiceDate >= '2011-03-01'") print(f"TRAINING items: {train_data_frame.count()}") print(f"TEST DATA items: {test_data_frame.count()}") transformation_pipeline = Pipeline().setStages([ feature.StringIndexer().setInputCol("day_of_week").setOutputCol( "day_of_week_index"), feature.OneHotEncoder().setInputCol( "day_of_week_index").setOutputCol("day_of_week_encoded"), feature.VectorAssembler().setInputCols( ["UnitPrice", "Quantity", "day_of_week_encoded"]).setOutputCol("features"), ]) fitted_pipeline = transformation_pipeline.fit(train_data_frame) transformed_training = fitted_pipeline.transform(train_data_frame) # transformed_training.cache() kmeans = clustering.KMeans().setK(2).setSeed(2) km_model = kmeans.fit(transformed_training) print(f"Training cost: {km_model.summary.trainingCost}") transformed_test = fitted_pipeline.transform(test_data_frame) transformed_test.summary().show()
def classification_ml(): if False: spark = SparkSession.builder.appName('classification-ml') \ .config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.23.1') \ .getOrCreate() df = spark.read \ .format('jdbc') \ .option('url', 'jdbc:sqlite:iris.db') \ .option('driver', 'org.sqlite.JDBC') \ .option('dbtable', 'iris') \ .load() else: spark = SparkSession.builder.appName('classification-ml').getOrCreate() df = spark.read.option('header', 'true').option('inferSchema', 'true').format('csv').load('dataset/iris.csv') spark.sparkContext.setLogLevel('WARN') df.show() labels = [ ('index', types.IntegerType()), ('a1', types.FloatType()), ('a2', types.FloatType()), ('a3', types.FloatType()), ('a4', types.FloatType()), ('id', types.StringType()), ('label', types.StringType()) ] stringIndexer = ml_feature.StringIndexer(inputCol='label', outputCol='label_int') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[1:5]], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(featuresCol=featuresCreator.getOutputCol(), labelCol=stringIndexer.getOutputCol(), maxIter=10, regParam=0.01) # Create a pipeline. pipeline = Pipeline(stages=[stringIndexer, featuresCreator, logistic]) # Split the dataset into training and testing datasets. df_train, df_test = df.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(df_train) test_result = model.transform(df_test) # Dataframe. #print(test_result.take(1)) #test_result.show(5, truncate=True, vertical=False) test_result.show(truncate=False) # Save and load. lr_path = './lr' logistic.write().overwrite().save(lr_path) lr2 = ml_classification.LogisticRegression.load(lr_path) print('Param =', lr2.getRegParam()) model_path = './lr_model' model.write().overwrite().save(model_path) model2 = PipelineModel.load(model_path) print('Stages =', model.stages) print(model.stages[2].coefficientMatrix == model2.stages[2].coefficientMatrix) print(model.stages[2].interceptVector == model2.stages[2].interceptVector)
def buildModel(self,save_pipe_path=None): df=self.getModelData() label_index=fea.StringIndexer(inputCol='user_type',outputCol='label') reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',') cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector') vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature") scaler=fea.StandardScaler(inputCol='feature',outputCol='features') if not save_pipe_path: lr=LogisticRegression() grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build() evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR") pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler]) pipe = pipeline.fit(df) pipe_out=pipe.transform(df) cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator) model=cv.fit(pipe_out) print evaluator.evaluate(model.transform(pipe_out)) print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam() predict_result=model.transform(pipe_out).select('probability','label').toPandas() predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False) else: lr=LogisticRegression(elasticNetParam=1.0) pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr]) model=pipeline.fit(df) model.save(save_pipe_path) print 'pipe saved'
def train_evaluate(train_data, test_data): # 将文字的分类特征转为数字 stringIndexer = ft.StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = ft.OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1] assembler = ft.VectorAssembler(inputCols=assemblerInputs, outputCol="features") # dt = cl.DecisionTreeClassifier(labelCol="label", # featuresCol="features") rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol='label', metricName='areaUnderROC') grid_search = tune.ParamGridBuilder()\ .addGrid(rf.impurity, [ "gini","entropy"])\ .addGrid(rf.maxDepth, [ 5,10,15])\ .addGrid(rf.maxBins, [10, 15,20])\ .addGrid(rf.numTrees, [10, 20,30])\ .build() rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5) # rf_tvs = tune.TrainValidationSplit( # estimator=rf, # estimatorParamMaps=grid_search, # evaluator=evaluator, # trainRatio=0.7 # ) pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv]) cv_pipeline_model = pipeline.fit(train_data) best_model = cv_pipeline_model.stages[-1] best_parm = get_best_param(best_model) AUC, AP = evaluate_model(cv_pipeline_model, test_data) return AUC, AP, best_parm, cv_pipeline_model
def fill_empty_string(string_in, fill_value='unknown'): if not isinstance(string_in, str): return fill_value elif not string_in: return fill_value else: return string_in na_handler = ssf.udf(fill_empty_string, sst.StringType()) indexers = {} for cat_col in cat_cols: merged = merged.withColumn(cat_col, na_handler(cat_col)) indexer = smf.StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}Inx") indexer = indexer.fit(merged) merged = indexer.transform(merged) merged = merged.drop(cat_col).withColumnRenamed(f"{cat_col}Inx", cat_col) indexers[cat_col] = indexer # merged.write.parquet('data/cached/indexed.parquet') encoder = smf.OneHotEncoder(inputCols=cat_cols, outputCols=[f"{x}Vec" for x in cat_cols]) encoder = encoder.fit(merged) encoded = encoder.transform(merged) encoded = encoded.drop(*cat_cols) encoded = encoded.persist()
model_data.show() # Create is_late model_data = model_data.withColumn("is_late", model_data.arr_delay > 0) # Convert to an integer model_data = model_data.withColumn("label", model_data.is_late.cast("integer")) # Remove missing values model_data = model_data.filter( "arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL" ) model_data.show() # Create a StringIndexer carr_indexer = features.StringIndexer(inputCol="carrier", outputCol="carrier_index") # Create a OneHotEncoder carr_encoder = features.OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact") # Create a StringIndexer dest_indexer = features.StringIndexer(inputCol="dest", outputCol="dest_index") # Create a OneHotEncoder dest_encoder = features.OneHotEncoder(inputCol="dest_index", outputCol="dest_fact") # Make a VectorAssembler vec_assembler = features.VectorAssembler(inputCols=[ "month", "air_time", "carrier_fact", "dest_fact", "plane_age" ],
chn = [] for col in df_train.columns: count = df_train.filter('{} is null'.format(col)).count() if count > 0: chn.append({col:count}) print('columns name has entry with none values : {}'.format(chn)) # handel missing data df_train = df_train.na.fill(value=df_train.groupby().avg('Age').take(1)[0][0], subset=['Age']) df_train = df_train.dropna(subset = ['Age', 'Embarked']) # convert categorical data to numeric convert_cols = ['Sex', 'Embarked'] for col in convert_cols: label_indexer = feature.StringIndexer(inputCol=col, outputCol=col + 'C').fit(df_train) df_train = label_indexer.transform(df_train).drop(col) # create optimal model with backward elimination from sklearn.preprocessing import StandardScaler import statsmodels.api as sm df_p = df_train.toPandas() #sc = StandardScaler() X = df_p.iloc[:,1:].values #X = sc.fit_transform(X) #X = np.append(arr = np.ones((712,1)).astype(int), values=X, axis=1) y = df_p.iloc[:,0].values X_Opt = X[:,[0,1,2,3,4,5,6]] regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit()
# 该样本除了district_id之外均为空值,可以直接删去,剩余6052行 data_sample = data_sample.dropna() print 'Count of rows: {0}'.format(data_sample.count()) # 查看数据的schema并保存数据 data_sample.printSchema() ####################### 创建转换器 Create a transformer ####################### ## 对'land_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'position', 'y'六个变量进行转换 import pyspark.ml.feature as ft # 对'land_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'position'五个多分类变量 # 先使用StringIndexer转换数据类型,再使用OneHotEncoderEstimator进行One Hot编码 indexer1 = ft.StringIndexer(inputCol="land_condition", outputCol="land_condition_index") data_sample = indexer1.fit(data_sample).transform(data_sample) indexer2 = ft.StringIndexer(inputCol="foundation_type", outputCol="foundation_type_index") data_sample = indexer2.fit(data_sample).transform(data_sample) indexer3 = ft.StringIndexer(inputCol="roof_type", outputCol="roof_type_index") data_sample = indexer3.fit(data_sample).transform(data_sample) indexer4 = ft.StringIndexer(inputCol="ground_floor_type", outputCol="ground_floor_type_index") data_sample = indexer4.fit(data_sample).transform(data_sample) indexer5 = ft.StringIndexer(inputCol="position", outputCol="position_index") data_sample = indexer5.fit(data_sample).transform(data_sample) encoder = ft.OneHotEncoderEstimator( \ inputCols=['land_condition_index', 'foundation_type_index', 'roof_type_index', 'ground_floor_type_index', 'position_index'], \ outputCols=['land_condition_vec', 'foundation_type_vec', 'roof_type_vec', 'ground_floor_type_vec', 'position_vec'])
logger.info("Starting Spark Context") spark = sparknlp.start() conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true")) sc = pyspark.SparkContext.getOrCreate(conf=conf) sqlcontext = pyspark.SQLContext(sc) training_set = (sqlcontext.read.format("parquet").option( "header", True).load(data_dir)) # TF cv = sf.CountVectorizer(inputCol=features, outputCol="tf_features") # IDF idf = sf.IDF(inputCol="tf_features", outputCol="features") # StringIndexer label_string = sf.StringIndexer(inputCol=label, outputCol="label") # Logistic regression lr = LogisticRegression(maxIter=10, family="multinomial") pipeline = Pipeline(stages=[cv, idf, label_string, lr]) paramGrid = (ParamGridBuilder().addGrid(cv.vocabSize, [500, 1000, 1500]).addGrid( lr.regParam, [0.1, 0.01, 0.001]).build()) logger.info("Pipeline created ...") logger.info("Starts grid search ...") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(),
def to_index(df, col): outcol = col + "_idx" indexer = mlf.StringIndexer(inputCol=col, outputCol=outcol) #print indexer.params() return indexer.fit(df).transform(df).drop(col)
#!/usr/bin/env python # -*- coding: utf-8 -*- # __author__='zhangyuwei37' import pyspark.ml.feature as ft from pyspark.ml import Pipeline # 特征预处理:对类别变量onehot,对数值变量scaling, 最后整合特征,输出pca降维结果 # onehot indexers = [ ft.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in nomial_features ] encoders = [ ft.OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler_onehot = ft.VectorAssembler( inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol="onehot_features") #scaler assembler_numeric = ft.VectorAssembler(inputCols=numeric_features, outputCol="numeric_features") std_scaler = ft.StandardScaler(inputCol="numeric_features", outputCol="numeric_features_scaled") assembler_final = ft.VectorAssembler( inputCols=['onehot_features', 'numeric_features_scaled'], outputCol="final_features")
logger.info("Starting Spark Context") conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true")) sc = pyspark.SparkContext.getOrCreate(conf=conf) sqlcontext = pyspark.SQLContext(sc) training_set = (sqlcontext.read.format("parquet").option( "header", True).load(data_dir)) # TF cv = sf.CountVectorizer(inputCol="text", outputCol="tf_features", vocabSize=input_dim) # IDF idf = sf.IDF(inputCol="tf_features", outputCol="features") label_string = sf.StringIndexer(inputCol="first_label", outputCol="label") pipeline_dl = Pipeline(stages=[cv, idf, label_string]) df = pipeline_dl.fit(training_set).transform(training_set) df = df.rdd.map(lambda x: (LabeledPoint(x[ 'label'], MLLibVectors.fromML(x['features'])))) logger.info("Pipeline created ...") logger.info("Transforms the text into tf idf RDD ...") model = create_keras_model(input_dim, output_dim) logger.info("Starts Training ...") spark_model = SparkMLlibModel(model=model, frequency='epoch', mode='asynchronous', parameter_server_mode='socket') spark_model.fit(df, epochs=epochs,
.builder \ .appName("pass") \ .getOrCreate() # TRAIN # reading data_train data_train = spark.read.csv("train.csv", header=True, inferSchema=True) # preprocessing data data_train = data_train.drop("PassengerId", "Name", "SibSp", "Parch", "Ticket", "Cabin", "Embarked") data_train.count() #data_train = data_train.na.fill(value=data_train.groupby().avg('Age').take(1)[0][0], subset=['Age']) # convert sex label_indexer = feature.StringIndexer(inputCol='Sex', outputCol='Sex_num').fit(data_train) data_train = label_indexer.transform(data_train).drop('Sex') #choice feature cols feature_cols = data_train.columns[1:] assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features') data_train = assembler.setHandleInvalid("skip").transform(data_train) data_train = data_train.withColumnRenamed('Survived', 'label') data_train = data_train.select('features', 'label') # TEST # reading data_test data_test = spark.read.csv("test.csv", header=True, inferSchema=True) # preprocessing data data_test = data_test.drop("PassengerId", "Name", "SibSp", "Parch", "Ticket",
def main(spark): n = len(sys.argv) - 1 if n < 1: print('\nParameters are needed!!\n') sys.exit() else: result_type = sys.argv[1] sku_type = sys.argv[2] end_date = sys.argv[3] end_date_1w = sys.argv[4] end_date_2w = sys.argv[5] input_train_data_table = sys.argv[6] input_predict_data_table = sys.argv[7] output_predict_result_table = sys.argv[8] predict_date = sys.argv[9] spark.sql("set hive.exec.dynamic.partition.mode=nonstrict") spark.sql("set spark.sql.hive.mergeFiles=true") spark.sql("set hive.exec.orc.split.strategy=BI") spark.sql("set mapred.job.priority = HIGH") spark.sql("set hive.default.fileformat=Orc") spark.sql("set hive.exec.parallel=true") spark.sql("set hive.auto.convert.join=true") spark.sql("set hive.merge.mapfiles = true") spark.sql("set hive.merge.mapredfiles = true") spark.sql("set hive.merge.size.per.task = 256000000") spark.sql("set hive.merge.smallfiles.avgsize=128000000") spark.sql("set hive.merge.orcfile.stripe.level=false") spark.sql("set hive.exec.dynamic.partition=true") spark.sql("set hive.exec.max.dynamic.partitions=1000000") spark.sql("set hive.exec.max.dynamic.partitions.pernode=1000000") spark.sql("set hive.exec.max.created.files=1000000") spark.sql("set mapreduce.job.counters.limit=10000") spark.sql("set mapred.output.compress=true") spark.sql("set hive.exec.compress.output=true") spark.sql("set spark.shuffle.service.enabled = true") spark.sql("set spark.sql.broadcastTimeout = 10000") print('end_date = {}\n'.format(end_date)) print('sku_type = {}\n'.format(sku_type)) print('result_type = {}\n'.format(result_type)) ### 构建训练和预测样本 # 确定取数口径 if sku_type == 'old': sku_type_sql = ' and otc_days >= 60' elif sku_type == 'new': sku_type_sql = ' and otc_days < 60' else: sku_type_sql = '' # 当周正样本 data_now = spark.sql(""" select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0""" + sku_type_sql + """ )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd """) # 提前1周的独有正样本 data_1w = spark.sql(""" select a.* from ( select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date_1w + """' and label > 0""" + sku_type_sql + """ )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd )a left join ( select item_sku_id,1 as index from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0""" + sku_type_sql + """ )b on a.item_sku_id=b.item_sku_id where index is null or index = '' """) # 提前2周的独有正样本 data_2w = spark.sql(""" select a.* from ( select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date_2w + """' and label > 0""" + sku_type_sql + """ )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd )a left join ( select item_sku_id,1 as index from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label > 0""" + sku_type_sql + """ )b on a.item_sku_id=b.item_sku_id where index is null or index = '' """) # 合并正样本 data = data_now.union(data_1w).union(data_2w) data_filter = data.filter("otc_days >= 0").filter("sku_status_cd = 3001") data_filter.cache() data_count = data_filter.count() print('positive data count = {}\n'.format(data_count)) # 补充负样本 data_neg = spark.sql(""" select t1.* from ( select * from """ + input_train_data_table + """ where end_date = '""" + end_date + """' and label = 0""" + sku_type_sql + """ and otc_days >= 0 and sku_status_cd = 3001 )t1 join ( select item_third_cate_cd from app.app_vdp_ai_sink_dept3_cate3_scope_mid_da where dt = '""" + predict_date + """' and app_id = 4 and scene_id = 1 and status = 3 group by item_third_cate_cd )t2 on t1.item_third_cate_cd = t2.item_third_cate_cd """) data_neg.cache() data_neg_count = data_neg.count() neg_sample_ratio = min(data_count / data_neg_count, 1.0) if data_neg_count > 0 else 0.0 data_neg_sample = data_neg.sample(neg_sample_ratio, seed=66) # 合并正负样本 if result_type == 'ucvr': data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\ .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\ .withColumn('label_adjust',func.when(func.col('label') > 1,1).otherwise(func.col('label')))\ .drop('label').withColumnRenamed('label_adjust','label') else: data_union = data_filter.union(data_neg_sample).orderBy(func.rand(seed=66)).filter("item_first_cate_cd is not null")\ .withColumn('data_type_int', func.col('data_type').cast(IntegerType())).drop('data_type').withColumnRenamed('data_type_int','data_type')\ .withColumn('label_binary',func.when(func.col('label') > 0,1).otherwise(0))\ .drop('label').withColumnRenamed('label_binary','label') # 合并sku embedding特征 predict_date_str = ''.join(predict_date.split('-')) sku_vec = spark.sql( "select * from tmp.tmp_qzl_sink_search_08_sku2vec_features_{0}".format( predict_date_str)) vec_size = len(sku_vec.columns) - 1 data_union_sku2vec = data_union.join(sku_vec, on='item_sku_id', how='left') ### 训练模型 # 特征分类 # 非特征 features_useless = [ 'item_first_cate_name', 'item_second_cate_cd', 'item_second_cate_name', 'item_third_cate_cd', 'item_third_cate_name', 'barndname_full', 'sku_name', 'item_sku_id', 'uv_value_label', 'first_into_otc_tm', 'end_date', 'sku_status_cd', 'red_price', 'red_price_level_rank' ] # 类别型特征 features_catagory = ['item_first_cate_cd'] # embedding特征 features_embedding = ['sku_vec_' + str(i) for i in range(vec_size)] # 数值型特征 features_numerical = [ f for f in data_union_sku2vec.columns if f not in ['label'] + features_useless + features_catagory + features_embedding ] # 特征缺失值统计 feature_na = data_union_sku2vec.agg( *[(1 - (func.count(c) / func.count('*'))).alias(c) for c in data_union_sku2vec.columns]) feature_na_DF = sqlDF2pandasDF(feature_na).T feature_na_DF = feature_na_DF.reset_index() feature_na_DF.columns = ['features', 'na_rate'] for i, row in feature_na_DF.iterrows(): print('{}: {}'.format(row['features'], row['na_rate'])) # 处理缺失值 fillna_value = {c: -1 for c in features_numerical} fillna_value.update({c: -10 for c in features_embedding}) data_union_sku2vec_fillna = data_union_sku2vec.fillna(fillna_value) # 数据预处理 stringIndexer_cd1 = ft.StringIndexer(inputCol="item_first_cate_cd", outputCol="item_first_cate_cd_index") encoder_cd1 = ft.OneHotEncoder(inputCol='item_first_cate_cd_index', outputCol='item_first_cate_cd_vec') featuresCreator = ft.VectorAssembler(inputCols=features_numerical + [encoder_cd1.getOutputCol()] + features_embedding, outputCol='features') pipeline = Pipeline( stages=[stringIndexer_cd1, encoder_cd1, featuresCreator]) data_transformer = pipeline.fit(data_union_sku2vec_fillna) data_transformed = data_transformer.transform(data_union_sku2vec_fillna) data_transformed.cache() data_union_count = data_transformed.count() print('data_union_count = {}\n'.format(data_union_count)) data_filter.unpersist() data_neg.unpersist() p_num = get_best_partition(data_union_count) data_transformed = data_transformed.repartition(p_num) # 开始训练 best_depth = 12 # get_best_depth(data_union_count) best_iter = 150 # get_best_iter(data_union_count) f = '1.0' # '0.8' s = 1.0 # 0.8 if result_type == 'ucvr': gbdt = GBTRegressor(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='squared',seed=66,maxMemoryInMB=2048,cacheNodeIds=True, \ maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01) else: gbdt = GBTClassifier(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='logistic',seed=66,maxMemoryInMB=2048,cacheNodeIds=True,\ maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy=f,subsamplingRate=s,stepSize=0.01) gbdt_model = gbdt.fit(data_transformed) ### 预测候选商品的结果 # 构建待预测样本 if sku_type == 'old': sku_type_sql_2 = ' where otc_days >= 60' elif sku_type == 'new': sku_type_sql_2 = ' where otc_days < 60' else: sku_type_sql_2 = '' data_test = spark.sql("select * from " + input_predict_data_table + "" + sku_type_sql_2 + "") data_test = data_test.withColumn( 'data_type_int', func.col('data_type').cast( IntegerType())).drop('data_type').withColumnRenamed( 'data_type_int', 'data_type') data_test.cache() data_test_count = data_test.count() print('data_test_count = {}\n'.format(data_test_count)) data_test = data_test.repartition(get_best_partition(data_test_count)) # 处理预测样本 data_test_sku2vec = data_test.join(sku_vec, on='item_sku_id', how='left') fillna_value_test = {c: -1 for c in features_numerical} fillna_value_test.update({c: -10 for c in features_embedding}) data_test_fillna = data_test_sku2vec.fillna(fillna_value_test) data_transformer_test = pipeline.fit(data_test_fillna) data_transformed_test = data_transformer_test.transform(data_test_fillna) data_transformed_test.cache() data_test.unpersist() # 得到并输出候选商品池的预测结果 gbdt_pred_test = gbdt_model.transform(data_transformed_test) features_result = [ 'item_third_cate_cd', 'item_sku_id', 'prediction', 'red_price', 'red_price_level_rank', 'otc_days' ] if result_type == 'binary_prob': gbdt_pred_test = gbdt_pred_test.select(['item_third_cate_cd','item_sku_id','probability','red_price','red_price_level_rank','otc_days'])\ .rdd.map(lambda row:(row['item_third_cate_cd'],row['item_sku_id'],float(row['probability'][1]),row['red_price'],row['red_price_level_rank'],row['otc_days'])).toDF(features_result) else: gbdt_pred_test = gbdt_pred_test.withColumn('prediction_adjust',func.when(func.col('prediction') > 1,1).when(func.col('prediction') < 0,0).otherwise(func.col('prediction')))\ .drop('prediction').withColumnRenamed('prediction_adjust','prediction') result = gbdt_pred_test.select(features_result).withColumn( 'new_old', func.when(func.col('otc_days') < 90, 'new').otherwise('old')) result.createOrReplaceTempView("result_df") spark.sql(""" insert overwrite table """ + output_predict_result_table + """ partition(dt='""" + predict_date + """',sku_type='""" + sku_type + """',result_type='""" + result_type + """') select * from result_df """) data_transformed.unpersist() data_transformed_test.unpersist()
#convert them to vectors df_conv01 = convDf(df01) #prepare for ml df_prepped01 = prep(df_conv01) df_prepped02 = df02.select("name").distinct() #function to apply labels df_labeled = get_labels(df_prepped01, df_prepped02) df_labeled = df_labeled.na.drop().drop("version_idx") cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names #pipline stages #index the label labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx") #vectorise the input toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features") #classify classifier = DecisionTreeClassifier(labelCol="Label_idx", featuresCol="Features", maxDepth=10, maxBins=200) #create pipline of the stages and use it to train and test pipeline = ml.Pipeline(stages=[labelIndexer, toVec, classifier]) train, test = df_labeled.randomSplit([0.7, 0.3], seed=12345) df_pip = pipeline.fit(train) predicted = df_pip.transform(test) #print result predicted.select("name", "Label_idx", "prediction", "rawPrediction",
# 将city作为标签,使用空气浓度,时间和空气质量为特征 # 切分原始数据集为训练集和预测集,预测city的值 # 最后用多分类准确率评价, 0.4085 data = spark.sql( "select PM25,PM10,NO2,SO2,O3_1,O3_8h,CO,AQI,level,year,month,date,hour,city from init_df" ) # 将特征向量聚合到一起 vector_assembler = ft.VectorAssembler(inputCols=[ "PM25", "PM10", "NO2", "SO2", "O3_1", "O3_8h", "CO", "AQI", "level", "year", "month", "date", "hour" ], outputCol="features") data = vector_assembler.transform(data) # data.show() # 将city转换为数字编码 label_indexer = ft.StringIndexer(inputCol="city", outputCol="city_int").fit(data) label_converter = ft.IndexToString(inputCol="pred_int", outputCol="pred", labels=label_indexer.labels) train, test = data.randomSplit([0.7, 0.3]) # 定义随机森林分类器 classifier = RandomForestClassifier(labelCol="city_int", featuresCol="features", predictionCol="pred_int", maxDepth=8, maxBins=128, maxMemoryInMB=512, numTrees=50) # 模型训练与预测
.builder \ .appName("HW-5-1") \ .getOrCreate() # reading data_train data_train = spark.read.csv("iris_train.csv", header=True, inferSchema=True) # vectorize all numerical columns into a single feature column feature_cols = data_train.columns[:-1] assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features') data_train = assembler.transform(data_train) # convert text labels into indices data_train = data_train.select(['features', 'class']) label_indexer = feature.StringIndexer(inputCol='class', outputCol='label').fit(data_train) data_train = label_indexer.transform(data_train) # reading data_test data_test = spark.read.csv("iris_test.csv", header=True, inferSchema=True) # vectorize all numerical columns into a single feature column feature_cols = data_test.columns[:-1] assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features') data_test = assembler.transform(data_test) # convert text labels into indices data_test = data_test.select(['features', 'Species']) label_indexer = feature.StringIndexer(inputCol='Species', outputCol='label').fit(data_test)