def quantile_discretizer(self, df, column, num_buckets): """ 按指定等分数 分位数分桶QuantileDiscretizer """ print('QuantileDiscretizerExample') df = df.repartition(1) # 按分位数分桶离散化——分位数离散化 if isinstance(column, list): output_column = [str(v) + '_quant' for v in numic_columns] print(len(column), len(output_column), len(num_buckets)) discretizer = QuantileDiscretizer(relativeError=0.01, handleInvalid="error", numBucketsArray=num_buckets, inputCols=column, outputCols=output_column) else: discretizer = QuantileDiscretizer(numBuckets=num_buckets, relativeError=0.01, handleInvalid="error", inputCol=column, outputCol=column + '_quant') # numBuckets指定分桶数 result = discretizer.setHandleInvalid("keep").fit(df).transform(df) return result
def data_transformation(self, df): df = df.withColumn( "date", from_unixtime(unix_timestamp('DateStringYYYYMMDD', 'yyyyMMdd'))) df = df.withColumn("Year", year("date")) # extract year df = df.withColumn("Day", dayofmonth("date")) # extract day df = df.withColumn("Month", month("date")) # extract month # bin continuous variable based on quantiles (temperature) discretizer = QuantileDiscretizer(numBuckets=6, inputCol="AvgHourlyTemp", outputCol="temp_quantile") result = discretizer.fit(df).transform(df) df = result # based on variation seen in boxplots df = df.withColumn( 'store_variability', F.when((F.col("Store_ID").isin('16', '17', '18', '20', '22', '31')), "variable")\ .when((F.col("Store_ID").isin('11', '2', '21', '23', '32', '34', '36', '38')), "not-variable") ) # based on mean revenue per store id df = df.withColumn( 'store_rank', F.when((F.col("Store_ID").isin('31', '17', '20', '38', '2', '36', '21')), "great")\ .when((F.col("Store_ID").isin('18', '32', '34', '23')), "good")\ .when((F.col("Store_ID").isin('11', '22', '16')), "ok") ) return df
def discrete(self): # Bucketizer from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = self.session.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = self.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show()
def Run(self): def Lift(cum_resp,decil_no): return (cum_resp/float(decil_no+1)) self._df=self._df.orderBy(self._proba_column,ascending=False) self._df = self._df.withColumn("id", monotonically_increasing_id()) # w = Window.orderBy(desc(self._proba_column)) # self._df = self._df.withColumn('id',row_number().over(w)) discretizer = QuantileDiscretizer(numBuckets=10, inputCol="id", outputCol="deciles") self._df = discretizer.fit(self._df).transform(self._df) Rank=self._df.groupby('deciles').agg(F.count(self._df[self._proba_column]).alias('cnt'), F.count(when(self._df[self._target_column] == int(self._posLabel), True)).alias('cnt_resp')) Rank=Rank.withColumn('cnt_non_resp',Rank['cnt']-Rank['cnt_resp']) Rank=Rank.orderBy('deciles',ascending=True) cumsum_window = (Window.orderBy(Rank['deciles']).rangeBetween(Window.unboundedPreceding, Window.currentRow)) Rank=Rank.withColumn("cum_resp",F.sum('cnt_resp').over(cumsum_window)) Rank=Rank.withColumn("cum_non_resp",F.sum('cnt_non_resp').over(cumsum_window)) Rank=Rank.withColumn("% Responders(Cumulative)",F.round(old_div(Rank["cum_resp"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2)) Rank=Rank.withColumn("% Non-Responders(Cumulative)",F.round(old_div(Rank["cum_non_resp"]*100,Rank.select(F.sum('cnt_non_resp')).collect()[0][0]),2)) Rank=Rank.withColumn("cum_population",F.sum("cnt").over(cumsum_window)) Rank=Rank.withColumn("pop_pct_per_decile",F.round(old_div(Rank["cnt"]*100,Rank.select(F.sum('cnt')).collect()[0][0]))) Rank=Rank.withColumn("% Population(Cumulative)",F.round(F.sum('pop_pct_per_decile').over(cumsum_window))) Rank=Rank.withColumn("KS",F.round(Rank["% Responders(Cumulative)"] - Rank["% Non-Responders(Cumulative)"],2)) Rank=Rank.withColumn("Lift at Decile",F.round(old_div(Rank["cnt_resp"]*Rank["pop_pct_per_decile"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2)) Rank = Rank.withColumn("id", monotonically_increasing_id()) Lift_udf=udf(lambda x,y:Lift(x,y),FloatType()) Rank=Rank.withColumn("Total_Lift",F.round(Lift_udf("cum_resp","id"),2)) Rank=Rank.drop('id') return(Rank)
def load(dev): """Loads the submissions from MongoDB database.""" logging.info('Loading submissions...') df = dev.read.format('com.mongodb.spark.sql.DefaultSource').load() df.createOrReplaceTempView('submissions') df.printSchema() query = 'select score, upvote_ratio, is_nfsw, text_embedded from \ submissions' df = dev.sql(query) df = df.rdd.map( lambda r: Row(score=r['score'], upvote_ratio=r['upvote_ratio'], is_nfsw=float(r['is_nfsw']), text_embedded=Vectors.dense(r['text_embedded']))).toDF() qd = QuantileDiscretizer(numBuckets=6, inputCol='upvote_ratio', outputCol='upvote_class') df = qd.fit(df).transform(df) df.printSchema() df.show() return df
def test_quantilediscretizer_converter(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"]) df = sql.createDataFrame(pd_df).select("sepal_length") quantile = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) model = quantile.fit(df) test_df = df torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) spark_output = model.transform(test_df).select( "sepal_length_bucket").toPandas() torch_output_np = torch_model.transform(pd_df[["sepal_length"]]) np.testing.assert_allclose(spark_output.to_numpy(), torch_output_np, rtol=1e-06, atol=1e-06)
def Binning(df, num_col, no_of_buckets): for (a,b) in df.dtypes: if a==num_col: o_dtype = b tdf = df.withColumn(num_col, col(num_col).cast('double')) qds = QuantileDiscretizer(numBuckets=no_of_buckets, inputCol=num_col, outputCol="bucket_no") bucketizer = qds.fit(tdf) splits = bucketizer.getSplits() tdf = bucketizer.transform(tdf) bucket_dict = dict() for i in range(no_of_buckets): bucket_dict[float(i)] = str(splits[i]) + ' to ' + str(splits[i+1]) #tdf = tdf.withColumn('bucket_no', col(num_col).cast('string')) mapping_expr=create_map([lit(x) for x in chain(*bucket_dict.items())]) tdf = tdf.withColumn(num_col + '_bucket_range', mapping_expr.getItem(col('bucket_no'))) tdf = tdf.withColumn(num_col, col(num_col).cast(o_dtype)) return tdf, bucket_dict
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count # 按movieId做聚合,统计电影点击次数count(1) as ratingCount # avg(rating) as avgRating # variance(rating) as ratingVar -- 这个是方差 movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) # 把平均得分转成只有1列的向量存储,后续做标准化要求的 movieFeatures.show(10) ######## 走pipeline特征处理 ######## # bucketing # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶 ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization # 标准化:将平均得分向量进行标准化 ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") # 创建pipeline pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) # 把分桶转成整数类型, 把标准化的向量提取为非向量 movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\ .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec')) movieProcessedFeatures.show(10)
def test_pipeline3(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["label"]) df = sql.createDataFrame(pd_df) quantile1 = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) quantile2 = QuantileDiscretizer(inputCol="sepal_width", outputCol="sepal_width_bucket", numBuckets=2) features = ["sepal_length_bucket", "sepal_width_bucket"] + features assembler = VectorAssembler(inputCols=features, outputCol="features") pipeline = Pipeline( stages=[quantile1, quantile2, assembler, LogisticRegression()]) model = pipeline.fit(df) df = df.select( ["sepal_length", "sepal_width", "petal_length", "petal_width"]) pd_df = pd_df[[ "sepal_length", "sepal_width", "petal_length", "petal_width" ]] torch_model = convert(model, "torch", df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(pd_df), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, 3), torch_model.predict_proba(pd_df), rtol=1e-06, atol=1e-05, )
def quantile_discretizer(dataFrame, inputCol, numBuckets=4): # 按分位数分桶离散化——分位数离散化 discretizer = QuantileDiscretizer(numBuckets=numBuckets, inputCol=inputCol, outputCol='%s_bucketizer' % (inputCol)) # numBuckets指定分桶数 bucketedData = discretizer.fit(dataFrame).transform(dataFrame) return bucketedData
def aggregate_spark(data, features, args): from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer( numBuckets=args["num_buckets"], inputCol=features["col"], outputCol="_" ).fit(data) return discretizer.getSplits()
def do_quantile_discretizer(input_data, result_data, column, prefix="buckets_", num_buckets=100): discretizer = QuantileDiscretizer(inputCol=column, outputCol=prefix + column, numBuckets=num_buckets) fittedBucketer = discretizer.fit(input_data) return fittedBucketer.transform(result_data)
def ReturnQuartile(df, cols): #This function groups a column into quartiles. The input is cols, which is a list of columns for c in cols: #Create the quartiles discretizer = QuantileDiscretizer(numBuckets=4, inputCol=c, outputCol=c + "bin", relativeError=0.01, handleInvalid="error") #Apply the quartiles df = discretizer.fit(df).transform(df) return df
def qcut(input_col, output_col, num_buckets): """ Bin columns into n buckets :param input_col: :param output_col: :param num_buckets: :return: """ discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=input_col, outputCol=output_col) return discretizer.fit(self).transform(self)
def qcut(input_col, output_col, num_buckets): """ Bin columns into n buckets. Quantile Discretizer :param input_col: Input column to processed :param output_col: Output columns with the bin number :param num_buckets: Number of buckets in which the column will be divided :return: """ discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=input_col, outputCol=output_col) return discretizer.fit(self).transform(self)
def quantiles(df, input_col): try: qds = QuantileDiscretizer( # 254 is used so the 255th can be inf numBuckets=254, inputCol=input_col, outputCol='bucketed', relativeError=1. / 2550, handleInvalid='error') return qds.fit(df).getSplits() except Exception as e: print(e) raise
def quantile_bucketize(col_list: list, num_buckets : int = 3): from pyspark.ml.feature import QuantileDiscretizer df = self for c in col_list: if c in df.schema.names: non_zero_values = df.select(c).where(col(c)!=0) bucketizer = QuantileDiscretizer( numBuckets = num_buckets, inputCol = c, outputCol = c+"_bucket" ).fit(non_zero_values).setHandleInvalid('keep') df = bucketizer.transform(df).drop(c) return df
def qcut(columns, num_buckets, handle_invalid="skip"): """ Bin columns into n buckets. Quantile Discretizer :param columns: Input columns to processed :param num_buckets: Number of buckets in which the column will be divided :param handle_invalid: :return: """ df = self columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) for col_name in columns: output_col = col_name + "_qcut" discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=col_name, outputCol=output_col, handleInvalid=handle_invalid) df = discretizer.fit(df).transform(df) return df
def _sparse_feature_with_quantile(train_data: DataFrame, test_data: DataFrame, sc: SparkContext): """ 连续特征离散化 :param train_data: :param test_data: :param sc: sparkContext :return: 含有离散化特征的DataFrame, 分桶splits """ feature_bucket_splits_dict = dict() # fit splitter for col in config.CONTINUOUS_COLUMNS: sparser = QuantileDiscretizer( numBuckets=config.CONTINUOUS_COLUMNS_BUCKET_NUM, inputCol=col, outputCol=config.BUCKET_FEATURE_PREFIX + col, relativeError=0.01, handleInvalid="error") sparser_model = sparser.fit(train_data) feature_bucket_splits_dict[col] = sparser_model.getSplits() # 保存 splits 方便上线时一致 utils.write_to_hdfs(sc, config.CONTINUOUS_COLUMNS_BUCKETS, json.dumps(feature_bucket_splits_dict), overwrite=True) print("[INFO] save continuous columns buckets {0}".format( config.CONTINUOUS_COLUMNS_BUCKETS)) # transform data for k, v in feature_bucket_splits_dict.items(): input_col, output_col, splits = k, config.BUCKET_FEATURE_PREFIX + k, v bucket_model = feature.Bucketizer(inputCol=input_col, outputCol=output_col, splits=splits) train_data = bucket_model.setHandleInvalid("skip").transform( train_data) test_data = bucket_model.setHandleInvalid("skip").transform(test_data) train_data.select(output_col).show(10, False) print("[INFO] continuous sparse transform {0} to {1}".format( input_col, output_col)) return train_data, test_data, feature_bucket_splits_dict
def numerical_example(data: DataFrame): movie_features = data.groupBy("movieId").agg( F.count(F.lit(1)).alias("ratingCount"), F.avg("rating").alias("avgRating"), F.variance("rating").alias("ratingVar")).withColumn( "avgRatingVec", udf_avg_rating_to_vec(F.col("avgRating"))) print_info(movie_features) # bucketing rating_count_discretizer = QuantileDiscretizer( numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # normalization rating_scaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") pipeline_stage = [rating_count_discretizer, rating_scaler] feature_pipeline = Pipeline(stages=pipeline_stage) movie_processed_features = feature_pipeline.fit(movie_features).transform( movie_features) print_info(movie_processed_features)
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) movieFeatures.show(10) # bucketing ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) movieProcessedFeatures.show(10)
def _load_dataframe(sc): """Loads the submissions from MongoDB database.""" df = sc.read.format(SPARK_SQL_DEFAULT).load() df.createOrReplaceTempView('submissions') query = 'select score, upvote_ratio, is_nfsw, text_embedded\ from submissions' df = sc.sql(query) df = df.rdd.map( lambda r: Row(score=r['score'], upvote_ratio=r['upvote_ratio'], is_nfsw=float(r['is_nfsw']), text_embedded=Vectors.dense(r['text_embedded']))).toDF() qd = QuantileDiscretizer(numBuckets=6, inputCol='upvote_ratio', outputCol='upvote_class') df = qd.fit(df).transform(df) return df
def pipeline_preprocess(inp_df): # since categorical variables need both string indexing and # encoder, we can merge both the operations stages = [] #this will host the steps for our data transformation for col in cat_cols: stringIndexer = StringIndexer(inputCol= col, \ outputCol=col + "Index") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer \ .getOutputCol()], \ outputCols = [col + "catVec"]) stages += [stringIndexer, encoder] acctlen_bin = QuantileDiscretizer(numBuckets=4, \ inputCol = "accountlength", \ outputCol="acctlen_bin") stages += [acctlen_bin] # Create the label_Idx for the Output Column label_Idx = StringIndexer(inputCol="churn", outputCol="label") stages += [label_Idx] # Create the vector assembler stage to assemble data into labels # and features numeric_cols = num_cols.copy() numeric_cols.remove("accountlength") numeric_cols.append("acctlen_bin") inp_features = [c + "catVec" for c in cat_cols] + numeric_cols assembler = VectorAssembler(inputCols=inp_features, \ outputCol="features") stages += [assembler] data_preprocess = Pipeline(stages=stages).fit(inp_df) return data_preprocess
def create_quantilesDiscretizer(input_col: str, nq:int) -> QuantileDiscretizer: """ Create a Quantile Discretizer for a specified column Uses as output colum the input + _encoded Parameters ---------- input_col: str Name of the Input Column nq: int Number of Quantiles to use Return ------ QuantileDiscretizer """ output_col = input_col + "_encoded" return QuantileDiscretizer(numBuckets=nq, relativeError=0.05, handleInvalid='keep', inputCol=input_col, outputCol=output_col)
def evaluateKs(self, predictions: 'sparkdf', tableName: 'string', prob: 'string' = "core") -> 'double': predictions.createOrReplaceTempView(tableName) result = self.spark.sql("SELECT score as prob, label FROM %s" % tableName) viewName = tableName + "_result" result.createOrReplaceTemView(viewName) quantileDiscretizer = QuantileDiscretizer(numBuckets=10, inputCol='prob', outputCol='prob_cut') discreDF = quantileDiscretizer.fit(result).transform(result) cut_view_name = viewName + "_with_cut" discreDF.createOrReplaceTemView(cut_view_name) sql_str = r"SELECT count(label) as label_all, sum(label) as label_bad, min(prob) as min, max(prob) as max, prob_cut FROM " + cut_view_name + " group by prob_cut order by prob_cut" resultLocal = spark.sql(sql_str).collect() ks, ks_cut_local = self.compute_ks(resultLocal) print(r"ks:\t%s" % str(ks)) for line in ks_cut_local: print(line) return float(ks)
def create_categorical_feature(self, dataframe, base_field, categorical_field, levels, increment=0): """Produces a PySpark dataframe containing a categorical field based on a specified field. :param dataframe: the PySpark dataframe :param base_field: the field that provides the values used to create the categorical field :param categorical_field: the name of the categorical field to be created :param levels: the number of levels to be created in the categorical field :param increment: the value to add to each level (Default value = 0) :returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe """ dataframe = self.fix_data_type(dataframe, [base_field], 'double') discretizer = QuantileDiscretizer(numBuckets=levels, inputCol=base_field, outputCol=categorical_field) dataframe = discretizer.fit(dataframe).transform(dataframe) return (dataframe.withColumn( categorical_field, dataframe[categorical_field].cast('int') + increment))
from pyspark.ml.feature import QuantileDiscretizer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("QuantileDiscretizerExample")\ .getOrCreate() # $example on$ data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = spark.createDataFrame(data, ["id", "hour"]) # $example off$ # Output of QuantileDiscretizer for such small datasets can depend on the number of # partitions. Here we force a single partition to ensure consistent results. # Note this is not necessary for normal use cases df = df.repartition(1) # $example on$ discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show() # $example off$ spark.stop()
def process(spark, train_data, test_data): ads_train = spark.read.parquet(train_data) ads_test = spark.read.parquet(test_data) ### Создаем пайплайн # Первая часть пайплайна - генерация фичей discretize_audience = QuantileDiscretizer(numBuckets=5, inputCol='target_audience_count', outputCol='aud_bins') discretize_days = QuantileDiscretizer(numBuckets=3, inputCol='day_count', outputCol='days_bins') discretize_cost = QuantileDiscretizer(numBuckets=4, inputCol='ad_cost', outputCol='cost_bins') featureGenerator = [discretize_audience, discretize_days, discretize_cost] # Вторая часть пайплайна - векторизация фичей featurize_list = ads_train.columns for elem in ['aud_bins', 'days_bins', 'cost_bins']: featurize_list.append(elem) featurize_list = [col for col in featurize_list if col not in 'ctr'] featurizer = VectorAssembler(inputCols=featurize_list, outputCol='raw_features') # Третья часть пайплайна - индексация фичей featureIndexer = VectorIndexer(inputCol='raw_features', outputCol='features', maxCategories=5) # Четвертая часть пайплайна - модель rf = RandomForestRegressor(labelCol='ctr', featuresCol='features') rfEval = RegressionEvaluator(predictionCol='prediction', labelCol='ctr', metricName='rmse') # Не будем перебирать все еще раз. Оставим лучшую версию модели paramGrid = ParamGridBuilder() \ .addGrid(rf.maxDepth, [10]) \ .addGrid(rf.numTrees, [75]) \ .addGrid(rf.maxBins, [64]) \ .addGrid(rf.featureSubsetStrategy, ['auto']) \ .build() #.addGrid(rf.maxDepth, [6, 8, 10, 12]) \ #.addGrid(rf.numTrees, [50, 75, 150, 300]) \ #.addGrid(rf.maxBins, [32, 64, 128]) \ #.addGrid(rf.featureSubsetStrategy, ['auto', 'sqrt', 'log2']) \ # Полный ML-пайплайн pipeline = Pipeline(stages=[ discretize_audience, discretize_days, discretize_cost, featurizer, featureIndexer, rf ]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=rfEval, numFolds=3) rfModel = cv.fit(ads_train) rfModel.save(MODEL_PATH) trainPreds = rfModel.transform(ads_train) testPreds = rfModel.transform(ads_test) train_rmse = round(rfEval.evaluate(trainPreds), 5) test_rmse = round(rfEval.evaluate(testPreds), 5) # Параметры лучшей модели maxDepth = rfModel.bestModel.stages[-1]._java_obj.getMaxDepth() numTrees = rfModel.bestModel.stages[-1]._java_obj.getNumTrees() maxBins = rfModel.bestModel.stages[-1]._java_obj.getMaxBins() featureSubsetStrategy = rfModel.bestModel.stages[ -1]._java_obj.getFeatureSubsetStrategy() return train_rmse, test_rmse, maxDepth, numTrees, maxBins, featureSubsetStrategy
df_val = df_val.withColumn(output_col, (f.hash(f.col(feature)))) df_val = df_val.withColumn( output_col, f.when(f.col(output_col) < 0, f.col(output_col) * -1 % 50).otherwise(f.col(output_col) % 50)) # Set the numbers of quantiles/buckets for the baseline approach nq = 50 from pyspark.ml.feature import QuantileDiscretizer, StringIndexer, FeatureHasher, HashingTF, OneHotEncoderEstimator, VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator AllQuantileDiscretizers = [ QuantileDiscretizer(numBuckets=nq, inputCol=col, outputCol=(col + "_bucketized"), handleInvalid="keep") for col in numeric_features ] AllStringIndexers = [ StringIndexer(inputCol=col, outputCol=(col + "_indexed")) for col in categorical_features ] ### FeatureHasher has been adapted to a hardcoded feature hashing + bucketing in the preprocessing step #AllFeatureHashers = [FeatureHasher(numFeatures=nq, # inputCols=[col], # outputCol=(col + "_hashed")) for col in id_features] #AllHashingTF = [HashingTF(inputCol=col, # outputCol=(col + "_vectorized")) for col in text_features]
contDF = spark.range(20).selectExpr("cast(id as double)") # COMMAND ---------- from pyspark.ml.feature import Bucketizer bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0] bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id") bucketer.transform(contDF).show() # COMMAND ---------- from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=5, inputCol="id", outputCol="quantiles") bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id") result = discretizer.fit(contDF).transform(contDF) result.show() # COMMAND ---------- from pyspark.ml.feature import StandardScaler sScaler = StandardScaler().setInputCol("features").setOutputCol( "features_st_scaled") sScaler.fit(scaleDF).transform(scaleDF).show() # COMMAND ----------