def data_transformation(self, df): df = df.withColumn( "date", from_unixtime(unix_timestamp('DateStringYYYYMMDD', 'yyyyMMdd'))) df = df.withColumn("Year", year("date")) # extract year df = df.withColumn("Day", dayofmonth("date")) # extract day df = df.withColumn("Month", month("date")) # extract month # bin continuous variable based on quantiles (temperature) discretizer = QuantileDiscretizer(numBuckets=6, inputCol="AvgHourlyTemp", outputCol="temp_quantile") result = discretizer.fit(df).transform(df) df = result # based on variation seen in boxplots df = df.withColumn( 'store_variability', F.when((F.col("Store_ID").isin('16', '17', '18', '20', '22', '31')), "variable")\ .when((F.col("Store_ID").isin('11', '2', '21', '23', '32', '34', '36', '38')), "not-variable") ) # based on mean revenue per store id df = df.withColumn( 'store_rank', F.when((F.col("Store_ID").isin('31', '17', '20', '38', '2', '36', '21')), "great")\ .when((F.col("Store_ID").isin('18', '32', '34', '23')), "good")\ .when((F.col("Store_ID").isin('11', '22', '16')), "ok") ) return df
def discrete(self): # Bucketizer from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = self.session.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = self.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show()
def quantile_discretizer(self, df, column, num_buckets): """ 按指定等分数 分位数分桶QuantileDiscretizer """ print('QuantileDiscretizerExample') df = df.repartition(1) # 按分位数分桶离散化——分位数离散化 if isinstance(column, list): output_column = [str(v) + '_quant' for v in numic_columns] print(len(column), len(output_column), len(num_buckets)) discretizer = QuantileDiscretizer(relativeError=0.01, handleInvalid="error", numBucketsArray=num_buckets, inputCols=column, outputCols=output_column) else: discretizer = QuantileDiscretizer(numBuckets=num_buckets, relativeError=0.01, handleInvalid="error", inputCol=column, outputCol=column + '_quant') # numBuckets指定分桶数 result = discretizer.setHandleInvalid("keep").fit(df).transform(df) return result
def Run(self): def Lift(cum_resp,decil_no): return (cum_resp/float(decil_no+1)) self._df=self._df.orderBy(self._proba_column,ascending=False) self._df = self._df.withColumn("id", monotonically_increasing_id()) # w = Window.orderBy(desc(self._proba_column)) # self._df = self._df.withColumn('id',row_number().over(w)) discretizer = QuantileDiscretizer(numBuckets=10, inputCol="id", outputCol="deciles") self._df = discretizer.fit(self._df).transform(self._df) Rank=self._df.groupby('deciles').agg(F.count(self._df[self._proba_column]).alias('cnt'), F.count(when(self._df[self._target_column] == int(self._posLabel), True)).alias('cnt_resp')) Rank=Rank.withColumn('cnt_non_resp',Rank['cnt']-Rank['cnt_resp']) Rank=Rank.orderBy('deciles',ascending=True) cumsum_window = (Window.orderBy(Rank['deciles']).rangeBetween(Window.unboundedPreceding, Window.currentRow)) Rank=Rank.withColumn("cum_resp",F.sum('cnt_resp').over(cumsum_window)) Rank=Rank.withColumn("cum_non_resp",F.sum('cnt_non_resp').over(cumsum_window)) Rank=Rank.withColumn("% Responders(Cumulative)",F.round(old_div(Rank["cum_resp"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2)) Rank=Rank.withColumn("% Non-Responders(Cumulative)",F.round(old_div(Rank["cum_non_resp"]*100,Rank.select(F.sum('cnt_non_resp')).collect()[0][0]),2)) Rank=Rank.withColumn("cum_population",F.sum("cnt").over(cumsum_window)) Rank=Rank.withColumn("pop_pct_per_decile",F.round(old_div(Rank["cnt"]*100,Rank.select(F.sum('cnt')).collect()[0][0]))) Rank=Rank.withColumn("% Population(Cumulative)",F.round(F.sum('pop_pct_per_decile').over(cumsum_window))) Rank=Rank.withColumn("KS",F.round(Rank["% Responders(Cumulative)"] - Rank["% Non-Responders(Cumulative)"],2)) Rank=Rank.withColumn("Lift at Decile",F.round(old_div(Rank["cnt_resp"]*Rank["pop_pct_per_decile"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2)) Rank = Rank.withColumn("id", monotonically_increasing_id()) Lift_udf=udf(lambda x,y:Lift(x,y),FloatType()) Rank=Rank.withColumn("Total_Lift",F.round(Lift_udf("cum_resp","id"),2)) Rank=Rank.drop('id') return(Rank)
def test_quantilediscretizer_converter(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"]) df = sql.createDataFrame(pd_df).select("sepal_length") quantile = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) model = quantile.fit(df) test_df = df torch_model = convert(model, "torch", test_df) self.assertTrue(torch_model is not None) spark_output = model.transform(test_df).select( "sepal_length_bucket").toPandas() torch_output_np = torch_model.transform(pd_df[["sepal_length"]]) np.testing.assert_allclose(spark_output.to_numpy(), torch_output_np, rtol=1e-06, atol=1e-06)
def load(dev): """Loads the submissions from MongoDB database.""" logging.info('Loading submissions...') df = dev.read.format('com.mongodb.spark.sql.DefaultSource').load() df.createOrReplaceTempView('submissions') df.printSchema() query = 'select score, upvote_ratio, is_nfsw, text_embedded from \ submissions' df = dev.sql(query) df = df.rdd.map( lambda r: Row(score=r['score'], upvote_ratio=r['upvote_ratio'], is_nfsw=float(r['is_nfsw']), text_embedded=Vectors.dense(r['text_embedded']))).toDF() qd = QuantileDiscretizer(numBuckets=6, inputCol='upvote_ratio', outputCol='upvote_class') df = qd.fit(df).transform(df) df.printSchema() df.show() return df
def Binning(df, num_col, no_of_buckets): for (a,b) in df.dtypes: if a==num_col: o_dtype = b tdf = df.withColumn(num_col, col(num_col).cast('double')) qds = QuantileDiscretizer(numBuckets=no_of_buckets, inputCol=num_col, outputCol="bucket_no") bucketizer = qds.fit(tdf) splits = bucketizer.getSplits() tdf = bucketizer.transform(tdf) bucket_dict = dict() for i in range(no_of_buckets): bucket_dict[float(i)] = str(splits[i]) + ' to ' + str(splits[i+1]) #tdf = tdf.withColumn('bucket_no', col(num_col).cast('string')) mapping_expr=create_map([lit(x) for x in chain(*bucket_dict.items())]) tdf = tdf.withColumn(num_col + '_bucket_range', mapping_expr.getItem(col('bucket_no'))) tdf = tdf.withColumn(num_col, col(num_col).cast(o_dtype)) return tdf, bucket_dict
def aggregate_spark(data, features, args): from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer( numBuckets=args["num_buckets"], inputCol=features["col"], outputCol="_" ).fit(data) return discretizer.getSplits()
def quantile_discretizer(dataFrame, inputCol, numBuckets=4): # 按分位数分桶离散化——分位数离散化 discretizer = QuantileDiscretizer(numBuckets=numBuckets, inputCol=inputCol, outputCol='%s_bucketizer' % (inputCol)) # numBuckets指定分桶数 bucketedData = discretizer.fit(dataFrame).transform(dataFrame) return bucketedData
def do_quantile_discretizer(input_data, result_data, column, prefix="buckets_", num_buckets=100): discretizer = QuantileDiscretizer(inputCol=column, outputCol=prefix + column, numBuckets=num_buckets) fittedBucketer = discretizer.fit(input_data) return fittedBucketer.transform(result_data)
def qcut(input_col, output_col, num_buckets): """ Bin columns into n buckets. Quantile Discretizer :param input_col: Input column to processed :param output_col: Output columns with the bin number :param num_buckets: Number of buckets in which the column will be divided :return: """ discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=input_col, outputCol=output_col) return discretizer.fit(self).transform(self)
def ReturnQuartile(df, cols): #This function groups a column into quartiles. The input is cols, which is a list of columns for c in cols: #Create the quartiles discretizer = QuantileDiscretizer(numBuckets=4, inputCol=c, outputCol=c + "bin", relativeError=0.01, handleInvalid="error") #Apply the quartiles df = discretizer.fit(df).transform(df) return df
def qcut(input_col, output_col, num_buckets): """ Bin columns into n buckets :param input_col: :param output_col: :param num_buckets: :return: """ discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=input_col, outputCol=output_col) return discretizer.fit(self).transform(self)
def quantiles(df, input_col): try: qds = QuantileDiscretizer( # 254 is used so the 255th can be inf numBuckets=254, inputCol=input_col, outputCol='bucketed', relativeError=1. / 2550, handleInvalid='error') return qds.fit(df).getSplits() except Exception as e: print(e) raise
def create_categorical_feature(self, dataframe, base_field, categorical_field, levels, increment=0): """Produces a PySpark dataframe containing a categorical field based on a specified field. :param dataframe: the PySpark dataframe :param base_field: the field that provides the values used to create the categorical field :param categorical_field: the name of the categorical field to be created :param levels: the number of levels to be created in the categorical field :param increment: the value to add to each level (Default value = 0) :returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe """ dataframe = self.fix_data_type(dataframe, [base_field], 'double') discretizer = QuantileDiscretizer(numBuckets=levels, inputCol=base_field, outputCol=categorical_field) dataframe = discretizer.fit(dataframe).transform(dataframe) return(dataframe.withColumn(categorical_field, dataframe[categorical_field].cast('int')+increment))
def quantile_bucketize(col_list: list, num_buckets : int = 3): from pyspark.ml.feature import QuantileDiscretizer df = self for c in col_list: if c in df.schema.names: non_zero_values = df.select(c).where(col(c)!=0) bucketizer = QuantileDiscretizer( numBuckets = num_buckets, inputCol = c, outputCol = c+"_bucket" ).fit(non_zero_values).setHandleInvalid('keep') df = bucketizer.transform(df).drop(c) return df
def qcut(columns, num_buckets, handle_invalid="skip"): """ Bin columns into n buckets. Quantile Discretizer :param columns: Input columns to processed :param num_buckets: Number of buckets in which the column will be divided :param handle_invalid: :return: """ df = self columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) for col_name in columns: output_col = col_name + "_qcut" discretizer = QuantileDiscretizer(numBuckets=num_buckets, inputCol=col_name, outputCol=output_col, handleInvalid=handle_invalid) df = discretizer.fit(df).transform(df) return df
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count # 按movieId做聚合,统计电影点击次数count(1) as ratingCount # avg(rating) as avgRating # variance(rating) as ratingVar -- 这个是方差 movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) # 把平均得分转成只有1列的向量存储,后续做标准化要求的 movieFeatures.show(10) ######## 走pipeline特征处理 ######## # bucketing # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶 ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization # 标准化:将平均得分向量进行标准化 ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") # 创建pipeline pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) # 把分桶转成整数类型, 把标准化的向量提取为非向量 movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\ .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec')) movieProcessedFeatures.show(10)
def test_pipeline3(self): iris = load_iris() features = [ "sepal_length", "sepal_width", "petal_length", "petal_width" ] pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["label"]) df = sql.createDataFrame(pd_df) quantile1 = QuantileDiscretizer(inputCol="sepal_length", outputCol="sepal_length_bucket", numBuckets=2) quantile2 = QuantileDiscretizer(inputCol="sepal_width", outputCol="sepal_width_bucket", numBuckets=2) features = ["sepal_length_bucket", "sepal_width_bucket"] + features assembler = VectorAssembler(inputCols=features, outputCol="features") pipeline = Pipeline( stages=[quantile1, quantile2, assembler, LogisticRegression()]) model = pipeline.fit(df) df = df.select( ["sepal_length", "sepal_width", "petal_length", "petal_width"]) pd_df = pd_df[[ "sepal_length", "sepal_width", "petal_length", "petal_width" ]] torch_model = convert(model, "torch", df) self.assertTrue(torch_model is not None) np.testing.assert_allclose( np.array(model.transform(df).select( "prediction").collect()).reshape(-1), torch_model.predict(pd_df), rtol=1e-06, atol=1e-06, ) np.testing.assert_allclose( np.array( model.transform(df).select("probability").collect()).reshape( -1, 3), torch_model.predict_proba(pd_df), rtol=1e-06, atol=1e-05, )
def _sparse_feature_with_quantile(train_data: DataFrame, test_data: DataFrame, sc: SparkContext): """ 连续特征离散化 :param train_data: :param test_data: :param sc: sparkContext :return: 含有离散化特征的DataFrame, 分桶splits """ feature_bucket_splits_dict = dict() # fit splitter for col in config.CONTINUOUS_COLUMNS: sparser = QuantileDiscretizer( numBuckets=config.CONTINUOUS_COLUMNS_BUCKET_NUM, inputCol=col, outputCol=config.BUCKET_FEATURE_PREFIX + col, relativeError=0.01, handleInvalid="error") sparser_model = sparser.fit(train_data) feature_bucket_splits_dict[col] = sparser_model.getSplits() # 保存 splits 方便上线时一致 utils.write_to_hdfs(sc, config.CONTINUOUS_COLUMNS_BUCKETS, json.dumps(feature_bucket_splits_dict), overwrite=True) print("[INFO] save continuous columns buckets {0}".format( config.CONTINUOUS_COLUMNS_BUCKETS)) # transform data for k, v in feature_bucket_splits_dict.items(): input_col, output_col, splits = k, config.BUCKET_FEATURE_PREFIX + k, v bucket_model = feature.Bucketizer(inputCol=input_col, outputCol=output_col, splits=splits) train_data = bucket_model.setHandleInvalid("skip").transform( train_data) test_data = bucket_model.setHandleInvalid("skip").transform(test_data) train_data.select(output_col).show(10, False) print("[INFO] continuous sparse transform {0} to {1}".format( input_col, output_col)) return train_data, test_data, feature_bucket_splits_dict
def _load_dataframe(sc): """Loads the submissions from MongoDB database.""" df = sc.read.format(SPARK_SQL_DEFAULT).load() df.createOrReplaceTempView('submissions') query = 'select score, upvote_ratio, is_nfsw, text_embedded\ from submissions' df = sc.sql(query) df = df.rdd.map( lambda r: Row(score=r['score'], upvote_ratio=r['upvote_ratio'], is_nfsw=float(r['is_nfsw']), text_embedded=Vectors.dense(r['text_embedded']))).toDF() qd = QuantileDiscretizer(numBuckets=6, inputCol='upvote_ratio', outputCol='upvote_class') df = qd.fit(df).transform(df) return df
def evaluateKs(self, predictions: 'sparkdf', tableName: 'string', prob: 'string' = "core") -> 'double': predictions.createOrReplaceTempView(tableName) result = self.spark.sql("SELECT score as prob, label FROM %s" % tableName) viewName = tableName + "_result" result.createOrReplaceTemView(viewName) quantileDiscretizer = QuantileDiscretizer(numBuckets=10, inputCol='prob', outputCol='prob_cut') discreDF = quantileDiscretizer.fit(result).transform(result) cut_view_name = viewName + "_with_cut" discreDF.createOrReplaceTemView(cut_view_name) sql_str = r"SELECT count(label) as label_all, sum(label) as label_bad, min(prob) as min, max(prob) as max, prob_cut FROM " + cut_view_name + " group by prob_cut order by prob_cut" resultLocal = spark.sql(sql_str).collect() ks, ks_cut_local = self.compute_ks(resultLocal) print(r"ks:\t%s" % str(ks)) for line in ks_cut_local: print(line) return float(ks)
def create_categorical_feature(self, dataframe, base_field, categorical_field, levels, increment=0): """Produces a PySpark dataframe containing a categorical field based on a specified field. :param dataframe: the PySpark dataframe :param base_field: the field that provides the values used to create the categorical field :param categorical_field: the name of the categorical field to be created :param levels: the number of levels to be created in the categorical field :param increment: the value to add to each level (Default value = 0) :returns: the PySpark dataframe containing a categorical field and all fields in the supplied dataframe """ dataframe = self.fix_data_type(dataframe, [base_field], 'double') discretizer = QuantileDiscretizer(numBuckets=levels, inputCol=base_field, outputCol=categorical_field) dataframe = discretizer.fit(dataframe).transform(dataframe) return (dataframe.withColumn( categorical_field, dataframe[categorical_field].cast('int') + increment))
def numerical_example(data: DataFrame): movie_features = data.groupBy("movieId").agg( F.count(F.lit(1)).alias("ratingCount"), F.avg("rating").alias("avgRating"), F.variance("rating").alias("ratingVar")).withColumn( "avgRatingVec", udf_avg_rating_to_vec(F.col("avgRating"))) print_info(movie_features) # bucketing rating_count_discretizer = QuantileDiscretizer( numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # normalization rating_scaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") pipeline_stage = [rating_count_discretizer, rating_scaler] feature_pipeline = Pipeline(stages=pipeline_stage) movie_processed_features = feature_pipeline.fit(movie_features).transform( movie_features) print_info(movie_processed_features)
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) movieFeatures.show(10) # bucketing ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) movieProcessedFeatures.show(10)
def create_quantilesDiscretizer(input_col: str, nq:int) -> QuantileDiscretizer: """ Create a Quantile Discretizer for a specified column Uses as output colum the input + _encoded Parameters ---------- input_col: str Name of the Input Column nq: int Number of Quantiles to use Return ------ QuantileDiscretizer """ output_col = input_col + "_encoded" return QuantileDiscretizer(numBuckets=nq, relativeError=0.05, handleInvalid='keep', inputCol=input_col, outputCol=output_col)
def pipeline_preprocess(inp_df): # since categorical variables need both string indexing and # encoder, we can merge both the operations stages = [] #this will host the steps for our data transformation for col in cat_cols: stringIndexer = StringIndexer(inputCol= col, \ outputCol=col + "Index") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer \ .getOutputCol()], \ outputCols = [col + "catVec"]) stages += [stringIndexer, encoder] acctlen_bin = QuantileDiscretizer(numBuckets=4, \ inputCol = "accountlength", \ outputCol="acctlen_bin") stages += [acctlen_bin] # Create the label_Idx for the Output Column label_Idx = StringIndexer(inputCol="churn", outputCol="label") stages += [label_Idx] # Create the vector assembler stage to assemble data into labels # and features numeric_cols = num_cols.copy() numeric_cols.remove("accountlength") numeric_cols.append("acctlen_bin") inp_features = [c + "catVec" for c in cat_cols] + numeric_cols assembler = VectorAssembler(inputCols=inp_features, \ outputCol="features") stages += [assembler] data_preprocess = Pipeline(stages=stages).fit(inp_df) return data_preprocess
from pyspark.ml.feature import QuantileDiscretizer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("QuantileDiscretizerExample")\ .getOrCreate() # $example on$ data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = spark.createDataFrame(data, ["id", "hour"]) # $example off$ # Output of QuantileDiscretizer for such small datasets can depend on the number of # partitions. Here we force a single partition to ensure consistent results. # Note this is not necessary for normal use cases df = df.repartition(1) # $example on$ discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show() # $example off$ spark.stop()
'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'global_subjectivity', 'global_sentiment_polarity', 'title_subjectivity', 'title_sentiment_polarity', 'abs_title_subjectivity', 'abs_title_sentiment_polarity'],outputCol='features' ) new_data = assembler.transform(data) final_data = new_data.select('features','shares') from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result") result = discretizer.fit(final_data).transform(final_data) finalData = result.select('result','features') from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features') train_data,test_data = finalData.randomSplit([0.7,0.3]) rfc_model = rfc.fit(train_data) result = rfc_model.transform(test_data); from pyspark.ml.evaluation import BinaryClassificationEvaluator acc_eval = BinaryClassificationEvaluator(labelCol='result') print(acc_eval.evaluate(result)) test_data.head(1) # import os, sys # import pandas
# QuantileDiscretizer takes a column with continuous features and outputs a # column with binned categorical features. The number of bins is set by the # numBuckets parameter. It is possible that the number of buckets used will be # smaller than this value, for example, if there are too few distinct values of # the input to create enough distinct quantiles. # NaN values: NaN values will be removed from the column during # QuantileDiscretizer fitting. This will produce a Bucketizer model for making # predictions. During the transformation, Bucketizer will raise an error when # it finds NaN values in the dataset, but the user can also choose to either # keep or remove NaN values within the dataset by setting handleInvalid. If the # user chooses to keep NaN values, they will be handled specially and placed # into their own bucket, for example, if 4 buckets are used, then non-NaN data # will be put into buckets[0-3], but NaNs will be counted in a special # bucket[4]. spark = SparkSession.builder.appName("QuantileDiscretizer").getOrCreate() data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = spark.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(inputCol="hour", outputCol="result", numBuckets=3) discretizerModel = discretizer.fit(df) result = discretizerModel.transform(df) result.show() spark.stop()
# $example on$ from pyspark.ml.feature import QuantileDiscretizer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("QuantileDiscretizerExample")\ .getOrCreate() # $example on$ data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)] df = spark.createDataFrame(data, ["id", "hour"]) # $example off$ # Output of QuantileDiscretizer for such small datasets can depend on the number of # partitions. Here we force a single partition to ensure consistent results. # Note this is not necessary for normal use cases df = df.repartition(1) # $example on$ discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show() # $example off$ spark.stop()
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import QuantileDiscretizer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate() # $example on$ data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] dataFrame = spark.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(dataFrame).transform(dataFrame) result.show() # $example off$ spark.stop()
df_val = df_val.withColumn(output_col, (f.hash(f.col(feature)))) df_val = df_val.withColumn( output_col, f.when(f.col(output_col) < 0, f.col(output_col) * -1 % 50).otherwise(f.col(output_col) % 50)) # Set the numbers of quantiles/buckets for the baseline approach nq = 50 from pyspark.ml.feature import QuantileDiscretizer, StringIndexer, FeatureHasher, HashingTF, OneHotEncoderEstimator, VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator AllQuantileDiscretizers = [ QuantileDiscretizer(numBuckets=nq, inputCol=col, outputCol=(col + "_bucketized"), handleInvalid="keep") for col in numeric_features ] AllStringIndexers = [ StringIndexer(inputCol=col, outputCol=(col + "_indexed")) for col in categorical_features ] ### FeatureHasher has been adapted to a hardcoded feature hashing + bucketing in the preprocessing step #AllFeatureHashers = [FeatureHasher(numFeatures=nq, # inputCols=[col], # outputCol=(col + "_hashed")) for col in id_features] #AllHashingTF = [HashingTF(inputCol=col, # outputCol=(col + "_vectorized")) for col in text_features]
# -*- coding: utf-8 -*- """ Created on Tue Jul 4 15:01:21 2017 @author: 10639497 """ from __future__ import print_function from pyspark.sql import SparkSession session = SparkSession.builder.appName("Quantilte").getOrCreate() from pyspark.ml.feature import QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] dataset = session.createDataFrame(data, ['id', 'hour']) dataset.show() discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(dataset).transform(dataset).select('result') result.show()