def test_list_list_float(self): b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]]) self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]]) self.assertTrue(all([type(v) == list for v in b.getSplitsArray()])) self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]])) self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]])) self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0])) self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
def bucketize(self, df, c): bucketizer4 = Bucketizer(splits=[-float("inf"), 0, 0.25, 0.5, 0.75, 1.0 ,float("inf")], inputCol=c, outputCol="B4_"+c) bucketizer10 = Bucketizer(splits=[-float("inf"), 0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ,float("inf")], inputCol=c, outputCol="B10_"+c) df = bucketizer4.transform(df.select('snapshotDate','ID',c)) df = bucketizer10.transform(df) return( df.select('snapshotDate','ID','B4_'+c, 'B10_'+c) )
def calc(df, col_x: str, col_y: str, bins=50, bin_width=None): """ Calculate the buckets and weights for a histogram Returns ------- (buckets, weights): tuple of two lists """ # Calculate buckets data = df[[col_x, col_y]] # Check int_types = (IntegerType, LongType, FloatType, DoubleType, DecimalType) col_type = data.schema.fields[0].dataType if not isinstance(col_type, int_types): raise ValueError( "hist2d method requires numerical or datetime columns, nothing to plot." ) # Calculate buckets buckets_x = utils.spark_buckets(data, col_x, bins=bins, bin_width=bin_width) buckets_y = utils.spark_buckets(data, col_y, bins=bins, bin_width=bin_width) # Generate DF with buckets bucketizer = Bucketizer(splits=buckets_x, inputCol=col_x, outputCol="bucket_x") buckets_df = bucketizer.transform(data) bucketizer = Bucketizer(splits=buckets_y, inputCol=col_y, outputCol="bucket_y") buckets_df = bucketizer.transform(buckets_df) histogram = buckets_df.groupby("bucket_x", "bucket_y").agg( F.count(col_x).alias("count")) # Create weights matrix (locally) hist_pd = histogram.toPandas() weights = np.zeros((bins, bins)) for index, row in hist_pd.iterrows(): weights[int(row["bucket_x"]), int(row["bucket_y"])] = row["count"] # Mask values that are zero so they look transparent weights = np.ma.masked_where(weights == 0, weights) len(buckets_x) len(weights) return buckets_x, buckets_y, weights
def main_emm_recode_demos(emm_raw_sdf): recode_demo_pipeline = Pipeline(stages=[ Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150], inputCol='age', outputCol="age1"), Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150], inputCol='age', outputCol="age7"), Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150], inputCol='age', outputCol="age8"), # Bucketizer(splits=[-25, 0, 25., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income1"), # Bucketizer(splits=[-25, 0, 25., 35., 50., 75., 100., float('Inf')], inputCol='income_amt', outputCol="income9"), IfElseTransformer( vals=[83], inputCol='hispanicid', outputCol='hispanic'), IfElseTransformer( vals=['M'], inputCol='gender_char', outputCol='gender'), IfElseTransformer(vals=[86], inputCol='raceid', outputCol='race_back'), IfElseTransformer(vals=[88], inputCol='raceid', outputCol='race_asian'), YesNoTransformer(inputCol='dvr_flag', outputCol='dvr'), YesNoTransformer(inputCol='cable_plus_flag', outputCol='cableplus'), YesNoTransformer(inputCol='video_game_owner_flag', outputCol='video_game'), YesNoTransformer(inputCol='internet_access_flag', outputCol='internet'), YesNoTransformer(inputCol='pay_cable_flag', outputCol='paycable'), YesNoTransformer( inputCol='television_high_definition_display_capability_flag', outputCol='hdtv'), YesNoTransformer(inputCol='alternative_delivery_flag', outputCol='satellite'), IsInTransformer(isin_bins=[[0, 1], [2], [3, 4, 5, 6, 7], [8]], inputCol='nielsen_occupation_code', outputCol='occupation1'), IsInTransformer(isin_bins=[[0, 8, 9, 10, 11, 12], [13, 14, 15], [16], [18, 19, 20]], inputCol='education_level_number', outputCol='education7'), IsInTransformer(isin_bins=[[16, 18, 19, 20], [0, 8, 9, 10, 11, 12, 13, 14, 15]], inputCol='education_level_number', outputCol='education2'), IsInTransformer(isin_bins=[['A'], ['B'], ['C'], ['D']], inputCol='county_size_code', outputCol='county_size') ]) return None
def discrete(self): # Bucketizer from pyspark.ml.feature import Bucketizer splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] data = [(-999.9, ), (-0.5, ), (-0.3, ), (0.0, ), (0.2, ), (999.9, )] dataFrame = self.session.createDataFrame(data, ["features"]) bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # QuantileDiscretizer data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)] df = self.createDataFrame(data, ["id", "hour"]) discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result") result = discretizer.fit(df).transform(df) result.show()
def create_buckets(percentage_of_missing_ctus_per_partyid): """ Devide party ids by percentage of missing ctus into a list of 5 buckets > 0 < 0.25 > 0.25 < 0.5 > 0.5 < 0.75 > 0.75 < 0.99 > 0.99 Output: +--------+-----------------------+-------+ |party_id|percentage_missing_ctus|buckets| +--------+-----------------------+-------+ | 1| 0.2| 0.0| | 2| 0.33| 1.0| | 3| 1.0| 4.0| | 4| 0.75| 3.0| | 5| 0.6| 2.0| | 6| 0.6| 2.0| +--------+-----------------------+-------+ """ bucketizer = Bucketizer(splits=[ 0, 0.25, 0.5, 0.75, 0.99, float('Inf') ], \ inputCol="percentage_missing_ctus", outputCol="buckets") df_of_buckets_ratio_between_imputed_distinct_ctus\ = bucketizer.setHandleInvalid("keep").\ transform(percentage_of_missing_ctus_per_partyid) return df_of_buckets_ratio_between_imputed_distinct_ctus
def _compute_hist(sdf, bins): # 'data' is a Spark DataFrame that selects one column. assert isinstance(bins, (np.ndarray, np.generic)) colname = sdf.columns[-1] bucket_name = "__{}_bucket".format(colname) # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer( splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" ) # after bucketing values, groups and counts them result = ( bucketizer.transform(sdf) .select(bucket_name) .groupby(bucket_name) .agg(F.count("*").alias("count")) .toPandas() .sort_values(by=bucket_name) ) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), "bucket": bins[:-1]}) # merges the bins with counts on it and fills remaining ones with zeros pdf = indexes.merge(result, how="left", on=[bucket_name]).fillna(0)[["count"]] pdf.columns = [bucket_name] return pdf[bucket_name]
def get_binned_stat(self, df, colname, col_stat, n_split=10): splits = CommonUtils.frange(col_stat["min"], col_stat["max"], num_steps=n_split) splits = sorted(splits) splits_range = [(splits[idx], splits[idx + 1]) for idx in range(len(splits) - 1)] splits_data = {"splits": splits, "splits_range": splits_range} splits = splits_data["splits"] double_df = df.withColumn(colname, df[colname].cast(DoubleType())) bucketizer = Bucketizer(inputCol=colname, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) histogram_df = binned_df.groupBy("BINNED_INDEX").count().toPandas() str_splits_range = [ " to ".join([str(x[0]), str(x[1])]) for x in splits_range ] bin_name_dict = dict(zip(range(len(splits_range)), str_splits_range)) bin_name_dict[n_split] = "null" histogram_df["orderIndex"] = histogram_df["BINNED_INDEX"].apply( lambda x: n_split if pd.isnull(x) else x) histogram_df["bins"] = histogram_df["orderIndex"].apply( lambda x: bin_name_dict[int(x)]) relevant_df = histogram_df[["bins", "count", "orderIndex"]] histogram_dict = relevant_df.T.to_dict().values() histogram_dict = sorted(histogram_dict, key=lambda x: x["orderIndex"]) output = [] for val in histogram_dict: output.append({"name": val["bins"], "value": val["count"]}) return output
def calc_histogram(self, bins): bucket_name = '__{}_bucket'.format(self.colname) # creates a Bucketizer to get corresponding bin of each value bucketizer = Bucketizer(splits=bins, inputCol=self.colname, outputCol=bucket_name, handleInvalid="skip") # after bucketing values, groups and counts them result = (bucketizer .transform(self.data._kdf._sdf) .select(bucket_name) .groupby(bucket_name) .agg(F.count('*').alias('count')) .toPandas() .sort_values(by=bucket_name)) # generates a pandas DF with one row for each bin # we need this as some of the bins may be empty indexes = pd.DataFrame({bucket_name: np.arange(0, len(bins) - 1), 'bucket': bins[:-1]}) # merges the bins with counts on it and fills remaining ones with zeros data = indexes.merge(result, how='left', on=[bucket_name]).fillna(0)[['count']] data.columns = [bucket_name] return data
def test_bucketizer(self): values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )] data = self.spark.createDataFrame(values, ["features"]) model = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], inputCol="features", outputCol="buckets") feature_count = len(data.select('features').first()) model_onnx = convert_sparkml( model, 'Sparkml Bucketizer', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.setHandleInvalid("error").transform(data) expected = predicted.select("buckets").toPandas().values.astype( numpy.float32) data_np = [data.toPandas().values.astype(numpy.float32)] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBucketizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['buckets'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def OneHotEncoder(self): """ Converts string-type categories to indexes, splits continuous data interval to indexes, encodes the categorical data using One-Hot encoding. """ splits = [-float("inf"), 500, 1200, 1700, float("inf")] self.bucketizer = Bucketizer( splitsArray=[splits, splits, splits], inputCols=["CRSDepTime", "CRSArrTime", "DepTime"], outputCols=["CatCRSDepTime", "CatCRSArrTime", "CatDepTime"]) self.varIdxer = StringIndexer( inputCol="OrigDest", outputCol="IndOrigDest").setHandleInvalid("skip") self.oneHot = OneHotEncoder(inputCols=[ 'Month', 'DayOfWeek', 'CatCRSDepTime', 'CatCRSArrTime', 'IndOrigDest', 'CatDepTime' ], outputCols=[ 'HotMonth', 'HotDayOfWeek', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotIndOrigDest', 'HotDepTime' ]).setHandleInvalid("keep")
def get_column_hist(self, column, bins): """return a list of counts corresponding to bins""" bins = list(copy.deepcopy(bins)) # take a copy since we are inserting and popping if bins[0] == -np.inf or bins[0] == -float("inf"): added_min = False bins[0] = -float("inf") else: added_min = True bins.insert(0, -float("inf")) if bins[-1] == np.inf or bins[-1] == float("inf"): added_max = False bins[-1] = float("inf") else: added_max = True bins.append(float("inf")) temp_column = self.spark_df.select(column).where(col(column).isNotNull()) bucketizer = Bucketizer( splits=bins, inputCol=column, outputCol="buckets") bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column) # This is painful to do, but: bucketizer cannot handle values outside of a range # (hence adding -/+ infinity above) # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound # for all but the last bin # But, since the last bin in our case will often be +infinity, we need to # find the number of values exactly equal to the upper bound to add those # We'll try for an optimization by asking for it at the same time if added_max == True: upper_bound_count = temp_column.select(column).filter(col(column) == bins[-2]).count() else: upper_bound_count = 0 hist_rows = bucketed.groupBy("buckets").count().collect() # Spark only returns buckets that have nonzero counts. hist = [0] * (len(bins) - 1) for row in hist_rows: hist[int(row["buckets"])] = row["count"] hist[-2] += upper_bound_count if added_min: below_bins = hist.pop(0) bins.pop(0) if below_bins > 0: logger.warning("Discarding histogram values below lowest bin.") if added_max: above_bins = hist.pop(-1) bins.pop(-1) if above_bins > 0: logger.warning("Discarding histogram values above highest bin.") return hist
def model_train(zipcode, complaint, day): print("Loading Data ...") data311 = spark.read.format("csv").option("header", "true").load("Data_Final/*.csv") infer_schema = "true" first_row_is_header = "true" delimiter = "," data311.registerTempTable("data311") data311 = data311.withColumn("ResTimeH", data311.Resolution_Time_Hours.cast('int')) data311 = data311.withColumn('day_of_week', dayofweek(data311['Created Date'])) data311 = data311.withColumn("Zip", data311["Incident Zip"].cast('int')) data311 = data311.filter(data311.ResTimeH > 0) data311 = data311.filter(data311.ResTimeH < 99) bucketizer = Bucketizer(splits=[0, 2, 6, float('Inf')], inputCol="ResTimeH", outputCol="categories") data311 = bucketizer.setHandleInvalid("keep").transform(data311) X = data311['Zip', 'Complaint_Type_Groups', 'day_of_week', 'categories'] X = X.filter(X["Zip"].isNotNull()) X = X.filter(X["Complaint_Type_Groups"].isNotNull()) X = X.filter(X["day_of_week"].isNotNull()) stage_1 = StringIndexer(inputCol="Complaint_Type_Groups", outputCol="categoryIndex") stage_2 = OneHotEncoderEstimator(inputCols=["categoryIndex"], outputCols=["categoryVec"]) stage_3 = VectorAssembler(inputCols=['Zip', 'day_of_week', 'categoryVec'], outputCol="features") stage_4 = StandardScaler().setInputCol("features").setOutputCol( "Scaled_ip_features") stage_5 = LogisticRegression(labelCol="categories", featuresCol="Scaled_ip_features") # setup the pipeline pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5]) # fit the pipeline model and transform the data as defined pipeline_model = pipeline.fit(X) zipcode = int(zipcode) day = int(day) input_variables = pd.DataFrame( [[zipcode, complaint, day]], columns=['Zip', 'Complaint_Type_Groups', 'day_of_week']) input_variables = spark.createDataFrame(input_variables) transformed = pipeline_model.transform(input_variables) ans = transformed.select(collect_list('prediction')).first()[0] if (ans[0] == 0.0): prediction = "Your complaint will be resolved within 2 hours." elif (ans[0] == 1.0): prediction = "Your complaint will be resolved within 2-6 hours." else: prediction = "Your complaint will be resolved after 6 hours" return prediction
def transform_spark(data, columns, args, transformed_column_name): from pyspark.ml.feature import Bucketizer import pyspark.sql.functions as F new_b = Bucketizer( splits=args["bucket_boundaries"], inputCol=columns["num"], outputCol=transformed_column_name ) return new_b.transform(data).withColumn( transformed_column_name, F.col(transformed_column_name).cast("int") )
def get_binned_dataframe(df, bin_name, variable_name, edges): ''' Produces a dataframe with a new column `bin_name` corresponding to the variable `variable_name` binned with the given `edges`. ''' splits = [-float('inf')]+list(edges)+[float('inf')] bucketizer = Bucketizer( splits=splits, inputCol=variable_name, outputCol=bin_name) binnedDF = bucketizer.transform(df) return binnedDF
def strat_scatterplot(sdf, col1, col2, n=30): stages = [] for col in [col1, col2]: splits = get_buckets(sdf.select(col).rdd.map(itemgetter(0)), n) stages.append(Bucketizer(splits=splits, inputCol=col, outputCol="__{}_bucket".format(col), handleInvalid="skip")) pipeline = Pipeline(stages=stages) model = pipeline.fit(sdf) return model, sdf.count()
def age_recoder(spark_df, age_col): """ :param spark_df: :param age_col: :return: """ age1 = Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150], inputCol=age_col, outputCol="age1") age7 = Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150], inputCol=age_col, outputCol="age7") age8 = Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150], inputCol=age_col, outputCol="age8") sdf_1 = age1.setHandleInvalid("keep").transform(spark_df) sdf_2 = age7.setHandleInvalid("keep").transform(sdf_1) res_sdf = age8.setHandleInvalid("keep").transform(sdf_2) return res_sdf
def bucketizer_splits(dataFrame, inputCol, splits=[-float('inf'), -0.5, 0.0, 0.5, float('inf')]): # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=inputCol, outputCol='%s_bucketizer' % (inputCol)) # splits指定分桶边界 bucketedData = bucketizer.transform(dataFrame) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def test_save_and_load_on_nested_list_params(self): temp_path = tempfile.mkdtemp() splitsArray = [ [-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.1, 1.2, float("inf")], ] bucketizer = Bucketizer(splitsArray=splitsArray, inputCols=["values", "values"], outputCols=["b1", "b2"]) savePath = temp_path + "/bk" bucketizer.write().overwrite().save(savePath) loadedBucketizer = Bucketizer.load(savePath) assert loadedBucketizer.getSplitsArray() == splitsArray
def buckert(self, df, column): """ 按指定边界 分桶Bucketizer """ splits = [-float('inf'), -0.5, 0.0, 0.5, float('inf')] # 按给定边界分桶离散化——按边界分桶 bucketizer = Bucketizer(splits=splits, inputCol=column, outputCol=column + '_bucketed') # splits指定分桶边界 bucketedData = bucketizer.transform(df) print('Bucketizer output with %d buckets' % (len(bucketizer.getSplits()) - 1)) return bucketedData
def pre_processing(dataFrame): splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")] bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures") # Transform original data into its bucket index. bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show()
def add_age_id(spark, df, logger): """Calculate the age_id by splitting the visitor age into buckets""" agebucketizer = Bucketizer(splits=[ float('-Inf'), 0, 2, 11, 16, 21, 26, 36, 46, 56, 66, float('Inf') ], inputCol="i94bir", outputCol="agebuckets") agebuck_df = agebucketizer.setHandleInvalid("keep").transform(df) age_id_df = agebuck_df.withColumn("age_id", when(col("i94bir") == -1, 999)\ .otherwise(col("agebuckets") .cast(IntegerType())) ) logger.info("Added age_id") age_id_df.persist() return age_id_df
def _bucketize_age_column( self, dataframe: DataFrame, input_col: str, output_col: str) -> Tuple[DataFrame, int, List[str]]: bucketizer = Bucketizer(splits=self.age_groups, inputCol=input_col, outputCol=output_col) output = bucketizer.setHandleInvalid("keep").transform(dataframe) splits = [s for s in bucketizer.getSplits()] mapping = [ "[{}, {})".format(splits[i], splits[i + 1]) for i in range(len(splits) - 1) ] n_age_groups = len(mapping) return output, n_age_groups, mapping
def add_duration_id(spark, df, logger): """Calculate the visitduration_id by splitting the visit duration into buckets""" durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate")) ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22, 29, float('Inf') ], inputCol="duration_days", outputCol="ddbuckets") ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df) dur_id_df = ddbuck_df.withColumn("visitduration_id", when(isnull(col("arrdate")) | isnull(col("depdate")), 999)\ .otherwise(col("ddbuckets").cast(IntegerType())) ) logger.info("Added duration_id") return dur_id_df
def _transform_data(self, data): data_handling = self.data_settings.get('data_handling', {}) # interactions if data_handling.get('interactions', False): columns_list = list(data.columns) columns_list.remove(self.model_settings['variable_to_predict']) for col1 in columns_list: for col2 in columns_list: if col1 != col2: name = str(col1) + '_' + str(col2) reverse_name = str(col2) + '_' + str(col1) if reverse_name not in list(data.columns): data = data.withColumn(name, (F.col(col1) + 1) * (F.col(col2) + 1)) # binning for feature_to_bin in data_handling.get("features_to_bin", []): min_val = data.agg({feature_to_bin['name']: "min"}).collect()[0][0] max_val = data.agg({feature_to_bin['name']: "max"}).collect()[0][0] full_bins = [(min_val - 1) ] + feature_to_bin['bins'] + [(max_val + 1)] bucketizer = Bucketizer(splits=full_bins, inputCol=feature_to_bin['name'], outputCol=feature_to_bin['name'] + '_binned') data = bucketizer.transform(data) # transformation for col in data_handling.get("features_handling", {}).keys(): transformation_array = data_handling["features_handling"][col].get( "transformation", []) # applying transformations for feature_transformation_method in transformation_array: data = data.withColumn( col + '_' + feature_transformation_method, eval('F.' + feature_transformation_method)(col)) # dropping features features_to_remove = data_handling.get('features_to_remove', []) if len(features_to_remove) > 0: data = data.drop(*[ feature for feature in features_to_remove if feature in data.columns ]) return data
def generateGroupedMeasureDataDict(self, measure_column): splits_data = self.get_measure_column_splits(self._data_frame, measure_column, 4) splits = splits_data["splits"] double_df = self._data_frame.withColumn( measure_column, self._data_frame[measure_column].cast(DoubleType())) bucketizer = Bucketizer(inputCol=measure_column, outputCol="BINNED_INDEX") bucketizer.setSplits(splits) binned_df = bucketizer.transform(double_df) unique_bins = binned_df.select("BINNED_INDEX").distinct().collect() unique_bins = [int(x[0]) for x in unique_bins] binned_index_dict = dict(zip(unique_bins, splits_data["splits_range"])) output = {"bins": binned_index_dict, "data": binned_df} return output
def strat_scatterplot(sdf, col1, col2, n=30): stages = [] for col in [col1, col2]: splits = np.linspace( *sdf.agg(F.min(col), F.max(col)).rdd.map(tuple).collect()[0], n + 1) bucket_name = '__{}_bucket'.format(col) stages.append( Bucketizer(splits=splits, inputCol=col, outputCol=bucket_name, handleInvalid="skip")) pipeline = Pipeline(stages=stages) model = pipeline.fit(sdf) return model, sdf.count()
def bucketize(self, df, field): df = df.withColumn(field, df[field].cast("double")) max = df.agg({field: "max"}).collect()[0][0] min = df.agg({field: "min"}).collect()[0][0] stddev = df.agg({field: "stddev"}).collect()[0][0] number_of_buckets = 1 if stddev != 0: number_of_buckets = ((max - min) // (stddev)) buckets = np.arange(number_of_buckets, dtype=np.float).tolist() buckets = [-float('inf')] + buckets + [float('inf')] bucketizer = Bucketizer(splits=buckets, inputCol=field, outputCol=field + '_bucketized') print("Bucketizing column: ", field) bucketized_features = bucketizer.transform(df) return bucketized_features
def transform_data(content_items): content_items = content_items.withColumn('receive_date', F.to_date( F.col('time'))).drop('time') bucketizer = Bucketizer(splits=DAYS_FROM_EULA_BINS, inputCol='days_from_eula', outputCol='days_from_eula_bin', handleInvalid='skip') content_items = bucketizer.transform(content_items) \ .drop('days_from_eula') \ .withColumn( 'days_from_eula_bin', convert_to_char(F.col('days_from_eula_bin').astype('int') + INT_TO_CHAR_BASELINE) ) print('content item data transformed') return content_items
def test_measures(self, targetDimension, testMeasure): chisquare_result = ChiSquareResult() df = self._data_frame.withColumn( testMeasure, self._data_frame[testMeasure].cast(DoubleType())) measureSummaryDict = dict(df.describe([testMeasure]).toPandas().values) if float(measureSummaryDict["count"]) > 10: maxval = float(measureSummaryDict["max"]) minval = float(measureSummaryDict["min"]) step = (maxval - minval) / 5.0 splits = [ math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval) ] bucketizer = Bucketizer(splits=splits, inputCol=testMeasure, outputCol="bucketedColumn") # bucketedData = bucketizer.transform(df) bucketedData = bucketizer.transform(df.na.drop(subset=testMeasure)) pivot_table = bucketedData.stat.crosstab( "{}".format(targetDimension), 'bucketedColumn') else: pivot_table = df.stat.crosstab("{}".format(targetDimension), testMeasure) rdd = list( chain(*zip(*pivot_table.drop(pivot_table.columns[0]).collect()))) data_matrix = Matrices.dense(pivot_table.count(), len(pivot_table.columns) - 1, rdd) result = Statistics.chiSqTest(data_matrix) chisquare_result.set_params(result) freq_table = self._get_contingency_table_of_freq(pivot_table) freq_table.update_col2_names(splits) freq_table.set_tables() chisquare_result.set_table_result(freq_table) # Cramers V Calculation stat_value = result.statistic n = freq_table.get_total() t = min(len(freq_table.column_one_values), len(freq_table.column_two_values)) v_value = math.sqrt(float(stat_value) / (n * float(t))) chisquare_result.set_v_value(v_value) chisquare_result.set_split_values([float(x) for x in splits]) # chisquare_result.set_buckeddata(bucketedData) return chisquare_result