def __init__(self, kdf_or_kser, window, min_periods=None): from databricks.koalas import DataFrame, Series from databricks.koalas.groupby import SeriesGroupBy, DataFrameGroupBy if window < 0: raise ValueError("window must be >= 0") if (min_periods is not None) and (min_periods < 0): raise ValueError("min_periods must be >= 0") self._window_val = window if min_periods is not None: self._min_periods = min_periods else: # TODO: 'min_periods' is not equivalent in pandas because it does not count NA as # a value. self._min_periods = window self.kdf_or_kser = kdf_or_kser if not isinstance(kdf_or_kser, (DataFrame, Series, DataFrameGroupBy, SeriesGroupBy)): raise TypeError( "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser)) if isinstance(kdf_or_kser, (DataFrame, Series)): self._index_scols = kdf_or_kser._internal.index_scols self._window = Window.orderBy(self._index_scols).rowsBetween( Window.currentRow - (self._window_val-1), Window.currentRow) self._unbounded_window = Window.orderBy(self._index_scols)
def main(spark, model_file, test_file): model = MatrixFactorizationModel.load(sc, model_file) #model = MatrixFactorizationModel.load(sc, 'hdfs:/user/xx852/als_model') test_df = spark.read.parquet(test_file) #test_df = spark.read.parquet('hdfs:/user/xx852/cf_test_small.parquet') test_df = test_df.select('user_label', 'track_label', 'count') #predictions = model.recommendProductsForUsers(500) predictions = model.recommendProductsForUsers(2) prediction_flat = predictions.flatMap(lambda p: p[1]) prediction_df = prediction_flat.toDF() intersections = prediction_df.join(test_df, (prediction_df.product == test_df.track_label)& (prediction_df.user == test_df.user_label), how = 'inner') predLabel = intersections.select('rating', 'count') #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1])) #metrics = RankingMetrics(predLabel_rdd) #print(metrics.meanAveragePrecision) from pyspark.sql import Window import pyspark.sql.functions as psf w_rating = Window.orderBy(psf.desc('rating')) w_count = Window.orderBy(psf.desc('count')) predLabel = predLabel.withColumn('rating_rank', \ psf.dense_rank().over(w_rating)).withColumn('count_rank', \ psf.dense_rank().over(w_count)) predLabel = predLabel.select('rating_rank', 'count_rank') #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1])) predLabel_rdd = predLabel.rdd metrics = RankingMetrics(predLabel_rdd)
def AnadeColumnaSpark(SQLContext, DFInput, NombreColumna, ListaAInsertar, cambioADouble): # Se anade una columna a un df de spark a partir de una lista y poniendole # un nombre a la columna #Se crea el dataframe de la Lista a Insertar DFListaAInsertar = SQLContext.createDataFrame([(l, ) for l in ListaAInsertar], [NombreColumna]) #add 'sequential' index and join both dataframe to get the final result DFInput = DFInput.withColumn( "row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) DFListaAInsertar = DFListaAInsertar.withColumn( "row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) DFSalida = DFInput.join( DFListaAInsertar, DFInput.row_idx == DFListaAInsertar.row_idx).drop("row_idx") if cambioADouble == True: DFSalida = DFSalida.withColumn( NombreColumna, DFSalida[NombreColumna].cast(DecimalType(15, 3))) return DFSalida
def with_row_number(output_col: str, order_by: list, df: DataFrame, sort="asc", zero_indexed=True) -> DataFrame: """Assign a sequential row number to each member of a dataframe""" is_desc = sort.lower() in ["desc", "descending"] if isinstance(order_by, str) or isinstance(order_by, Column): order_by = [order_by] elif not isinstance(order_by, list): msg = "Ordering criteria must be a string column name or a list of string column names" raise Exception(msg) # create a window function depending on the sort order if is_desc: window = Window.orderBy(*[F.desc(i) for i in order_by]) else: window = Window.orderBy(*[F.asc(i) for i in order_by]) # if the client wants to start from row 1 then that's fine if not zero_indexed: return df.withColumn(output_col, F.row_number().over(window)) # otherwise start from row number 0 return df.withColumn(output_col, F.row_number().over(window) - 1)
def calc_profit2(df): ''' Creating new column with shifted Close price by 1 day Profit label calculation 1 if stock risen up, 0 is it went down :param df: :return: ''' df_daily_return = df.withColumn( 'prev_day_price', F.lag(df['Open']).over(Window.orderBy("id"))) df_daily_return = df_daily_return.filter( df_daily_return.prev_day_price.isNotNull()) df_profit = df_daily_return.withColumn( 'Profit', profit_udf(df_daily_return.Open, df_daily_return.prev_day_price)) # df_profit = df_profit.withColumn( # 'prediction', BHC(df_profit.Profit, F.lag(df_profit['Profit']).over( # Window.orderBy("id")))) df_profit = df_profit.withColumn('prediction', RC(df_profit.Profit)) df_shifted_profit = df_profit.withColumn( 'Profit', F.lag(df_profit['Profit'], count=-1).over(Window.orderBy("id"))) final_df = df_shifted_profit.filter(df_shifted_profit.Profit.isNotNull()) final_df = final_df.drop("Daily return") final_df = final_df.drop("prev_day_price") return final_df
def filterEnglish(df, sqlContext): before = df.count() df = df.dropna(subset=['tweet']) after = df.count() df = df.filter(df.country == 'United States of America') print("Success: removed " + str(before - after) + " null tweets and filtered USA") tweetList = list(df.select('tweet').toPandas()['tweet']) print("Success: made tweet list") languages = [ld.detect_language(text) for text in tweetList] print("Success: languages detected") languageDf = sqlContext.createDataFrame([(l, ) for l in languages], ['languages']) df = df.withColumn( "row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) languageDf = languageDf.withColumn( "row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) df = df.join(languageDf, df.row_idx == languageDf.row_idx).drop("row_idx") df = df.filter(df.languages == 'english').drop("languages") print("Success: filtered languages") return df
def spark_join(df, column, column_name, sql_context): b = sql_context.createDataFrame([(int(l),) for l in column], [column_name]) # add 'sequential' index and join both dataframe to get the final result df = df.withColumn("row_idx_2", row_number().over(Window.orderBy(monotonically_increasing_id()))) b = b.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id()))) final_df = df.join(b, df.row_idx_2 == b.row_idx). \ drop("row_idx_2") return final_df
def _cal_percent_rank(self, dfBase, col_name, order): print "sort column[{0}],order by[{1}]".format(col_name, order) if order == "asc": window = Window.orderBy(col_name) else: window = Window.orderBy(fn.desc(col_name)) df = dfBase.where("%s is not null" % col_name) \ .withColumn(col_name + "_rank", fn.percent_rank().over(window)) \ .persist(storageLevel=StorageLevel.DISK_ONLY) \ .union(dfBase.where("%s is null" % col_name) .withColumn(col_name + "_rank", fn.lit(None))) \ .repartition(20).persist(storageLevel=StorageLevel.DISK_ONLY) df.count() return df
def _is_monotonic_decreasing(self) -> Series: window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1) cond = SF.lit(True) has_not_null = SF.lit(True) for scol in self._internal.index_spark_columns[::-1]: data_type = self._internal.spark_type_for(scol) prev = F.lag(scol, 1).over(window) compare = MultiIndex._comparator_for_monotonic_increasing(data_type) # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over the all levels. has_not_null = has_not_null & scol.isNotNull() cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__lt__)) cond = has_not_null & (prev.isNull() | cond) cond_name = verify_temp_column_name( self._internal.spark_frame.select(self._internal.index_spark_columns), "__is_monotonic_decreasing_cond__", ) sdf = self._internal.spark_frame.select( self._internal.index_spark_columns + [cond.alias(cond_name)] ) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=self._internal.index_fields, ) return first_series(DataFrame(internal))
def modify_group_for_dim(model_dict, df, d, colname): '''given a DF with a groups assigned (variable colname), apply a dictionary to post-process the groups according to that one dimension. returns original DF with modified colname column. e.g. move specific seasons to the holdout or throwaway sets. ''' dim_props = model_dict['dimensional_dataset_proportions'].iteritems() for grp, grp_dict_list in dim_props: for grp_dict in grp_dict_list: window = Window.orderBy('dim_rnd')\ .partitionBy(grp_dict['dim'], colname) df = df.withColumn('dim_rk', F.percent_rank().over(window)) ## if (1) the column is within the set values, ## (2) the pre-existing group falls within those set values, and ## (3) the random value is below the set threshold, ## then override and modify the group membership if grp_dict['prop_to_move'] > 0: df = df.withColumn( colname, F.when((col(grp_dict['dim']).isin(grp_dict['vals'])) & (col(colname).isin(grp_dict['from_groups'])) & (col('dim_rk') >= 1 - grp_dict['prop_to_move']), grp).otherwise(col(colname))) return df
def generate_brand_data(): try: hbase = {"table": brand_table, "families": ["0"], "row": "brand_id"} print(f"{dt.now()} 生成品牌名称 {brand_table}") #写入品牌表 plm_item = get_plm_item(spark).select("item_id", "item_name", "item_kind", "is_mrb") # item_kind !=4 4为罚没 is_mrb=1 1为在使用 去掉空格 去掉* 中文括号用英文括号替换 #处理item_name #??? 可以用所有数据 然后drop掉没有括号的 plm_item_brand = plm_item\ .where(col("item_kind") != "4") \ .where(col("is_mrb") == "1") \ .withColumn("brand_name",item_name_udf(col("item_name")))\ .dropDuplicates(["brand_name"]) \ .where(col("brand_name") != "") from pyspark.sql import Window win = Window.orderBy("item_id") brand=plm_item_brand.select("item_id", "item_name", "brand_name") \ .withColumn("brand_id", f.row_number().over(win)) brand.foreachPartition( lambda x: write_hbase1(x, ["brand_name"], hbase)) except: tb.print_exc()
def state_dimension(business_df): # state dimension states = business_df.select(business_df.state).distinct() state_dim = states.withColumn( "state_id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) return state_dim
def shift(self, periods=1, fill_value=None): """ Shift Series/Index by desired number of periods. .. note:: the current implementation of shift uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. Parameters ---------- periods : int Number of periods to shift. Can be positive or negative. fill_value : object, optional The scalar value to use for newly introduced missing values. The default depends on the dtype of self. For numeric data, np.nan is used. Returns ------- Copy of input Series/Index, shifted. Examples -------- >>> df = ks.DataFrame({'Col1': [10, 20, 15, 30, 45], ... 'Col2': [13, 23, 18, 33, 48], ... 'Col3': [17, 27, 22, 37, 52]}, ... columns=['Col1', 'Col2', 'Col3']) >>> df.Col1.shift(periods=3) 0 NaN 1 NaN 2 NaN 3 10.0 4 20.0 Name: Col1, dtype: float64 >>> df.Col2.shift(periods=3, fill_value=0) 0 0 1 0 2 0 3 13 4 23 Name: Col2, dtype: int64 """ if len(self._internal.index_columns) == 0: raise ValueError("Index must be set.") if not isinstance(periods, int): raise ValueError('periods should be an int; however, got [%s]' % type(periods)) col = self._scol index_columns = self._kdf._internal.index_columns window = Window.orderBy(index_columns).rowsBetween(-periods, -periods) shifted_col = F.lag(col, periods).over(window) col = F.when( shifted_col.isNull() | F.isnan(shifted_col), fill_value ).otherwise(shifted_col) return self._with_new_scol(col).alias(self.name)
def to_indexed_ids(df: DataFrame, col_name: str) -> DataFrame: """node id to index number. Args: df (DataFrame): source data frame. col_name (str): column to generate index. Returns: DataFrame: ``` +-----------------+-----------+..+ |customer_id_index|customer_id|..| +-----------------+-----------+..| | 1| 10001105|..| | 2| 10007421|..| | 3| 10008274|..| | 4| 10010722|..| | 5| 10012171|..| +-----------------+-----------+--+ ``` """ # ref: https://towardsdatascience.com/adding-sequential-ids-to-a-spark-dataframe-fa0df5566ff6 index_window = Window.orderBy(F.col(col_name)) index_df = df.withColumn(f"{col_name}_index", F.row_number().over(index_window) - 1) return index_df
def tr_create_state_dim(df): df_dim = df.select(col('state').alias('StateName')).drop_duplicates() window = Window.orderBy(df_dim.StateName) df_dim_final = df_dim.withColumn('StateId', row_number().over(window)) return df_dim_final
def ffill_windows(cls, df, time_col, columns_to_fill): """ Forward filling strategy. This strategy fills empty spots using the last know value of a column """ import sys from pyspark.sql import Window from pyspark.sql.functions import last # define the window (and order it by time) window = Window.orderBy(time_col)\ .rowsBetween(-sys.maxsize, 0) # fill every column and replace columns for col_entry in columns_to_fill: col_name_to_fill = col_entry[0] col_name_new = col_entry[1] if (col_name_new is None): col_name_new = col_name_to_fill df = df.withColumn( col_name_new, last(df[col_name_to_fill], ignorenulls=True).over(window)) return df
def cal_sku_band(sp): """ Calculate the band of sp. """ sku_id = 'sku_id' sale = 'sale' split_list = [0.2, 0.4, 0.6, 0.8] split_name = [1, 2, 3, 4, 5] cond_str = '' for i, percent in enumerate(split_list): cond_str += ''' WHEN rank_percent < {percent} THEN {name} '''.format( percent=percent, name=split_name[i]) cond_str += ''' ELSE {name}'''.format(name=split_name[-1]) split_cond = ''' CASE {cond_str} END AS band '''.format(cond_str=cond_str) group_sp = sp.groupBy(sku_id).agg(F.sum(F.col(sale)).alias('sale_sum')) distinct_sp = group_sp.select(F.countDistinct(sku_id).alias('cnt')) windowspec_r = Window.orderBy(F.col('sale_sum').desc()) rank_sp = group_sp.select( 'sku_id', F.rank().over(windowspec_r).alias('rank')).crossJoin(distinct_sp) rank_sp = rank_sp.withColumn('rank_percent', F.col('rank') / F.col('sale_sum')) band_sp = rank_sp.selectExpr('sku_id', split_cond) return band_sp
def test_window_functions_without_partitionBy(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.orderBy("key", df.value) from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.row_number().over(w), F.rank().over(w), F.dense_rank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1), ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ ds = process_manager.service(DataStore) # --- your algorithm code goes here self.logger.debug('Now executing link: {link}.', link=self.name) df = ds[self.read_key] window = Window.orderBy(self.datetime_col) \ .rowsBetween(0, Window.unboundedFollowing) if self.partitionby_cols: window = window.partitionBy(self.partitionby_cols) date_col = f.to_date(self.datetime_col) # only dates per row reset_col = f.when(f.col(self.event_col) > 0, date_col) # dates of events bfilled_reset_col = f.first(reset_col, ignorenulls=True).over( window) # backfilled dates of events countdown_col = f.datediff( bfilled_reset_col, date_col) # difference between next event date and today ds[self.store_key] = df.withColumn(self.countdown_col_name, countdown_col) return StatusCode.Success
def events_popularity_score(df): df.registerTempTable("df_tbl") df_popularity=spark.sql(''' select item_id,country, count(distinct USER_ID) as count_score from df_tbl group by 1,2 order by 3 desc ''') ''' df_popularity=df_popularity.coalesce(1) from pyspark.sql.functions import monotonically_increasing_id df_popularity=df_popularity.withColumn('score', monotonically_increasing_id()) ''' from pyspark.sql import Window import pyspark.sql.functions as psf cnt_score = Window.orderBy(psf.desc("count_score")) df_popularity_sc = df_popularity.withColumn("score", psf.dense_rank().over(cnt_score) ) return df_popularity_sc
def run(self): session = SparkSession.builder.appName("KueskiRaitings").master( "local[*]").config("spark.driver.memory", "8g").config("spark.executor.memory", "8g").getOrCreate() dataFrameReader = session.read df = dataFrameReader.option("header", "true").option("inferSchema", "true").csv(self.get_data()) df = df.withColumn( "index", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) df = df.withColumn(USER_ID, df[USER_ID].cast(IntegerType())) df = df.withColumn(MOVIE_ID, df[MOVIE_ID].cast(IntegerType())) df = df.withColumn(RATING, df[RATING].cast(FloatType())) df = df.withColumn(TS, df[TS].cast(TimestampType())) w1 = Window.partitionBy(USER_ID).orderBy(TS) df = df.withColumn("nb_previous_ratings", row_number().over(w1) - 1) df = df.withColumn("avg_ratings_previous", mean(RATING).over(w1)) df = df.withColumn("avg_ratings_previous", lag('avg_ratings_previous', 1, 0).over(w1)) obj = df.select([ c for c in df.columns if c in ['nb_previous_ratings', 'avg_ratings_previous'] ]) return obj.toPandas().to_json(orient="records")
def get_street_categories_index(spark, col_name, pos_samples, road_features): n_roads = road_features.count() street_cat_distrib = ( road_features.select(col_name) .na.fill("unknown") .groupBy(col_name) .count() .withColumn("p", col("count") / lit(n_roads)) .orderBy(col("p").desc()) .drop("count") ) return ( pos_samples.select(col_name) .na.fill("unknown") .groupBy(col_name) .count() .withColumn("p_pos", col("count") / lit(pos_samples.count())) .drop("count") .join(street_cat_distrib, col_name) .na.fill(0, ["p_pos"]) .select(col_name, (col("p_pos") - col("p")).alias("risk")) .withColumn( col_name + "_indexed", row_number().over(Window.orderBy(col("risk").desc())) ) .drop("risk") )
def Run(self): def Lift(cum_resp,decil_no): return (cum_resp/float(decil_no+1)) self._df=self._df.orderBy(self._proba_column,ascending=False) self._df = self._df.withColumn("id", monotonically_increasing_id()) # w = Window.orderBy(desc(self._proba_column)) # self._df = self._df.withColumn('id',row_number().over(w)) discretizer = QuantileDiscretizer(numBuckets=10, inputCol="id", outputCol="deciles") self._df = discretizer.fit(self._df).transform(self._df) Rank=self._df.groupby('deciles').agg(F.count(self._df[self._proba_column]).alias('cnt'), F.count(when(self._df[self._target_column] == int(self._posLabel), True)).alias('cnt_resp')) Rank=Rank.withColumn('cnt_non_resp',Rank['cnt']-Rank['cnt_resp']) Rank=Rank.orderBy('deciles',ascending=True) cumsum_window = (Window.orderBy(Rank['deciles']).rangeBetween(Window.unboundedPreceding, Window.currentRow)) Rank=Rank.withColumn("cum_resp",F.sum('cnt_resp').over(cumsum_window)) Rank=Rank.withColumn("cum_non_resp",F.sum('cnt_non_resp').over(cumsum_window)) Rank=Rank.withColumn("% Responders(Cumulative)",F.round(old_div(Rank["cum_resp"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2)) Rank=Rank.withColumn("% Non-Responders(Cumulative)",F.round(old_div(Rank["cum_non_resp"]*100,Rank.select(F.sum('cnt_non_resp')).collect()[0][0]),2)) Rank=Rank.withColumn("cum_population",F.sum("cnt").over(cumsum_window)) Rank=Rank.withColumn("pop_pct_per_decile",F.round(old_div(Rank["cnt"]*100,Rank.select(F.sum('cnt')).collect()[0][0]))) Rank=Rank.withColumn("% Population(Cumulative)",F.round(F.sum('pop_pct_per_decile').over(cumsum_window))) Rank=Rank.withColumn("KS",F.round(Rank["% Responders(Cumulative)"] - Rank["% Non-Responders(Cumulative)"],2)) Rank=Rank.withColumn("Lift at Decile",F.round(old_div(Rank["cnt_resp"]*Rank["pop_pct_per_decile"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2)) Rank = Rank.withColumn("id", monotonically_increasing_id()) Lift_udf=udf(lambda x,y:Lift(x,y),FloatType()) Rank=Rank.withColumn("Total_Lift",F.round(Lift_udf("cum_resp","id"),2)) Rank=Rank.drop('id') return(Rank)
def main(): """ Run pipeline: - Create spark session - Get config - Read all dataframes with meta - Merge country names - Correct country names - Generate an id column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('country') country_mapping_path = config.get('country_mapping_path') (gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration) = read_data(spark, config=config) df = merge_country_names(gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = df.withColumn('country_id', F.row_number().over(Window.orderBy('country_name'))) write_with_meta(df=df, df_meta=config['output_meta'])
def induce_graph(graph, relabel=True, partitions=[]): """Remove extra edges that do not belong to the graph""" # small dataframe for reindexing/relabeling window = Window.orderBy("id") if partitions: window = window.partitionBy(partitions) # ensure 0 index for mapping into a scipy.sparse matrix rank = graph.vertices.select( "id", F.row_number().over(window).alias("rank")).withColumn( "rank", F.expr("rank - 1")) vertices = graph.vertices.join(rank, on="id", how="left") edges = graph.edges.join(vertices.selectExpr("id as src", "rank as rank_src"), on="src", how="inner").join(vertices.selectExpr( "id as dst", "rank as rank_dst"), on="dst", how="inner") if relabel: vertices = vertices.withColumn("relabeled_id", F.col("id")).withColumn( "id", F.col("rank")) edges = (edges.withColumn("relabeled_src", F.col("src")).withColumn( "relabeled_dst", F.col("dst")).withColumn("src", F.col("rank_src")).withColumn( "dst", F.col("rank_dst"))) vertices = vertices.drop("rank") edges = edges.drop("rank_src", "rank_dst") return GraphFrame(vertices, edges)
def tr_create_city_dim(df): df_dim = df.select(col('county').alias('CityName')).drop_duplicates() window = Window.orderBy(df_dim.CityName) df_dim_final = df_dim.withColumn('CityId', row_number().over(window)) return df_dim_final
def create_ids(df): df_titles = df.select("title").distinct() windowspec = Window.orderBy("title") df_ids = df_titles.withColumn("id", row_number().over(windowspec)) df_ids = df_ids.withColumn("id", col("id") - 1) count = df_ids.count() return df_ids, count
def _calc_model_metrics(df: DataFrame, prob_col: str, label_col: str) -> Dict[str, float]: r"""calc model metrics at max f1 given probabilities and labels Parameters ---------- df : pyspark.sql.DataFrame prob_col : str colname w/ raw probabilities of being in class 1 label_col : str Returns ------- max_metrics : dict dict with keys : 'tp', 'tn', 'fp', 'fn', 'f1', 'accuracy', 'precision', 'recall' and corresponding values as floats Raises ------ UncaughtException """ _persist_if_unpersisted(df) metrics_df = df.groupby(prob_col).pivot(label_col).count().fillna(value=0) metrics_df.persist(StorageLevel(False, True, False, False)) window = Window.orderBy(prob_col).rowsBetween(Window.unboundedPreceding, -1) metrics_df = metrics_df.withColumn('fn', F.sum(F.col(str(1))).over(window)) metrics_df = metrics_df.withColumn( 'tn', F.sum(F.col(str(0))).over(window)).fillna(value=0) metrics_df.persist(StorageLevel(False, True, False, False, 1)) all_count = df.count() pos_count = df.where(F.col(label_col) == 1).count() neg_count = all_count - pos_count metrics_df = metrics_df.withColumn('tp', pos_count - F.col('fn')) metrics_df = metrics_df.withColumn('fp', neg_count - F.col('tn')) metrics_df = metrics_df.withColumn('precision', (F.col('tp')) / (F.col('tp') + F.col('fp'))) metrics_df = metrics_df.withColumn('recall', F.col('tp') / pos_count) metrics_df = metrics_df.withColumn( 'informativeness', 2 * (F.col('precision') * F.col('recall')) / (F.col('precision') + F.col('recall'))) metrics_df.persist(StorageLevel(False, True, False, False)) max_metrics = metrics_df.where( F.col('informativeness') == metrics_df.select( F.max(F.col('informativeness'))).take(1)[0][0]).take( 1)[0].asDict() max_metrics['accuracy'] = (max_metrics['tp'] + max_metrics['tn']) / all_count max_metrics['threshold'] = max_metrics[prob_col] return max_metrics
def topK_df(df, key_col, K): """ Using window functions. Handles ties OK. """ window = Window.orderBy(functions.col(key_col).desc()) return (df.withColumn("rank", functions.rank().over(window)).filter( functions.col('rank') <= K).drop('rank'))
def category_dimension(b_df): # category dimension categories = b_df.select(b_df.restaurant_category).filter( b_df.restaurant_category.isNotNull()).distinct() category_dim = categories.withColumn( "category_id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) return category_dim
def spark_timestamp_split( data, ratio=0.75, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark timestamp based splitter The splitter splits the data into sets by timestamps without stratification on either user or item. The ratios are applied on the timestamp column which is divided accordingly into several partitions. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time in split[0] <= the earliest time in split[1]) col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Float number represented in seconds since Epoch. Returns: list: Splits of the input data as spark.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.orderBy(col(col_timestamp)) rating = data.withColumn("rank", row_number().over(window_spec)) data_count = rating.count() rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( "rank" ) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ).drop("rank") splits.append(rating_split) return splits
| 3| a| | 1| b| | 2| b| | 3| b| | 1| c| | 3| c| +----+---+ """ schema = StructType([StructField("attr", StringType(), True), StructField("obj", StringType(), True)]) aoDF = sqlCtx.createDataFrame(aoPair, schema) #Window that moves over rows of same obj and sorted by attr window = Window.orderBy("attr").partitionBy("obj") ## Prev column contains previous attr of the same object """ Transformed Table +----+---+----+ |attr|obj|prev| +----+---+----+ | 1| a|null| | 2| a| 1| | 3| a| 2| | 1| b|null| | 2| b| 1| | 3| b| 2| | 1| c|null| | 3| c| 1|