def main(): # Cargamos los ficheros csv directamente como dataframes spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext df1 = spark.read.format("csv").option( "header", "true").load("simpsons_characters.csv") df2 = spark.read.format("csv").option("header", "true").load("simpsons_episodes.csv") df3 = spark.read.format("csv").option( "header", "true").load("simpsons_locations.csv") df4 = spark.read.format("csv").option( "header", "true").load("simpsons_script_lines.csv") # Obtenemos un df por episode_id y el count de los distintos personajes. dfe = df4.groupBy('episode_id').agg( functions.approx_count_distinct( df4.character_id).alias("character_count")).selectExpr( "episode_id as id", "character_count") # Obtenemos un df con id, imbd_rating y el count de personajes. dfjoin1 = df2.join(dfe, on="id", how='outer').select("id", "imdb_rating", "character_count") # Obtenemos (id, genero) e (id, episodio) y los unimos por los id's de los personajes. dfc = df1.select('id', 'gender').selectExpr("id as character_id", "gender") dfe = df4.select('character_id', 'episode_id') dfjoin2 = dfc.join(dfe, on="character_id", how='outer').select("episode_id", "character_id", "gender") # Para obtener el count de los generos, hacemos un filtro y agrupamos por id de episodio, para después hacer el count de cada episodio. dfCharacMasc = dfjoin2.filter( dfjoin2.gender == 'm').groupBy('episode_id').agg( functions.approx_count_distinct( dfjoin2.character_id).alias("masc_count")).selectExpr( "episode_id as id", "masc_count") dfCharacFem = dfjoin2.filter( dfjoin2.gender == 'f').groupBy('episode_id').agg( functions.approx_count_distinct( dfjoin2.character_id).alias("fem_count")).selectExpr( "episode_id as id", "fem_count") # Por último se unen todos los campos en un mismo dataframe y se muestra. dfjoin3 = dfjoin1.join(dfCharacMasc, on="id", how='outer').select("id", "imdb_rating", "character_count", "masc_count") dfjoin4 = dfjoin3.join(dfCharacFem, on="id", how='outer').select("id", "imdb_rating", "character_count", "masc_count", "fem_count") dfjoin4.show() sc.stop()
def join_to_weekly_budget(df, email_vols): return ( df.join(email_vols, on="prim_party_id").withColumn( "email_budget", F.col("nonflight")).drop(*["flight", "nonflight"]).select( "*", F.rank().over( Window.partitionBy(F.col("prim_party_id")).orderBy( F.col("score").desc())).alias("rank")).withColumn( "rank_adj", F.when( F.col("OFFER_TYPE") != "recommended", F.col("rank") * 100).otherwise( F.col("rank"))).drop(*["rank"]). select( "*", F.rank().over( Window.partitionBy(F.col("prim_party_id")).orderBy( F.col("rank_adj").desc())).alias('rank')).withColumn( "elig_for_week", F.approx_count_distinct("action_id").over( Window.partitionBy( F.col("prim_party_id")))).withColumn( "email_budget_week", F.when( F.col("elig_for_week") < F.col("email_budget"), F.col("elig_for_week")).otherwise( F.col("email_budget"))). filter(F.col("rank") <= F.col("email_budget_week")).drop(*[ "email_budget", "score", "rank_adj", "offer_send_time", "send_date", "elig_for_week" ]))
def perform_eda(data_pipeline, target, feature, nonnull_cutoff, except_for_these, save_fig): spark = _loadSpark() base = spark.sql(data_pipeline) base.persist() cols_drop = [] for col_name in base.columns: if base.select(col_name).filter((F.col(col_name).isNotNull()) | (F.col( col_name).cast('double') == 0.0)).count() <= nonnull_cutoff: cols_drop.append(col_name) base2 = base.drop(*cols_drop) print( '*** Dropped null-only or columns having non-null values less than or equal to ' + str(nonnull_cutoff) + '...') print(str(cols_drop)) base.unpersist() base2.persist() print('\n*** Excluded features including a target...') print(str(except_for_these)) print('\n*** Excluded timestamp type and single value numeric features...') num = [] cat = [] for c in base2.drop(*except_for_these).dtypes: if c[1] == 'string': cat += [c[0]] base2 = base2.withColumn( c[0], F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' ') | (F.col(c[0]).isNull()), F.lit('None')).otherwise(F.col(c[0]))) if c[1] not in ('string', 'timestamp') and base2.select( F.approx_count_distinct(c[0])).collect()[0][0] > 1: num += [c[0]] base2 = (base2.withColumn( c[0], F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' '), F.lit(None)).otherwise(F.col(c[0]))).withColumn( c[0], F.col(c[0]).cast('double'))) print('*** Distribution by target value ***') base2.groupBy(target).count().show() print('*** Describe categorical variables in the data set ***') for i in cat: base2.groupBy(i).count().orderBy('count', ascending=False).show(2000, truncate=False) print('*** Describe numeric variables in the data set ***') for i in num: base2.select(i).summary().show(truncate=False) if save_fig: tmp = base2.select(i).toPandas() tmp.hist(figsize=(12, 10)) plt.savefig('./fig/' + i + '.png') plt.close()
def main(): # Cargamos los ficheros csv directamente como dataframes spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext df1 = spark.read.format("csv").option( "header", "true").load("simpsons_characters.csv") df2 = spark.read.format("csv").option("header", "true").load("simpsons_episodes.csv") df3 = spark.read.format("csv").option( "header", "true").load("simpsons_locations.csv") df4 = spark.read.format("csv").option( "header", "true").load("simpsons_script_lines.csv") dfScriptLines = df4.groupBy('episode_id').agg( functions.approx_count_distinct( df4.location_id).alias("count")).selectExpr( "episode_id as id", "count") dfLocations = df2.join(dfScriptLines, on="id", how='outer').select("id", "imdb_rating", "count") dfLocations.show() sc.stop()
def getColumnUniqueCounts(clickDF): """ Parameters ---------- clickDF : class Dataframe """ distvals = clickDF.agg(*(approx_count_distinct(col(c)).alias(c) for c in clickDF.columns if str.startswith(c,"c"))) print(type(distvals)) return distvals
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, u'99'), tuple(g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue(95 < g.agg(functions.approx_count_distinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def aggregate_job(self, df_news): df_finalized = df_news.groupby('action_state', 'Year', 'Actor1Type1Code') \ .agg( F.approx_count_distinct( 'GLOBALEVENTID').alias('events_count'), F.sum('normg_scale').alias('norm_score_cale')) # Calculate avg by dividing by event count print df_finalized.show(df_finalized.count()) #print df_finalized.printSchema() return df_finalized
def distinct_count(table, col_names, method="exact"): try: if method == "exact" or method == None: from pyspark.sql.functions import col, countDistinct uniques = table.agg(*(countDistinct(col(c)).alias(c) for c in col_names)) elif method == "approx": from pyspark.sql.functions import col, approx_count_distinct uniques = table.agg(*(approx_count_distinct(col(c)).alias(c) for c in col_names)) else: raise ValueError("Unknown method {}, choose between ['exact', 'approx']".format(method)) except: print("Cannot resolve column: {}".format(col_names)) return uniques
def approx_cardinalities(df, cols): from functools import reduce counts = df.groupBy( F.lit(True).alias("drop_me") ).agg( F.count('*').alias("total"), *[F.approx_count_distinct(F.col(c)).alias(c) for c in cols] ).drop("drop_me").cache() result = reduce(lambda l, r: l.unionAll(r), [counts.select(F.lit(c).alias("field"), F.col(c).alias("approx_count")) for c in counts.columns]).collect() counts.unpersist() return dict([(r[0],r[1]) for r in result])
def aggregate_job(self, df_joined): ''' This function is used to perform group by and aggregation on joined data frame :param df_joined: joined events and mentions data frame will be provided :return: aggregated data frame will be returned. ''' df_finalized = df_joined.groupby('state', 'Year', 'MonthYear', 'Actor1Type1Code') \ .agg( functions.approx_count_distinct('GLOBALEVENTID').alias('events_count'), functions.sum('GoldsteinScale_norm').alias('GoldsteinScale_norm_sum'), functions.sum('Avg_Confidence').alias('confidence_sum') ) df_finalized.show() return df_finalized
def aggregate(uri, conf): spark = SparkSession.builder.config(conf=conf).getOrCreate() parsed_df = spark.read.format('csv').options(header=True, inferschema=True).load(uri) columns = parsed_df.columns identify_aggregations(columns) print("unique_params_indices: " + str(unique_params_indices)) print("numeric_params_indices: " + str(numeric_params_indices)) # initiate output DF with record count cnt_col_name = spark.conf.get("output") + "_count" output = parsed_df.groupBy([parsed_df.temp_res, parsed_df.spat_res]) \ .count() output = output.withColumnRenamed("count", cnt_col_name) # select and handle categorical attributes cat_col_names = [columns[i] for i in unique_params_indices] for name in cat_col_names: col = parsed_df.groupBy([parsed_df.temp_res, parsed_df.spat_res]) \ .agg(F.approx_count_distinct(name).alias(name+"_uniq")) output = output.join(col, ["temp_res", "spat_res"], 'left') # select and handle numeric attributes num_col_names = [columns[i] for i in numeric_params_indices] # fill in NaN numeric values to avoid error parsed_df = parsed_df.fillna(0, subset=num_col_names) for name in num_col_names: col = parsed_df.groupBy([parsed_df.temp_res, parsed_df.spat_res]) \ .agg(F.mean(name).alias(name+"_avg")) output = output.join(col, ["temp_res", "spat_res"], 'left') # write aggregated dataset out_dir = spark.conf.get("output") out_dir = "aggregates/" + out_dir # output.printSchema() # output.show(40) # print(output.count()) print("output directory is: " + out_dir) output.write.csv(out_dir, header=True) spark.stop()
def group_distinct(self, df_, columns: list) -> dict: """ :param df_: pysaprk df :param columns: columns of type str in the dataframe :return: dict of columns and distinct values """ data = [] for c in columns: cnt = df_.agg(approx_count_distinct(col(c)).alias(c)).collect() data.append(cnt[0][0]) df_pd = pd.DataFrame(index=columns, columns=["num_distinct"], data=data) return df_pd.sort_values("num_distinct", axis=0, ascending=True).to_dict()["num_distinct"]
def process_measures_from_grouped_message(self): try: input_df = self.get_kafka_consumer() flat_message_df = self.get_parsed_message(input_df) unique_users_measure_df = flat_message_df.groupBy( 'time_window').agg( F.approx_count_distinct("users").alias( "unique_users")).orderBy('time_window', 'unique_users') country_count_df1 = flat_message_df.groupBy( 'time_window', 'country').count().orderBy('time_window', 'count') most_represented_countries_df = country_count_df1.withColumn( "rank_max", F.rank().over( Window.partitionBy("time_window").orderBy(F.desc("count"))) ).where(F.col("rank_max") == 1).orderBy("time_window").select( F.col("time_window"), F.col("country").alias("most_represented_country"), F.col("count").alias("most_represented_country_count")) least_represented_countries_df = country_count_df1.withColumn( "rank_min", F.rank().over( Window.partitionBy("time_window").orderBy(F.col("count"))) ).where(F.col("rank_min") == 1).orderBy("time_window").select( F.col("time_window"), F.col("country").alias("least_represented_country"), F.col("count").alias("least_represented_country_count")) most_represented_country_df = self.get_first_record_from_grouped_data( most_represented_countries_df, "most_represented_country") least_represented_country_df = self.get_first_record_from_grouped_data( least_represented_countries_df, "least_represented_country") measures_df = most_represented_country_df.join(least_represented_country_df,"time_window").join(unique_users_measure_df,"time_window")\ .select(most_represented_countries_df.time_window,F.col("most_represented_country")\ ,F.col("most_represented_country_count"),F.col("least_represented_country")\ ,F.col("least_represented_country_count"),F.col("unique_users")) measures_df.write.format("csv").save(self.output_file_dir) #self.logger.info(most_represented_country_df.collect()) #self.logger.info(least_represented_country_df.collect()) #self.logger.info(unique_users_measure_df.collect()) except Exception as e: self.logger.error(e)
def removeInvalidCols(df): ''' Remove columns with all Nans or all same values Parameter: - dataframe Returns: - dataframe (after removing cols) ''' count_distinct = df.select([ F.approx_count_distinct(c).alias("{0}".format(c)) for c in df.columns ]) # consider only columns with >= 2 distincts distinct = count_distinct.toPandas().to_dict(orient='list') to_consider = [k for k in distinct if distinct[k][0] >= 2] # return the cleaned dataframe return df[to_consider]
def _deduplication_approximated_todo(columns, df): """ Returns what (columns, as in spark columns) to compute to get the results requested by the parameters. :param columns: :type columns: list :param df: :type df: DataFrame :return: Pyspark columns representing what to compute. """ if columns is None: print( "Approximated count distinct spanning over the whole row is currently not supported" ) else: # multiple count distinct, one column each todo = [approx_count_distinct(col(c)).alias(c) for c in columns] return todo
def main(): # base streams from kafka submission_stream = stream_from_kafka('SubmissionProducer') comment_stream = stream_from_kafka('CommentProducer') # subreddit_stream = stream_from_kafka('SubredditProducer') # redditor_stream = stream_from_kafka('RedditorProducer') # persist base streams to hdfs for batch processing write_to_hdfs(submission_stream, 'submission') write_to_hdfs(comment_stream, 'comment') # write_to_hdfs(subreddit_stream, 'subreddit') # write_to_hdfs(redditor_stream, 'redditor') # streaming stats windowed_submissions = submission_stream \ .groupby(window('created_utc', '5 seconds', '5 seconds')) \ .count() \ .withColumn('window', col('window')['end']) \ .withColumn('date', to_date('window')) windowed_comments = comment_stream \ .groupby(window('created_utc', '5 seconds', '5 seconds')) \ .count() \ .withColumn('window', col('window')['end']) \ .withColumn('date', to_date('window')) windowed_unique_active_subreddits = submission_stream \ .select('created_utc', 'subreddit') \ .union(comment_stream.select('created_utc', 'subreddit')) \ .groupby(window('created_utc', '5 seconds', '5 seconds')) \ .agg(approx_count_distinct('subreddit'), count('subreddit')) \ .withColumn('window', col('window')['end']) \ .withColumn('date', to_date('window')) \ .withColumnRenamed('approx_count_distinct(subreddit)', 'active_subreddits') \ .withColumnRenamed('count(subreddit)', 'submissions_and_comments') # persist streaming stats to cassandra stream_to_cassandra(windowed_submissions, 'windowed_submissions') stream_to_cassandra(windowed_comments, 'windowed_comments') stream_to_cassandra(windowed_unique_active_subreddits, 'unique_active_subreddits') spark_session.streams.awaitAnyTermination()
def profile(df): x = spark.createDataFrame(sc.emptyRDD(), SCHEMA) df.cache() for field in df.schema.fields: if isinstance(field.dataType, (pst.StructField, pst.ArrayType, pst.MapType, pst.StructType, pst.BooleanType)): continue else: y = df.agg( F.lit(field.name).alias("desc"), F.count(field.name).alias('count'), F.min(field.name).alias('min'), F.max(field.name).alias('max'), F.sum(field.name).alias('sum'), F.stddev(field.name).alias('stddev'), F.max(F.length(field.name)).alias('max_length'), F.approx_count_distinct(field.name).alias('cardinality')) x = x.union(y) return x
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted( g.agg({ "key": "max", "value": "count" }).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, "99"), tuple( g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue( 95 < g.agg(functions.approx_count_distinct(df.key)).first()[0]) # test deprecated countDistinct self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def readFile(spark, input_file): '''read file (clickstream.parquet) and add new features to dataframe''' parquetFile = spark.read.parquet(input_file) # clickstream.parquet parquetFile = parquetFile.distinct() # drop dublicates w = Window.partitionBy('ad_id') parquetFile = parquetFile.withColumn( 'day_count', fn.approx_count_distinct('date').over(w)) parquetFile = parquetFile.withColumn( 'is_cpm', (fn.col('ad_cost_type') == 'CPM').cast('integer')) parquetFile = parquetFile.withColumn( 'is_cpc', (fn.col('ad_cost_type') == 'CPC').cast('integer')) parquetFile = parquetFile.withColumn( 'is_click', (fn.col('event') == 'click').cast('integer')) parquetFile = parquetFile.withColumn( 'is_view', (fn.col('event') == 'view').cast('integer')) parquetFile = parquetFile.withColumn( 'CTR', fn.when(fn.sum('is_view').over(w) == 0, 0).otherwise( fn.sum('is_click').over(w) / fn.sum('is_view').over(w))) return parquetFile
("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100) ] schema = ["employee_name", "department", "salary"] df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False)
def estimate_segments( df: pyspark.sql.dataframe.DataFrame, target_field: str = None, max_segments: int = 30, include_columns: List[str] = [], unique_perc_bounds: Tuple[float, float] = [None, 0.8], null_perc_bounds: Tuple[float, float] = [None, 0.2], ) -> Optional[Union[List[Dict], List[str]]]: """ Estimates the most important features and values on which to segment data profiling using entropy-based methods. If no target column provided, maximum entropy column is substituted. :param df: the dataframe of data to profile :param target_field: target field (optional) :param max_segments: upper threshold for total combinations of segments, default 30 :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance. :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive. :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive. :return: a list of segmentation feature names """ current_split_columns = [] segments = [] segments_used = 1 max_entropy_column = (float("-inf"), None) if not unique_perc_bounds[0]: unique_perc_bounds[0] = float("-inf") if not unique_perc_bounds[1]: unique_perc_bounds[1] = float("inf") if not null_perc_bounds[0]: null_perc_bounds[0] = float("-inf") if not null_perc_bounds[1]: null_perc_bounds[1] = float("inf") valid_column_names = set() count = df.count() print("Limiting to categorical (string) data columns...") valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)} print("Gathering cardinality information...") n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names} print("Gathering missing value information...") n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names} print("Finding valid columns for autosegmentation...") for col in valid_column_names.copy(): null_perc = 0.0 if count == 0 else n_nulls[col] / count unique_perc = 0.0 if count == 0 else n_uniques[col] / count if ( col in segments or n_uniques[col] <= 1 or null_perc < null_perc_bounds[0] or null_perc >= null_perc_bounds[1] or unique_perc < unique_perc_bounds[0] or unique_perc >= unique_perc_bounds[1] ): valid_column_names.remove(col) if not valid_column_names: return [] if not target_field: print("Finding alternative target field since none were specified...") for col in valid_column_names: col_entropy = _simple_entropy(df, col) if n_uniques[col] > 1: col_entropy /= math.log(n_uniques[col]) if col_entropy > max_entropy_column[0]: max_entropy_column = (col_entropy, col) target_field = max_entropy_column[1] print(f"Using {target_field} column as target field.") assert target_field in df.columns valid_column_names.add(target_field) valid_column_names = list(valid_column_names) countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache() print("Calculating segments...") while segments_used < max_segments: valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))} _, segment_column_name = _find_best_split( countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques ) if not segment_column_name: break segments.append(segment_column_name) current_split_columns.append(segment_column_name) segments_used *= n_uniques[segment_column_name] return segments
def get_data(self): """ Returns statistics about attributes in a data frame """ from pyspark.sql import functions # Correlation pairs corr_pairs = list( chunks(list(itertools.product(self.attrs, self.attrs)), len(self.attrs))) # Cache data self.data.cache() df_count = self.data.count() # TODO: Implement median using df.approxQuantile('col', [.5], .25) stats = [] for i, name in enumerate(self.attrs): df_col = functions.col(name) stats.append(functions.lit(name)) stats.append(functions.max(df_col).alias('max_{}'.format(name))) stats.append(functions.min(df_col).alias('min_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.stddev(df_col), 4).alias('stddev_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append( functions.count(df_col).alias('count_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.avg(df_col), 4).alias('avg_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append( functions.approx_count_distinct(df_col).alias( 'distinct_{}'.format(name))) stats.append((df_count - functions.count(df_col)).alias( 'missing_{}'.format(name))) if name in self.numeric_attrs: stats.append( functions.round(functions.skewness(df_col), 2).alias('skewness_{}'.format(name))) stats.append( functions.round(functions.kurtosis(df_col), 2).alias('kurtosis_{}'.format(name))) else: stats.append(functions.lit('-')) stats.append(functions.lit('-')) if self.params['correlation']: for pair in corr_pairs[i]: if all([ pair[0] in self.numeric_attrs, pair[1] in self.numeric_attrs ]): stats.append( functions.round(functions.corr(*pair), 4).alias('corr_{}'.format(i))) else: stats.append(functions.lit('-')) self.data = self.data.agg(*stats) aggregated = self.data.take(1)[0] n = len(self.names) rows = [aggregated[i:i + n] for i in range(0, len(aggregated), n)] return {"rows": rows, "attributes": self.get_column_names().split(',')}
def _get_approx_distinct_count_for_col(self, df: DataFrame, _dcol: str, _rsd=0.05): return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \ .rdd.map(lambda row: row[0]).collect()[0]
data = data.select( col("data").getItem(0).alias("TS"), col("data").getItem(1).alias("UID"), col("data").getItem(2).alias("URL"), col("data").getItem(3).alias("Title"), col("data").getItem(4).alias("User-Agent"), ) data = data.selectExpr("TS", "UID", 'parse_url(URL, "HOST") as domain') data = data.withColumn("TS", from_unixtime(col("TS").cast("int"))) data = data.withColumn( "zone", when(col("domain").endswith(".ru"), "ru").otherwise("not ru")) data = data.groupBy("zone", window("TS", "2 seconds", "1 second")).agg( count("UID").alias("view"), approx_count_distinct(col("UID")).alias("unique")) data = data.sort(["window", "view", "zone"], ascending=[True, False, True]).limit(20) data = data.select(col("window"), col("zone"), col("view"), col("unique")) query = (data.writeStream.outputMode("complete").format("console").option( "truncate", "false").trigger(once=args.once, processingTime=args.processing_time).start()) query.awaitTermination()
StructField("merchant_id", LongType(), True), StructField("created_time_ist", StringType(), True) ]) txnFctDf = sparkSession.createDataFrame(txnFctRdd, txnFctSchema) txnFctDf.printSchema() txnFctDf.show(5, False) # Applying tranformation on dataframe using DSL (Domain Specific Language) txnFctDf = txnFctDf\ .withColumn("created_time_ist", unix_timestamp(txnFctDf["created_time_ist"], "yyyy-MM-dd HH:mm:ss").cast(TimestampType())) txnFctDf.printSchema() txnFctDf.show(5, False) print("# of records = " + str(txnFctDf.count())) print("# of merchants = " + str(txnFctDf.select(txnFctDf["merchant_id"]).distinct().count())) txnAggDf = txnFctDf\ .repartition(10, txnFctDf["merchant_id"])\ .groupBy("merchant_id")\ .agg(sum("amount"), approx_count_distinct("status")) txnAggDf.show(5, False) txnAggDf\ .withColumnRenamed("sum(amount)", "total_amount")\ .withColumnRenamed("approx_count_distinct(status)", "dist_status_count")\ txnAggDf.show(5, False)
df_news.show(10) df_news = df_news.filter(df_news.ActionGeo_CountryCode == 'US') df_news = df_news.filter(df_news.Actor1Code != 'null') df_news = df_news.filter(df_news.Actor1Type1Code != 'null') name = 'ActionGeo_ADM1Code' udf = UserDefinedFunction(lambda x: x[:2]+'-'+x[2:], StringType()) df_news = df_news.select( *[udf(column).alias(name) if column == name else column for column in df_news.columns]) split_col = F.split(df_news['ActionGeo_ADM1Code'], '-') df_news = df_news.withColumn('action_state', split_col.getItem(1)) print df_news.show() df_news = df_news.groupby('action_state', 'Year', 'Actor1Type1Code').agg(F.approx_count_distinct('GLOBALEVENTID').alias('event_code'), # F.col('SQLDATE'), # F.collect_list('EventCode'), # F.col('Actor1Name'), # F.col('Actor2Name'), # F.collect_list('ActionGeo_ADM1Code'), # F.collect_list('action_state'), # F.collect_list('ActionGeo_Fullname'), F.avg('GoldsteinScale').alias( 'goldstein_scale'), F.avg('AvgTone').alias('avg_tone')) print df_news.show(df_news.count()) print df_news.printSchema() postgres_dump(config, df_news)
df.createOrReplaceTempView("dfTable") # COMMAND ---------- from pyspark.sql.functions import count df.select(count("StockCode")).show() # 541909 # COMMAND ---------- from pyspark.sql.functions import countDistinct df.select(countDistinct("StockCode")).show() # 4070 # COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import sum df.select(sum("Quantity")).show() # 5176450
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
def create_base(_undersample,undersample_rate, _save_fig,data_pipeline, target,except_for_these,nonnull_cutoff,unique_num_value,balThreshold,n_feat, use_wgt): base = spark.sql(data_pipeline) base.persist() #Drop all Nulls or less than a threshold cols_drop = [] for col_name in base.drop(*except_for_these).columns: if base.select(col_name).filter((F.col(col_name).isNotNull()) | (F.col(col_name).cast('double') == 0.0)).count() <= nonnull_cutoff: cols_drop.append(col_name) base2 = base.drop(*cols_drop) print('*** Dropped null-only or columns having non-null values less than or equal to ' + str(nonnull_cutoff) + '...') print(str(cols_drop)) base.unpersist() base2.persist() print('\n*** Excluded features including a target...') print(str(except_for_these)) num = [] cat = [] for c in base2.drop(*except_for_these).dtypes: if c[1] == 'string': cat += [c[0]] base2 = base2.withColumn(c[0], F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' ') | (F.col(c[0]).isNull()), F.lit('None')).otherwise(F.col(c[0]))) if c[1] not in ('string', 'timestamp') and base2.select(F.approx_count_distinct(c[0])).collect()[0][0] > unique_num_value: num += [c[0]] base2 = ( base2.withColumn(c[0], F.when((F.col(c[0]) == '') | (F.col(c[0]) == ' '), F.lit(None)).otherwise(F.col(c[0]))) .withColumn(c[0], F.col(c[0]).cast('double'))) print('\n*** Distribution by target value ***') base2.groupBy(target).count().show() # Create batches for faster processing base = (df1.join(df_seg, ['party_id', 'cmdb_partition'], 'left').withColumn('rnum',F.row_number().over(W.orderBy('party_id')))) # Replace empty strings to None string or null n_batch = 20 batch_size = int(n_sample/n_batch) total_nulls = {} dfs = {} print('\n*** Replace empty strings to None string or null...***') for i in range(n_batch): print('*** batch :' + str(i+1) + ' out of ' + str(n_batch) + ' with a size of ' + str(batch_size)) lb = i* batch_size + 1 ub = i * batch_size + batch_size df_b = base.filter((F.col('rnum') >= lb) & (F.col('rnum') <= ub)) for c in df_b.drop(*except_for_these).columns: if c in cat: df_b = df_b.withColumn(c, F.when((F.col(c) == '') | (F.col(c) == ' ') | (F.col(c).isNull()), F.lit('None')).otherwise(F.col(c))) if c in num: df_b = (df_b.withColumn(c, F.when((F.col(c) == '') | (F.col(c) == ' '), F.lit(None)).otherwise(F.col(c))) .withColumn(c, F.col(c).cast('double'))) nulls = df_b.select(c).filter((F.col(c).isNull()) | (F.col(c) == 'None')).count() total_nulls.setdefault(c, []).append(nulls) dfs[str(i)]=df_b drop_list = [] for col, nulls in total_nulls.items(): print('Column : ' + col + ' - null count : ' + str(nulls)) if sum(nulls) > 0: drop_list.append(col) print('\n*** Drop list ***') print(str(drop_list)) print('\n*** Merge batches into one data frame...') from functools import reduce def unionAll(dfs): return reduce(lambda df1, df2: df1.unionByName(df2), dfs) base2 = (unionAll(list(dfs.values())).drop(*drop_list)) print('*** Merged as base2 ...') #Check imbalance - weighting for the minority class total_size = float(base2.count()) tp_size = float(base2.select(target).filter(F.col(target) == 1).count()) tn_size = float(total_size - tp_size) if tp_size/tn_size < balThreshold: print("Imbalance issue exists....target vs non-target ratio : " + str(tp_size/tn_size)) if use_wgt: class_weight = tn_size / total_size base2 = base2.withColumn('classWeight', F.when(F.col(target) == 0, class_weight).otherwise(1-class_weight)) else: pass else: pass print('\n*** Describe categorical variables in the data set ***') print(str(cat)) for i in cat: base2.groupBy(i).count().orderBy('count', ascending=False).show(2000,truncate=False) print('*** Describe numeric variables in the data set ***') print(str(num)) for i in num: base2.select(i).summary().show(truncate=False) if _save_fig: tmp = base2.select(i).toPandas() tmp.hist(figsize=(12,10)) plt.savefig('./fig/'+i+'.png') plt.close() if _undersample: print('*** Undersampling major class due to imbalanced data ***') base_tp = base2.filter(F.col(target) == 1) _count = base_tp.count() print('*** Target size :' + str(_count*undersample_rate)) base_tn = (base2.filter(F.col(target) == 0) .withColumn('rand_num', F.lit(F.rand(_seed))) .withColumn('row_num', F.row_number().over(W.orderBy('rand_num'))) .filter(F.col('row_num') <= _count*undersample_rate) .drop('rand_num', 'row_num')) df = base_tp.unionByName(base_tn) else: df = base2 base2.unpersist() return df, num, cat
def main(): # Cargamos los ficheros csv directamente como dataframes spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext df1 = spark.read.format("csv").option( "header", "true").load("simpsons_characters.csv") df2 = spark.read.format("csv").option("header", "true").load("simpsons_episodes.csv") df3 = spark.read.format("csv").option( "header", "true").load("simpsons_locations.csv") df4 = spark.read.format("csv").option( "header", "true").load("simpsons_script_lines.csv") #A) numero de ubicaciones diferentes que aparecen en cada episodio dfScriptLines = df4.groupBy('episode_id').agg( functions.approx_count_distinct( df4.location_id).alias("count")).selectExpr( "episode_id as id", "count") dfLocations = df2.join(dfScriptLines, on="id", how='outer').select("id", "imdb_rating", "count") dfaux1 = dfLocations.withColumn( "imdb_rating_double", dfLocations["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("count_double", dfLocations["count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr(dfaux2.imdb_rating_double, dfaux2.count_double).alias('Pearson-Imdb-NumLocations')) dfcof.show() #B) numero de personajes que aparecen en cada episodio dfe = df4.groupBy('episode_id').agg( functions.approx_count_distinct( df4.character_id).alias("character_count")).selectExpr( "episode_id as id", "character_count") dfjoin1 = df2.join(dfe, on="id", how='outer').select("id", "imdb_rating", "character_count") dfaux1 = dfjoin1.withColumn("imdb_rating_double", dfLocations["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("character_count_double", dfaux1["character_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.character_count_double).alias('Pearson-Imdb-NumCharacters')) dfcof.show() #C y D) numero de personajes masculinos y femeninos que aparecen en cada episodio dfc = df1.select('id', 'gender').selectExpr("id as character_id", "gender") dfe = df4.select('character_id', 'episode_id') dfjoin2 = dfc.join(dfe, on="character_id", how='outer').select("episode_id", "character_id", "gender") dfCharacMasc = dfjoin2.filter( dfjoin2.gender == 'm').groupBy('episode_id').agg( functions.approx_count_distinct( dfjoin2.character_id).alias("masc_count")).selectExpr( "episode_id as id", "masc_count") dfCharacFem = dfjoin2.filter( dfjoin2.gender == 'f').groupBy('episode_id').agg( functions.approx_count_distinct( dfjoin2.character_id).alias("fem_count")).selectExpr( "episode_id as id", "fem_count") dfjoin3 = dfjoin1.join(dfCharacMasc, on="id", how='outer').select("id", "imdb_rating", "character_count", "masc_count") dfjoin4 = dfjoin3.join(dfCharacFem, on="id", how='outer').select("id", "imdb_rating", "character_count", "masc_count", "fem_count") dfaux1 = dfjoin4.withColumn("imdb_rating_double", dfjoin4["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("masc_count_double", dfaux1["masc_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.masc_count_double).alias('Pearson-Imdb-NumCharactersMasc')) dfcof.show() dfaux1 = dfjoin4.withColumn("imdb_rating_double", dfjoin4["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("fem_count_double", dfaux1["fem_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.fem_count_double).alias('Pearson-Imdb-NumCharactersFem')) dfcof.show() #E) numero de palabras dfe = df4.groupBy('episode_id').agg( functions.sum(df4.word_count).alias("word_count")).selectExpr( "episode_id as id", "word_count") dfjoin1 = df2.join(dfe, on="id", how='outer').select("id", "imdb_rating", "word_count") dfaux1 = dfjoin1.withColumn("imdb_rating_double", dfjoin1["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("word_count_double", dfaux1["word_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.word_count_double).alias('Pearson-Imdb-NumWords')) dfcof.show() #F) cantidad total de diálogos dfe2 = df4.filter(df4.speaking_line == True).groupBy('episode_id').agg( functions.count(df4.raw_text).alias("raw_text_count")).selectExpr( "episode_id as id", "raw_text_count") dfjoin2 = dfjoin1.join(dfe2, on="id", how='outer').select("id", "imdb_rating", "word_count", "raw_text_count") dfaux1 = dfjoin2.withColumn("imdb_rating_double", dfjoin1["imdb_rating"].cast(DoubleType())) dfaux2 = dfaux1.withColumn("raw_text_count_double", dfaux1["raw_text_count"].cast(DoubleType())) dfcof = dfaux2.agg( functions.corr( dfaux2.imdb_rating_double, dfaux2.raw_text_count_double).alias('Pearson-Imdb-NumRawText')) dfcof.show() sc.stop()
# COMMAND ---------- from pyspark.sql.functions import count df.select(count("StockCode")).show() # 541909 # COMMAND ---------- from pyspark.sql.functions import countDistinct df.select(countDistinct("StockCode")).show() # 4070 # COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ----------