def test_first_last_ignorenulls(self): from pyspark.sql import functions df = self.spark.range(0, 100) df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id")) df3 = df2.select(functions.first(df2.id, False).alias('a'), functions.first(df2.id, True).alias('b'), functions.last(df2.id, False).alias('c'), functions.last(df2.id, True).alias('d')) self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
def reduce_to_ohlc(time, rdd): row_rdd = rdd.map(lambda row: row.split(',')) \ .filter(lambda row: len(row) == 3) \ .map(lambda row: Row( symbol=row[0], tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'), price=float(row[1]) )) sql_context = get_sql_context_instance(rdd.context) data = sql_context.createDataFrame(row_rdd) data.cache() data.write.format('org.apache.spark.sql.cassandra') \ .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save() ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \ .orderBy('tx_time') \ .groupBy('symbol', 'batch_time') \ .agg( F.first(data.price).alias('open'), F.max(data.price).alias('high'), F.min(data.price).alias('low'), F.last(data.price).alias('close'), F.first(data.tx_time).alias('open_time'), F.last(data.tx_time).alias('close_time') ) existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .load() \ .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time') merged_ohlc = ohlc.join(existing_ohlc, (ohlc.symbol == existing_ohlc.symbol) & (ohlc.batch_time == existing_ohlc.batch_time), 'left' ) merged_ohlc = merged_ohlc.select( ohlc.symbol.alias('symbol'), ohlc.batch_time.alias('batch_time'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'), F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'), F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high') ) merged_ohlc.write.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save()
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, u'99'), tuple(g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def levenshtein_cluster(df, col_name): # Prepare a group so we don need to apply the fingerprint to the whole data set df = df.select(col_name).groupby(col_name).agg(F.count(col_name).alias("count")) df = KeyCollision.fingerprint(df, col_name) df_t = df.groupby(col_name + "_FINGERPRINT").agg(F.collect_list(col_name).alias("cluster"), F.size(F.collect_list(col_name)).alias("cluster_size"), F.first(col_name).alias("recommended"), F.sum("count").alias("count")) # Filter min distance df_l = DistanceCluster.levenshtein_filter(df, col_name) # Cluster df_l = df_l.join(df_t, (df_l[col_name + "_FROM"] == df_t[col_name + "_FINGERPRINT"]), how="left") \ .cols.drop(col_name + "_FINGERPRINT") \ .cols.drop([col_name + "_FROM", col_name + "_TO", col_name + "_LEVENSHTEIN_DISTANCE"]).table() return df_l
def saveCompletedJobRunScheduleData(microBatchDF): scheduleExplodeDF = microBatchDF.select(microBatchDF.job_id, microBatchDF.run_id, explode(microBatchDF.schedule)) scheduleDF = scheduleExplodeDF.groupBy("job_id", "run_id").pivot("key").agg( first("value")) if DeltaTable.isDeltaTable(spark, completed_job_run_schedule_path): # merge data deltaTable = DeltaTable.forPath(spark, completed_job_run_schedule_path) (deltaTable.alias("target").merge( scheduleDF.alias("source"), "source.job_id=target.job_id and source.run_id=target.run_id"). whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()) else: (scheduleDF.write.format("delta").mode("overwrite").option( "mergeSchema", "true").save(completed_job_run_schedule_path))
def statistic_school_address(df): """ 学校和工作地对应的分析 :param df: :return: """ df = df.filter(df.address.isNotNull()) groups = ("school_name", "degree", "address") df = add_median_salary(df, groups) sda_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) sda_df = sda_df.filter(sda_df.person_num > MIN_NUM) # 不限degree分析 sa_df = sda_df.groupby("school_name", "address").agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) sa_df = sa_df.withColumn("degree", F.lit(NA)) sda_df = sda_df.unionByName(sa_df) return sda_df
def deduplication(logger, df_dict: Dict[str, DataFrame], rules: Dict[str, List[str]]): """ Deduplicate lines considering few columns and merge data from those duplicate Args: logger: Logger instance used to log events df_dict: Dictionary of the datasets with the structure {Name: Dataframe} rules: {Dataset Name: [column1, column2] Returns: Dic updated in place """ try: for df_name, columns in rules.items(): df_dict[df_name] = df_dict.get(df_name).groupBy(*columns) \ .agg( *[first(x, ignorenulls=True).alias(x) for x in df_dict.get(df_name).columns if x not in columns]) logger.info("Dataframes cleaning deduplication applied") except Exception as e: logger.error("Cleaning duplicate rows couldn't be performed: {}".format(e), traceback.format_exc()) raise e
def main(): spark = SparkSession.builder.master("local").appName("Word Count").config( "spark.some.config.option", "some-value").getOrCreate() # sc = SparkContext() l = [(None, 1), ('Aliceaa', 3), ('Alices', None), ('Alicesssss', 1), ('Alices', 3)] x = spark.createDataFrame(l, ['name', 'age']) def myFunc(data_list): for val in data_list: if val is not None and val != '': return val return None myUdf = udf(myFunc, StringType()) x=x.groupBy('age')\ .agg(first('name').alias('name')) # dropping duplicates from the dataframe x.dropDuplicates().show()
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted( g.agg({ 'key': 'max', 'value': 'count' }).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, u'99'), tuple( g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue( 95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def group_batched_logs(df_logs): # group logs from did + interval_time + keyword. # group 1: group by did + interval_starting_time + keyword df = df_logs.groupBy( 'aid', 'interval_starting_time', 'keyword_index').agg( first('keyword').alias('keyword'), first('age').alias('age'), first('gender_index').alias('gender_index'), first('aid_bucket').alias('aid_bucket'), fn.sum(col('is_click')).alias('kw_clicks_count'), fn.sum(fn.when(col('is_click') == 0, 1).otherwise(0)).alias('kw_shows_count'), ) # df = df.orderBy('keyword_index') df = df.withColumn( 'kwi_clicks_count', concat_ws(":", col('keyword_index'), col('kw_clicks_count'))) df = df.withColumn( 'kwi_shows_count', concat_ws(":", col('keyword_index'), col('kw_shows_count'))) df = df.withColumn( 'kw_clicks_count', concat_ws(":", col('keyword'), col('kw_clicks_count'))) df = df.withColumn( 'kw_shows_count', concat_ws(":", col('keyword'), col('kw_shows_count'))) # group 2: group by did + interval_starting_time df = df.groupBy('aid', 'interval_starting_time').agg( concat_ws(",", collect_list('keyword_index')).alias('kwi'), concat_ws( ",", collect_list('kwi_clicks_count')).alias('kwi_click_counts'), concat_ws( ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'), concat_ws(",", collect_list('keyword')).alias('interval_keywords'), concat_ws( ",", collect_list('kw_clicks_count')).alias('kw_click_counts'), concat_ws(",", collect_list('kw_shows_count')).alias('kw_show_counts'), first('age').alias('age'), first('gender_index').alias('gender_index'), first('aid_bucket').alias('aid_bucket')) return df
def search_clients_daily(main_summary): return agg_search_data( main_summary, [ 'client_id', 'submission_date', 'engine', 'source', ], map(agg_first, [ 'country', 'app_version', 'distribution_id', 'locale', 'search_cohort', 'addon_version', 'os', 'channel', 'profile_creation_date', 'default_search_engine', 'default_search_engine_data_load_path', 'default_search_engine_data_submission_url', 'sample_id', ]) + [ # Count of 'first' subsessions seen for this client_day (count(when(col('subsession_counter') == 1, 1)).alias('sessions_started_on_this_day')), first( datediff( 'subsession_start_date', from_unixtime(col('profile_creation_date') * 24 * 60 * 60))).alias('profile_age_in_days'), sum(col('subsession_length') / 3600.0).alias('subsession_hours_sum'), mean(size('active_addons')).alias('active_addons_count_mean'), (max('scalar_parent_browser_engagement_max_concurrent_tab_count'). alias('max_concurrent_tab_count_max')), (sum('scalar_parent_browser_engagement_tab_open_event_count'). alias('tab_open_event_count_sum')), (sum(col('active_ticks') * 5 / 3600.0).alias('active_hours_sum')), ])
def statistic_school_rank(df): """ 专业排名 :param df: :return: """ groups = ("school_name", "degree") df = add_median_salary(df, groups) sd_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) sd_df = sd_df.filter(sd_df.person_num > MIN_NUM) # 不限degree分析 s_df = sd_df.groupby("school_name").agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) s_df = s_df.withColumn("degree", F.lit(NA)) sd_df = sd_df.unionByName(s_df) sd_df = sd_df.filter(sd_df.person_num > MIN_NUM) sd_df = add_rank(sd_df, "degree") return sd_df
def _event_prop(event_type: str, expr: Column) -> Column: """Get property from the event of a certain type within the ad session. Parameters ---------- event_type Event type. expr Value expression. Returns ------- Column Column expression that evaluates to the provided `expr` if the event type matches the specified one, or None otherwise. """ return first( when(col('type') == event_type, expr), ignorenulls=True )
def pose(): # SparkSession_2 = SparkSession.newSession() spark = SparkSession.builder.appName('csql_demo1').master( 'local[*]').getOrCreate() # spark = SparkSession.builder.appName('csql_demo').master('local[*]').config('spark.jars', 'file:///home/boopathi/Downloads/spark-cassandra-connector-2.4.0-s_2.11.jar').getOrCreate() # spark.conf.set('spark.jars', 'file:///home/boopathi/Downloads/postgresql-42.2.7.jar')postgresql-42.2.7.jar # spark.newSession() ,.config('spark.jars','file:///home/boopathi/Downloads/*') #-------------------- SparkSession_2 = spark.newSession() # query = "(SELECT * FROM attribute_kv) as r" query = "(SELECT * FROM attribute_kv WHERE entity_type = 'DEVICE' ) as r" get_data = spark.read.format('jdbc').option( 'driver', 'org.postgresql.Driver').option( 'url', 'jdbc:postgresql://192.168.1.36:5432/thingsboard').option( "user", "postgres").option("password", "postgres").option('dbtable', query).load() dx = get_data.withColumn( "value", concat_ws("", get_data.bool_v, get_data.long_v, get_data.dbl_v, get_data.json_v, get_data.str_v)) dx = dx.filter(dx.attribute_type == 'SERVER_SCOPE') nl = dx.groupBy('entity_id', 'attribute_type').pivot('attribute_key').agg( first('value')) ld = nl.withColumnRenamed('entity_id', 'device_id') query = "(SELECT name, type, id FROM device) as r" sk = spark.read.format('jdbc').option( 'driver', 'org.postgresql.Driver').option( 'url', 'jdbc:postgresql://192.168.1.36:5432/thingsboard').option( "user", "postgres").option("password", "postgres").option('dbtable', query).load() joined_data = ld.join(sk, ld.device_id == sk.id) req_det = joined_data.rdd.map(lambda x: [ x.name, x.device_id, x.attribute_type, x.scNo, x.simNo, x.imeiNumber, x .boardNumber, x.zoneName, x.wardName, x.location, x.phase, x.ccmsType, x.kva, x.baseWatts, x.baseLine, x.connectedWatts, x.roadType, x. latitude, x.longitude ]).collect() return req_det
def filter_df_on_start_activities_nocc(df, nocc, sa_count0=None, timestamp_key=DEFAULT_TIMESTAMP_KEY, case_id_glue=CASE_CONCEPT_NAME, activity_key=DEFAULT_NAME_KEY, grouped_df=None): """Filters the Spark dataframe on start activities number of occurrences """ if grouped_df is None: grouped_df = df.groupby(case_id_glue) if sa_count0 is None: parameters = { PARAMETER_CONSTANT_TIMESTAMP_KEY: timestamp_key, PARAMETER_CONSTANT_CASEID_KEY: case_id_glue, PARAMETER_CONSTANT_ACTIVITY_KEY: activity_key, GROUPED_DATAFRAME: grouped_df } sa_count0 = get_start_activities(df, parameters=parameters) sa_count = [k for k, v in sa_count0.items() if v >= nocc] if len(sa_count) < len(sa_count0): grouped_df = grouped_df.agg(F.first(activity_key).alias(activity_key+"_1")) df_start = grouped_df.filter(grouped_df[activity_key+"_1"].isin(sa_count)) return df.join(F.broadcast(df_start), grouped_df.columns[0]).drop(activity_key+"_1") return df
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted( g.agg({ "key": "max", "value": "count" }).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, "99"), tuple( g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue( 95 < g.agg(functions.approx_count_distinct(df.key)).first()[0]) # test deprecated countDistinct self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def saveCompletedJobRunTaskData(microBatchDF): taskExpode1 = microBatchDF.select(microBatchDF.job_id, microBatchDF.run_id, explode(microBatchDF.cluster_spec)) taskExpode2 = taskExpode1.select(taskExpode1.job_id, taskExpode1.run_id, taskExpode1.key.alias("task_type"), explode(taskExpode1.value)) taskDF = taskExpode2.groupBy("job_id", "run_id", "task_type").pivot("key").agg(first("value")) if DeltaTable.isDeltaTable(spark, completed_job_run_task_path): # merge data deltaTable = DeltaTable.forPath(spark, completed_job_run_task_path) (deltaTable.alias("target").merge( taskDF.alias("source"), "source.job_id=target.job_id and source.run_id=target.run_id"). whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()) else: (taskDF.write.format("delta").mode("overwrite").option( "mergeSchema", "true").save(completed_job_run_task_path))
def levenshtein_cluster(df, input_col): """ Return a dataframe with a string of cluster related to a string :param df: Spark Dataframe :param input_col: :return: """ # Prepare a group so we don't need to apply the fingerprint to the whole data set df = df.select(input_col).groupby(input_col).agg( F.count(input_col).alias("count")) df = keycollision.fingerprint(df, input_col) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) fingerprint_col = name_col(input_col, FINGERPRINT_COL) df_t = df.groupby(fingerprint_col).agg( F.collect_list(input_col).alias(cluster_col), F.size(F.collect_list(input_col)).alias(cluster_size_col), F.first(input_col).alias(recommended_col), F.sum("count").alias(count_col)).repartition(1) # if Optimus.cache: # df_t = df_t.cache() # Filter nearest string df_l = levenshtein_filter(df, input_col).repartition(1) if Optimus.cache: df_l = df_l.cache() # Create Cluster df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \ .cols.drop(fingerprint_col) \ .cols.drop([input_col + "_FROM", input_col + "_TO", name_col(input_col, "LEVENSHTEIN_DISTANCE")]) return df_l
def gen_freq_distr_user_data(userDF, attrs): userWt = weightCol # get weighted frequencies of each category of a user's categorical attributes print("[getCategoryFreqs] Grouping records by users") categoryFreqInfo = {} udf = userDF for attr in attrs: print("Processing attribute %s" % attr) # get wt for each individual user _tbl = udf.filter(udf[attr].isNotNull()) \ .groupby(userIdCol, attr) \ .agg(F.first(userWt).alias(userWt)) # sum up weight for each values of the attribute _tbl = _tbl.groupby(attr).agg(F.sum(userWt).alias('wt')) attrInfo = _tbl.collect() # build a dic of {attrValue:freq} vals = {x[attr]: x['wt'] for x in attrInfo} # sum of all occurances of attribute tot = sum(vals.values()) #compute relative freq w.r.t to total occurances of the attribute info = {val: float(wt) / tot for val, wt in vals.iteritems()} categoryFreqInfo[attr] = info return categoryFreqInfo
def statistic_major_position(df): """ 专业对应的 :param df: :return: """ groups = ("major", "degree", "position_name") df = df.filter(df.position_name.isNotNull()) # 职位别名 df = df.withColumn("position_title", F.lower(F.trim(df.position_title))) pdf = df.groupby("position_name", "position_title").agg(F.count("*").alias("total")) pdf = pdf.groupby("position_name").apply(filter_position) pdf = pdf.groupby("position_name").agg(F.collect_set("position_title").alias("position_set")) pdf = pdf.withColumn("position_alias", F.udf(lambda x: "/".join(x))(pdf.position_set)) pdf = pdf.select("position_name", "position_alias") # 职位对应行业 idf = df.groupby("position_name", "industry").agg(F.count("*").alias("total")) idf = idf.groupby("position_name").apply(filter_industry) idf = idf.groupby("position_name").agg(F.collect_set("industry").alias("industry_set")) idf = idf.withColumn("industry_alias", F.udf(lambda x: "/".join(x))(idf.industry_set)) idf = idf.select("position_name", "industry_alias") # 限制degree分析 df = add_median_salary(df, groups) mdp_df = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.first(df.avg_salary).alias("avg_salary")) mdp_df = mdp_df.filter(mdp_df.person_num > MIN_NUM) # 不限degree分析 mp_df = mdp_df.groupby("major", "position_name").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) mp_df = mp_df.withColumn("degree", F.lit(NA)) mdp_df = mdp_df.unionByName(mp_df) # 融合职位别名 mdp_df = mdp_df.join(pdf, "position_name") # 融合职位对应的行业 mdp_df = mdp_df.join(idf, "position_name") return mdp_df
def user_cluster_model(spark, ratings, movies, k, genres): """ Returns a clustering model for users' genre preferences """ # Get all user ids all_user_ids = ratings.select("userId").distinct().rdd.flatMap( lambda x: x).collect() # Calculate scores for each user scores = user_genre_scores(spark, ratings, movies, all_user_ids)\ .sort(col("userId"), col("genre")) # Convert genres in rows to columns scores = scores.groupBy("userId").pivot("genre").agg( first("score")).na.fill(0) # Ignore movies without genres if "(no genres listed)" in scores.columns: scores = scores.drop("(no genres listed)") scores.cache() # Find genres in dataset used genres_in_scores = scores.drop("userId").columns # Train a k-means model scores = VectorAssembler(inputCols=genres_in_scores, outputCol="features").transform(scores) kmeans_model = KMeans().setK(k).setSeed(5052).fit(scores) # Save genres used in model to model object kmeans_model.genres = genres_in_scores # Calculate sihlouette score & save to model train_predictions = kmeans_model.transform(scores) kmeans_model.sihlouette_score = ClusteringEvaluator().evaluate( train_predictions) return kmeans_model
def create_yearly_weather(spark) -> DataFrame: """ Reads in 3 years of daily weather reports throughout the world. After filtering on US stations only, and keeping only the most prevalent key weather metrics, the dataframe needs to be pivotted so it can be easily joined with the review and distances dataframes. """ yearly_weather_path = f"s3://{s3_bucket}/ghcn/year_*" elements_to_keep = ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN'] yearly_weather = (spark.read.csv( yearly_weather_path, header=False, schema=yearly_weather_schema).filter( col('element').isin(elements_to_keep)).filter( col('station_id').startswith('US')).withColumn( 'year', substring(col('date'), 1, 4)).withColumn( 'month', substring(col('date'), 5, 2)).withColumn( 'day', substring(col('date'), 7, 2)).withColumn( 'weather_date', to_date( concat_ws( '-', col('year'), col('month'), col('day')))).select( col('station_id'), col('weather_date'), col('element'), col('value').cast( IntegerType())).repartition( 200, 'station_id', 'weather_date')) yearly_weather_pivot = (yearly_weather.groupby( 'station_id', 'weather_date').pivot('element').agg(first('value')).dropna( subset=['PRCP', 'TMAX', 'TMIN']).repartition( 200, 'station_id', 'weather_date')) return yearly_weather_pivot
def assoc_fn(df: DataFrame, group_by_cols): gbc = [col(x) for x in group_by_cols] h_fn = partial(harmonic_fn, partition_cols=group_by_cols, over_col="evs_score", output_col=harmonic_col) assoc_df = (df.withColumn( "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0))) ).transform(h_fn).groupBy(*gbc).agg( countDistinct(col("pmid")).alias("f"), mean(col("evidence_score")).alias("mean"), stddev(col("evidence_score")).alias("std"), max(col("evidence_score")).alias("max"), min(col("evidence_score")).alias("min"), expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))"). alias("q"), count(col("pmid")).alias("N"), first(col(harmonic_col)).alias(harmonic_col)).withColumn( "median", element_at(col("q"), 2)).withColumn( "q1", element_at(col("q"), 1)).withColumn("q3", element_at(col("q"), 3)).drop("q")) return assoc_df
def run_job(trades: DataFrame): """ Generates daily summaries of provided trade data grouped by security. Returns a DataFrame with the following columns: - Security - Date - TradedVolume - NumberOfTrades - StartPrice - EndPrice - HighPrice - LowPrice - Volatility DataFrame is ordered alphabetically by trade, then in chronological reverse order. :param trades: DataFrame :return: DataFrame """ with_roc = trades.withColumn( "ROC", udf(calculate_roc, FloatType())(trades.StartPrice, trades.EndPrice)) grouped = with_roc.groupBy('Mnemonic', 'Date')\ .agg( sum('TradedVolume').alias('TradedVolume'), sum('NumberOfTrades').alias('NumberOfTrades'), first('StartPrice').alias('StartPrice'), last('EndPrice').alias('EndPrice'), max('MaxPrice').alias('HighPrice'), min('MinPrice').alias('LowPrice'), sum('ROC').alias('Volatility'), )\ .withColumnRenamed('Mnemonic', 'Security') \ .orderBy(asc('Security'), desc('Date')) return grouped
def process_df(self, df): def detect_anomaly(ts): """ Args ts: pandas.series rtype: int """ outliers_indices = seasonal_esd( ts, hybrid=True, max_anomalies=10) return len(outliers_indices) grouped_df = df.groupBy(["id"]).agg(F.collect_list("downsample_avg").alias( "downsampled_ts"), first("start_ts").alias("start_ts"), last("end_ts").alias("end_ts")) anomaly_udf = udf(detect_anomaly, IntegerType()) processed_df = grouped_df.withColumn("num_anomaly", anomaly_udf( "downsampled_avg")).sort(desc("num_anomaly")) final_df = processed_df.select( "id", "start_ts", "end_ts", "num_anomaly") try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "global_anomalies_table", "append") except Exception as e: print(e) pass
def group_batched_logs(df_logs_batched): # group the logs to generate the train ready data from the basic unit of uckey + interval_time + keyword. # group 1: group by uckey + interval_starting_time + keyword df = df_logs_batched.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg( first('keyword').alias('keyword'), fn.sum(col('is_click')).alias('keyword_click_count'), fn.count(fn.when(col('is_click') == 0, 1).otherwise( 0)).alias('keyword_show_count') ) df = df.withColumn('keyword_index_click_count', concat_ws(":", col('keyword_index'), col('keyword_click_count'))) df = df.withColumn('keyword_index_show_count', concat_ws(":", col('keyword_index'), col('keyword_show_count'))) df = df.withColumn('keyword_click_count', concat_ws(":", col('keyword'), col('keyword_click_count'))) df = df.withColumn('keyword_show_count', concat_ws(":", col('keyword'), col('keyword_show_count'))) # group 2: group by uckey + interval_starting_time df = df.groupBy('uckey', 'interval_starting_time').agg( concat_ws(",", collect_list('keyword_index')).alias('interval_keyword_indexes'), concat_ws(",", collect_list('keyword_index_click_count')).alias('interval_keyword_indexes_click_counts'), concat_ws(",", collect_list('keyword_index_show_count')).alias('interval_keyword_indexes_show_counts'), concat_ws(",", collect_list('keyword')).alias('interval_keywords'), concat_ws(",", collect_list('keyword_click_count')).alias('interval_keywords_click_counts'), concat_ws(",", collect_list('keyword_show_count')).alias('interval_keywords_show_counts') ) return df
def pivotSummary(df: DataFrame) -> DataFrame: ''' Combined with the melt function above this function takes in a summary of a dataframe calculated by `.describe()` function and outputs it in a long and more readable format especially in case of dataframes with many variables. ''' schema = df.schema slist = [] for i in schema: slist.append(i.name) id1 = slist[0] slist.remove('summary') longFormat = melt(df, id_vars=[id1], value_vars=slist) wideDF = longFormat.groupBy('variable').pivot( 'summary', ['count', 'mean', 'stddev', 'min', 'max']).agg(first('value')) return wideDF
idf_model.write().overwrite().save( "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/idf_model") # df_standard = idf_model.transform(df_standard) # df_standard.show() # --------- 从这里开始应该重启一个job 我偷懒没有写 ------------ # 5. 利用原有的crossjoin 数据构建,公司的label df_result = load_training_data(spark) # 待清洗数据 # 为生产厂商构建新label df_mnf_label = df_result.where(df_result.label == 1.0).select( "id", "MANUFACTURER_NAME", "MANUFACTURER_NAME_STANDARD", "MANUFACTURER_NAME_EN_STANDARD") df_mnf_label = df_mnf_label.groupBy("id").agg(first(df_mnf_label.MANUFACTURER_NAME).alias("MANUFACTURER_NAME_ANSWER"), \ first(df_mnf_label.MANUFACTURER_NAME_STANDARD).alias("MANUFACTURER_NAME_STANDARD_ANSWER"), \ first(df_mnf_label.MANUFACTURER_NAME_EN_STANDARD).alias("MANUFACTURER_NAME_EN_STANDARD_ANSWER")) df_result = df_result.join(df_mnf_label, how="left", on="id") df_result = df_result.withColumn( "mnf_label", when((df_result.MANUFACTURER_NAME_STANDARD == df_result.MANUFACTURER_NAME_STANDARD_ANSWER) | (df_result.MANUFACTURER_NAME_EN_STANDARD == df_result.MANUFACTURER_NAME_EN_STANDARD_ANSWER), 1.0).otherwise(0.0)) # df_result.select("id", "MANUFACTURER_NAME", "MANUFACTURER_NAME_STANDARD","MANUFACTURER_NAME_STANDARD_ANSWER", "mnf_label").show() df_result = df_result.drop("MANUFACTURER_NAME_ANSWER", "MANUFACTURER_NAME_STANDARD_ANSWER", "MANUFACTURER_NAME_EN_STANDARD_ANSWER")
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
def agg_first(col): return first(col).alias(col)
def hash(self, df_trajectory_processed, df_type='pandas'): # Assert Implemented Methods assert df_type in { 'pandas', 'spark' }, 'hash@<TrajectoryHasherJacardEstimation>: df_type = "{}" is not implemented!'.format( df_type) # Hash if (df_type == 'pandas'): # Bounds id_timestamp_min = df_trajectory_processed['id_timestamp'].min() id_timestamp_max = df_trajectory_processed['id_timestamp'].max() # Select ID TimeStamp for Hashes id_timestamps_selected = np.random.choice( np.arange(id_timestamp_min, id_timestamp_max + 1), self.n_hashes, replace=False) if self.n_hashes < ( id_timestamp_max - id_timestamp_min + 1) else list( range(id_timestamp_min, id_timestamp_max + 1)) id_timestamps_selected_set = set(id_timestamps_selected) # Filter df_result = df_trajectory_processed[ df_trajectory_processed['id_timestamp'].map( lambda x: x in id_timestamps_selected_set)].copy( ).sort_values(['id_user', 'id_timestamp']) # Index Locations location_indices = [] lat_lng_to_idx = dict() for lat, lng in zip(df_result['lat'], df_result['lng']): key = (lat, lng) if (not key in lat_lng_to_idx): lat_lng_to_idx[key] = len(lat_lng_to_idx) # Add New Index location_indices.append(lat_lng_to_idx[key]) df_result['location_indices'] = location_indices # Calculate Hashes df_hashes = None for i, id_timestamp in enumerate(id_timestamps_selected): if (df_hashes is None): df_hashes = df_result[df_result['id_timestamp'] == id_timestamp][[ 'id_user', 'location_indices' ]].copy() else: df_hashes['location_indices'] = df_result[ df_result['id_timestamp'] == id_timestamp]['location_indices'].values # Rename New Column colname = 'hash_{}'.format(i) df_hashes.rename(columns={'location_indices': colname}, inplace=True) elif (df_type == 'spark'): # ID TimeStamp Bounds row = df_trajectory_processed.agg( sql_functions.min(sql_functions.col("id_timestamp")).alias( "id_timestamp_min"), sql_functions.max(sql_functions.col("id_timestamp")).alias( "id_timestamp_max")).head() id_timestamp_min, id_timestamp_max = row['id_timestamp_min'], row[ 'id_timestamp_max'] # Chosen TimeStamps id_timestamps_selected = np.random.choice( np.arange(id_timestamp_min, id_timestamp_max + 1), self.n_hashes, replace=False).tolist() if self.n_hashes < ( id_timestamp_max - id_timestamp_min + 1) else list( range(id_timestamp_min, id_timestamp_max + 1)) # Create SparkDataFrame df_id_timestamps = self.params['spark'].createDataFrame( [[id_timestamp, 'hash_{}'.format(i)] for i, id_timestamp in enumerate(id_timestamps_selected)], schema=sql_types.StructType([ sql_types.StructField('id_timestamp', sql_types.IntegerType(), False), sql_types.StructField('hash_name', sql_types.StringType(), False), ])) # ID Locations df_location_ids = df_trajectory_processed.select("lat", "lng").\ distinct().withColumn( "id_location", sql_functions.row_number().over( Window.orderBy("lat", "lng") ) ) # Join Hashes df_result = df_trajectory_processed.join(df_id_timestamps, on=['id_timestamp'], how='inner').join( df_location_ids, on=['lat', 'lng'], how='inner') # Turn Into Table df_hashes = df_result.groupby("id_user").\ pivot( "hash_name" ).\ agg( sql_functions.first( "id_location" ) ) # Return return (df_hashes)
key = my_bucket_object.key tablename=key.split("_") table = tablename[0] val = tablename[2].split(".")[0] path = f's3a://{bucket}/{key}' df = spark.read.option("header",True).csv(path) s = [s for s in df.columns if '2019' in s] selected = ['State','Metro'] + s df2=df.select(*selected) newdf = df2.withColumn('house_average', sum(df2[col] for col in s)/len(s)) df1 = newdf.withColumn("house_average", F.round(newdf["house_average"], 1)) d = df1.drop(*s) d = d.withColumn('bedrooms',F.lit(val)) d = d.withColumnRenamed('Metro','city') d = d.dropna(subset=["city"]) d = d.groupBy('state').agg(F.avg('house_average').alias('house_average'), F.first('bedrooms')) #d = d.withColumnRenamed('first(state)','state') d = d.withColumnRenamed('first(bedrooms)','bedrooms') d = d.withColumn("house_average", F.round(d["house_average"], 1)) d = d.sort('state') d.show() d.write \ .format("jdbc") \ .option("url","jdbc:postgresql://10.0.0.8:5432/my_db") \ .option("dbtable","house_prices") \ .option("user","test") \ .option("password","test") \ .option("driver","org.postgresql.Driver") \ .mode("Append") \ .save()
sc = SparkContext(appName='generateDOIBoost') spark = SparkSession(sc) #Loading CrossRef Dataframe crossref = spark.read.load('/data/df/crossref.parquet', format="parquet") #Loading MAG Dataframe microsoft = spark.read.load("/data/df/mag.parquet", format="parquet") #Alias each column with _mag microsoft = microsoft.select(*(col(x).alias(x + '_mag') for x in microsoft.columns)) #Group By DOI since we have repeated doi with multiple abstract, at the moment we take the first One mag = microsoft.groupBy('doi_mag').agg( first('authors_mag').alias('author_mag'), first('abstract_mag').alias('abstract_mag'), first('collectedFom_mag').alias('collectedFrom_mag')) #Load ORCID DataFrame orcid = spark.read.load("/data/df/ORCID.parquet", format="parquet") #Fix missing value in collectedFrom orcid = orcid.withColumn('collectedFrom', array(lit('ORCID'))) #Alias each column with _orchid orcid = orcid.select(*(col(x).alias(x + '_orcid') for x in orcid.columns)) #Load UnpayWall DataFrame uw = spark.read.load("/data/df/unpaywall.parquet", format="parquet")