def clean_flat_column_names(df, col_name): df_clean = df for col in df.columns: splits = col.split('{}_'.format(col_name)) name = splits[len(splits) - 1] df_clean = df_clean.withColumnRenamed(col,name) return df_clean
# COMMAND ---------- # MAGIC %md # MAGIC ### Brewery # COMMAND ---------- from pyspark.sql.functions import explode, when df_brewery_flattened = flatten_df(df.select(df.brewery)) # COMMAND ---------- for col in df_brewery_flattened.columns: splits = col.split('brewery_') name = splits[len(splits) - 1] df_brewery_flattened = df_brewery_flattened.withColumnRenamed(col, name) # COMMAND ---------- create_register_delta_table(df_brewery_flattened, 'brewery', untappd_base_query_path + 'brewery', True) # COMMAND ---------- # MAGIC %md # MAGIC ### Comments # COMMAND ----------
# COMMAND ---------- # MAGIC %md # MAGIC ### Brewery # COMMAND ---------- from pyspark.sql.functions import explode, when df_brewery_flattened = flatten_df(df.select(df.brewery)) # COMMAND ---------- for col in df_brewery_flattened.columns: splits = col.split('brewery_') name = splits[len(splits) - 1] df_brewery_flattened = df_brewery_flattened.withColumnRenamed(col, name) # COMMAND ---------- df_brewery_flattened_upsert = df_brewery_flattened.join( spark.table('brewery'), 'id', 'left_anti') create_register_delta_table(df_brewery_flattened_upsert, 'brewery', untappd_base_query_path + 'brewery', True) # COMMAND ---------- # MAGIC %md # MAGIC ### Comments
def filter_non_restaurants(col): category = ' '.join(col.split(',')) if 'restaurant' not in category.lower() and 'food' not in category.lower(): return True return False
def filter_restaurants(col): for category in col.split(','): if 'restaurant' in category.lower() or 'food' in category.lower(): return True return False
# COMMAND ---------- # MAGIC %md # MAGIC ### Comments # COMMAND ---------- from pyspark.sql.functions import explode, when df_comments = df.select(df.comments.count.alias('comments_count'), explode(df.comments.items), df.comments.total_count.alias('total_count')) df_comments_flattened = flatten_df(df_comments) # COMMAND ---------- for col in df_comments_flattened.columns: splits = col.split('col_') name = splits[len(splits) - 1] df_comments_flattened = df_comments_flattened.withColumnRenamed(col,name) df_comments_flattened = df_comments_flattened.withColumn('user_brewery_details_tmp', explode('user_brewery_details')) df_comments_flattened = df_comments_flattened.withColumn('user_venue_details_tmp', explode('user_venue_details')) df_comments_flattened = df_comments_flattened.drop('user_venue_details', 'user_brewery_details') df_comments_flattened = df_comments_flattened.withColumnRenamed('user_venue_details_tmp','user_venue_details').withColumnRenamed('user_brewery_details_tmp','user_brewery_details') # COMMAND ---------- df_comments_flattened.writeStream.format('delta').option('path', untappd_base_query_path+'comments').option('checkpointLocation', untappd_base_query_path+'/checkpoints').trigger(once=True).start() # COMMAND ---------- # %sql
#register_delta_table(name = 'beer') # COMMAND ---------- # MAGIC %md # MAGIC ### Brewery # COMMAND ---------- from pyspark.sql.functions import explode, when df_brewery_flattened = flatten_df(df.select(df.brewery)) # COMMAND ---------- for col in df_brewery_flattened.columns: splits = col.split('brewery_') name = splits[len(splits) - 1] df_brewery_flattened = df_brewery_flattened.withColumnRenamed(col, name) # COMMAND ---------- write_delta_table(df_brewery_flattened, 'brewery') # COMMAND ---------- register_delta_table('brewery') # COMMAND ---------- # MAGIC %md # MAGIC ### Comments