Example #1
0
def clean_flat_column_names(df, col_name):
  df_clean = df
  for col in df.columns:
    splits = col.split('{}_'.format(col_name))
    name = splits[len(splits) - 1]
    df_clean = df_clean.withColumnRenamed(col,name)
  return df_clean
Example #2
0
# COMMAND ----------

# MAGIC %md
# MAGIC ### Brewery

# COMMAND ----------

from pyspark.sql.functions import explode, when

df_brewery_flattened = flatten_df(df.select(df.brewery))

# COMMAND ----------

for col in df_brewery_flattened.columns:
    splits = col.split('brewery_')
    name = splits[len(splits) - 1]
    df_brewery_flattened = df_brewery_flattened.withColumnRenamed(col, name)

# COMMAND ----------

create_register_delta_table(df_brewery_flattened, 'brewery',
                            untappd_base_query_path + 'brewery', True)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Comments

# COMMAND ----------
Example #3
0
# COMMAND ----------

# MAGIC %md
# MAGIC ### Brewery

# COMMAND ----------

from pyspark.sql.functions import explode, when

df_brewery_flattened = flatten_df(df.select(df.brewery))

# COMMAND ----------

for col in df_brewery_flattened.columns:
    splits = col.split('brewery_')
    name = splits[len(splits) - 1]
    df_brewery_flattened = df_brewery_flattened.withColumnRenamed(col, name)

# COMMAND ----------

df_brewery_flattened_upsert = df_brewery_flattened.join(
    spark.table('brewery'), 'id', 'left_anti')
create_register_delta_table(df_brewery_flattened_upsert, 'brewery',
                            untappd_base_query_path + 'brewery', True)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Comments
Example #4
0
def filter_non_restaurants(col):
    category = ' '.join(col.split(','))
    if 'restaurant' not in category.lower() and 'food' not in category.lower():
        return True
    return False
Example #5
0
def filter_restaurants(col):
    for category in col.split(','):
        if 'restaurant' in category.lower() or 'food' in category.lower():
            return True
    return False
Example #6
0
# COMMAND ----------

# MAGIC %md
# MAGIC ### Comments

# COMMAND ----------

from pyspark.sql.functions import explode, when
df_comments = df.select(df.comments.count.alias('comments_count'), explode(df.comments.items), df.comments.total_count.alias('total_count'))
df_comments_flattened = flatten_df(df_comments)

# COMMAND ----------

for col in df_comments_flattened.columns:
  splits = col.split('col_')
  name = splits[len(splits) - 1]
  df_comments_flattened = df_comments_flattened.withColumnRenamed(col,name)
df_comments_flattened = df_comments_flattened.withColumn('user_brewery_details_tmp', explode('user_brewery_details'))
df_comments_flattened = df_comments_flattened.withColumn('user_venue_details_tmp', explode('user_venue_details'))
df_comments_flattened = df_comments_flattened.drop('user_venue_details', 'user_brewery_details')
df_comments_flattened = df_comments_flattened.withColumnRenamed('user_venue_details_tmp','user_venue_details').withColumnRenamed('user_brewery_details_tmp','user_brewery_details')


# COMMAND ----------

df_comments_flattened.writeStream.format('delta').option('path',  untappd_base_query_path+'comments').option('checkpointLocation', untappd_base_query_path+'/checkpoints').trigger(once=True).start()

# COMMAND ----------

# %sql
Example #7
0
#register_delta_table(name = 'beer')

# COMMAND ----------

# MAGIC %md
# MAGIC ### Brewery

# COMMAND ----------

from pyspark.sql.functions import explode, when
df_brewery_flattened = flatten_df(df.select(df.brewery))

# COMMAND ----------

for col in df_brewery_flattened.columns:
    splits = col.split('brewery_')
    name = splits[len(splits) - 1]
    df_brewery_flattened = df_brewery_flattened.withColumnRenamed(col, name)

# COMMAND ----------

write_delta_table(df_brewery_flattened, 'brewery')

# COMMAND ----------

register_delta_table('brewery')

# COMMAND ----------

# MAGIC %md
# MAGIC ### Comments