def init_df(): df = load_to_spark.init_article_hotspot_df(filenames) df = df.where(col("author").isNotNull()) df_bots = df.where(col("author").rlike("|".join(bots))) df = df.subtract(df_bots) print(df.count()) return df
def init_dataframes(filenames): df = load_to_spark.init_article_hotspot_df(filenames) df = df.where(col("author").isNotNull()) dfbots = df.where(col("author").rlike("|".join(bots))) df = df.subtract(dfbots) df_category = load_to_spark.create_category_df() df_t_id = df.select("title", "id") df_category = df_category.join(df_t_id, "id").select("title", "category") return df, df_category
def draw_histogram(df): fig, axes = plt.subplots() fig.set_size_inches(20, 20) hist(axes, [df], bins=20, color=['red']) plt.savefig( "/scratch/wikipedia-dump/plots/hotspots/article_category_hotspot_jaccard.png" ) #for i in range(1, 6): # filenames.append(base_path + str(i) + ".json") filenames.append(base_path + "11.json") #revID|author|timestamp|title df_hot = load_to_spark.init_article_hotspot_df(filenames) #title|window|rev_count df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df_hot) #id|category df_categories = load_to_spark.create_category_df( "/scratch/wikipedia-dump/categorylinks.json") #id|title|author|authorID|editTime df = load_to_spark.main_init_df(filenames) df = df.select("title", "author", col("editTime").alias("timestamp"), col("id").alias("id1")).distinct() #title|author|category|timestamp df_joined = df.join(df_categories, col("id") == col("id1")).drop("id1", "id") df_joined.count()
def init_df(): df = load_to_spark.init_article_hotspot_df(filenames) return df