def init_df(): df = load_to_spark.main_init_df(filenames) #df = df.sample(False, fraction=1000000.0 / df.count(), seed=int(round(time.time()))) df.cache() df = df.where(col("author").isNotNull()) df_bots = df.where(col("author").rlike("|".join(bots))) df = df.subtract(df_bots) return df
axes[0, 1].set_ylim([0, 200]) hist(axes[0, 0], [df1], bins=20, color=['red']) axes[0, 0].set_title('Anzahl von Revisionen ueber allen Artikeln') axes[0, 0].set_xlabel('Anzahl der Revisionen') axes[0, 0].set_ylabel('Anzahl der Artikeln') hist(axes[0, 1], [df2], bins=20, color=['blue']) axes[0, 1].set_title('Anzahl von Revisionen ueber Autoren pro Artikel') axes[0, 1].set_xlabel('Anzahl der Revisionen per Autor') axes[0, 1].set_ylabel('Anzahl der Artikeln') plt.savefig('Number_of_revisions_per_article') #get start time start_time = time.time() df = load_to_spark.main_init_df() #retrieve spark worker count worker_count = load_to_spark.sc._jsc.sc().getExecutorMemoryStatus().size() - 1 total_edits_per_article = number_of_revisions_per_article(df) total_edits_per_article.cache() total_edits_per_article.show() df_revision_length = total_edits_per_article\ .select(col("edit history length").alias("revision_length"))\ .orderBy(desc("revision_length")) total_number_of_authors_per_article = number_of_authors_per_article(df) total_number_of_authors_per_article.cache() total_number_of_authors_per_article.show()
# save_to_log(file_count, worker_count, dur1, 'groupby') save_to_log(file_count, worker_count, dur2, 'groupby_count') print('groupby test complete') def test_crossjoin(df1, df2): dur1, dur2 = crossjoin(df1, df2) file_count = len(filenames) worker_count = load_to_spark.sc._jsc.sc().getExecutorMemoryStatus().size() - 1 # save_to_log(file_count, worker_count, dur1, 'crossjoin') save_to_log(file_count, worker_count, dur2, 'crossjoin_count') print('crossjoin test complete') abs_start_time = time.time() first_file_titles = 22288 df = load(filenames) rows = df.select("title").take(len(filenames * first_file_titles)) df_titles = load_to_spark.createDataFrame(rows) df_titles.show() df = df.withColumn('revision', explode(df.revision)) #test_select() #test_filter() df2 = load_to_spark.main_init_df(filenames) load_to_spark.create_session() first_file_titles = 22288 test_groupby(df2) test_crossjoin(df_titles, df_titles) df_titles.unpersist()
import load_to_spark from pyspark.sql.functions import col, avg filenames = [] #for i in range(1, 27): # path = "/scratch/wikipedia-dump/wiki_small_" + str(i) + ".json" # filenames.append(path) path = "/scratch/wikipedia-dump/wiki_small_11.json" filenames.append(path) df = load_to_spark.main_init_df(filenames).select("title", "author").distinct() print(df.count()) df_grouped = df.groupBy(col("title")).count() df_avg = df_grouped.select(avg(col("count"))) df_avg.show()
from pyspark.sql.functions import col import load_to_spark base_path = "/scratch/wikipedia-dump/wiki_small_" filenames = [] bots = ["Bot", "Bots"] for i in range(1, 27): path = base_path + str(i) + ".json" filenames.append(path) print("Selecting real users") df = load_to_spark.main_init_df(filenames).select(col("author"), col("title")) df = df.where(col("author").isNotNull()) df_bots = df.where(col("author").rlike("|".join(bots))) df_authors = df.subtract(df_bots).distinct() df_authors.cache() print("counting active and inactive") df_authors = df_authors.groupBy(col("author")).count() df_active = df_authors.where(col("count") > 10) df_inactive = df_authors.subtract(df_active) active = df_active.count() inactive = df_inactive.count() output = "active: " + str(active) + ", inactive: " + str(inactive) file = open("/home/ubuntu/BA_Project/log/active_inactive_comparison.txt", "+w") file.write(output) file.close() print("Done")
import load_to_spark import jaccard_similarity from pyspark.sql.functions import col, avg bots = ["Bot", "Bots"] df = load_to_spark.main_init_df( "/scratch/wikipedia-dump/wiki_small_11.json").select("title", "author").distinct() df = df.where(col("author").isNotNull()) dfbots = df.where(col("author").rlike("|".join(bots))) df = df.subtract(dfbots) df.cache() df_jaccard = jaccard_similarity.jaccard_with_min_hashing(df, "title", "author") df_jaccard.cache() df_similar = df_jaccard.where(col("jaccard") < 0.3).select("title1", "title2") df_rest = df_jaccard.where((col("jaccard") >= 0.3) & (col("jaccard") <= 0.7)).select( "title1", "title2") df_sim_t = df_similar.select(col("title1")).union(df_similar.select(col("title2")))\ .distinct() df_sim_t.show() df_rest_t = df_rest.select(col("title1")).union(df_rest.select( col("title2"))).distinct() df_rest_t.show() df_sim_joined = df.join(df_sim_t, col("title1") == col("title"))\ .select("author", "title").distinct()
def revisions_per_author(): df = load_to_spark.main_init_df() return numbder_of_revisions_per_author(df)
"/scratch/wikipedia-dump/plots/hotspots/different_article_author_hotspots.png", True) for i in range(1, 27): filenames.append(base_path + str(i) + ".json") #load hotspot df df_hot = load_to_spark.init_article_hotspot_df(filenames) df_hot = df_hot.where(col("author").isNotNull()) df_h_bots = df_hot.where(col("author").rlike("|".join(bots))) df_hot = df_hot.subtract(df_h_bots) #load main df df = load_to_spark.main_init_df(filenames).select( "author", "title", col("editTime").alias("timestamp")) df = df.where(col("author").isNotNull()) df_bots = df.where(col("author").rlike("|".join(bots))) df = df.subtract(df_bots) df.show() df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df_hot).select( "window", col("title").alias("title1")) df_hotspots.show() df_joined = df.join(df_hotspots, (col("title") == col("title1")) & (col("timestamp").between(col("window")["start"], col("window")["end"])))\ .select("author", "title", "window").distinct() df_grouped = df_joined.groupBy(col("author")).count() df_hist = df_grouped.select(col("count"))
fig.set_size_inches(20, 20) hist(axes[0, 0], [df], bins=20, color=['red']) axes[0, 0].set_xlabel('Jaccard Koeffizient') axes[0, 0].set_ylabel('Anzahl der Artikeln') plt.savefig('Jaccard_Similarity') def sparse_vec(r): li = set(r[1]) li = sorted(li) l = len(li) vals = [1.0 for x in range(l)] return r[0], Vectors.sparse(942000, li, vals) df_gn = load_to_spark.main_init_df() df_titles = df_gn.select("title", "author").where(col("author").isNotNull()) # Determine all distinct authors, exclude bots and generate IDs for each df_all_authors = df_gn.select("author").where( col("author").isNotNull()).distinct() # Select only Bots df_bots = df_all_authors.where(col("author").rlike('|'.join(search_text))) # Select all authors except bots df_real_users = df_all_authors.subtract(df_bots) windowSpec = W.orderBy("author") df_authors = df_real_users.withColumn("Id", f.row_number().over(windowSpec))