def init_df():
    df = load_to_spark.main_init_df(filenames)
    #df = df.sample(False, fraction=1000000.0 / df.count(), seed=int(round(time.time())))
    df.cache()
    df = df.where(col("author").isNotNull())
    df_bots = df.where(col("author").rlike("|".join(bots)))
    df = df.subtract(df_bots)
    return df
    axes[0, 1].set_ylim([0, 200])
    hist(axes[0, 0], [df1], bins=20, color=['red'])
    axes[0, 0].set_title('Anzahl von Revisionen ueber allen Artikeln')
    axes[0, 0].set_xlabel('Anzahl der Revisionen')
    axes[0, 0].set_ylabel('Anzahl der Artikeln')
    hist(axes[0, 1], [df2], bins=20, color=['blue'])
    axes[0, 1].set_title('Anzahl von Revisionen ueber Autoren pro Artikel')
    axes[0, 1].set_xlabel('Anzahl der Revisionen per Autor')
    axes[0, 1].set_ylabel('Anzahl der Artikeln')
    plt.savefig('Number_of_revisions_per_article')

	
#get start time
start_time = time.time()

df = load_to_spark.main_init_df()

#retrieve spark worker count
worker_count = load_to_spark.sc._jsc.sc().getExecutorMemoryStatus().size() - 1

total_edits_per_article = number_of_revisions_per_article(df)
total_edits_per_article.cache()
total_edits_per_article.show()
df_revision_length = total_edits_per_article\
    .select(col("edit history length").alias("revision_length"))\
    .orderBy(desc("revision_length"))


total_number_of_authors_per_article = number_of_authors_per_article(df)
total_number_of_authors_per_article.cache()
total_number_of_authors_per_article.show()
Ejemplo n.º 3
0
#    save_to_log(file_count, worker_count, dur1, 'groupby')
    save_to_log(file_count, worker_count, dur2, 'groupby_count')
    print('groupby test complete')

def test_crossjoin(df1, df2):
    dur1, dur2 = crossjoin(df1, df2)
    file_count = len(filenames)
    worker_count = load_to_spark.sc._jsc.sc().getExecutorMemoryStatus().size() - 1
#    save_to_log(file_count, worker_count, dur1, 'crossjoin')
    save_to_log(file_count, worker_count, dur2, 'crossjoin_count')
    print('crossjoin test complete')

abs_start_time = time.time()

first_file_titles = 22288
df = load(filenames)
rows = df.select("title").take(len(filenames * first_file_titles))
df_titles = load_to_spark.createDataFrame(rows)
df_titles.show()
df = df.withColumn('revision', explode(df.revision))
#test_select()
#test_filter()

df2 = load_to_spark.main_init_df(filenames)

load_to_spark.create_session()
first_file_titles = 22288
test_groupby(df2)
test_crossjoin(df_titles, df_titles)
df_titles.unpersist()
Ejemplo n.º 4
0
import load_to_spark
from pyspark.sql.functions import col, avg

filenames = []

#for i in range(1, 27):
#    path = "/scratch/wikipedia-dump/wiki_small_" + str(i) + ".json"
#    filenames.append(path)

path = "/scratch/wikipedia-dump/wiki_small_11.json"
filenames.append(path)
df = load_to_spark.main_init_df(filenames).select("title", "author").distinct()
print(df.count())
df_grouped = df.groupBy(col("title")).count()
df_avg = df_grouped.select(avg(col("count")))
df_avg.show()
Ejemplo n.º 5
0
from pyspark.sql.functions import col
import load_to_spark

base_path = "/scratch/wikipedia-dump/wiki_small_"
filenames = []
bots = ["Bot", "Bots"]

for i in range(1, 27):
    path = base_path + str(i) + ".json"
    filenames.append(path)

print("Selecting real users")
df = load_to_spark.main_init_df(filenames).select(col("author"), col("title"))
df = df.where(col("author").isNotNull())
df_bots = df.where(col("author").rlike("|".join(bots)))
df_authors = df.subtract(df_bots).distinct()
df_authors.cache()
print("counting active and inactive")
df_authors = df_authors.groupBy(col("author")).count()
df_active = df_authors.where(col("count") > 10)
df_inactive = df_authors.subtract(df_active)

active = df_active.count()
inactive = df_inactive.count()
output = "active: " + str(active) + ", inactive: " + str(inactive)

file = open("/home/ubuntu/BA_Project/log/active_inactive_comparison.txt", "+w")
file.write(output)
file.close()
print("Done")
Ejemplo n.º 6
0
import load_to_spark
import jaccard_similarity
from pyspark.sql.functions import col, avg

bots = ["Bot", "Bots"]

df = load_to_spark.main_init_df(
    "/scratch/wikipedia-dump/wiki_small_11.json").select("title",
                                                         "author").distinct()
df = df.where(col("author").isNotNull())
dfbots = df.where(col("author").rlike("|".join(bots)))
df = df.subtract(dfbots)
df.cache()

df_jaccard = jaccard_similarity.jaccard_with_min_hashing(df, "title", "author")
df_jaccard.cache()

df_similar = df_jaccard.where(col("jaccard") < 0.3).select("title1", "title2")
df_rest = df_jaccard.where((col("jaccard") >= 0.3)
                           & (col("jaccard") <= 0.7)).select(
                               "title1", "title2")

df_sim_t = df_similar.select(col("title1")).union(df_similar.select(col("title2")))\
    .distinct()
df_sim_t.show()
df_rest_t = df_rest.select(col("title1")).union(df_rest.select(
    col("title2"))).distinct()
df_rest_t.show()

df_sim_joined = df.join(df_sim_t, col("title1") == col("title"))\
    .select("author", "title").distinct()
def revisions_per_author():
    df = load_to_spark.main_init_df()
    return numbder_of_revisions_per_author(df)
        "/scratch/wikipedia-dump/plots/hotspots/different_article_author_hotspots.png",
        True)


for i in range(1, 27):
    filenames.append(base_path + str(i) + ".json")

#load hotspot df
df_hot = load_to_spark.init_article_hotspot_df(filenames)
df_hot = df_hot.where(col("author").isNotNull())
df_h_bots = df_hot.where(col("author").rlike("|".join(bots)))
df_hot = df_hot.subtract(df_h_bots)

#load main df
df = load_to_spark.main_init_df(filenames).select(
    "author", "title",
    col("editTime").alias("timestamp"))
df = df.where(col("author").isNotNull())
df_bots = df.where(col("author").rlike("|".join(bots)))
df = df.subtract(df_bots)
df.show()

df_hotspots = hotspot_detection.sliding_window_hotspots_by_time(df_hot).select(
    "window",
    col("title").alias("title1"))
df_hotspots.show()
df_joined = df.join(df_hotspots, (col("title") == col("title1")) & (col("timestamp").between(col("window")["start"], col("window")["end"])))\
    .select("author", "title", "window").distinct()

df_grouped = df_joined.groupBy(col("author")).count()
df_hist = df_grouped.select(col("count"))
Ejemplo n.º 9
0
    fig.set_size_inches(20, 20)
    hist(axes[0, 0], [df], bins=20, color=['red'])
    axes[0, 0].set_xlabel('Jaccard Koeffizient')
    axes[0, 0].set_ylabel('Anzahl der Artikeln')
    plt.savefig('Jaccard_Similarity')


def sparse_vec(r):
    li = set(r[1])
    li = sorted(li)
    l = len(li)
    vals = [1.0 for x in range(l)]
    return r[0], Vectors.sparse(942000, li, vals)


df_gn = load_to_spark.main_init_df()

df_titles = df_gn.select("title", "author").where(col("author").isNotNull())

# Determine all distinct authors, exclude bots and generate IDs for each
df_all_authors = df_gn.select("author").where(
    col("author").isNotNull()).distinct()

# Select only Bots
df_bots = df_all_authors.where(col("author").rlike('|'.join(search_text)))

# Select all authors except bots
df_real_users = df_all_authors.subtract(df_bots)

windowSpec = W.orderBy("author")
df_authors = df_real_users.withColumn("Id", f.row_number().over(windowSpec))