axes[0, 0].set_xlim([0, 30])
    axes[0, 0].set_title('Aktualität des Artikels')
    axes[0, 0].set_xlabel('Zeitraum (Anazhl Monaten)')
    axes[0, 0].set_ylabel('Länge (Anzahl Revisionen)')
    hist(axes[0, 1], [df2], bins=20, color=['blue'])
    axes[0, 1].set_title('Länge der Artikel-Historie')
    axes[0, 1].set_xlabel('Zeitraum (Anazhl Monaten)')
    axes[0, 1].set_ylabel('Länge (Anzahl Revisionen)')
    hist(axes[1, 0], [df3], bins=20, color=['purple'])
    axes[1, 0].set_title('Erstellungsdatum des Artikels')
    axes[1, 0].set_xlabel('Zeitraum (Anazhl Monaten)')
    axes[1, 0].set_ylabel('Anzahl Artikeln')
    plt.savefig('CreationTime_ActiveTime_TimeSinceLastEdit_Histogram')


df = load_to_spark.init()
df_res = join_last_and_creation_dates(df)
df_res.show()

# Draw histograms
df_article_creation = df_res.select(col("time_since_creation")).orderBy(
    desc("time_since_creation"))
df_article_creation.cache()
df_article_creation.persist()
df_article_creation.show()

df_active_time = df_res.select(col("active time").alias("active")).orderBy(
    desc("active"))
df_active_time.cache()
df_active_time.persist()
df_active_time.show()
Beispiel #2
0
    axes[0, 0].legend()
    hist(axes[0, 1], [df2], bins=20, color=['blue'])
    axes[0, 1].set_title('Anzahl von Revisionen pro Monat')
    axes[0, 1].set_xlabel('Länge der Revisionen')
    axes[0, 1].set_ylabel('Anzahl der Artikeln')
    axes[0, 1].legend()
    plt.savefig('Average_Number_Of_Revisions_per_Month')


spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.executor.memory", "128g") \
    .getOrCreate()

df_gn = load_to_spark.init()

df_groups = df_gn.select("title").distinct()

df = load_to_spark.main_init_df()

df_monthly_ts = df.withColumn("yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime"))))\
    .withColumn("yearmonth", col("yearmonth").cast("timestamp"))
df_monthly_ts = df_monthly_ts.groupBy("yearmonth",
                                      "title").count().orderBy(desc("count"))

df = df.withColumn(
    "yearmonth",
    f.concat(f.year("editTime"), f.lit('-'),
             format_string("%02d", f.month("editTime"))))
df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count"))