axes[0, 0].set_xlim([0, 30]) axes[0, 0].set_title('Aktualität des Artikels') axes[0, 0].set_xlabel('Zeitraum (Anazhl Monaten)') axes[0, 0].set_ylabel('Länge (Anzahl Revisionen)') hist(axes[0, 1], [df2], bins=20, color=['blue']) axes[0, 1].set_title('Länge der Artikel-Historie') axes[0, 1].set_xlabel('Zeitraum (Anazhl Monaten)') axes[0, 1].set_ylabel('Länge (Anzahl Revisionen)') hist(axes[1, 0], [df3], bins=20, color=['purple']) axes[1, 0].set_title('Erstellungsdatum des Artikels') axes[1, 0].set_xlabel('Zeitraum (Anazhl Monaten)') axes[1, 0].set_ylabel('Anzahl Artikeln') plt.savefig('CreationTime_ActiveTime_TimeSinceLastEdit_Histogram') df = load_to_spark.init() df_res = join_last_and_creation_dates(df) df_res.show() # Draw histograms df_article_creation = df_res.select(col("time_since_creation")).orderBy( desc("time_since_creation")) df_article_creation.cache() df_article_creation.persist() df_article_creation.show() df_active_time = df_res.select(col("active time").alias("active")).orderBy( desc("active")) df_active_time.cache() df_active_time.persist() df_active_time.show()
axes[0, 0].legend() hist(axes[0, 1], [df2], bins=20, color=['blue']) axes[0, 1].set_title('Anzahl von Revisionen pro Monat') axes[0, 1].set_xlabel('Länge der Revisionen') axes[0, 1].set_ylabel('Anzahl der Artikeln') axes[0, 1].legend() plt.savefig('Average_Number_Of_Revisions_per_Month') spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.executor.memory", "128g") \ .getOrCreate() df_gn = load_to_spark.init() df_groups = df_gn.select("title").distinct() df = load_to_spark.main_init_df() df_monthly_ts = df.withColumn("yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime"))))\ .withColumn("yearmonth", col("yearmonth").cast("timestamp")) df_monthly_ts = df_monthly_ts.groupBy("yearmonth", "title").count().orderBy(desc("count")) df = df.withColumn( "yearmonth", f.concat(f.year("editTime"), f.lit('-'), format_string("%02d", f.month("editTime")))) df_monthly = df.groupBy("yearmonth", "title").count().orderBy(desc("count"))