def main(argv): spark = SparkSession.builder \ .appName('VIDEO_CLUSTERING') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') spark.conf.set('spark.executor.cores', '4') video_type_code = handle_params(argv) video_df = spark.read.format('jdbc')\ .option('url', 'jdbc:mysql://192.168.174.133:3306/big_data')\ .option('driver', 'com.mysql.cj.jdbc.Driver')\ .option('dbtable', 'VIDEO_STATISTIC')\ .option('user', 'root').option('password', 'root').load() assembler = VectorAssembler()\ .setInputCols(['play_count', 'favorite_count', 'comment_count', 'barrage_count'])\ .setOutputCol('features') video_vector = assembler.transform(video_df.select( 'play_count', 'favorite_count', 'comment_count', 'barrage_count' ).limit(1000)) bkm = BisectingKMeans(k=8, minDivisibleClusterSize=1.0) model = bkm.fit(video_vector) centers = model.clusterCenters() video_vector = assembler.transform(video_df.select( 'play_count', 'favorite_count', 'comment_count', 'barrage_count' )) transformed = model.transform(video_vector).select('features', 'prediction') transformed.show()
def test(argv): video_type_code = handle_params(argv)
def main(argv): spark = SparkSession.builder \ .appName('HOT_VIDEO') \ .master('spark://{}:{}'.format(SPARK_HOST_ADDR, SPARK_HOST_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') video_type_code = handle_params(argv) filepath = create_filepath(video_type_code, 'video_details') video_df = spark.read.format('csv').options(header='true').load(filepath) video_info_df = video_df.select('av号', '收藏数', '播放量', '弹幕数', '评论数', '投稿时间', '发布时间') video_info_df = video_info_df.dropna() create_video_entity_udf = udf(calc_compre_heat, DoubleType()) create_video_participation_udf = udf(calc_participation, DoubleType()) filter_udf = udf(column_filter, BooleanType()) pass_time_udf = udf(calc_pass_duration, IntegerType()) video_info_df = video_info_df.withColumnRenamed('av号', 'av') video_info_df = video_info_df.withColumnRenamed('播放量', 'play_count') video_info_df = video_info_df.withColumnRenamed('收藏数', 'favorite_count') video_info_df = video_info_df.withColumnRenamed('评论数', 'comment_count') video_info_df = video_info_df.withColumnRenamed('弹幕数', 'barrage_count') video_info_df = video_info_df.withColumnRenamed('投稿时间', 'script_time') video_info_df = video_info_df.withColumnRenamed('发布时间', 'publish_time') video_info_df = video_info_df.filter( filter_udf('play_count', 'favorite_count', 'comment_count', 'barrage_count')) # video_info_df.foreach(print) # tmp = video_info_df.count() # video_info_df = video_info_df.withColumn( 'compre_heat', create_video_entity_udf('play_count', 'favorite_count', 'comment_count', 'barrage_count')) video_info_df = video_info_df.withColumn( 'participation', create_video_participation_udf('play_count', 'favorite_count', 'comment_count', 'barrage_count')) video_info_df = video_info_df.withColumn( 'pass_time', pass_time_udf('script_time', 'publish_time')) # video_info_df = video_info_df.withColumn( 'type_code', udf(lambda: video_type_code, IntegerType())()) # video_info_df = video_info_df.sort('compre_heat', ascending=False) video_info_df = video_info_df.dropna() video_info_df = video_info_df.dropDuplicates(['av']) # video_info_df = video_info_df.limit(64) video_info_df.show() # video_info_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data', 'VIDEO_STATISTIC', 'append', mysql_conn_param)