Example #1
0
def main(argv):

    spark = SparkSession.builder \
        .appName('VIDEO_CLUSTERING') \
        .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')
    spark.conf.set('spark.executor.cores', '4')

    video_type_code = handle_params(argv)

    video_df = spark.read.format('jdbc')\
        .option('url', 'jdbc:mysql://192.168.174.133:3306/big_data')\
        .option('driver', 'com.mysql.cj.jdbc.Driver')\
        .option('dbtable', 'VIDEO_STATISTIC')\
        .option('user', 'root').option('password', 'root').load()

    assembler = VectorAssembler()\
        .setInputCols(['play_count',
                        'favorite_count',
                        'comment_count',
                        'barrage_count'])\
        .setOutputCol('features')

    video_vector = assembler.transform(video_df.select(
        'play_count', 'favorite_count', 'comment_count', 'barrage_count'
    ).limit(1000))

    bkm = BisectingKMeans(k=8, minDivisibleClusterSize=1.0)
    model = bkm.fit(video_vector)
    centers = model.clusterCenters()

    video_vector = assembler.transform(video_df.select(
        'play_count', 'favorite_count', 'comment_count', 'barrage_count'
    ))

    transformed = model.transform(video_vector).select('features', 'prediction')


    transformed.show()
Example #2
0
def test(argv):
    video_type_code = handle_params(argv)
Example #3
0
def main(argv):

    spark = SparkSession.builder \
        .appName('HOT_VIDEO') \
        .master('spark://{}:{}'.format(SPARK_HOST_ADDR, SPARK_HOST_PORT)) \
        .getOrCreate()
    spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
    spark.conf.set('spark.driver.maxResultSize', '0')
    spark.conf.set('spark.driver.cores', '4')
    spark.conf.set('spark.driver.memory', '4g')
    spark.conf.set('spark.executor.memory', '4g')

    video_type_code = handle_params(argv)
    filepath = create_filepath(video_type_code, 'video_details')

    video_df = spark.read.format('csv').options(header='true').load(filepath)
    video_info_df = video_df.select('av号', '收藏数', '播放量', '弹幕数', '评论数', '投稿时间',
                                    '发布时间')
    video_info_df = video_info_df.dropna()

    create_video_entity_udf = udf(calc_compre_heat, DoubleType())
    create_video_participation_udf = udf(calc_participation, DoubleType())
    filter_udf = udf(column_filter, BooleanType())
    pass_time_udf = udf(calc_pass_duration, IntegerType())

    video_info_df = video_info_df.withColumnRenamed('av号', 'av')
    video_info_df = video_info_df.withColumnRenamed('播放量', 'play_count')
    video_info_df = video_info_df.withColumnRenamed('收藏数', 'favorite_count')
    video_info_df = video_info_df.withColumnRenamed('评论数', 'comment_count')
    video_info_df = video_info_df.withColumnRenamed('弹幕数', 'barrage_count')
    video_info_df = video_info_df.withColumnRenamed('投稿时间', 'script_time')
    video_info_df = video_info_df.withColumnRenamed('发布时间', 'publish_time')

    video_info_df = video_info_df.filter(
        filter_udf('play_count', 'favorite_count', 'comment_count',
                   'barrage_count'))

    # video_info_df.foreach(print)
    # tmp = video_info_df.count()
    #
    video_info_df = video_info_df.withColumn(
        'compre_heat',
        create_video_entity_udf('play_count', 'favorite_count',
                                'comment_count', 'barrage_count'))
    video_info_df = video_info_df.withColumn(
        'participation',
        create_video_participation_udf('play_count', 'favorite_count',
                                       'comment_count', 'barrage_count'))

    video_info_df = video_info_df.withColumn(
        'pass_time', pass_time_udf('script_time', 'publish_time'))

    #
    video_info_df = video_info_df.withColumn(
        'type_code',
        udf(lambda: video_type_code, IntegerType())())
    #
    video_info_df = video_info_df.sort('compre_heat', ascending=False)
    video_info_df = video_info_df.dropna()
    video_info_df = video_info_df.dropDuplicates(['av'])

    # video_info_df = video_info_df.limit(64)
    video_info_df.show()
    #
    video_info_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data',
                             'VIDEO_STATISTIC', 'append', mysql_conn_param)