def main(argv): spark = SparkSession.builder \ .appName('AUTHOR_STATISTIC') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') video_type_code = FuncUtils.handle_params(argv) filepath = FuncUtils.create_filepath(video_type_code, 'video_details/video_info_type') video_df = spark.read.format('csv').options(header='true').load(filepath) video_df = video_df.select('av号', '是否收费') video_df = video_df.withColumnRenamed('av号', 'av') video_df = video_df.withColumnRenamed('是否收费', 'is_pay') video_rdd = video_df.rdd.filter(lambda x: x['is_pay'].isdigit()) pair_rdd = video_rdd.map(lambda x: (x['is_pay'], 1)) pair_rdd = pair_rdd.reduceByKey(lambda a, b: a + b) pair_df = pair_rdd.map(lambda pair: Row(**create_is_pay_entity( pair, video_type_code))).toDF() pair_df.show()
def main(argv): spark = SparkSession.builder \ .appName('AUTHOR_STATISTIC') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') video_type_code = FuncUtils.handle_params(argv) filepath = FuncUtils.create_filepath(video_type_code, 'video_details/video_info_type') video_df = spark.read.format('csv').options(header='true').load(filepath) # video_df = None # for i in range(1, 15): # filepath = FuncUtils.create_filepath(i, 'video_details/video_info_type') # df = spark.read.format('csv').options(header='true').load(filepath) # if video_df is None: # video_df = df # else: # video_df = video_df.union(df) author_df = video_df.select('Up主', '播放量') author_rdd = video_df.rdd author_rdd = author_rdd.filter(lambda x: x['播放量'].isdigit()) pair_rdd = author_rdd.map(lambda x: (x['Up主'], (int(x['播放量']), 1))) pair_rdd = pair_rdd.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) pair_rdd = pair_rdd.filter(lambda x: x[1][0] != 0) pair_rdd = pair_rdd.map(lambda x: (x[0], x[1][0] / x[1][1])) pair_df = pair_rdd.map( lambda pair: Row(**create_author_entity(pair, video_type_code)) ).toDF() pair_df = pair_df.sort('avg_play_count', ascending=False) pair_df = pair_df.limit(200) pair_df.show() pair_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data', 'AUTHOR_STATISTIC', 'append', mysql_conn_param)
def main(argv): # --- TODO --- global video_type_code spark = SparkSession.builder \ .appName('BARRAGE_WORD_COUNT') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') video_type_code = FuncUtils.handle_params(argv) filepath = FuncUtils.create_filepath(video_type_code, 'barrage_data/barrage_type') video_df = spark.read.format('csv').options(header='true').load(filepath) text_data = video_df.select('弹幕内容') text_rdd = text_data.rdd text_rdd = text_rdd.filter(lambda x: x['弹幕内容'] is not None) word_data = text_rdd.flatMap(word_split) word_data = word_data.filter(lambda word: word not in WORD_FILTER) pair_data = word_data.map(lambda word: (word, 1)) pair_data = pair_data.reduceByKey(lambda a, b: a + b) word_df = pair_data.map( lambda pair: Row(**create_word_entity(pair, 'barrage', video_type_code)) ).toDF() word_df.createOrReplaceTempView('WORD') word_df = word_df.sort('count', ascending=False) word_df.show() word_df = word_df.limit(500) word_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data', 'WORD_FREQ', 'append', mysql_conn_param)
def main(argv): global video_type_code spark = SparkSession.builder \ .appName('VIDEO_TITLE_WORD_COUNT') \ .master('spark://{}:{}'.format(SPARK_MASTER_ADDR, SPARK_MASTER_PORT)) \ .getOrCreate() spark.conf.set('spark.sql.execution.arrow.enabled', 'true') spark.conf.set('spark.driver.maxResultSize', '0') spark.conf.set('spark.driver.cores', '4') spark.conf.set('spark.driver.memory', '4g') spark.conf.set('spark.executor.memory', '4g') video_type_code = FuncUtils.handle_params(argv) filepath = FuncUtils.create_filepath(video_type_code, 'video_details/video_info_type') video_df = spark.read.format('csv').options(header='true').load(filepath) video_info_df = video_df.select('av号', '标题') video_info_df = video_info_df.withColumnRenamed('av号', 'av') video_info_df = video_info_df.withColumnRenamed('标题', 'title') title_word_data = video_info_df.rdd.filter(lambda x: x['title'] is not None).flatMap(word_split) title_word_data = title_word_data.filter(lambda word: word not in WORD_FILTER) pair_data = title_word_data.map(lambda word: (word, 1)) pair_data = pair_data.reduceByKey(lambda a, b: a + b) word_df = pair_data.map( lambda pair: Row(**create_word_entity(pair, 'title', video_type_code)) ).toDF() word_df.show() word_df = word_df.sort('count', ascending=False).limit(500) word_df.write.jdbc('jdbc:mysql://192.168.174.133:3306/big_data', 'WORD_FREQ', 'append', mysql_conn_param)