def filter_data(): total_video_limit = 14 last_month_limit = 4 high_rate = 80 sub_limit = 50000 high_quality_file_path = data_path + "high_quality_video.txt" high_quality_channel = data_path + "high_quality_channel_portuguese.csv" if os.path.exists(high_quality_file_path): full_chanel = pandas_lib.get_big_data(high_quality_file_path, '') else: full_chanel = get_full_high_data(high_quality_file_path) full_chanel['high_rate'] = full_chanel.apply(lambda x: round( float(x.high_quality_count) / float(x.total_video_count), 3) * 100, axis=1) final_chanel = full_chanel.loc[(full_chanel['high_rate']>=high_rate) & (full_chanel['total_video_count']>=total_video_limit) \ & (full_chanel['last_month_count']>=last_month_limit) & (full_chanel['subscribe']>=sub_limit)] final_chanel['channel'] = final_chanel['channel'].map( lambda x: "https://www.youtube.com/channel/" + x) asia_country = ['Indonesia', 'Thailand', 'Vietnam'] spanish_contry = ['Mexico','Argentina','Colombia','Spain','Chile','Peru','El Salvador',\ 'Ecuador','Costa Rica','Bolivia','Guatemala','Dominican Republic','Paraguay','Honduras'] portuguese_country = ['Brazil', 'Portugal'] final_chanel = final_chanel.loc[final_chanel["area"].isin( portuguese_country)] final_chanel.to_csv(high_quality_channel, index=False, sep=',')
def filter_data(): start_date = '2018-05-01' end_date = '2018-08-16' sub_limit = 5000 rate_percent = 78 total_video_limit = 14 last_month_limit = 4 full_chanel = pd.DataFrame() sub_rate = pandas_lib.get_sub_rate(sub_file) list = os.listdir(video_src_path) for i in range(0,len(list)): path = os.path.join(video_src_path,list[i]) if os.path.isfile(path): pdt = pandas_lib.get_big_data(path) pdt = pdt.loc[(pdt['publish_date']>=start_date) & (pdt['publish_date']<=end_date) \ & (pdt['subscribe']>=sub_limit)] channel_tmp = pdt['channel'].unique() pdt['percent'] = pdt.apply(lambda x: pandas_lib.get_standard_rate(sub_rate,round(x.rate/100,5),x.subscribe), axis = 1) pdt = pdt.dropna() pdt_tar = pdt.loc[pdt['percent']>=rate_percent] chanel_base = pd.DataFrame(columns=('channel', 'total_video_count', 'high_quality_count')) for ch in channel_tmp: pdt_ch = pdt.loc[pdt['channel'] == ch] row = pd.DataFrame([ch, pdt_ch.shape[0], pdt_tar.shape[0]], ['channel', 'total_video_count', 'high_quality_count']) chanel_base.append(row,ignore_index=True) pd_rm_same = pdt.drop_duplicates(subset=['channel'],keep='first') full_chanel = full_chanel.append(pd.merge(pd_rm_same,chanel_base,on='channel')) print(full_chanel) sys.exit(0)
def get_data(): if os.path.exists(full_file): fr = pandas_lib.get_big_data(full_file,'') else: #first call to generate file video_data if os.path.exists(video_file): video_data = pandas_lib.get_big_data(video_file,'') else: sys.exit() # video_data = multi_get_data(24) # video_data.to_csv(video_file,index=False,sep=',') fr = video_data.dropna() ############################get sub rate##################################### sub_rate = pandas_lib.get_sub_rate(sub_file) #########################count week and rate position######################## fr['week'] = fr.apply(lambda x: pandas_lib.get_week_day(x.publish_date), axis = 1) fr['position'] = fr.apply(lambda x: pandas_lib.get_standard_rate(sub_rate,round(x.rate/100,5),x.subscribe), axis = 1) fr = fr.dropna() fr.to_csv(full_file,index=False,sep=',') return fr
def get_full_high_data(high_quality_file_path): start_date = '2018-05-01' last_month_date = '2018-07-16' end_date = '2018-08-16' sub_limit = 5000 rate_percent = 78 full_chanel = pd.DataFrame() sub_rate = pandas_lib.get_sub_rate(sub_file) list = os.listdir(video_src_path) for i in range(0, len(list)): path = os.path.join(video_src_path, list[i]) if os.path.isfile(path): pdt = pandas_lib.get_big_data(path) pdt = pdt.loc[(pdt['publish_date']>=start_date) & (pdt['publish_date']<=end_date) \ & (pdt['subscribe']>=sub_limit)] channel_tmp = pdt['channel'].unique() pdt['percent'] = pdt.apply(lambda x: pandas_lib.get_standard_rate( sub_rate, round(x.rate / 100, 5), x.subscribe), axis=1) pdt = pdt.dropna() pdt_tar = pdt.loc[pdt['percent'] >= rate_percent] chanel_base = pd.DataFrame() for ch in channel_tmp: total_video_count = pdt.loc[pdt['channel'] == ch].shape[0] high_quality_count = pdt_tar.loc[pdt_tar['channel'] == ch].shape[0] last_month_count = pdt.loc[(pdt['channel'] == ch) & (pdt['publish_date']>=last_month_date) & \ (pdt['publish_date']<=end_date)].shape[0] row = pd.DataFrame([[ch, total_video_count, high_quality_count, last_month_count]], \ columns = ['channel', 'total_video_count', 'high_quality_count', 'last_month_count']) chanel_base = chanel_base.append(row, ignore_index=True) pd_rm_same = pdt.drop_duplicates(subset=['channel'], keep='first') full_chanel = full_chanel.append(pd.merge(pd_rm_same, chanel_base, on='channel'), ignore_index=True) logger.info('completed file:' + list[i] + ", count:" + str(full_chanel.shape[0])) else: logger.error('file not exist:' + list[i]) full_chanel.to_csv(high_quality_file_path, index=False, sep=',') return full_chanel
#!/usr/bin/env python # -*- coding: utf-8 -*- # import numpy as np import pandas as pd import pandas_lib import matplotlib.pyplot as plt ##########################get video base info############################## file = 'C:\Users\lijiang\KOL\py\data\DataSet.csv' base_data = pandas_lib.get_big_data(file, 'video_id') ########################count and choose avaliable data#################### same_file = 'C:\Users\lijiang\KOL\py\data\same_value.txt' view_fit = pandas_lib.choose_avail_data(same_file) fit = pd.DataFrame(view_fit) #################join base data info and available data info################ fr = base_data.join(fit) fr = fr.dropna() ############################get sub rate##################################### sub_file = 'C:\Users\lijiang\KOL\py\data\sub_num_region.txt' sub_rate = pandas_lib.get_sub_rate(sub_file) #######count views/followers and merge time and transfer timezone########### fr.eval(""" percent0=seven_view/follower0 percent1=seven_view/follower1 avg_percent=seven_view*2/(follower0+follower1)""",