Ejemplo n.º 1
0
def filter_data():
    total_video_limit = 14
    last_month_limit = 4
    high_rate = 80
    sub_limit = 50000
    high_quality_file_path = data_path + "high_quality_video.txt"
    high_quality_channel = data_path + "high_quality_channel_portuguese.csv"
    if os.path.exists(high_quality_file_path):
        full_chanel = pandas_lib.get_big_data(high_quality_file_path, '')
    else:
        full_chanel = get_full_high_data(high_quality_file_path)
    full_chanel['high_rate'] = full_chanel.apply(lambda x: round(
        float(x.high_quality_count) / float(x.total_video_count), 3) * 100,
                                                 axis=1)
    final_chanel = full_chanel.loc[(full_chanel['high_rate']>=high_rate) & (full_chanel['total_video_count']>=total_video_limit) \
            & (full_chanel['last_month_count']>=last_month_limit) & (full_chanel['subscribe']>=sub_limit)]
    final_chanel['channel'] = final_chanel['channel'].map(
        lambda x: "https://www.youtube.com/channel/" + x)

    asia_country = ['Indonesia', 'Thailand', 'Vietnam']
    spanish_contry = ['Mexico','Argentina','Colombia','Spain','Chile','Peru','El Salvador',\
         'Ecuador','Costa Rica','Bolivia','Guatemala','Dominican Republic','Paraguay','Honduras']
    portuguese_country = ['Brazil', 'Portugal']
    final_chanel = final_chanel.loc[final_chanel["area"].isin(
        portuguese_country)]
    final_chanel.to_csv(high_quality_channel, index=False, sep=',')
Ejemplo n.º 2
0
def filter_data():
	start_date = '2018-05-01'
	end_date = '2018-08-16'
	sub_limit = 5000
	rate_percent = 78
	total_video_limit = 14
	last_month_limit = 4
	full_chanel = pd.DataFrame()
	sub_rate = pandas_lib.get_sub_rate(sub_file)
	list = os.listdir(video_src_path)
	for i in range(0,len(list)):
		path = os.path.join(video_src_path,list[i])
		if os.path.isfile(path):
			pdt = pandas_lib.get_big_data(path)
			pdt = pdt.loc[(pdt['publish_date']>=start_date) & (pdt['publish_date']<=end_date) \
						   & (pdt['subscribe']>=sub_limit)]
			channel_tmp = pdt['channel'].unique()
			pdt['percent'] = pdt.apply(lambda x: pandas_lib.get_standard_rate(sub_rate,round(x.rate/100,5),x.subscribe), axis = 1)
			pdt = pdt.dropna()
			pdt_tar = pdt.loc[pdt['percent']>=rate_percent]
			chanel_base = pd.DataFrame(columns=('channel', 'total_video_count', 'high_quality_count'))
			for ch in channel_tmp:
				pdt_ch = pdt.loc[pdt['channel'] == ch]
				row = pd.DataFrame([ch, pdt_ch.shape[0], pdt_tar.shape[0]], ['channel', 'total_video_count', 'high_quality_count'])
				chanel_base.append(row,ignore_index=True)
			pd_rm_same = pdt.drop_duplicates(subset=['channel'],keep='first')
			full_chanel = full_chanel.append(pd.merge(pd_rm_same,chanel_base,on='channel'))
			print(full_chanel)
			sys.exit(0)
Ejemplo n.º 3
0
def get_data():
	if os.path.exists(full_file):
		fr = pandas_lib.get_big_data(full_file,'')
	else:
		#first call to generate file video_data
		if os.path.exists(video_file):
			video_data = pandas_lib.get_big_data(video_file,'')
		else:
			sys.exit()
		#	video_data = multi_get_data(24)
		#	video_data.to_csv(video_file,index=False,sep=',')
		fr = video_data.dropna()

		############################get sub rate#####################################
		sub_rate = pandas_lib.get_sub_rate(sub_file)

		#########################count week and rate position########################
		fr['week'] = fr.apply(lambda x: pandas_lib.get_week_day(x.publish_date), axis = 1)
		fr['position'] = fr.apply(lambda x: pandas_lib.get_standard_rate(sub_rate,round(x.rate/100,5),x.subscribe), axis = 1)
		fr = fr.dropna()
		
		fr.to_csv(full_file,index=False,sep=',')
	return fr
Ejemplo n.º 4
0
def get_full_high_data(high_quality_file_path):
    start_date = '2018-05-01'
    last_month_date = '2018-07-16'
    end_date = '2018-08-16'
    sub_limit = 5000
    rate_percent = 78
    full_chanel = pd.DataFrame()
    sub_rate = pandas_lib.get_sub_rate(sub_file)
    list = os.listdir(video_src_path)
    for i in range(0, len(list)):
        path = os.path.join(video_src_path, list[i])
        if os.path.isfile(path):
            pdt = pandas_lib.get_big_data(path)
            pdt = pdt.loc[(pdt['publish_date']>=start_date) & (pdt['publish_date']<=end_date) \
                  & (pdt['subscribe']>=sub_limit)]
            channel_tmp = pdt['channel'].unique()
            pdt['percent'] = pdt.apply(lambda x: pandas_lib.get_standard_rate(
                sub_rate, round(x.rate / 100, 5), x.subscribe),
                                       axis=1)
            pdt = pdt.dropna()
            pdt_tar = pdt.loc[pdt['percent'] >= rate_percent]
            chanel_base = pd.DataFrame()
            for ch in channel_tmp:
                total_video_count = pdt.loc[pdt['channel'] == ch].shape[0]
                high_quality_count = pdt_tar.loc[pdt_tar['channel'] ==
                                                 ch].shape[0]
                last_month_count = pdt.loc[(pdt['channel'] == ch) & (pdt['publish_date']>=last_month_date) & \
                      (pdt['publish_date']<=end_date)].shape[0]
                row = pd.DataFrame([[ch, total_video_count, high_quality_count, last_month_count]], \
                      columns = ['channel', 'total_video_count', 'high_quality_count', 'last_month_count'])
                chanel_base = chanel_base.append(row, ignore_index=True)
            pd_rm_same = pdt.drop_duplicates(subset=['channel'], keep='first')
            full_chanel = full_chanel.append(pd.merge(pd_rm_same,
                                                      chanel_base,
                                                      on='channel'),
                                             ignore_index=True)
            logger.info('completed file:' + list[i] + ", count:" +
                        str(full_chanel.shape[0]))
        else:
            logger.error('file not exist:' + list[i])
    full_chanel.to_csv(high_quality_file_path, index=False, sep=',')
    return full_chanel
Ejemplo n.º 5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#

import numpy as np
import pandas as pd
import pandas_lib
import matplotlib.pyplot as plt

##########################get video base info##############################
file = 'C:\Users\lijiang\KOL\py\data\DataSet.csv'
base_data = pandas_lib.get_big_data(file, 'video_id')

########################count and choose avaliable data####################
same_file = 'C:\Users\lijiang\KOL\py\data\same_value.txt'
view_fit = pandas_lib.choose_avail_data(same_file)
fit = pd.DataFrame(view_fit)

#################join base data info and available data info################
fr = base_data.join(fit)
fr = fr.dropna()

############################get sub rate#####################################
sub_file = 'C:\Users\lijiang\KOL\py\data\sub_num_region.txt'
sub_rate = pandas_lib.get_sub_rate(sub_file)

#######count views/followers and merge time and transfer timezone###########
fr.eval("""
	percent0=seven_view/follower0
	percent1=seven_view/follower1
	avg_percent=seven_view*2/(follower0+follower1)""",