video_df = video_df.drop_duplicates() print(video_df.shape) print('Users: ', len(video_df.USER.unique())) print('Titles: ', len(video_df.TITLE.str.encode('utf-8').unique())) print('IDS: ', len(video_df.ID.unique())) # <markdowncell> # There are 908 titles in the Beraldo dataset, of which **289** are unique. There are 246 users -- people (robots?) who upload videos. # # Comparing Davide's results with the ones I get from the Youtube API: # <codecell> df_am = yt.youtube_search(query='anonymous,internet,freedom', max_results=1000, with_statistics=True) # <codecell> df_am.columns df_am.drop_duplicates(inplace=True, cols='videoId') print(df_am.shape) print 'Unique ids: ', len(df_am.videoId.unique()) # <markdowncell> # So not a huge difference in numbers -- 525 vs 289. But are they the same videos more or less? # <codecell> davide_set = set(video_df.ID.tolist())
# <codecell> ops.applymap? # <codecell> ops.apply? # <codecell> import YT_api_generate as yt # <codecell> ops = yt.format_durations(ops) # <markdowncell> # ## THIS THE KEY BIT TO LINK TITLES AND DURATIONS # <codecell> dur_ti = ops.groupby(ops.duration_time)['title'].value_counts() # <codecell> t=dt.time? # <codecell>
import YT_api_generate as yt import pandas as pd import re import numpy as np import ggplot import matplotlib.pyplot as plt import seaborn # <codecell> # raw search results from Youtube api ops_df = pd.read_pickle('data/operations_results.pyd') # <codecell> ops = yt.title_clean_operations(ops_df) ops = yt.format_durations(ops) ops.shape # <codecell> print len(sort(ops.title_short.unique())) sort(ops.title_short.unique()) # <codecell> # calculate the duration of operations # duration time is the length of a video used to find matches end = ops.groupby(['title_short', 'duration_time']).publishedAt.max() start = ops.groupby(['title_short', 'duration_time']).publishedAt.min() operation_duration = end - start
pd.set_option("display.notebook_repr_html", True) from optparse import OptionParser # <codecell> operations = [op.replace('\n', '') for op in open('data/query list').readlines()] # <codecell> operations # <codecell> df_ops = pd.DataFrame() for op in operations: df = yt.youtube_search(op,1000, True) df['operation'] = op df_ops = df_ops.append(df) # <codecell> df = yt.youtube_search(operations[-1],1000, True) df['operation'] = operations[-1] df_ops = df_ops.append(df) # <codecell> print ('overall video details:' + str(df_ops.shape)) # <codecell>
# <codecell> print('There are %d operations' % len(operations)) # <codecell> df_ops = pd.DataFrame() # <codecell> ## the basic operations for op in operations[177:]: df = yt.youtube_search(op,500, True) df['query'] = op df_ops = df_ops.append(df) print (operations.index(op)) # <codecell> df_ops.shape # <markdowncell> # Run all the same queries with 'mirror' as well. This might help address the unpredictability of the Youtube search results and its 'denial of search results'. So, we undertake a 'supply of service' approach here. # <codecell> ## run the same operations with 'mirror'