pd.set_option("display.notebook_repr_html", True) from optparse import OptionParser # <codecell> operations = [op.replace('\n', '') for op in open('data/query list').readlines()] # <codecell> operations # <codecell> df_ops = pd.DataFrame() for op in operations: df = yt.youtube_search(op,1000, True) df['operation'] = op df_ops = df_ops.append(df) # <codecell> df = yt.youtube_search(operations[-1],1000, True) df['operation'] = operations[-1] df_ops = df_ops.append(df) # <codecell> print ('overall video details:' + str(df_ops.shape)) # <codecell>
video_df = video_df.drop_duplicates() print(video_df.shape) print('Users: ', len(video_df.USER.unique())) print('Titles: ', len(video_df.TITLE.str.encode('utf-8').unique())) print('IDS: ', len(video_df.ID.unique())) # <markdowncell> # There are 908 titles in the Beraldo dataset, of which **289** are unique. There are 246 users -- people (robots?) who upload videos. # # Comparing Davide's results with the ones I get from the Youtube API: # <codecell> df_am = yt.youtube_search(query='anonymous,internet,freedom', max_results=1000, with_statistics=True) # <codecell> df_am.columns df_am.drop_duplicates(inplace=True, cols='videoId') print(df_am.shape) print 'Unique ids: ', len(df_am.videoId.unique()) # <markdowncell> # So not a huge difference in numbers -- 525 vs 289. But are they the same videos more or less? # <codecell> davide_set = set(video_df.ID.tolist())
# <codecell> print('There are %d operations' % len(operations)) # <codecell> df_ops = pd.DataFrame() # <codecell> ## the basic operations for op in operations[177:]: df = yt.youtube_search(op,500, True) df['query'] = op df_ops = df_ops.append(df) print (operations.index(op)) # <codecell> df_ops.shape # <markdowncell> # Run all the same queries with 'mirror' as well. This might help address the unpredictability of the Youtube search results and its 'denial of search results'. So, we undertake a 'supply of service' approach here. # <codecell> ## run the same operations with 'mirror'