Esempio n. 1
0
pd.set_option("display.notebook_repr_html", True)
from optparse import OptionParser

# <codecell>

operations = [op.replace('\n', '') for op in open('data/query list').readlines()]

# <codecell>

operations

# <codecell>

df_ops = pd.DataFrame()
for op in operations:
    df  = yt.youtube_search(op,1000, True)
    df['operation'] = op
    df_ops = df_ops.append(df)

# <codecell>

df  = yt.youtube_search(operations[-1],1000, True)
df['operation'] = operations[-1]
df_ops = df_ops.append(df)

# <codecell>

print ('overall video details:' + str(df_ops.shape))

# <codecell>
Esempio n. 2
0
video_df = video_df.drop_duplicates()
print(video_df.shape)
print('Users: ', len(video_df.USER.unique()))
print('Titles: ', len(video_df.TITLE.str.encode('utf-8').unique()))
print('IDS: ', len(video_df.ID.unique()))

# <markdowncell>

# There are 908 titles in the Beraldo dataset, of which **289** are unique. There are 246 users -- people (robots?) who upload videos. 
# 
# Comparing Davide's results with the ones I get from the Youtube API:

# <codecell>

df_am = yt.youtube_search(query='anonymous,internet,freedom', max_results=1000, with_statistics=True)

# <codecell>

df_am.columns
df_am.drop_duplicates(inplace=True, cols='videoId')
print(df_am.shape)
print 'Unique ids: ', len(df_am.videoId.unique())

# <markdowncell>

# So not a huge difference in numbers -- 525 vs 289. But are they the same videos more or less? 

# <codecell>

davide_set = set(video_df.ID.tolist())
Esempio n. 3
0
# <codecell>

print('There are %d operations' % len(operations))

# <codecell>



df_ops = pd.DataFrame()

# <codecell>

## the basic operations

for op in operations[177:]:
  df  = yt.youtube_search(op,500, True)
  df['query'] = op
  df_ops = df_ops.append(df)
  print (operations.index(op))

# <codecell>

df_ops.shape

# <markdowncell>

# Run all the same queries with 'mirror' as well. This might help address the unpredictability of the Youtube search results and its 'denial of search results'. So, we undertake a 'supply of service' approach here. 

# <codecell>

## run the same operations with 'mirror'