Example #1
0
video_df = video_df.drop_duplicates()
print(video_df.shape)
print('Users: ', len(video_df.USER.unique()))
print('Titles: ', len(video_df.TITLE.str.encode('utf-8').unique()))
print('IDS: ', len(video_df.ID.unique()))

# <markdowncell>

# There are 908 titles in the Beraldo dataset, of which **289** are unique. There are 246 users -- people (robots?) who upload videos. 
# 
# Comparing Davide's results with the ones I get from the Youtube API:

# <codecell>

df_am = yt.youtube_search(query='anonymous,internet,freedom', max_results=1000, with_statistics=True)

# <codecell>

df_am.columns
df_am.drop_duplicates(inplace=True, cols='videoId')
print(df_am.shape)
print 'Unique ids: ', len(df_am.videoId.unique())

# <markdowncell>

# So not a huge difference in numbers -- 525 vs 289. But are they the same videos more or less? 

# <codecell>

davide_set = set(video_df.ID.tolist())
# <codecell>

ops.applymap?

# <codecell>

ops.apply?

# <codecell>

import YT_api_generate as yt

# <codecell>

ops = yt.format_durations(ops)

# <markdowncell>

# ## THIS THE KEY BIT TO LINK TITLES AND DURATIONS

# <codecell>

dur_ti = ops.groupby(ops.duration_time)['title'].value_counts()

# <codecell>

t=dt.time?

# <codecell>
Example #3
0
import YT_api_generate as yt
import pandas as pd
import re
import numpy as np
import ggplot
import matplotlib.pyplot as plt
import seaborn

# <codecell>

# raw search results from Youtube api
ops_df = pd.read_pickle('data/operations_results.pyd')

# <codecell>

ops = yt.title_clean_operations(ops_df)
ops = yt.format_durations(ops)
ops.shape

# <codecell>

print len(sort(ops.title_short.unique()))
sort(ops.title_short.unique())

# <codecell>

# calculate the duration of operations
# duration time is the length of a video used to find matches
end = ops.groupby(['title_short', 'duration_time']).publishedAt.max()
start = ops.groupby(['title_short', 'duration_time']).publishedAt.min()
operation_duration = end - start
Example #4
0
pd.set_option("display.notebook_repr_html", True)
from optparse import OptionParser

# <codecell>

operations = [op.replace('\n', '') for op in open('data/query list').readlines()]

# <codecell>

operations

# <codecell>

df_ops = pd.DataFrame()
for op in operations:
    df  = yt.youtube_search(op,1000, True)
    df['operation'] = op
    df_ops = df_ops.append(df)

# <codecell>

df  = yt.youtube_search(operations[-1],1000, True)
df['operation'] = operations[-1]
df_ops = df_ops.append(df)

# <codecell>

print ('overall video details:' + str(df_ops.shape))

# <codecell>
Example #5
0
# <codecell>

print('There are %d operations' % len(operations))

# <codecell>



df_ops = pd.DataFrame()

# <codecell>

## the basic operations

for op in operations[177:]:
  df  = yt.youtube_search(op,500, True)
  df['query'] = op
  df_ops = df_ops.append(df)
  print (operations.index(op))

# <codecell>

df_ops.shape

# <markdowncell>

# Run all the same queries with 'mirror' as well. This might help address the unpredictability of the Youtube search results and its 'denial of search results'. So, we undertake a 'supply of service' approach here. 

# <codecell>

## run the same operations with 'mirror'