-
Notifications
You must be signed in to change notification settings - Fork 0
/
uploaded_ad_data.py
98 lines (80 loc) · 3.62 KB
/
uploaded_ad_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv
from shared import make_stream_id, storage_formatter_factory
def is_ad(url):
ad_strings = ['instream_ad', 'pagead', 'doubleclick.net', 'ad_data_204', 'stats/ads', 'fwmrm.net/ad', 'api/ads']
return reduce(lambda x, y: x or y, map(lambda x: x in url, ad_strings), False)
def get_ad_streams(input_file, home_ip, website='youtube.com', is_incoming=False):
ad_dict = {}
with open(input_file, 'rb') as csv_file:
data_reader = csv.DictReader(csv_file, delimiter=',')
for row in data_reader:
if row['is_ack'] == 'True':
continue
if row['protocol'] != 'TCP':
continue
if website in row['website']:
if row['dst' if is_incoming else 'src'] == home_ip:
if row['url'] != '':
stream_id = make_stream_id(row)
try:
ad_dict[stream_id].append(row['url'])
except KeyError:
ad_dict[stream_id] = [row['url']]
return ad_dict
def get_stream_sizes(input_file, home_ip, stream_ids, is_incoming=False):
sizes = [0] * len(stream_ids)
all_sizes = 0
with open(input_file, 'rb') as csv_file:
data_reader = csv.DictReader(csv_file, delimiter=',')
for row in data_reader:
if row['is_ack'] == 'True':
continue
if row['protocol'] != 'TCP':
continue
if row['dst' if is_incoming else 'src'] == home_ip:
all_sizes += int(row['len'])
this_stream_id = make_stream_id(row)
for i, stream_id in enumerate(stream_ids):
if stream_id == this_stream_id:
sizes[i] += int(row['len'])
break
return sizes, all_sizes
def get_ad_sizes(print_urls=False):
input_file = 'chrome_combined_dataset.csv'
home_ip = '10.0.2.15'
website = 'youtube.com'
ad_dict = get_ad_streams(input_file, home_ip, website=website, is_incoming=False)
qu = []
for stream_id, url_list in ad_dict.items():
if reduce(lambda x, y: x and y, map(is_ad, url_list), True):
qu.append(stream_id)
if print_urls:
print stream_id, len(url_list)
for url in url_list:
print '\t', url
sizes, total = get_stream_sizes(input_file, home_ip, qu)
print 'Total ad data sent:', storage_formatter_factory(unit_speed=False)(sum(sizes))
print 'Total data sent:', storage_formatter_factory(unit_speed=False)(total)
print 'Percentage ad data {0:0.1f}%'.format(sum(sizes) * 100 / float(total))
def netflix_uploaded_data():
input_file = 'hannibal_dump/chrome.csv'
home_ip = '192.168.1.2'
website = 'netflix.com'
ad_dict = get_ad_streams(input_file, home_ip, website=website, is_incoming=False)
# https://www.netflix.com/api/msl/NFCDCH-LX-/cadmium/pblifecycle
stream_ids = set([])
for key in ad_dict:
# print key
for url in ad_dict[key]:
if 'pblifecycle' in url:
stream_ids.add(key)
# if 'nflxvideo.net/range' in url:
# stream_ids.add(key)
# print '\t', url
sizes, total = get_stream_sizes(input_file, home_ip, stream_ids)
print 'Total data sent:', storage_formatter_factory(unit_speed=False)(total)
print 'Percentage range data {0:0.1f}%'.format(sum(sizes) * 100 / float(total))
print 'Total ad data sent:', storage_formatter_factory(unit_speed=False)(sum(sizes))
if __name__ == '__main__':
netflix_uploaded_data()
#get_ad_sizes(print_urls=True)