/
Heavy User Attributes No SC.py
573 lines (460 loc) · 23.1 KB
/
Heavy User Attributes No SC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
# Databricks notebook source
# MAGIC %md
# MAGIC # Heavy User Attributes - No Search Counts
# COMMAND ----------
# MAGIC %md
# MAGIC What is the frequency and percentage of heavy users in 1 day for:
# MAGIC * Tab Count
# MAGIC * Window Count
# MAGIC * Active Addons Count
# MAGIC * Session Started on this Day
# MAGIC * Normalized Channel
# MAGIC * OS
# MAGIC * Is Default Browser
# MAGIC * Country
# MAGIC * Locale
# MAGIC * Default Search Engine
# COMMAND ----------
import pyspark.sql.functions as F
import pyspark.sql.types as st
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
# COMMAND ----------
# MAGIC %md
# MAGIC ###Prepare Data
# COMMAND ----------
#Global variables
sample_id = 42
week_1_start = '20180920'
week_1_end = '20180926'
wday = '20180926'
wday_cond = "submission_date_s3 = \'" + wday + "\'"
heavy_uri = '169'
heavy_ah = '0.93'
# COMMAND ----------
# From telemetry docs for how clients_daily deteremines values
sum_query = """
SELECT
client_id,
submission_date_s3,
sum(coalesce(scalar_parent_browser_engagement_total_uri_count, 0)) AS td_uri,
sum(coalesce(scalar_parent_browser_engagement_active_ticks, 0)) AS td_active_ticks,
sum(coalesce(scalar_parent_browser_engagement_active_ticks, 0)*5/3600) AS td_active_hours,
sum(subsession_length/3600) AS td_subsession_hours,
sum(CASE WHEN subsession_counter = 1 THEN 1 ELSE 0 END) AS sessions_started_on_this_day,
mean(active_addons_count) AS active_addons_count_mean,
max(scalar_parent_browser_engagement_max_concurrent_tab_count) AS tab_count_max,
max(scalar_parent_browser_engagement_max_concurrent_window_count) AS window_count_max,
max(scalar_parent_browser_engagement_unique_domains_count) AS domains_count_max,
first(profile_creation_date) AS profile_creation_date,
first(previous_build_id) AS previous_build_id,
first(normalized_channel) AS normalized_channel,
first(os) AS os,
first(normalized_os_version) AS normalized_os_version,
first(windows_build_number) AS windows_build_number,
first(install_year) AS install_year,
first(distribution_id) AS distribution_id,
count(distinct document_id) AS pings_aggregated_by_this_row,
first(app_build_id) AS app_build_id,
first(app_display_version) AS app_display_version,
first(update_channel) AS update_channel,
first(update_enabled) AS update_enabled,
first(update_auto_download) AS update_auto_download,
first(timezone_offset) AS timezone_offset,
first(vendor) AS vendor,
first(is_default_browser) AS is_default_browser,
first(default_search_engine) AS default_search_engine,
sum(devtools_toolbox_opened_count) AS devtools_toolbox_opened_count_sum,
mean(places_bookmarks_count) AS places_bookmarks_count_mean,
mean(places_pages_count) AS places_pages_count_mean,
sum(scalar_parent_browser_engagement_tab_open_event_count) AS td_tab_event_count,
sum(scalar_parent_browser_engagement_window_open_event_count) AS td_window_event_count,
first(CASE WHEN country IS NOT NULL AND country != '??' THEN country ELSE NULL END) as country,
first(CASE WHEN country IS NOT NULL AND country != '??'
THEN CASE WHEN city IS NOT NULL THEN city ELSE '??' END
ELSE NULL END) AS city,
first(geo_subdivision1) AS geo_subdivision1,
first(locale) AS locale,
first(sync_configured) AS sync_configured,
sum(sync_count_desktop) AS sync_count_desktop,
sum(sync_count_mobile) AS sync_count_mobile,
first(sample_id) AS sample_id
FROM main_summary
WHERE
app_name='Firefox'
AND submission_date_s3 >= '{}'
AND submission_date_s3 <= '{}'
AND sample_id = '{}'
GROUP BY
1, 2
"""
search_query = """
SELECT client_id,
submission_date_s3,
engine,
SUM(sap) as sap,
SUM(tagged_sap) as tagged_sap,
SUM(tagged_follow_on) as tagged_follow_on,
SUM(organic) as in_content_organic
FROM search_clients_daily
WHERE
submission_date_s3 >= '{}'
AND submission_date_s3 <= '{}'
AND sample_id = '{}'
GROUP BY
1, 2, 3
"""
# COMMAND ----------
ms_1week_sum = spark.sql(sum_query.format(week_1_start,week_1_end,sample_id))
search_wk = spark.sql(search_query.format(week_1_start,week_1_end,sample_id))
# COMMAND ----------
# Fill null search counts with 0s
search_wk = search_wk.na.fill(0)
# Sum over the day
search_wk_sum = search_wk.groupBy('client_id', 'submission_date_s3') \
.agg(F.sum(F.col('sap')+F.col('in_content_organic'))) \
.withColumnRenamed('sum((sap + in_content_organic))','td_search_counts') \
.sort('client_id', 'submission_date_s3')
# Join the search table with the other counts
ms_1week = ms_1week_sum.join(search_wk_sum, ['client_id', 'submission_date_s3'], 'full_outer').na.fill(0)
# Average the total daily values over the week
ms_1week_avg = ms_1week.groupBy('client_id').avg() \
.withColumnRenamed('avg(td_uri)','avg_uri') \
.withColumnRenamed('avg(td_active_ticks)','avg_active_ticks') \
.withColumnRenamed('avg(td_active_hours)','avg_active_hours') \
.withColumnRenamed('avg(td_subsession_hours)','avg_subsession_hours') \
.withColumnRenamed('avg(sessions_started_on_this_day)','avg_sessions_started') \
.withColumnRenamed('avg(active_addons_count_mean)','avg_addons_count') \
.withColumnRenamed('avg(tab_count_max)','avg_tab_count') \
.withColumnRenamed('avg(window_count_max)','avg_window_count') \
.withColumnRenamed('avg(domains_count_max)','avg_domains_count') \
.withColumnRenamed('avg(pings_aggregated_by_this_row)', 'avg_pings') \
.withColumnRenamed('avg(sync_count_desktop)', 'avg_sync_desktop') \
.withColumnRenamed('avg(sync_count_mobile)', 'avg_sync_mobile') \
.withColumnRenamed('avg(td_search_counts)', 'avg_search_counts')
# COMMAND ----------
ms_wday = ms_1week.where(wday_cond)
# COMMAND ----------
# MAGIC %md
# MAGIC ### Attributes of Heavy Users
# COMMAND ----------
def get_freq_arrays(heavy_cutoff, heavy_column_name, attr_column_name, gte=True, df=ms_wday):
"""
Given a heavy cutoff, a heavy column name and an attribute column name,
returns two arrays for a frequency graph.
Params:
heavy_cutoff (number): a cutoff to define a heavy user
heavy_column_name (string): name of the column for heavy user
attr_column_name (string): name of the column for the attribute we're interested in
gte (boolean, default True): True if >= cutoff, False if < cutoff
df (dataframe, default ms_wday): dataframe to calculate frequency arrays from
Returns two arrays for graphing: attribute count array and number of clients array
"""
# get one day of data for heavy or non-heavy users
eval = '>=' if gte else '<'
hu = df.where(heavy_column_name + eval + heavy_cutoff)
# get frequencies of attribute column
freq_hu = hu.groupby(attr_column_name) \
.agg(F.countDistinct('client_id')).withColumnRenamed('count(DISTINCT client_id)', 'num_clients') \
.sort(attr_column_name) \
.filter(F.col(attr_column_name).isNotNull())
# get 2 arrays for attribute column count and number of clients
attr_count_arr = array(freq_hu.select(attr_column_name).rdd.flatMap(lambda x:x).collect())
num_clients_arr = array(freq_hu.select('num_clients').rdd.flatMap(lambda x: x).collect())
if gte:
print 'max heavy num clients:', num_clients_arr.max()
return attr_count_arr, num_clients_arr
def get_list_freq_arrays(attr_column_name, df=ms_wday):
"""
Given an attribute column name,
return list of twelve arrays for a frequency graph -
uri attribute count and number of clients for heavy users,
uri attribute count and number of clients for non-heavy users,
search count attribute count and number of clients for heavy users,
search count attribute count and number of clients for non-heavy users,
active hours attribute count and number of clients for heavy users,
active hours attribute count and number of clients for non-heavy users.
Params:
attr_column_name (string): name of the column for the attribute we're interested in
df (dataframe, default ms_wday): dataframe to calculate frequency arrays from
Returns list of twelve arrays for graphing: uri attribute count array and number of clients array for heavy,
uri attribute count array and number of clients array for non-heavy,
search count attribute count array and number of clients array for heavy,
search count attribute count array and number of clients array for non-heavy,
active hours attribute array and number of clients array for heavy,
active hours attribute array and number of clients array for non-heavy
"""
arrlist = []
huri_attr_arr, huri_attr_c_arr = get_freq_arrays(heavy_uri, 'td_uri', attr_column_name, True, df)
arrlist.append(huri_attr_arr)
arrlist.append(huri_attr_c_arr)
nhuri_attr_arr, nhuri_attr_c_arr = get_freq_arrays(heavy_uri, 'td_uri', attr_column_name, False, df)
arrlist.append(nhuri_attr_arr)
arrlist.append(nhuri_attr_c_arr)
# hsc_attr_arr, hsc_attr_c_arr = get_freq_arrays(heavy_sc, 'td_search_counts', attr_column_name, True, df)
# arrlist.append(hsc_attr_arr)
# arrlist.append(hsc_attr_c_arr)
# nhsc_attr_arr, nhsc_attr_c_arr = get_freq_arrays(heavy_sc, 'td_search_counts', attr_column_name, False, df)
# arrlist.append(nhsc_attr_arr)
# arrlist.append(nhsc_attr_c_arr)
hah_attr_arr, hah_attr_c_arr = get_freq_arrays(heavy_ah, 'td_active_hours', attr_column_name, True, df)
arrlist.append(hah_attr_arr)
arrlist.append(hah_attr_c_arr)
nhah_attr_arr, nhah_attr_c_arr = get_freq_arrays(heavy_ah, 'td_active_hours', attr_column_name, False, df)
arrlist.append(nhah_attr_arr)
arrlist.append(nhah_attr_c_arr)
return arrlist
# COMMAND ----------
def calc_plot_heavy_bars(ax, heavy_cutoff, heavy_column_name, attr_column_name, xlabel, xulim, yulim, title, width=0.4, xllim=0, yllim=0, conv_xaxis=False, pct_label=True, xlabel_ra=False):
"""
Given a heavy cutoff, a heavy column name and an attribute column name, prepare a bar chart graph.
Params:
ax (axis): axis for the plot
heavy_cutoff (number): a cutoff to define a heavy user
heavy_column_name (string): name of the column for heavy user
attr_column_name (string): name of the column for the attribute we're interested in
xlabel (string): x-axis label
xlim (number): limit for x-axis
ylim (number): limit for y-axis
title (string): title for the plot
width (number): width of bars (default 0.4)
xllim (number): lower limit for x-axis (default 0)
yllim (nuumber): lower limit for y-axis (default 0)
conv_xaxis (boolean): convert the x-axis arrays to number (default False)
pct_label (boolean): add the percent label on top of the bar (default True)
xlabel_ra (boolean): align the xlabel to the right, for long labels (default False)
"""
h_attr_arr, h_clients_arr = get_freq_arrays(heavy_cutoff, heavy_column_name, attr_column_name)
nh_attr_arr, nh_clients_arr = get_freq_arrays(heavy_cutoff, heavy_column_name, attr_column_name, False)
plot_heavy_bars(ax, h_attr_arr, h_clients_arr, nh_attr_arr, nh_clients_arr,
xlabel, xulim, yulim, title, width, xllim, yllim,
conv_xaxis, pct_label, xlabel_ra)
return None
# COMMAND ----------
def plot_heavy_bars(ax, h_attr_arr, h_clients_arr, nh_attr_arr, nh_clients_arr,
xlabel, xulim, yulim, title, width=0.4, xllim=0, yllim=0,
conv_xaxis=False, pct_label=True, xlabel_ra=False):
"""
Given an axis, list of 12 arrays and graph variables, prepare a bar chart graph.
Params:
ax (axis): axis for the plot
h_attr_arr (array): heavy attribute array
h_clients_arr (array): heavy number of clients array
nh_attr_arr (array): non-heavy attribute array
nh_clients_arr (array): non-heavy number of clients array
xlabel (string): x-axis label
xulim (number): upper limit for x-axis
yulim (number): upper limit for y-axis
title (string): title for the plot
width (number): width of bars (default 0.4)
xllim (number): lower limit for x-axis (default 0)
yllim (number): lower limit for y-axis (default 0)
conv_xaxis (boolean): convert the x-axis arrays to number (default False)
pct_label (boolean): add the percent label on top of the bar (default True)
xlabel_ra (boolean): align the xlabel to the right, for long labels (default False)
"""
if (conv_xaxis):
xllim = 1
if (len(h_attr_arr) != len(nh_attr_arr)):
print 'Mismatch in xarray lengths!'
extras = np.setxor1d(h_attr_arr, nh_attr_arr)
for item in extras:
print item
if item in nh_attr_arr: # extra item in non-heavy array
nhidx = nh_attr_arr.tolist().index(item)
print 'Delete ' + item + ' from non-heavy array'
nh_attr_arr = np.delete(nh_attr_arr, nhidx)
nh_clients_arr = np.delete(nh_clients_arr, nhidx)
else: # extra item in heavy array
hidx = h_attr_arr.tolist().index(item)
print ('Insert ' + item + ' into non-heavy array')
nh_attr_arr = np.insert(nh_attr_arr, hidx, item)
nh_clients_arr = np.insert(nh_clients_arr, hidx, 0)
xticks = array(list(range(1, len(h_attr_arr)+1)))
ax.set_xticks(xticks)
if (xlabel_ra):
ax.set_xticklabels(h_attr_arr, rotation=45, ha='right')
else:
ax.set_xticklabels(h_attr_arr, rotation=45)
h_attr_arr = xticks
nh_attr_arr = xticks
# the bar chart
nhbars = ax.bar(nh_attr_arr - width/2, nh_clients_arr, width,
label ='Non Heavy', color = 'lightsalmon')
hbars = ax.bar(h_attr_arr + width/2, h_clients_arr, width,
label ='Heavy', color = 'mediumaquamarine')
ax.set_xlabel(xlabel)
ax.set_xlim(xllim-1, xulim)
ax.set_ylabel('Number of Clients')
ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)
ax.set_ylim(yllim, yulim)
ax.set_title(title)
ax.legend(loc='best')
yrange = yulim - yllim
if (pct_label):
# Add percent labels above the heavy bars
for hrect,nhrect in zip(hbars,nhbars):
xloc = hrect.get_x()
if (xloc >= xllim and xloc <= (xulim - 1)):
hheight = hrect.get_height()
nhheight = nhrect.get_height()
htext = hheight*1.0/(hheight + nhheight)*100
ax.text(xloc + hrect.get_width()/2.0, hheight+.01*yrange, '{0:,.2f}%'.format(htext),
ha='center', va='bottom', rotation='vertical', fontsize=8)
return None
def plot_three_heavy_bars(arrlist, fig_size, xlabel, title, xulim, yulim,
width=0.4, xllim=0, yllim=0, conv_xaxis=False,
pct_label=True, xlabel_ra=False, rows=True):
"""
Given list of twelve arrays and some graph parameters, prepare a graph with 3 bar charts.
Params:
arrlist (list of arrays): list of 12 arrays to graph
[0] heavy URI attribute array
[1] heavy URI number of clients array
[2] non-heavy URI attribute array
[3] non-heavy URI number of clients array
[4] heavy search count attribute array
[5] heavy search count number of clients array
[6] non-heavy search count attribute array
[7] non-heavy search count number of clients array
[8] heavy active hours attribute array
[9] heavy active hours number of clients array
[10] non-heavy active hours attribute array
[11] non-heavy active hours number of clients array
fig_size (tuple of integers): size of the figure
xlabel (string): x-axis label
xulim (number): upper limit for x-axis
yulim (number): upper limit for y-axis
title (string): title for the plot
width (number): width of bars (default 0.4)
xllim (number): lower limit for x-axis (default 0)
yllim (number): lower limit for y-axis (default 0)
conv_xaxis (boolean): convert the x-axis arrays to number (default False)
pct_label (boolean): add the percent label on top of the bar (default True)
xlabel_ra (boolean): align the xlabel to the right, for long labels (default False)
rows (boolean): create 3 rows if true, create 3 columns if false (default True)
"""
plt.gcf().clear()
if rows:
fig, (ax1, ax3) = plt.subplots(nrows=2, ncols=1, figsize=fig_size)
else:
fig, (ax1, ax3) = plt.subplots(nrows=1, ncols=2, figsize=fig_size)
# the bar chart for uri
title_uri = title + ' for URI Users'
plot_heavy_bars(ax1, arrlist[0], arrlist[1], arrlist[2], arrlist[3], xlabel, xulim, yulim,
title_uri, width, xllim, yllim, conv_xaxis, pct_label, xlabel_ra)
# the bar chart for sc
# title_sc = title + ' for Search Count Users'
# plot_heavy_bars(ax2, arrlist[4], arrlist[5], arrlist[6], arrlist[7], xlabel, xulim, yulim,
# title_sc, width, xllim, yllim, conv_xaxis, pct_label, xlabel_ra)
# the bar chart for ah
title_ah = title + ' for Active Hours Users'
plot_heavy_bars(ax3, arrlist[4], arrlist[5], arrlist[6], arrlist[7], xlabel, xulim, yulim,
title_ah, width, xllim, yllim, conv_xaxis, pct_label, xlabel_ra)
plt.tight_layout()
display(fig)
return None
# COMMAND ----------
# MAGIC %md
# MAGIC ####Tab Count
# COMMAND ----------
list_tc_plot_arrs = get_list_freq_arrays('tab_count_max')
plot_three_heavy_bars(list_tc_plot_arrs, (10,8), 'Tab Count', 'Frequency of Tab Counts', 45, 32000)
# COMMAND ----------
plot_three_heavy_bars(list_tc_plot_arrs, (10,8), 'Tab Count', 'Frequency of Tab Counts', 40, 12500, xllim=10)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Window Count
# COMMAND ----------
list_wc_plot_arrs = get_list_freq_arrays('window_count_max')
plot_three_heavy_bars(list_wc_plot_arrs, (11,7), 'Window Count', 'Frequency', 12, 110000, rows=False)
# COMMAND ----------
plot_three_heavy_bars(list_wc_plot_arrs, (12,6), 'Window Count', 'Frequency', 25, 2000, xllim=7, rows=False)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Active Addons Count
# COMMAND ----------
list_aa_plot_arrs = get_list_freq_arrays('active_addons_count_mean')
plot_three_heavy_bars(list_aa_plot_arrs, (12,6), 'Active Addons Count', 'Frequency', 25, 72000, pct_label=False, rows=False)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Sessions Started on this Day
# COMMAND ----------
list_ss_plot_arrs = get_list_freq_arrays('sessions_started_on_this_day')
plot_three_heavy_bars(list_ss_plot_arrs, (10,8), 'Sessions Started', 'Frequency of Sessions Started', 25, 60000)
# COMMAND ----------
plot_three_heavy_bars(list_ss_plot_arrs, (10,8), 'Sessions Started', 'Frequency of Sessions Started', 40, 3700, xllim=11)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Normalized Channel
# COMMAND ----------
list_nc_plot_arrs = get_list_freq_arrays('normalized_channel')
plot_three_heavy_bars(list_nc_plot_arrs, (10,6), 'Normalized Channel', 'Frequency', 7, 250000,
conv_xaxis=True, rows=False)
# COMMAND ----------
plot_three_heavy_bars(list_nc_plot_arrs, (10,6), 'Normalized Channel', 'Frequency', 7, 14000, conv_xaxis=True, rows=False)
# COMMAND ----------
# MAGIC %md
# MAGIC ####OS
# COMMAND ----------
list_os_plot_arrs = get_list_freq_arrays('os')
plot_three_heavy_bars(list_os_plot_arrs, (10,6), 'OS', 'Frequency', 5, 230000, conv_xaxis=True, rows=False)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Is Default Browser
# COMMAND ----------
list_db_plot_arrs = get_list_freq_arrays('is_default_browser')
plot_three_heavy_bars(list_db_plot_arrs, (10,6), 'Is Default Browser', 'Frequency', 3, 450000, conv_xaxis=True, rows=False)
# COMMAND ----------
#country - 239
top_ctry = ['US', 'DE', 'FR', 'IN', 'BR', 'RU', 'PL', 'CN', 'ID', 'IT', 'ES', 'GB', 'JP', 'CA', 'MX', 'IR', 'AU', 'UA', 'NL', 'CZ']
ctr_wday = ms_wday.where(F.col('country').isin(top_ctry))
list_c_plot_arrs = get_list_freq_arrays('country', df=ctr_wday)
plot_three_heavy_bars(list_c_plot_arrs, (10,8), 'Country', 'Frequency of Country', 21, 47000, conv_xaxis=True)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Locale
# COMMAND ----------
#locale - 93!
top_locale = ['en-US', 'de', 'fr', 'ru', 'es-ES']
locale_wday = ms_wday.where(F.col('locale').isin(top_locale))
list_tl_plot_arrs = get_list_freq_arrays('locale', df=locale_wday)
plot_three_heavy_bars(list_tl_plot_arrs, (12,6), 'Locale', 'Frequency', 6, 100000, conv_xaxis=True, rows=False)
# COMMAND ----------
#locale - 93!
sec_locale = ['pt-BR', 'pl', 'zh-CN', 'it', 'en-GB', 'ja', 'es-MX', 'nl', 'cs', 'hu', 'es-AR', 'en-CA', 'id', 'tr', 'zh-TW', 'el']
slocale_wday = ms_wday.where(F.col('locale').isin(sec_locale))
list_sl_plot_arrs = get_list_freq_arrays('locale', df=slocale_wday)
plot_three_heavy_bars(list_sl_plot_arrs, (10,10), 'Locale', 'Frequency', 17, 38000, conv_xaxis=True)
# COMMAND ----------
# MAGIC %md
# MAGIC ####Default Search Engine
# COMMAND ----------
#default_search_engine - 1329
# too many for one plot - select a set of search engines, and pass in that df
# change get_freq_arrays to take an optional df
top_dse = ['google-2018', 'baidu', 'yandex-ru', 'bing', 'ddg', 'yasearch']
tdse_wday = ms_wday.where(F.col('default_search_engine').isin(top_dse))
list_tdse_plot_arrs = get_list_freq_arrays('default_search_engine', df=tdse_wday)
plot_three_heavy_bars(list_tdse_plot_arrs, (10,7), 'Default Search Engine', 'Frequency', 7, 40000, conv_xaxis=True, rows=False)
# COMMAND ----------
#default_search_engine - 1329
# too many for one plot - select a set of search engines, and pass in that df
# change get_freq_arrays to take an optional df
sec_dse = ['other-Bing Search Engine', 'other-Ask Web Search', 'other-Yahoo! Powered',
'other-Bing Search', 'other-Ad-Aware SecureSearch', 'google-nocodes', 'yandex-tr']
sdse_wday = ms_wday.where(F.col('default_search_engine').isin(sec_dse))
list_sdse_plot_arrs = get_list_freq_arrays('default_search_engine', df=sdse_wday)
plot_three_heavy_bars(list_sdse_plot_arrs, (11,7), 'Default Search Engine', 'Frequency', 8, 5500,
conv_xaxis=True, xlabel_ra=True, rows=False)
# COMMAND ----------
#default_search_engine - 1329
# too many for one plot - select a set of search engines, and pass in that df
# change get_freq_arrays to take an optional df
third_dse = ['yahoo', 'yandex', 'other-YHS', 'google-2018-sysaddon', 'other-Norton Search', 'qwant']
thdse_wday = ms_wday.where(F.col('default_search_engine').isin(third_dse))
list_thdse_plot_arrs = get_list_freq_arrays('default_search_engine', df=thdse_wday)
plot_three_heavy_bars(list_thdse_plot_arrs, (11,7), 'Default Search Engine', 'Frequency', 7, 3000,
xllim=1, conv_xaxis=True, xlabel_ra=True, rows=False)
# COMMAND ----------