This repository has been archived by the owner on Jun 22, 2018. It is now read-only.
/
experiments.py
executable file
·1181 lines (927 loc) · 31.2 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/python
'''
Name: Sravan Bhamidipati
Date: 5th February, 2013
Purpose: To do different experiments on the collected logs. Functions annotated
with "INTERFACE" are high-level and can be called by the user. Functions
annotated with "INTERNAL" are internal ones which may be called by the interface
functions.
'''
import adAnalyzer, adLib, adOps, adParser
import ast, os, itertools, pylab, random, re, sys
def make_dir(dirpath):
'''Create directory to save experiment results.
Args:
dirpath: Path to directory.
'''
if not os.path.isdir(dirpath):
if os.path.exists(dirpath):
print "ERROR:", dirpath, "exists and is not a directory."
sys.exit(-1)
else:
os.makedirs(dirpath)
def types_count_str(types_count):
'''Return string form of the types_count dictionary, which has D, R and X
as keys and their respective counts as values.'''
return "\t".join([str(types_count["D"]), str(types_count["R"]), str(types_count["X"])])
def churn(adset_file, results_dir):
'''INTERFACE: Churn is the number of ads per number of trials.
Args:
adset_file: Config file like "accounts.cf".
results_dir: Directory path to save experiment results.
'''
file_set_lists = adParser.parse_conf(adset_file)
ad_truth = adLib.true_ds_of_ads("dbs/adTruth.db")
churn_out = "User\tTrials\tAll\tD\tR\tX\n"
make_dir(results_dir)
for user in file_set_lists:
ad_list = []
churn_out += "\t".join([user, "0", "0", "0", "0", "0"]) + "\n"
for i in range(0, len(file_set_lists[user])):
# for i in range(len(file_set_lists[user])-1, -1, -1):
tmp_list = adParser.parse_html_set(file_set_lists[user][i])
ad_list = adOps.union([ad_list, tmp_list])
type_counts = adLib.ad_types_count(ad_list, ad_truth)
churn_out += "\t".join([user, str(i+1), str(len(ad_list)), \
types_count_str(type_counts)]) + "\n"
fd = open(results_dir + "/" + user + ".txt", "w")
fd.write(adOps.get_ads_str(ad_list))
fd.flush()
fd.close()
fd = open(results_dir + "/churn.txt", "w")
fd.write(churn_out)
fd.flush()
fd.close()
def avg_churn(churn, min_uid, max_uid):
'''INTERNAL: Find the avg churn across users in a range.
Args:
churn: Multi-level dictionary of users, trials/total/d_s/r_s/x_s/knee_x/
knee_y, with values for number of trials and corresponding number of
ads, targeted ads, random ads and other targeted ads.
min_uid: Integer between 10-30, form which user churn will be plotted.
max_uid: Integer between 10-30, till which user churn will be plotted.
Return:
avgs: A dictionary of lists of avg total, d_s, r_s, x_s for each trial
across a range of users.
'''
avgs = {"trials": [], "total": [], "d_s": [], "r_s": [], "x_s": []}
for i in range(0, 100):
users = 0
for key in avgs:
avgs[key].append(0)
for user in churn:
if "avg" in user:
continue
uid = int(user.strip("ccloudauditor"))
if uid >= min_uid and uid <=max_uid:
if len(churn[user]["trials"]) <= i:
users = 0
break
users += 1
for key in avgs:
avgs[key][i] += churn[user][key][i]
if users == 0:
for key in avgs:
avgs[key].pop()
break
else:
for key in avgs:
avgs[key][i] /= users
return avgs
def save_churn_png(results_dir, churn, min_uid, max_uid, knee):
'''INTERNAL: Save the churn PNG with all rings and bells.
Args:
results_dir: Directory path to save experiment results.
churn: Multi-level dictionary of users, trials/total/d_s/r_s/x_s/knee_x/
knee_y, with values for number of trials and corresponding number of
ads, targeted ads, random ads and other targeted ads.
min_uid: Integer between 10-30, form which user churn will be plotted.
max_uid: Integer between 10-30, till which user churn will be plotted.
knee: The "fictional" knee point for all users.
'''
for y_key in ["total", "d_s", "r_s", "x_s"]:
for user in churn:
if "avg" in user:
continue
uid = int(user.strip("ccloudauditor"))
if uid >= min_uid and uid <=max_uid:
pylab.plot(churn[user]["trials"], churn[user][y_key], "-", \
label=user)
if y_key == "total":
pylab.plot(churn[user]["knee_x"], churn[user]["knee_y"], \
"o", color="black")
pylab.xticks(range(0, 100, 10))
pylab.xlim([0, 100])
pylab.xlabel("Trials")
pylab.legend(loc="best", prop={'size':10})
if y_key == "total":
pylab.axvline(x=knee, color="black")
pylab.annotate("knee=" + str(knee), xy=(knee, 0))
pylab.ylabel("All Ads")
pylab.title("Number of All Ads Vs Number of Trials")
elif y_key == "d_s":
pylab.ylabel("TargetedDs")
pylab.title("Number of TargetedDs Vs Number of Trials")
elif y_key == "r_s":
pylab.ylabel("Rs")
pylab.title("Number of Rs Vs Number of Trials")
elif y_key == "x_s":
pylab.ylabel("TargetedXs")
pylab.title("Number of TargetedXs Vs Number of Trials")
pylab.savefig(results_dir + "/" + y_key + "-" + str(min_uid) + "-" + \
str(max_uid) + ".png")
pylab.clf()
def save_avg_churn_png(results_dir, churn):
'''INTERNAL: Save the average churn PNGs.
Args:
results_dir: Directory path to save experiment results.
churn: Multi-level dictionary of users, trials/total/d_s/r_s/x_s/knee_x/
knee_y, with values for number of trials and corresponding number of
ads, targeted ads, random ads and other targeted ads.
'''
for user in churn:
if "avg" not in user:
continue
for y_key in churn[user]:
if y_key == "trials":
continue
pylab.plot(churn[user]["trials"], churn[user][y_key], "-", \
label=user + ", " + y_key)
pylab.xticks(range(0, 100, 10))
pylab.xlim([0, 100])
pylab.xlabel("Trials")
pylab.ylabel("Number of Ads")
pylab.title("Avg. Number of Ads Vs Number of Trials")
pylab.legend(loc="best", prop={'size':10})
pylab.savefig(results_dir + "/" + user + ".png")
pylab.clf()
def plot_churn(results_dir):
'''INTERFACE: Plot the churn file as two PNGs.
Args:
results_dir: Directory path to save experiment results.
'''
fd = open(results_dir + "/churn.txt", "r")
churn = {}
for line in fd.readlines():
if "Trials" in line:
continue
user, trials, total, d_s, r_s, x_s = line.strip().split()
if user in churn:
churn[user]["trials"].append(int(trials))
churn[user]["total"].append(int(total))
churn[user]["d_s"].append(int(d_s))
churn[user]["r_s"].append(int(r_s))
churn[user]["x_s"].append(int(x_s))
else:
churn[user] = {"trials": [int(trials)], "total": [int(total)], \
"d_s": [int(d_s)], "r_s": [int(r_s)], "x_s": [int(x_s)]}
fd.close()
ratio = 0.75
knees = []
for user in churn:
trials = churn[user]["trials"]
totals = churn[user]["total"]
for i in range(0, len(trials)):
if totals[i]/float(totals[-1]) > ratio:
churn[user]["knee_x"] = i
churn[user]["knee_y"] = totals[i]
knees.append(i)
break
knees = sorted(knees)
if len(knees) > 2:
knee = knees[len(knees)/2 + 1]
else:
knee = knees[len(knees)/2]
print "Knee:", knee
churn["avg_10-20"] = avg_churn(churn, 10, 20)
churn["avg_21-30"] = avg_churn(churn, 21, 30)
make_dir(results_dir)
save_churn_png(results_dir, churn, 10, 20, knee)
save_churn_png(results_dir, churn, 21, 30, knee)
save_avg_churn_png(results_dir, churn)
def all_ads(adset_file):
'''INTERFACE: Print all ads to stderr. Needs a line in adParser to be
uncommented.'''
file_sets = adParser.parse_conf(adset_file)
html_set = set()
for user in file_sets:
for file_set in file_sets[user]:
html_set |= file_set
adParser.parse_html_set(html_set)
def unique_ads(ads_file, unique_ads_file):
'''INTERFACE: Extract all unique ads from a file containing a list of ads
(probably dumped before ad matching).
Args:
ads_file: File containing the string format of a list of ads.
unique_ads_file: File where the unique ads seen in ads_file are written.
'''
ad_set = set()
NEW_AD = 0
ad = ""
fd = open(ads_file, "r")
for line in fd.readlines():
if "START" in line:
NEW_AD = 1
if NEW_AD == 1:
ad += line
if "END" in line:
NEW_AD = 0
ad_set.add(ad)
ad = ""
fd.close()
make_dir(results_dir)
fd = open(unique_ads_file, "w")
for ad in ad_set:
fd.write(ad)
fd.flush()
fd.close()
'''
AWK script to measure performance of ad matching.
awk 'BEGIN {tp=fp=fn=0} {for (i=1; i<=NF; i++) {if($i~/TP/) tp+=$i; if($i~/FP/) fp+=$i; if($i~/FN/) fn+=$i}} END {print tp" "fp" "fn}' performance_file
'''
def compare_accounts(adset_file, results_dir):
'''INTERFACE: Compare a "base" and "other" account to see which of the ads
in "base" are found in "other".
Args:
adset_file: Config file like "accounts.cf" specifying "base" and "other".
results_dir: Directory path to save experiment results.
'''
file_sets = adParser.parse_conf(adset_file)
ad_truth = adLib.true_ds_of_ads("dbs/adTruth.db")
if "base" in file_sets and "other" in file_sets:
base_file_sets = file_sets["base"]
other_file_sets = file_sets["other"]
else:
print "ERROR:", adset_file, "doesn't specify base and other accounts."
return
make_dir(results_dir)
result_str = "Base\tCount\tDs\tRs\tXs\tOther\tNF\tDs\tRs\tXs\tCommon\tDs\tRs\tXs\n"
for b in range(0, len(base_file_sets)):
# print "BaseTrial", b
base_ads = adParser.parse_html_set(base_file_sets[b])
# adLib.dump_ads(base_ads, results_dir + "/base" + str(b) + ".txt")
base_count = len(base_ads)
base_tc = adLib.ad_types_count(base_ads, ad_truth)
other_ads = []
prev_diff = base_count
printed = False
for o in range(0, len(other_file_sets)):
other_ads = adOps.union([other_ads, \
adParser.parse_html_set(other_file_sets[o])])
base_ads = adOps.difference(base_ads, other_ads)
diff = len(base_ads)
diff_tc = adLib.ad_types_count(base_ads, ad_truth)
common = base_count - diff
common_tc = {}
for key in base_tc:
common_tc[key] = base_tc[key] - diff_tc[key]
if (not printed and o == len(other_file_sets)-1) or diff != prev_diff:
printed = True
result_str += "\t".join([str(b), str(base_count), \
types_count_str(base_tc), str(o), str(diff), \
types_count_str(diff_tc), str(common), \
types_count_str(common_tc)]) + "\n"
prev_diff = diff
if diff == 0:
break
# adLib.dump_ads(base_ads, results_dir + "/diff" + str(b) + ".txt")
# adLib.dump_ads(other_ads, results_dir + "/other.txt")
fd = open(results_dir + "/results.txt", "w")
fd.write(result_str)
fd.flush()
fd.close()
def plot_comparison(results_dir):
'''INTERFACE: Plot the results of comparing two accounts for common ads.
Args:
results_dir: Directory path containing the comparison results file and
also where the comparison plot should be saved.
'''
fd = open(results_dir + "/results.txt", "r")
bases = []
counts = []
others = []
misses = []
founds = []
for line in fd.readlines():
if "BaseTrial" in line:
continue
base, count, other, missed = line.strip().split()
base = int(base) + 1
count = int(count)
other = int(other) + 1
missed = int(missed)
found = 1 - (missed/float(count))
if len(bases) > 0 and bases[-1] == base:
others[-1] = other
misses[-1] = missed
founds[-1] = found
else:
bases.append(base)
counts.append(int(count))
others.append(other)
misses.append(missed)
founds.append(found)
fd.close()
pylab.ylim([0, 1])
pylab.yticks(adLib.float_range(0, 1, 0.1))
pylab.xlabel("Base Trial")
pylab.ylabel("Fraction of Ads Found")
pylab.plot(bases, founds, "b.", label="Found " + \
str(round(sum(founds)/len(founds), 3)))
pylab.legend(loc="upper left", prop={'size':10})
pylab.title("Common Ads in Identical Accounts")
pylab.twinx()
pylab.xlim([0, 100])
pylab.xticks(range(0, 100, 10))
pylab.ylim([0, 15])
pylab.yticks(range(0, 15, 1))
pylab.ylabel("Number of Trials to Find Base Ads")
pylab.plot(bases, others, "r.", label="In Trials " + \
str(round(sum(others)/float(len(others)), 3)))
pylab.legend(loc="lower right", prop={'size':10})
pylab.savefig(results_dir + "/results.png")
pylab.clf()
print results_dir, sum(counts)/float(len(counts)), sum(misses)/float(len(misses))
def analyze_comparison(results_file):
'''INTERFACE: To compute various averages of a comparison.
Args:
results_file: File like the "results.txt" of compare_accounts.
'''
fd = open(results_file, "r")
results = []
for line in fd.readlines():
if "Base" in line:
continue
words = line.strip().split()
ints = []
for word in words:
ints.append(int(word))
base = ints.pop(0)
if base == len(results):
results.append(ints)
else:
results[base] = ints
fd.close()
avgs = []
for i in range(0, len(results[0])):
avgs.append(0.0)
for j in range(0, len(results)):
avgs[i] += results[j][i]
avgs[i] /= len(results)
result = os.path.basename(results_file).strip(".txt") + "\t" + \
str(len(results)) + "\t" + "\t".join([str(x) for x in avgs])
print result
def analyze_comparisons(comparisons_file):
'''INTERFACE: To compute various averages across comparisons.
Args:
comparisons_file: File with various analyze_comparison outputs.
'''
fd = open(comparisons_file, "r")
similar_9_1 = []
similar_1_9 = []
similar_1_1 = []
for line in fd.readlines():
words = line.strip().split()
if re.match(r'similar_1|similar_20', words[0]):
similar_9_1.append(words[1:])
elif re.match(r'similar_2\d-1|similar_30-20', words[0]):
similar_1_9.append(words[1:])
elif re.match(r'similar_[23]', words[0]):
similar_1_1.append(words[1:])
fd.close()
for similar in [similar_9_1, similar_1_9, similar_1_1]:
print len(similar)
avg = []
maxi = []
mini = []
for i in range(0, len(similar[0])):
avg.append(0.0)
maxi.append(0.0)
mini.append(1000.0)
for s in similar:
f = float(s[i])
avg[i] += f
if maxi[i] <= f:
maxi[i] = f
if mini[i] >= f:
mini[i] = f
avg[i] /= len(similar)
print "Avg", "\t".join([str(x) for x in avg])
print "Max", "\t".join([str(x) for x in maxi])
print "Min", "\t".join([str(x) for x in mini])
print
def write_conf(base, other):
'''Write a conf file for comparing accounts.'''
fd = open("tests/test.cf", "w")
conf_str = "base\tlogs/ccloudauditor" + str(base) + "\n" + \
"other\tlogs/ccloudauditor" + str(other) + "\n"
fd.write(conf_str)
fd.flush()
fd.close()
def analyze_areas(areas_file):
'''INTERFACE: Analyze the precision-recall areas for various models, alphas,
betas, thresholds, and find the optimal parameters.
Args:
areas_file: A file of areas dump, e.g. "results/models_34t/areas.txt"
'''
fd = open(areas_file, "r")
areas = {}
max_areas = {}
models = set()
alphas = set()
betas = set()
thresholds = set()
for line in fd.readlines():
words = line.strip().split(". ")
model = words.pop(0)
area = float(words.pop())
alpha = beta = threshold = -1
for word in words:
if word.startswith("Alpha"):
alpha = round(float(word.lstrip("Alpha ")), 1)
elif word.startswith("Beta"):
beta = round(float(word.lstrip("Beta ")), 1)
elif word.startswith("Threshold"):
threshold = round(float(word.lstrip("Threshold ")), 1)
if alpha == 0 or beta == 0:
continue
if model not in areas:
areas[model] = {}
max_areas[model] = {"ab": 0, "at": 0, "bt": 0}
if alpha not in areas[model]:
areas[model][alpha] = {}
if beta not in areas[model][alpha]:
areas[model][alpha][beta] = {}
areas[model][alpha][beta][threshold] = area
if threshold == -1 and max_areas[model]["ab"] < area:
max_areas[model]["ab"] = area
elif beta == -1 and max_areas[model]["at"] < area:
max_areas[model]["at"] = area
elif alpha == -1 and max_areas[model]["bt"] < area:
max_areas[model]["bt"] = area
models.add(model)
alphas.add(alpha)
betas.add(beta)
thresholds.add(threshold)
fd.close()
alphas -= set([-1])
betas -= set([-1])
thresholds -= set([-1])
for model in models:
for alpha in alphas:
for beta in betas:
area1 = areas[model][alpha][beta][-1]
area2 = areas[model][beta][alpha][-1]
if area1 == area2:
print "Equal area", model, alpha, beta
else:
print "Unequal area", model, alpha, beta, area1, area2
if areas[model][alpha][beta][-1] == max_areas[model]["ab"]:
print "AB", model, alpha, beta, max_areas[model]["ab"]
for alpha in alphas:
for threshold in thresholds:
if areas[model][alpha][-1][threshold] == max_areas[model]["at"]:
print "AT", model, alpha, threshold, max_areas[model]["at"]
for beta in betas:
for threshold in thresholds:
if areas[model][-1][beta][threshold] == max_areas[model]["bt"]:
print "BT", model, beta, threshold, max_areas[model]["bt"]
def compute_prs(ads_file):
'''INTERFACE: Compute precision and recall for various combinations of
model, alpha, beta, threshold for the set of ads dumped into a file.
Args:
ads_file: File containing dumped ads. Usually after merging across
accounts.
'''
adwiser = {"ads": adLib.load_ads(ads_file)}
adwiser["prediction"] = adAnalyzer.analyze_ads(adwiser["ads"])
# Uncomment below line to try all possible thresholds.
# adwiser["scores"] = adAnalyzer.get_scores(adwiser["prediction"])
adwiser["truth"] = adAnalyzer.true_ds_of_ad_list(adwiser["ads"])
adwiser["verification"] = adAnalyzer.verify_predictions(adwiser)
adAnalyzer.aggregate_verifications(adwiser, True)
def show_verifications(ads_file):
'''INTERFACE: Print the verifications of predictions made for the set of
ads dumped into a file.
Args:
ads_file: File containing dumped ads. Usually after merging across
accounts.
'''
adwiser = {"ads": adLib.load_ads(ads_file)}
adwiser["prediction"] = adAnalyzer.analyze_ads(adwiser["ads"])
adwiser["truth"] = adAnalyzer.true_ds_of_ad_list(adwiser["ads"])
adwiser["verification"] = adAnalyzer.verify_predictions(adwiser)
for i in range(0, len(adwiser["ads"])):
print adwiser["ads"][i].get_ad_str(), adwiser["prediction"][i]
print adwiser["verification"][i]
print
def dump_all_ads(conf_file, results_dir):
'''INTERFACE: Dump all ads in base trials and cumulative shadow trials.
Args:
conf_file: Config file like "accounts.cf".
results_dir: Directory into which the ads are to be dumped into.
'''
file_sets = adParser.parse_conf("accounts.cf")
shadow_ads = []
make_dir(results_dir + "/base")
make_dir(results_dir + "/shadow")
print "Trial Base Shadow Cumulative"
for i in range(0, 91):
base_file_set = file_sets["ccloudauditor10"][i]
base_ads = adParser.parse_html_set(base_file_set)
adLib.dump_ads(base_ads, results_dir + "/base/base_" + str(i) + ".txt")
shadow_file_set = adParser.get_file_set(file_sets, i, "ccloudauditor10")
sads = adParser.parse_html_set(shadow_file_set)
shadow_ads = adOps.union([shadow_ads, sads])
adLib.dump_ads(shadow_ads, results_dir + "/shadow/shadow_" + str(i) + \
".txt")
print i, len(base_ads), len(sads), len(shadow_ads)
'''
Common + Diff ads.
for i in range(0, 91):
print i
for j in [9, 33, 72, 90]:
base_ads = adLib.load_ads("results/dumped_ads/base/base_" + str(i) + \
".txt")
shadow_ads = adLib.load_ads("results/dumped_ads/shadow/shadow_" + \
str(j) + ".txt")
common_ads = adOps.intersection([base_ads, shadow_ads])
adOps.difference(base_ads, common_ads)
adLib.dump_ads([base_ads, common_ads], \
"results/dumped_ads/analyzed/analyzed_" + str(i) + \
"_" + str(j) + ".txt")
'''
def find_good_abs(pr_file):
'''INTERNAL: Find the set of (alpha, beta) tuples in a precision-recall file
for which both precision and recall are at least min_val.
Args:
pr_file: File with model, alpha, beta, threshold, precision, recall
records.
Return:
Directory of sets of (alpha, beta) tuples.
'''
fd = open(pr_file, "r")
prs = []
for line in fd.readlines():
words = line.strip().split()
prs.append([words[1], words[2], float(words[4]), float(words[5])])
fd.close()
alpha_betas = {}
for f in [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
alpha_betas[f] = set()
for record in prs:
if record[2] >= f and record[3] >= f:
alpha_betas[f].add((record[0], record[1]))
return alpha_betas
def find_optimal_abs(rawdir, result_prefix):
'''INTERFACE: Identify the optimal set of (alpha, beta) tuples.
Args:
rawdir: Directory containing optimal_b_s.txt files.
result_prefix: Prefix to the results file to be written.
'''
docs = 91
for t in [33, 72]:
alpha_betas = []
output = ""
for i in range(0, docs):
alpha_betas.append(find_good_abs(rawdir + "/optimal_" + str(i) \
+ "_" + str(t) + ".txt"))
for f in [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
ab_freqs = {}
for i in range(0, docs):
for ab in alpha_betas[i][f]:
if ab in ab_freqs:
ab_freqs[ab] += 1
else:
ab_freqs[ab] = 1
output += str(f) + " " + str(ab_freqs) + "\n"
fd = open(result_prefix + "_" + str(t) + ".txt", "w")
fd.write(output)
fd.flush()
fd.close()
def area(x_values, y_values):
'''INTERNAL: To calculate the area under a curve.
Args:
x_values: List of x-coordinates.
y_values: List of y-coordinates.
Return:
A real number indicating the area under the plot of x Vs y.
'''
area = 0
for i in range(1, len(x_values)):
area += (0.5 * (x_values[i] - x_values[i-1]) * \
(y_values[i] + y_values[i-1]))
return area
def find_good_thresholds(pr_file, alpha_betas):
'''INTERNAL: Find the set of thresholds in a precision-recall file for which
both precision and recall are at least min_val (0.6), for the given set of
(alpha, beta) tuples.
Args:
pr_file: File with model, alpha, beta, threshold, precision, recall
records.
alpha_betas: Set of (alpha, beta) tuples.
Return:
thresholds: Dictionary of good thresholds and their respective
(precision, recall) tuples for each (alpha, beta) tuples in alpha_betas.
'''
min_val = 0.6
fd = open(pr_file, "r")
thresholds = {}
for line in fd.readlines():
words = line.strip().split()
precision = float(words[4])
recall = float(words[5])
for (alpha, beta) in alpha_betas:
if words[1] == alpha and words[2] == beta and precision >= min_val \
and recall >= min_val:
if (alpha, beta) not in thresholds:
thresholds[(alpha, beta)] = {}
thresholds[(alpha, beta)][float(words[3])] = (precision, recall)
fd.close()
return thresholds
def find_optimal_thresholds(rawdir, file_suffix, alpha_betas):
'''INTERFACE: Find the optimal thresholds for the given set of (alpha, beta)
tuples for the given model.
Args:
rawdir: Directory containing optimal_b_s.txt files.
file_suffix: Suffix of the optimal_b_s.txt files which'ld be considered.
alpha_betas: Set of (alpha, beta) tuples.
'''
good_thresholds = []
optimals = {}
for ab in alpha_betas:
optimals[ab] = {}
for filename in os.listdir(rawdir):
if not filename.endswith(file_suffix):
continue
good_ts = find_good_thresholds(rawdir + "/" + filename, \
alpha_betas)
for ab in good_ts:
for t in good_ts[ab]:
if t not in optimals[ab]:
optimals[ab][t] = {"limits": (0.6, 0.65, 0.7, 0.75, 0.8, \
0.85, 0.9, 0.95),"bases": [0, 0, 0, 0, \
0, 0, 0, 0]}
good_thresholds.append(good_ts)
for good_ts in good_thresholds:
for ab in good_ts:
for t in optimals[ab]:
if t in good_ts[ab]:
(precision, recall) = good_ts[ab][t]
for i in range(0, len(optimals[ab][t]["limits"])):
if precision >= optimals[ab][t]["limits"][i] \
and recall >= optimals[ab][t]["limits"][i]:
optimals[ab][t]["bases"][i] += 1
else:
smaller_ts = []
for t2 in good_ts[ab]:
if t2 <= t:
smaller_ts.append(t2)
if len(smaller_ts) > 0:
(precision, recall) = good_ts[ab][max(smaller_ts)]
for i in range(0, len(optimals[ab][t]["limits"])):
if precision >= optimals[ab][t]["limits"][i] \
and recall >= optimals[ab][t]["limits"][i]:
optimals[ab][t]["bases"][i] += 1
for ab in optimals:
for t in optimals[ab]:
optimals[ab][t]["area"] = area(optimals[ab][t]["limits"], \
optimals[ab][t]["bases"])
max_area = max([optimals[ab][t]["area"] for t in optimals[ab]])
for t in optimals[ab]:
if optimals[ab][t]["area"] >= max_area:
print ab[0], ab[1], t, optimals[ab][t]
def f_measure(precision, recall):
'''INTERNAL: Return F-measure from precision and recall.'''
beta = 1
return precision*recall / ((beta*beta*precision) + recall)
def threshold_f_measures(pr_file, alpha_betas):
'''INTERNAL: Find the set of thresholds in a precision-recall file for the
given set of (alpha, beta) tuples, and their respective F-measures.
Args:
pr_file: File with model, alpha, beta, threshold, precision, recall
records.
alpha_betas: Set of (alpha, beta) tuples.
Return:
thresholds: Dictionary of thresholds and their respective F-measures for
each (alpha, beta) tuples in alpha_betas.
'''
fd = open(pr_file, "r")
thresholds = {}
for line in fd.readlines():
words = line.strip().split()
precision = float(words[4])
recall = float(words[5])
for (alpha, beta) in alpha_betas:
if words[1] == alpha and words[2] == beta:
if (alpha, beta) not in thresholds:
thresholds[(alpha, beta)] = {}
thresholds[(alpha, beta)][float(words[3])] = \
f_measure(precision, recall)
fd.close()
return thresholds
def summarize_optimal_params(optimal_file, rawdir):
'''INTERFACE: Final round of identifying optimal parameters. Based on area
under the curve of "limit Vs base".
Args:
optimal_file: File containing precision and recall limits and a
stringified dictionary of parameters (tuples) and their corresponding
number of base trials. e.g. "results/optimal/p_exp_33.txt"
rawdir: Directory containing optimal_b_s.txt files.
Return:
summary: A multi-level dictionary of parameters and their corresponding
limits, bases and areas.
'''
fd = open(optimal_file, "r")
summary = {}
for line in fd.readlines():
words = line.strip().split()
limit = float(words[0])
params_dict = ast.literal_eval(" ".join(words[1:]))
for param in params_dict:
if param in summary:
summary[param]["limits"].append(limit)
summary[param]["bases"].append(params_dict[param])
else:
summary[param] = {"limits": [limit], \
"bases": [params_dict[param]]}
fd.close()
for param in summary:
summary[param]["area"] = area(summary[param]["limits"], \
summary[param]["bases"])
optimal = {"area": max([summary[param]["area"] for param in summary])}
alpha_betas = set()
for param in summary:
if summary[param]["area"] >= optimal["area"]:
limits = tuple(summary[param]["limits"])
bases = tuple(summary[param]["bases"])
lb = tuple([limits, bases])
alpha_betas.add(param)
if lb in optimal:
optimal[lb].append(param)
else:
optimal[lb] = [param]
# find_optimal_thresholds(rawdir, optimal_file.split("_")[-1], alpha_betas)
# return optimal
return alpha_betas
def summarize_all_optimal_params(optimal_dir):
'''INTERFACE:
Args:
optimal_dir: Directory containing outputs of find_optimal_abs.
'''
summaries = {}
for optimal_file in os.listdir(optimal_dir):
summaries[optimal_file.strip(".txt")] = \
summarize_optimal_params(optimal_dir + "/" + optimal_file)
for model in summaries:
output = model + "\t" + str(summaries[model]["area"])
for key in summaries[model]:
if key != "area":
output += "\t" + str(key[1])
print output