forked from liming-thu/TestDB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Hybrid_TimeOverQuality.py
executable file
·222 lines (186 loc) · 7.48 KB
/
Hybrid_TimeOverQuality.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# Time Over Quality
# For a given keyword and target quality,
# get a list of (r, k) value pairs that can satisfy this quality requirement
# and then run hybrid queries for different (r, k) pairs
# and draw T curves of different (r, k) pairs in one canvas
import time
import Conf
import DatabaseFactory
import KeywordsUtil
import Modeler
import PlotUtil
import numpy as np
import json
###########################################################
# Configurations
###########################################################
dbType = Conf.DBTYPE
databaseName = Conf.DATABASE
tableName = Conf.TABLE
db = DatabaseFactory.getDatabase(dbType)
# From what frequency, choose keywords
frequencies = [100000, 500000, 1000000, 2000000, 3000000, 5000000, 8000000, 10000000, 12000000, 15000000, 20000000]
# For each frequency, how many keywords we choose
numOfKeywords = 3
# Target Quality
quality = 0.85
reversed_order = False
order_suffix = 'asc'
if reversed_order:
order_suffix = 'desc'
# Choose keywords with different frequencies
keywords = []
for freq in frequencies:
keywords.extend(KeywordsUtil.pickNearestKeywordToFrequency(freq, numOfKeywords))
# Remove keywords with non alphabetic symbols
keywords[:] = [kw for kw in keywords if kw[0].isalpha()]
print keywords
# keywords = [('girl', 1090320)]
r_percentages = range(10, 110, 10)
r_values = map(lambda rp: float(rp) / 100.0, r_percentages)
r_labels = map(lambda rp: str(rp) + '%', r_percentages)
keyword_labels = map(lambda kw: kw[0] + ':' + str(kw[1]), keywords)
# For fill cache
fillCacheTableName = 'coord_tweets' if tableName == 'coord_tweets_sorted' else 'coord_tweets_sorted'
fillCacheKeywords = KeywordsUtil.randomPickInFrequencyRange(3000000, 8000000, 3)
###########################################################
# Run Script
###########################################################
print '================================================='
print ' ' + dbType + ' Experiment - Time over quality '
print '- Hybrid approach'
print '================================================='
print 'table:', tableName
print 'keywords:', keywords
print 'r_percentage:', r_labels
print '-------------------------------------------------'
start = time.time()
# rk_pairs Dictionary stores for each keyword a list of list:
# {'soccer': [[r=R0,k=K0], [r=R1,k=K1], ..., [r=Rn,k=Kn]], 'rain': [[...]]}
# load rk_pairs dictionary from json file first
rk_pairs_file = dbType + '_' + databaseName + '_' + tableName + '_rk_pairs.json'
rk_pairs = {}
try:
with open(rk_pairs_file) as f:
rk_pairs = json.load(f)
except IOError:
print rk_pairs_file, ' does not exist.'
# 1. Collect (r, k) pairs for each keyword that not in current dictionary
# For each keyword:
# run Modeler.findKROfQuality() to get a list of (r, k) pairs
progress = 0
t0 = time.time()
for i_keyword in keywords:
print 'Processing keyword =', i_keyword[0] + ' ...'
if i_keyword[0] in rk_pairs.keys():
print 'already exists in dictionary rk_pairs:', rk_pairs[i_keyword[0]]
else:
i_rk_pairs = Modeler.findKROfQuality(i_keyword[0], quality, r_values)
rk_pairs[i_keyword[0]] = i_rk_pairs
progress += 1
print '[Total time]', time.time() - t0, \
'[Progress]', str(progress * 100 / len(keywords)) + '%'
print rk_pairs
# Save rk_pairs dictionary into json file
with open(rk_pairs_file, 'w+') as f:
json.dump(rk_pairs, f)
# 2. For each r value: (for one r value, there's only one k value respectively)
# For each keyword, run hybrid query:
# Send dummy query
# Get the execution time of the query
# Time Dictionary stores for each keyword a list of Time
# {'soccer': [t(r=R0), t(r=R1), ...], 'rain': [...]}
# load times dictionary from json file first
# times_file = 'hybrid_' + tableName + '_freq-' + str(min(frequencies)) + '-' + str(max(frequencies)) + '_q-' + str(quality) + '_times.json'
times_file = 'hybrid_' + tableName + '_' + keywords[0][0] + '_q-' + str(quality) + '_times_' + order_suffix + '.json'
times = {}
draw_curves_directly = False
try:
with open(times_file) as f:
times = json.load(f)
draw_curves_directly = True
except IOError:
print times_file, ' does not exist.'
# if json file does not exist, new an empty dictionary
for keyword in keywords:
times[keyword[0]] = [np.nan] * len(r_values)
print times
if not draw_curves_directly:
progress = 0
t0 = time.time()
i_start = 0
i_end = len(r_values)
i_step = 1
# run the queries in reversed order
if reversed_order:
i_start = len(r_values) - 1
i_end = -1
i_step = -1
for i in range(i_start, i_end, i_step):
r = r_values[i]
print 'Processing r =', str(int(r * 100)) + '% ...'
for keyword in keywords:
i_rk_pairs = rk_pairs[keyword[0]]
k = i_rk_pairs[i][1]
# if k value for this r for this keyword,
# assign the time to be NaN
if k < 0:
times[keyword[0]][i] = np.nan
continue
# Send a dummy query
t1 = time.time()
db.queryDummy()
t2 = time.time()
print 'dummy query takes', t2 - t1, 's'
for fillCacheKW in fillCacheKeywords:
# Send a fill Cache query
t1 = time.time()
db.SumCoordinateHybrid(fillCacheTableName, fillCacheKW[0], 1.0, fillCacheKW[1])
t2 = time.time()
print 'fill cache query takes', t2 - t1, 's'
t_start = time.time()
# l_coordinates_hybrid = db.GetCoordinateHybrid(tableName, keyword[0], r, k)
l_coordinates_hybrid = db.SumCoordinateHybrid(tableName, keyword[0], r, k)
t_end = time.time()
print 'This query takes', t_end - t_start, 's'
times[keyword[0]][i] = t_end - t_start
progress += 1
print '[Total time]', time.time() - t0, \
'[Progress]', str(progress * 100 / (len(keywords) * len(r_values))) + '%'
# Restart DB
db.restart()
print times
# Save times into json file
with open(times_file, 'w+') as f:
json.dump(times, f)
# 3. Plot the T-(r, k) curves of different keywords in one canvas
print 'Plotting images ...'
# i_fileName_head = 'hybrid_' + tableName + '_freq-' + str(min(frequencies)) + '-' + str(max(frequencies)) + '_q-' + str(quality)
i_fileName_head = 'hybrid_' + tableName + '_' + keywords[0][0] + '_q-' + str(quality)
# (1) Plot T-(r, k) curves of different keywords
i_fileName = i_fileName_head + '_t-r-k_' + order_suffix
i_labels = keyword_labels
print 'i_labels:'
print i_labels
i_x = r_labels
i_curves = []
print 'keywords:'
for keyword in keywords:
print keyword[0]
i_curves.append(times[keyword[0]])
i_x_label = '(r, k) pair for r'
i_y_label = 'Execution Time(s)'
# i_title = 'F=[' + str(min(frequencies)) + '-' + str(max(frequencies)) + '] Q=' + str(quality) + ' - T-(r,k) curves'
i_title = 'F=[' + str(keywords[0][1]) + '] Q=' + str(quality) + ' - T-(r,k) curves'
print 'Plotting', i_title
PlotUtil.plotCurves(i_fileName, i_labels, i_x, i_curves, i_x_label, i_y_label, i_title)
end = time.time()
print '================================================='
print ' ' + dbType + ' Experiment - Time over quality '
print '- Hybrid approach'
print '================================================='
print 'table:', tableName
print 'keywords:', keywords
print 'r_percentage:', r_labels
print '-------------------------------------------------'
print 'Finished!', end - start, 'seconds spent.'