/
dataLoaderNew.py
227 lines (195 loc) · 9.3 KB
/
dataLoaderNew.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
from _pybgpstream import BGPStream, BGPRecord
import sys
import ast
import time
import json
import os
import csv
burst2writeA = {} # Dictionary recording the A updates
burst2writeW = {} # Dictionary recording the W updates
graph_points = {} # Dictionary with all peers and their burst counts, used to plot the graphs of the bursts
def main():
# Register the starting point in seconds, from epoch time
start = int(sys.argv[1])
# Register the stopping point in seconds, from epoch time
stop = int(sys.argv[2])
print 'Interval: ' + str(stop-start)
# Register the array of collectors we want to take the updates from
collectors = ast.literal_eval(sys.argv[3])
# Register the window where once we get a burst, we record the last [window] seconds of updates
window = int(sys.argv[4])
# Threshold in number of updates when the burst is recorded
threshold = int(sys.argv[5])
# Time when we start the loading process
s = time.time()
# Launch the loading process
load_data(start, stop, collectors, window, threshold)
# Time when we stop the loading process
e = time.time()
# Total time it took to collect and process all updates
print 'Total time: ', (e - s)
# Function to save in json file the point we display in plotGraph.js
def saveGraphPoint(queue, updateType, peer, timestamp, collectors, threshold):
# We register the burst points only if the threshold is reached
if len(queue) >= threshold:
# If the peer is not graph_points, we open a csv file to append its name in it
# We use the intermediate csv file as a json file can't be appended as easily
if peer not in graph_points:
fd = open('csv_peernames-'+'-'.join(collectors)+'.csv', 'a')
# We append in the csv file the json file name used in plotGraph
fd.write(peer.replace(':', '_')+'/'+peer.replace(':', '_') + '-graph.json' + '\n')
fd.close()
# We create a space in graph_points to append the burst updates
graph_points[peer] = {}
# If the update timestamp is not yet recorded, we create the space for it in the dictionary
if timestamp not in graph_points[peer]:
graph_points[peer][timestamp] = {'A': 0, 'W': 0}
# We finally append the size of the burst at the timestamp moment
graph_points[peer][timestamp][updateType] = len(queue)
# Function to flush properly a burst
# The burst is flushed only if the updates arrival exceeds the size of the window
def cleanQueue(queue, timestamp, window):
# If the current timestamp observed is bigger than the last timestamp recorded
if queue and timestamp > queue[-1]['tst']:
while queue:
# If the timestamp observed is out of the window, we delete these elements of the queue
if queue[0]['tst'] + window < timestamp:
del queue[0]
else:
break
# Return the last timestamp observed for the peer and return None if the window as been exceeded
# If the window is exceeded it means we'll have to record the current timestamp
def currentBurstTime(burstqueue, peer, timestamp, window):
if peer not in burstqueue or not burstqueue[peer]:
return None
else:
lasttst = max(burstqueue[peer].keys())
# We want to record twice the size of the window
if timestamp > lasttst + window:
return None
return lasttst
# Write in csv file the raw update bursts
def writeBurst(peer, burstqueue, updatetype, timestamp):
# Replace ':' by '_' for file naming recognition
peer_file_name = peer.replace(':', '_')
# Create a directory for the peer
if not os.path.exists(peer_file_name):
os.makedirs(peer_file_name)
# Create the csv file containing the timestamps and prefixes in the burst
if updatetype == 'A':
with open(peer_file_name+'/'+peer_file_name+'-'+str(timestamp)+'-burstA.csv', 'a') as f:
for elem in burstqueue[peer][timestamp]:
f.write(str(elem['tst']) + ',' + elem['prefix'] + '\n')
else:
with open(peer_file_name+'/'+peer_file_name+'-'+str(timestamp)+'-burstW.csv', 'a') as f:
for elem in burstqueue[peer][timestamp]:
f.write(str(elem['tst']) + ',' + elem['prefix'] + '\n')
# Handle a new update collected by the loader
def handleUpdate(queue, burstQueue, update, peer, updatetype, timestamp, window, threshold):
# Clean the queue before handling the update
cleanQueue(queue, timestamp, window)
queue.append(update)
length = len(queue)
lasttst = currentBurstTime(burstQueue, peer, timestamp, window)
# we record the end of the burst if a timestamp is still in a burst window
if lasttst:
burstQueue[peer][lasttst].append(update)
# not recording any burst and we are detecting a burst
elif length > threshold:
if peer not in burstQueue:
burstQueue[peer] = {}
burstQueue[peer][timestamp] = []
# Record what we already have in the queue in burstqueue
for elem in queue:
burstQueue[peer][timestamp].append(elem)
# Write in a csv file the first window seconds of the burst
if len(burstQueue[peer]) > 1:
writeBurst(peer, burstQueue, updatetype, min(burstQueue[peer]))
del burstQueue[peer][min(burstQueue[peer])]
# Main function to load the data and process it
def load_data(start, stop, collectors, window, threshold):
peers = {}
# collectors is a list of the collectors we want to include
# Start and stop define the interval we are looking in the data
# Create a new BGPStream instance and a reusable BGPRecord instance
stream = BGPStream()
rec = BGPRecord()
# Add filter for each collector.
# If no collector is mentioned, it will consider 16 of them
if collectors:
for collector in collectors:
print collector
stream.add_filter('collector', collector)
else:
for i in range(0, 10):
stream.add_filter('collector', 'rrc0' + str(i))
for i in range(10, 16):
stream.add_filter('collector', 'rrc' + str(i))
stream.add_filter('record-type', 'updates')
# Consider the interval from "start" to "stop" in seconds since epoch
stream.add_interval_filter(start, stop)
# Start the stream
stream.start()
# For each record (one record = one second, can have multiple elements for the same second) we handle its updates
while stream.get_next_record(rec):
timestamp = rec.time
if rec.status != "valid":
print rec.project, rec.collector, rec.type, timestamp, rec.status
else:
# Go through all elements of the record
elem = rec.get_next_elem()
while elem:
# Consider only the A and W updates
if elem.type not in ['A', 'W']:
elem = rec.get_next_elem()
continue
peer = elem.peer_address
updatetype = elem.type
prefix = elem.fields['prefix']
if peer not in peers:
peers[peer] = {
'A': [],
'W': []
}
update = {'tst': timestamp, 'prefix': prefix}
if updatetype == 'A':
handleUpdate(peers[peer]['A'], burst2writeA, update, peer, updatetype, timestamp, window, threshold)
saveGraphPoint(peers[peer]['A'], updatetype, peer, timestamp, collectors, threshold)
else:
handleUpdate(peers[peer]['W'], burst2writeW, update, peer, updatetype, timestamp, window, threshold)
saveGraphPoint(peers[peer]['W'], updatetype, peer, timestamp, collectors, threshold)
elem = rec.get_next_elem()
# After processing all records, we write the graph json files with the graph points recorded for each peer
for peer in graph_points:
peer_file_name = peer.replace(':', '_')
if not os.path.exists(peer_file_name):
os.makedirs(peer_file_name)
with open(peer_file_name+'/'+peer_file_name + '-graph.json', 'w') as outfile:
json.dump(graph_points[peer], outfile, indent=2)
# Write the last burst of A updates if there is one left
if burst2writeA:
for peer in burst2writeA:
if burst2writeA[peer]:
for timestamp in burst2writeA[peer]:
writeBurst(peer, burst2writeA, 'A', timestamp)
# Write the last burst of W updates if there is one left
if burst2writeW:
for peer in burst2writeW:
if burst2writeW[peer]:
for timestamp in burst2writeW[peer]:
writeBurst(peer, burst2writeW, 'W', timestamp)
# transform csv names in json file to use getJSON in plotGrap
# step to CSV is used to avoid appending to the end of a json file directly as appending
# to a json file overwrite the whole file
jsonlist = []
with open('csv_peernames-'+'-'.join(collectors)+'.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
jsonlist.append(row[0])
jsondata = json.dumps(jsonlist, indent=2)
fd = open('json_file_names-' + '-'.join(collectors) + '.json', 'w')
fd.write(jsondata)
fd.close()
# Launch the program
if __name__ == '__main__':
main()