-
Notifications
You must be signed in to change notification settings - Fork 0
/
microbehavior_logic.py
388 lines (317 loc) · 13.5 KB
/
microbehavior_logic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
#author:jzadeh
import re
import entropy as en
import pandas as pd
class HTTPMicroBehaviors:
def isBase64(s):
""" Define a class method for matching base64 strings"""
# check that the string has no remainder in %4, check that it only contains valid characters
try:
re_check = re.match('^([A-Za-z0-9+/]{4})*([A-Za-z0-9+/]{4}|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)$', s)
return re.match('^([A-Za-z0-9+/]{4})*([A-Za-z0-9+/]{4}|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)$', s)
except: TypeError
return False
def isUrlEncoded(s):
"""Define a class method for matching url encoded strings"""
# check if the string contains 1/3 or more % characters
percentCount = 0
try:
for c in s:
if c == '%':
percentCount = percentCount +1
if (percentCount/len(s)) >= (1/3):
return(True)
else:
return(False)
except(ZeroDivisionError):
# Catch all exceptions, specifically division by 0 from empty strings
return(False)
def max_path_length(inList):
"""return the max path length of all URIs in list"""
# Declare local var that will store the max path length
maxLength = 1
# Count the depth of the file structure for each uri in inList
for uri in inList:
try:
current_count = uri.count('/')
if current_count > maxLength:
maxLength = current_count
except:
AttributeError
return(maxLength)
def min_path_length(inList):
"""return the min path length of all URIs"""
# Declare local var that will store the min path length
minLength = inList.iloc[0].count('/')
# Count the depth of the file structure for each uri in inList
for uri in inList:
try:
current_count = uri.count('/')
if current_count < minLength:
minLength = current_count
except: AttributeError
return(minLength)
def max_length(inList):
"""Max length of all URIs in list"""
maxLength = len(inList.head(1))
previous_len = 0
for uri in inList:
try:
current_len = len(uri)
if current_len>= previous_len:
maxLength = current_len
previous_len = current_len
except: AttributeError
return(maxLength)
def min_length(inList):
"""Min length of all URIs in list"""
minLength = len(inList.head(1))
previous_len = 0
for uri in inList:
try:
current_len = len(uri)
if current_len<= previous_len:
minLength = current_len
previous_len = current_len
except: AttributeError
return(minLength)
def max_entropy(inList):
"""returns the maximum shannon entropy of URIs in the list"""
try:
maxEntropy = en.shannon_entropy(inList[0])
except(IndexError,TypeError,KeyError):
try:
maxEntropy = en.shannon_entropy(inList)
except(TypeError):
maxEntropy = 0.0
for uri in inList:
try:
if maxEntropy < en.shannon_entropy(uri):
maxEntropy = en.shannon_entropy(uri)
except(IndexError, TypeError, KeyError):
print()
return(maxEntropy)
def min_entropy(inList):
"""Returns the minimum shannon entropy of URIs in the list"""
try:
minEntropy = en.shannon_entropy(inList[0])
except(IndexError,TypeError,KeyError):
try:
minEntropy = en.shannon_entropy(inList)
except(TypeError):
minEntropy = 0.0
for uri in inList:
try:
if minEntropy > en.shannon_entropy(uri):
minEntropy = en.shannon_entropy(uri)
except(IndexError, TypeError, KeyError):
print()
return(minEntropy)
def base_64_match(inList):
"""Return the number of URI in inList that could be a base64 encoded string"""
count = 0
for uri in inList:
if HTTPMicroBehaviors.isBase64(uri):
count = count+1
return(count)
def percent_encoding_match(inList):
"""Number of URIs with large number of % encoded strings"""
count = 0
for uri in inList:
if HTTPMicroBehaviors.isUrlEncoded(uri):
count = count+1
return(count)
def uri_distinct(inSeries):
"""expects a list of URI, returns a integer indicating the number of URI's that are unique"""
# Instantiate the unique strings counter
inList = inSeries.tolist()
count = len(inList)
# Recursively check for the head of the list in the rest of the list
while len(inList) > 0:
# If the head of the list matches, decrement the unique strings counter
if inList[0] in inList[1:]:
count = count - 1
del inList[0]
return(count)
def referrers_distinct(inSeries):
"""expects a list of URI, returns a integer indicating the number of URI's that are unique"""
# Instantiate the unique strings counter
inList = inSeries.tolist()
count = len(inList)
# Recursively check for the head of the list in the rest of the list
while len(inList) > 0:
# If the head of the list matches, decrement the unique strings counter
if inList[0] in inList[1:]:
count = count - 1
del inList[0]
return(count)
def max_length_referrer(inList):
"""Min length of all referrers in list"""
minLength = len(inList.head(1))
previous_len = 0
for uri in inList:
try:
current_len = len(uri)
if current_len<= previous_len:
minLength = current_len
previous_len = current_len
except: AttributeError
return(minLength)
def behaviorVector(inFrame):
"""expects a dataFrame, returns a dictionary
define a dictionary of learning features: uriMaxPathDepth, uriMinPathDepth, uriMaxLength, uriMinLength, uriDistinct,
uriMaxEntropy, uriMinEntropy, isBase64, isUrlEncoded"""
inList = inFrame['uri']
inListRef = inFrame['referrer']
# Dirty work around for IndexError anomaly generated when calling entropy inside the dictionary key/value declaration below
mxEntropy = HTTPMicroBehaviors.max_entropy(inList)
mnEntropy = HTTPMicroBehaviors.min_entropy(inList)
behaviorVector = {
'max_length_referrer': HTTPMicroBehaviors.max_length_referrer(inListRef),
'referrers_distinct' : HTTPMicroBehaviors.referrers_distinct(inListRef),
'max_path_depth': HTTPMicroBehaviors.max_path_length(inList),
'min_path_depth': HTTPMicroBehaviors.min_path_length(inList),
'max_length' : HTTPMicroBehaviors.max_length(inList),
'min_length' : HTTPMicroBehaviors.min_length(inList),
'uri_Distinct' : HTTPMicroBehaviors.uri_distinct(inList),
'max_entropy' : mxEntropy,
'min_entropy' : mnEntropy
}
timing_vector = TimeBehaviors.behavior_vector(inFrame)
uri_dict = dict(behaviorVector)
uri_dict.update(timing_vector)
return(uri_dict)
class TimeBehaviors:
"""Class specific method for calculating difference between time-stamps,
expects list of date timese, type float"""
def time_delta(times):
delta = times[1]-times[0]
return(delta.total_seconds())
def get_time_interval(inFrame):
"""List of time-deltas from first to last,
expects a dataFrame, returns a list of seconds in float format"""
#declare list that will hold deltas
deltas = []
#cast inFrame epochTime column into a list
times = inFrame['epochTime'].tolist()
#read through the epochTime list until empty
while len(times) > 1:
#for each iteration, calculate the total second time delta
deltas.append(TimeBehaviors.time_delta(times))
#strip the head off the list
times.remove(times[0])
#return the list of time deltas
return(deltas)
def max_time_interval(inFrame):
"""Returns the maximum time interval in a window,
expects a dataFrame, returns a float"""
return(max(TimeBehaviors.get_time_interval(inFrame)))
def min_time_interval(inFrame):
"""Returns the minimum time interval in a window
expects a dataFrame, returns a float"""
return(min(TimeBehaviors.get_time_interval(inFrame)))
def interval_length(inFrame):
"""Time Length in Window,
expects a dataFrame, returns window time delta, difference of last and first time stamps"""
#last row number of the inFrame
d0 = TimeBehaviors.min_time_interval(inFrame)
d1 = TimeBehaviors.max_time_interval(inFrame)
return(d1-d0)
def get_max_deltas(inFrame, n=5):
"""Get a list containing the max deltas info, default is top 5
expects a dataFrame, returns a list of floats"""
#get time deltas list
DeltasList = TimeBehaviors.get_time_interval(inFrame)
#sort DeltasList
DeltasList = sorted(DeltasList, reverse=True)
#return size
return_size = n
if n == 0:
return(DeltasList[:5])
else:
return(DeltasList[:return_size])
def get_min_deltas(inFrame, n = 5):
"""Get a list containing the min deltas info, default is least 5
expects a dataFrame, returns a list of floats"""
#get time deltas list
DeltasList = TimeBehaviors.get_time_interval(inFrame)
#sort DeltasList
DeltasList = sorted(DeltasList)
#return size
return_size = n
if n == 0:
return(DeltasList[:5])
else:
return(DeltasList[:return_size])
# Group of time-delta ratio counters
def ratio_of_deltas_A(inFrame):
"""(Ratio of time-deltas < 1 second) / (window size)
expects a dataFrame, returns a float"""
counter = 0
index = 0
for i in TimeBehaviors.get_time_interval(inFrame):
if i <= 1:
counter = counter + 1
try:
return(counter / TimeBehaviors.interval_length(inFrame))
except: ZeroDivisionError
return 0.0
def ratio_of_deltas_B(inFrame):
"""(Ratio of time-deltas < 5 second)/(window size)
expects a dataFrame, returns a float"""
counter = 0
for i in TimeBehaviors.get_time_interval(inFrame):
if i <= 5 and i > 1:
counter = counter + 1
try:
return(counter / TimeBehaviors.interval_length(inFrame))
except: ZeroDivisionError
return 0.0
def ratio_of_deltas_C(inFrame):
"""(Ratio of time-deltas < 10 second)/(window size)
expects a dataFrame, returns a float"""
counter = 0
for i in TimeBehaviors.get_time_interval(inFrame):
if i <=10 and i >5:
counter = counter + 1
try:
return(counter / TimeBehaviors.interval_length(inFrame))
except: ZeroDivisionError
return 0.0
def ratio_of_deltas_D(inFrame):
"""(Ratio of time-deltas < 20 second)/(window size)
expects a dataFrame, returns a float"""
counter = 0
for i in TimeBehaviors.get_time_interval(inFrame):
if i <=20 and i > 10:
counter = counter + 1
try:
return(counter / TimeBehaviors.interval_length(inFrame))
except: ZeroDivisionError
return 0.0
def ratio_of_deltas_E(inFrame):
"""(Ratio of time-deltas >= 100 second)/(window size)
expects a dataFrame, returns a float"""
counter = 0
for i in TimeBehaviors.get_time_interval(inFrame):
if i >= 100:
counter = counter + 1
try:
return(counter / TimeBehaviors.interval_length(inFrame))
except: ZeroDivisionError
return 0.0
def behavior_vector(self, n = 5):
behaviorVector = {
'time_interval' : TimeBehaviors.get_time_interval(self),
'max_deltas' : TimeBehaviors.get_max_deltas(self),
'min_deltas' : TimeBehaviors.get_min_deltas(self),
'ratio_of_deltas_A' : TimeBehaviors.ratio_of_deltas_A(self),
'ratio_of_deltas_B' : TimeBehaviors.ratio_of_deltas_B(self),
'ratio_of_deltas_C' : TimeBehaviors.ratio_of_deltas_C(self),
'ratio_of_deltas_D' : TimeBehaviors.ratio_of_deltas_D(self),
'ratio_of_deltas_E' : TimeBehaviors.ratio_of_deltas_E(self),
'max_time_interval' : TimeBehaviors.max_time_interval(self),
'min_time_interval' : TimeBehaviors.min_time_interval(self),
'interval_length' : TimeBehaviors.interval_length(self)
}
return(behaviorVector)