-
Notifications
You must be signed in to change notification settings - Fork 0
/
Nasa_log.py
379 lines (301 loc) · 13.2 KB
/
Nasa_log.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# NASA Apache Web Server Log Analysis --Apache Spark
#Data set from NASA Kennedy Space Center web server in Florida
#The full data set is freely available at http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html, and it contains all HTTP requests for two months.
#Loading the log file
# Specify path to downloaded log file
import sys
import os
from pyspark.sql.functions import split, regexp_extract
log_file_path = 'dbfs:/' + os.path.join('databricks-datasets', 'cs100', 'lab2', 'data-001', 'apache.access.log.PROJECT')
base_df = sqlContext.read.text(log_file_path)
# Let's look at the schema
base_df.printSchema()
#Parsing the log file
split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'),
regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('path'),
regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('status'),
regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('content_size'))
split_df.show(truncate=False)
#Data Cleaning
base_df.filter(base_df['value'].isNull()).count()
#check how clean the data is
bad_rows_df = split_df.filter(split_df['host'].isNull() |
split_df['timestamp'].isNull() |
split_df['path'].isNull() |
split_df['status'].isNull() |
split_df['content_size'].isNull())
bad_rows_df.count()
#Find which column is affected
from pyspark.sql.functions import col, sum
def count_null(col_name):
return sum(col(col_name).isNull().cast('integer')).alias(col_name)
# Build up a list of column expressions, one per column.
#
# This could be done in one line with a Python list comprehension, but we're keeping
# it simple for those who don't know Python very well.
exprs = []
for col_name in split_df.columns:
exprs.append(count_null(col_name))
# Run the aggregation. The *exprs converts the list of expressions into
# variable function arguments.
split_df.agg(*exprs).show()
#Let's see if there are any lines that do not end with one or more digits.
bad_content_size_df = base_df.filter(~ base_df['value'].rlike(r'\d+$'))
bad_content_size_df.count()
# Take a look at some of the bad column values. Since it's possible that the rows end in extra white space, we'll tack a marker character onto
# the end of each line, to make it easier to see trailing white space.
from pyspark.sql.functions import lit, concat
bad_content_size_df.select(concat(bad_content_size_df['value'], lit('*'))).show(truncate=False)
#Fix the rows with null content_size
# Replace all null content_size values with 0.
cleaned_df = split_df.na.fill({'content_size': 0})
# Ensure that there are no nulls left.
exprs = []
for col_name in cleaned_df.columns:
exprs.append(count_null(col_name))
cleaned_df.agg(*exprs).show()
#Parsing the timestamp.
month_map = {
'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12
}
def parse_clf_time(s):
""" Convert Common Log time format into a Python datetime object
Args:
s (str): date and time in Apache time format [dd/mmm/yyyy:hh:mm:ss (+/-)zzzz]
Returns:
a string suitable for passing to CAST('timestamp')
"""
# NOTE: We're ignoring time zone here. In a production application, we want to handle that.
return "{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}".format(
int(s[7:11]),
month_map[s[3:6]],
int(s[0:2]),
int(s[12:14]),
int(s[15:17]),
int(s[18:20])
)
u_parse_time = udf(parse_clf_time)
logs_df = cleaned_df.select('*', u_parse_time(cleaned_df['timestamp']).cast('timestamp').alias('time')).drop('timestamp')
total_log_entries = logs_df.count()
logs_df.printSchema()
display(logs_df)
#Cache logs_df. We're going to be using it quite a bit from here forward.
logs_df.cache()
#Content Size Statistics
#Calculate statistics based on the content size.
content_size_summary_df = logs_df.describe(['content_size'])
content_size_summary_df.show()
#We can use SQL to directly calculate these statistics
from pyspark.sql import functions as sqlFunctions
content_size_stats = (logs_df
.agg(sqlFunctions.min(logs_df['content_size']),
sqlFunctions.avg(logs_df['content_size']),
sqlFunctions.max(logs_df['content_size']))
.first())
print 'Using SQL functions:'
print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*content_size_stats)
#HTTP Status Analysis
status_to_count_df =(logs_df
.groupBy('status')
.count()
.sort('status')
.cache())
status_to_count_length = status_to_count_df.count()
print 'Found %d response codes' % status_to_count_length
status_to_count_df.show()
display(status_to_count_df)
log_status_to_count_df = status_to_count_df.withColumn('log(count)', sqlFunctions.log(status_to_count_df['count']))
display(log_status_to_count_df)
# np is just an alias for numpy.
# cm and plt are aliases for matplotlib.cm (for "color map") and matplotlib.pyplot, respectively.
# prepareSubplot is a helper.
from spark_notebook_helpers import prepareSubplot, np, plt, cm
data = log_status_to_count_df.drop('count').collect()
x, y = zip(*data)
index = np.arange(len(x))
bar_width = 0.7
colorMap = 'Set1'
cmap = cm.get_cmap(colorMap)
fig, ax = prepareSubplot(np.arange(0, 6, 1), np.arange(0, 14, 2))
plt.bar(index, y, width=bar_width, color=cmap(0))
plt.xticks(index + bar_width/2.0, x)
display(fig)
#Frequent Hosts
# Any hosts that has accessed the server more than 10 times.
host_sum_df =(logs_df
.groupBy('host')
.count())
host_more_than_10_df = (host_sum_df
.filter(host_sum_df['count'] > 10)
.select(host_sum_df['host']))
print 'Any 20 hosts that have accessed more then 10 times:\n'
host_more_than_10_df.show(truncate=False)
#Visualize the number of hits to paths (URIs) in the log
paths_df = (logs_df
.groupBy('path')
.count()
.sort('count', ascending=False))
paths_counts = (paths_df
.select('path', 'count')
.map(lambda r: (r[0], r[1]))
.collect())
paths, counts = zip(*paths_counts)
colorMap = 'Accent'
cmap = cm.get_cmap(colorMap)
index = np.arange(1000)
fig, ax = prepareSubplot(np.arange(0, 1000, 100), np.arange(0, 70000, 10000))
plt.xlabel('Paths')
plt.ylabel('Number of Hits')
plt.plot(index, counts[:1000], color=cmap(0), linewidth=3)
plt.axhline(linewidth=2, color='#999999')
display(fig)
display(paths_df)
#Find the top paths (URIs) in the log.
# Top Paths
print 'Top Ten Paths:'
paths_df.show(n=10, truncate=False)
#Find Top Ten Error Paths
# DataFrame containing all accesses that did not return a code 200
from pyspark.sql.functions import desc
not200DF = logs_df.filter(logs_df.status != 200)
#print not200DF
not200DF.show(10)
# Sorted DataFrame containing all paths and the number of times they were accessed with non-200 return code
logs_sum_df = not200DF.groupBy('path').count().sort('count',ascending=False)
print 'Top Ten failed URLs:'
logs_sum_df.show(10, False)
#Find Number of Unique Hosts
unique_host_count = logs_df.select('host').distinct().count()
print 'Unique hosts: {0}'.format(unique_host_count)
#Number of Unique Daily Hosts
from pyspark.sql.functions import dayofmonth
day_to_host_pair_df = logs_df.select('host',dayofmonth('time').alias('day'))
#day_to_host_pair_df.show(10,False)
day_group_hosts_df = day_to_host_pair_df.dropDuplicates()
#day_group_hosts_df.show(10,False)
daily_hosts_df = day_group_hosts_df.groupBy('day').count()
print 'Unique hosts per day:'
daily_hosts_df.show(30, False)
daily_hosts_df.cache()
#Visualizing the Number of Unique Daily Hosts
days_with_hosts =[]
hosts = []
for i in range (len(daily_hosts_df.collect())):
days_with_hosts.append(daily_hosts_df.collect()[i][0])
hosts.append(daily_hosts_df.collect()[i][1])
#print(daily_hosts_df.collect())
#print(days_with_hosts)
#print(days_with_hosts[0][1])
print(hosts)
fig, ax = prepareSubplot(np.arange(0, 30, 5), np.arange(0, 5000, 1000))
colorMap = 'Dark2'
cmap = cm.get_cmap(colorMap)
plt.plot(days_with_hosts, hosts, color=cmap(0), linewidth=3)
plt.axis([0, max(days_with_hosts), 0, max(hosts)+500])
plt.xlabel('Day')
plt.ylabel('Hosts')
plt.axhline(linewidth=3, color='#999999')
plt.axvline(linewidth=2, color='#999999')
display(fig)
display(daily_hosts_df)
#Average Number of Daily Requests per Host
total_req_per_day_df = logs_df.groupBy(dayofmonth('time').alias('day')).count()
#total_req_per_day_df.show()
avg_daily_req_per_host_df = (
(total_req_per_day_df.alias("total")
.join(daily_hosts_df.alias("host"), ["day"])
.select(col("day"), (col("total.count") / col("host.count")).alias("avg_reqs_per_host_per_day")))
)
print 'Average number of daily requests per Hosts is:\n'
avg_daily_req_per_host_df.show()
avg_daily_req_per_host_df.cache()
#Visualizing the Average Daily Requests per Unique Host
days_with_avg = (avg_daily_req_per_host_df.select('day').collect())
avgs = (avg_daily_req_per_host_df.select('avg_reqs_per_host_per_day').collect())
for i in range (len(avgs)):
days_with_avg[i]=days_with_avg[i][0]
avgs[i]=avgs[i][0]
print(days_with_avg)
print(avgs)
fig, ax = prepareSubplot(np.arange(0, 20, 5), np.arange(0, 16, 2))
colorMap = 'Set3'
cmap = cm.get_cmap(colorMap)
plt.plot(days_with_avg, avgs, color=cmap(0), linewidth=3)
plt.axis([0, max(days_with_avg), 0, max(avgs)+2])
plt.xlabel('Day')
plt.ylabel('Average')
plt.axhline(linewidth=3, color='#999999')
plt.axvline(linewidth=2, color='#999999')
display(fig)
display(avg_daily_req_per_host_df)
#Exploring 404 Status Codes
#Counting 404 Response Codes
not_found_df = logs_df.filter(logs_df.status==404)
print('Found {0} 404 URLs').format(not_found_df.count())
not_found_df.cache()
#Listing 404 Status Code Records
not_found_paths_df = not_found_df.select('path')
unique_not_found_paths_df = not_found_paths_df.distinct()
print '404 URLS:\n'
unique_not_found_paths_df.show(n=40, truncate=False)
#Listing the Top Twenty 404 Response Code paths
top_20_not_found_df = not_found_paths_df.groupBy('path').count().sort('count',ascending=False)
print 'Top Twenty 404 URLs:\n'
top_20_not_found_df.show(n=20, truncate=False)
#Listing the Top Twenty-five 404 Response Code Hosts
hosts_404_count_df = not_found_df.groupBy('host').count().sort('count',ascending=False)
print 'Top 25 hosts that generated errors:\n'
hosts_404_count_df.show(n=25, truncate=False)
#Listing 404 Errors per Day
errors_by_date_sorted_df = not_found_df.groupBy(dayofmonth('time').alias('day')).count()
print '404 Errors by day:\n'
errors_by_date_sorted_df.show()
errors_by_date_sorted_df.cache()
#Visualizing the 404 Errors by Day
days_with_errors_404 = errors_by_date_sorted_df.select('day').collect()
#print days_with_errors_404[2][0]
errors_404_by_day = errors_by_date_sorted_df.select('count').collect()
for i in range (len(days_with_errors_404)):
days_with_errors_404[i]=days_with_errors_404[i][0]
errors_404_by_day[i]=errors_404_by_day[i][0]
print days_with_errors_404
print errors_404_by_day
fig, ax = prepareSubplot(np.arange(0, 20, 5), np.arange(0, 600, 100))
colorMap = 'rainbow'
cmap = cm.get_cmap(colorMap)
plt.plot(days_with_errors_404, errors_404_by_day, color=cmap(0), linewidth=3)
plt.axis([0, max(days_with_errors_404), 0, max(errors_404_by_day)])
plt.xlabel('Day')
plt.ylabel('404 Errors')
plt.axhline(linewidth=3, color='#999999')
plt.axvline(linewidth=2, color='#999999')
display(fig)
display(errors_by_date_sorted_df)
#Top Five Days for 404 Errors
top_err_date_df = errors_by_date_sorted_df.sort('count',ascending=False)
print 'Top Five Dates for 404 Requests:\n'
top_err_date_df.show(5)
#Hourly 404 Errors
from pyspark.sql.functions import hour
hour_records_sorted_df = not_found_df.groupBy(hour('time').alias('hour')).count()
print 'Top hours for 404 requests:\n'
hour_records_sorted_df.show(24)
hour_records_sorted_df.cache()
#Visualizing the 404 Response Codes by Hour
hours_with_not_found =map (lambda hour: hour[0],(hour_records_sorted_df.select('hour').collect()))
not_found_counts_per_hour = map (lambda counts: counts[0],(hour_records_sorted_df.select('count').collect()))
print hours_with_not_found
print not_found_counts_per_hour
fig, ax = prepareSubplot(np.arange(0, 25, 5), np.arange(0, 500, 50))
colorMap = 'seismic'
cmap = cm.get_cmap(colorMap)
plt.plot(hours_with_not_found, not_found_counts_per_hour, color=cmap(0), linewidth=3)
plt.axis([0, max(hours_with_not_found), 0, max(not_found_counts_per_hour)])
plt.xlabel('Hour')
plt.ylabel('404 Errors')
plt.axhline(linewidth=3, color='#999999')
plt.axvline(linewidth=2, color='#999999')
display(fig)
display(hour_records_sorted_df)