forked from asdw12345/phish_detective
/
phish_detective.py
565 lines (485 loc) · 20 KB
/
phish_detective.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
"""
This scipt takes a sitedata json file as an input, extracts keywords, and does
a google query on those. A simple decision tree -based workflow categorizes website in to
- unresolved
- definitely not phishing
- probably not phishing
- probably phishing
- definitely phishing
"""
import bs4
import collections
import json
import keywords
import ocr
import os
import re
import requests
import simple_logger
import sys
import time
import urllib
import utils
import website_fetcher
import goslate
# import ngrams
logger = simple_logger.SimpleLogger()
logger.activate()
#############
# ARGUMENTS #
#############
# header data for google search
HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
# regular expression for extracting urls
GOOGLERX = re.compile("""http[s]?://[^\s'"]*""")
# max number of keywords extracted from a website
MAXCOUNT = 5
# stopmlds: mlds that almost always appear in Google's response
STOPMLDS = ['google', 'youtube', 'blogger', 'googleusercontent', 'schema']
####################
# hidden functions #
####################
def _load_json(jspath):
try:
with open(jspath) as f:
js = json.load(f)
except FileNotFoundError:
js = {}
return js
def _get_screenshot_path(jspath):
js = _load_json(jspath)
base = os.path.dirname(os.path.dirname(jspath))
path = os.path.join(base, 'websites', js['siteid'] + '.png')
return path
def _asks_password(js, value=None):
"""
Does the site contain a password field?
"""
sources = [source for source in js.get('external_source', {}).values()]
sources.append(js['source'])
for source in sources:
source = utils._replace_ampersands(source)
source = utils._replace_ad(source)
soup = bs4.BeautifulSoup(source, 'lxml')
for inputfield in soup.find_all('input'):
type = inputfield.get('type', None)
if type == 'password':
# logger.print('found a password field')
return True
# logger.print("no password fields")
return False
def _ocr_on_json(jspath):
js = _load_json(jspath)
if 'ocr' in js:
logger.print("this site has been ocr'd before")
return js
sspath = _get_screenshot_path(jspath)
stopwords = set(line.strip() for line in open('data/stopwords_en.txt'))
stopwords |= set(line.strip() for line in open('data/stopwords_www.txt'))
logger.print("doing ocr")
ocrtext = ocr.do_ocr(sspath, 'eng')
ocrtokens = [word for word in ocrtext.lower().split() if word not in stopwords]
js['ocr'] = ' '.join(ocrtokens)
with open(jspath, 'w') as f:
json.dump(js, f, indent=0, sort_keys=True)
return js
####################
# public functions #
####################
def fetch_urls(query):
"""
Do a Google search with a given query. Extract all URLs from the response for later processing.
Arguments
---------
query: str
string containing keywords separated by whitespace
Returns
-------
urls: set
the set of URLs returned by Google
"""
# quote whitespace so that the query can be placeed in a URL.
query = urllib.parse.quote(query)
url = 'http://www.google.com/search?q={}'.format(query)
r = requests.get(url, headers=HEADERS)
html = r.text
urls = set()
for url in GOOGLERX.findall(html):
if '<' in url or '\\' in url: # Google highlights search results
continue
mld, ps = keywords.split_mld_ps(url)
domain = mld + '.' + ps
if domain == 'google.fi' or domain == 'googleusercontent.com':
continue
urls.add(url)
return urls
def extract_domains(url_set, logging=False):
"""
Extract mail level domains and public suffixes from a set of urls.
Returns
-------
domains : set
tuples of form (mld, ps)
"""
domains = set()
for url in url_set:
mld, ps = keywords.split_mld_ps(url)
domains.add((mld, ps))
logger.print(" domains returned by google:", logging=logging)
for mld, ps in domains:
if mld not in STOPMLDS:
logger.print("{}.{}".format(mld, ps), nots=True, logging=logging)
return domains
def prominent_domains(js, keyw, domains=set(), extend_search=True):
"""
Given keywords of a site and domains returned by a google search, check
which domains are found from the site. Split the result on primary and
secondary targets.
Primary target is a domain that is found from the keywords or in the
domains guesses from the domain. Secondary target is a domain that is not
found from the keywords, but appears elsewhere in the site.
Parameters
----------
js : json object
contains site data
keyw : list
contains site keywords
domains : set
set of tuples (mld, ps)
extend_search : boolean
whether to look for prominent domains from text and links as well
Returns
-------
prominent : set
set of string "mld.ps" that either appear in keywords or can be guessed from the keywords
"""
prominent = set()
mld_guesses = keywords.guess_mld(js)
url_tokens = re.split('\W+', (js['starturl'] + ' ' + js['landurl']).lower())
title_tokens = re.split('\W+', js['title'].lower())
# logger.print("checking for prominent domains:")
for mld, ps in domains:
mld = mld.lower()
ps = ps.lower()
# segments = ngrams.segment(mld)
if mld in keyw:
logger.print("mld found from keywords: {}.{}".format(mld, ps), nots=True)
prominent.add('.'.join([mld, ps]))
# prominent.add((mld, ps))
elif mld in mld_guesses:
logger.print("mld found from mld-guessing: {}.{}".format(mld, ps), nots=True)
prominent.add('.'.join([mld, ps]))
# prominent.add((mld, ps))
# elif extend_search and ' '.join(segments) in ' '.join(js['text'].lower().split()) and mld not in STOPMLDS:
# logger.print("found by segmentation from text: {}.{}".format(mld, ps), nots=True)
# prominent.add('.'.join([mld, ps]))
# elif all(item in title_tokens for item in segments):
# logger.print("found by segmentation from title: {}.{}".format(mld, ps), nots=True)
# prominent.add('.'.join([mld, ps]))
# # prominent.add((mld, ps))
elif mld in url_tokens:
logger.print("mld in url: {}.{}".format(mld, ps), nots=True)
prominent.add('.'.join([mld, ps]))
# prominent.add((mld, ps))
if extend_search:
link_domains = set(keywords.split_mld_ps(link) for link in utils.extract_urls(js['source']))
link_domains |= set(keywords.split_mld_ps(link) for link in js['loglinks'])
# remove mlds that often occur: google, blogger, ... These are STOPMLDS
link_domains = set((mld, ps) for (mld, ps) in link_domains if mld not in STOPMLDS)
for dom in domains:
if dom in link_domains and dom not in prominent:
logger.print("mld found from links: {}.{}".format(*dom), nots=True)
prominent.add('.'.join(dom))
# prominent.add(dom)
return prominent
def build_query_domains(js):
google_domains = set()
# google search with mld_guesses
mld_guesses = keywords.guess_mld(js)
if mld_guesses:
mld_guess_str = ' '.join(['"{}"'.format(x) for x in mld_guesses])
logger.print("mld guesses: {}".format(mld_guess_str))
urls = fetch_urls(mld_guess_str)
google_domains |= extract_domains(urls)
else:
logger.print("no mld guesses")
langid = 'en'
# langid = goslate.Goslate().detect(js['text'])
logger.print("langid: {}".format(langid))
# google search with keywords
keyw = keywords.keywords(js, max_count=MAXCOUNT, boost=True, langid=langid)
# keyw = keywords.keywords(js, max_count=MAXCOUNT, augment=False)
keywstring = ' '.join(keyw)
if keywstring:
logger.print("keywords: {}".format(keywstring))
urls = fetch_urls(keywstring)
google_domains |= extract_domains(urls)
else:
logger.print("no keywords")
# google search with augmented keywords
augkeyw = keywords.keywords(js, max_count=MAXCOUNT, augment=True, langid=langid)
augkeywstring = ' '.join(augkeyw)
if augkeywstring != keywstring:
logger.print("augmented keywords: {}".format(augkeywstring))
urls = fetch_urls(augkeywstring)
google_domains |= extract_domains(urls)
return google_domains, keyw, augkeyw
def is_phish(js={}, jspath='', url=''):
"""
Decide whether a website is phishing using its keywords and a Google search
based on those.
Parameters
----------
js: dict, optional
contains site data
jspath: str, optional
path to a json file with the site data
url: str, optional
url of a website
Returns
-------
rank: int
* -1 = unresolved, fetching a website failed
* 0 = not phish
* 1 = suspicious
* 2 = phish
description: str
above description for the numerical values
targets: set
potential targets in case of 1 or 2, empty when rank is -1, 0
"""
# load json
if url:
fetcher = website_fetcher.WebsiteFetcher(logging=True, confirm=True)
# sitedata, screenshot = fetcher.fetch_sitedata_and_screenshot(url)
# js = sitedata
jspath, sspath = fetcher.fetch_and_save_data(url)
js = _load_json(jspath)
elif jspath:
js = _load_json(jspath)
sspath = _get_screenshot_path(jspath)
if not js:
logger.print("json file is empty; cannot continue")
return -1, 'unresolved', set()
logger.print("siteid: {}".format(js['siteid']))
landurl = js['landurl']
# logger.print("landing url: {}".format(landurl[:80]))
logger.print("loglinks:")
for link in js['loglinks']:
logger.print(link, nots=True)
mld, ps = keywords.split_mld_ps(landurl)
# logger.print("main level domain: {}".format(mld))
# 1. TESTS
# password?
pw = _asks_password(js)
if pw:
logger.print("asks for a password")
else:
logger.print("does not ask for a password")
google_domains, keyw, augkeyw = build_query_domains(js)
logger.print("query-domains:")
for dom in google_domains:
logger.print('.'.join(dom), nots=True)
if pw:
# logger.print("asks for a password")
prominent_domains_found = prominent_domains(js, keyw + augkeyw, google_domains, extend_search=False)
mld_in_gmld = (mld, ps) in google_domains
if mld_in_gmld:
logger.print("a query-mld matches with site-mld -> not phish")
return 0, 'not phish', set()
else:
logger.print("no query-mld matches with site-mld")
if prominent_domains_found:
logger.print("prominent domains found -> phish")
return 2, 'phish', prominent_domains_found
else:
logger.print("did not find prominent mlds")
logger.print("doing ocr")
js = _ocr_on_json(jspath)
ocrkeyw = keywords.keywords(js, max_count=MAXCOUNT, augment=True, use_ocr=True)
keywstring = ' '.join(ocrkeyw)
logger.print("ocr keywords: {}".format(keywstring))
urls = fetch_urls(keywstring)
google_domains = extract_domains(urls)
prominent_domains_found = prominent_domains(js, ocrkeyw, google_domains, extend_search=True)
mld_in_gmld = (mld, ps) in google_domains
if mld_in_gmld:
logger.print("a query-mld matches with site-mld -> not phish")
return 0, 'not phish', set()
else:
logger.print("no query-mld matches with site-mld")
if prominent_domains_found:
logger.print("prominent domains found -> phish")
return 2, 'phish', prominent_domains_found
else:
logger.print("prominent domains not found -> possibly phish")
return 1, 'suspicious', set()
else:
# logger.print("password not found")
prominent_domains_found = prominent_domains(js, keyw, google_domains, extend_search=False)
mld_in_gmld = (mld, ps) in google_domains
if mld_in_gmld:
logger.print("a query-mld matches with site-mld -> not phish")
return 0, 'not phish', set()
else:
logger.print("no query-mld matches with site-mld")
if prominent_domains_found:
logger.print("prominent domains found -> suspicious")
return 1, 'suspicious', prominent_domains_found
else:
logger.print("prominent domains not found -> not phish")
return 0, 'not phish', set()
# logger.print("did not find prominent mlds")
# logger.print("doing ocr")
# js = _ocr_on_json(jspath)
# ocrkeyw = keywords.keywords(js, max_count=MAXCOUNT, augment=True, use_ocr=True)
# keywstring = ' '.join(ocrkeyw)
# logger.print("ocr keywords: {}".format(keywstring))
# urls = fetch_urls(keywstring)
# google_domains = extract_domains(urls)
# prominent_domains_found = prominent_domains(js, ocrkeyw, google_domains, extend_search=True)
# mld_in_gmld = (mld, ps) in google_domains
# if mld_in_gmld:
# logger.print("a query-mld matches with site-mld -> not phish")
# return 0, 'not phish', set()
# else:
# logger.print("no query-mld matches with site-mld")
# if prominent_domains_found:
# logger.print("prominent domains found -> suspicious")
# return 1, 'suspicious', prominent_domains_found
# else:
# logger.print("prominent domains not found -> not phish")
# return 0, 'not phish', set()
def is_phish2(ws=None):
"""
Decide whether a website is phishing using its keywords and a Google search
based on those.
Parameters
----------
ws: website object or None
contains all downloaded information about the site
Returns
-------
rank: int
* -1 = unresolved, fetching a website failed
* 0 = not phish
* 1 = suspicious
* 2 = phish
description: str
above description for the numerical values
targets: set
potential targets in case of 1 or 2, empty when rank is -1, 0
"""
if ws is None:
# logger.print("website object is None is empty; cannot continue")
return -1, 'unresolved', set()
# logger.print("siteid: {}".format(ws.siteid))
# logger.print("landing url: {}".format(ws.landurl[:80]))
mld, ps = keywords.split_mld_ps(ws.landurl)
# 1. TESTS
# password?
pw = ws.has_password
# if pw:
# logger.print("asks for a password")
# else:
# logger.print("does not ask for a password")
google_domains, keyw, augkeyw = extract_domains(ws.js['urls_keywords'] + ws.js['urls_augmented']), ws.keywords(), ws.augmented_keywords()
# logger.print("query-domains:")
# for dom in google_domains:
# logger.print('.'.join(dom), nots=True)
if pw:
# logger.print("password found")
prominent_domains_found = prominent_domains(ws.js, keyw + augkeyw, google_domains, extend_search=False)
mld_in_gmld = (mld, ps) in google_domains
if mld_in_gmld:
# logger.print("a query-mld matches with site-mld -> not phish")
# return 0
return 0, 'not phish', set()
else:
# logger.print("no query-mld matches with site-mld")
if prominent_domains_found:
# logger.print("prominent domains found -> phish")
# return 2
return 2, 'phish', prominent_domains_found
else:
# logger.print("did not find prominent mlds")
# logger.print("doing ocr")
ocrkeyw = keywords.keywords(ws.js, max_count=MAXCOUNT, augment=True, use_ocr=True)
keywstring = ' '.join(ocrkeyw)
# logger.print("ocr keywords: {}".format(keywstring))
urls = ws.js['urls_ocr']
google_domains = extract_domains(urls)
prominent_domains_found = prominent_domains(ws.js, ocrkeyw, google_domains, extend_search=True)
mld_in_gmld = (mld, ps) in google_domains
if mld_in_gmld:
# logger.print("a query-mld matches with site-mld -> not phish")
# return 0
return 0, 'not phish', set()
else:
# logger.print("no query-mld matches with site-mld")
if prominent_domains_found:
# logger.print("prominent domains found -> phish")
# return 2
return 2, 'phish', prominent_domains_found
else:
# logger.print("prominent domains not found -> possibly phish")
# return 1
return 1, 'suspicious', set()
else:
# logger.print("password not found")
prominent_domains_found = prominent_domains(ws.js, keyw, google_domains, extend_search=False)
mld_in_gmld = (mld, ps) in google_domains
if mld_in_gmld:
# logger.print("a query-mld matches with site-mld -> not phish")
# return 0
return 0, 'not phish', set()
else:
# logger.print("no query-mld matches with site-mld")
if prominent_domains_found:
# logger.print("prominent domains found -> suspicious")
# return 1
return 1, 'suspicious', prominent_domains_found
else:
# logger.print("prominent domains not found -> not phish")
# return 0
return 0, 'not phish', set()
# logger.print("did not find prominent mlds")
# logger.print("doing ocr")
# js = _ocr_on_json(jspath)
# ocrkeyw = keywords.keywords(js, max_count=MAXCOUNT, augment=True, use_ocr=True)
# keywstring = ' '.join(ocrkeyw)
# logger.print("ocr keywords: {}".format(keywstring))
# urls = fetch_urls(keywstring)
# google_domains = extract_domains(urls)
# prominent_domains_found = prominent_domains(js, ocrkeyw, google_domains, extend_search=True)
# mld_in_gmld = (mld, ps) in google_domains
# if mld_in_gmld:
# logger.print("a query-mld matches with site-mld -> not phish")
# return 0, 'not phish', set()
# else:
# logger.print("no query-mld matches with site-mld")
# if prominent_domains_found:
# logger.print("prominent domains found -> suspicious")
# return 1, 'suspicious', prominent_domains_found
# else:
# logger.print("prominent domains not found -> not phish")
# return 0, 'not phish', set()
########
# main #
########
if __name__ == '__main__':
usage = 'usage:\n\tpython {0} --url <url>\n\tpython {0} --json <jspath>'.format(__file__)
if not sys.argv[1:]:
print(usage)
else:
logger.activate()
if sys.argv[1] == '--url':
url = sys.argv[2]
print(is_phish(url=url))
elif sys.argv[1] == '--json':
jspath = sys.argv[2]
print(is_phish(jspath=jspath))
else:
print(sys.argv[1])
print(usage)
print()