def SiteURLSQL(item, LOG, SQL, TABLEname, PROXY, UAFILE, UAG): # remove URL containing UID-style strings siteURL = quote( re.split("(?:[0-9a-fA-F]:?){32}", item['page']['url'])[0], ':/') dn = dirname(siteURL) # Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, dn) == 0: now = str(TimestampNow().Timestamp()) siteDomain = urlparse(item['page']['url']).netloc source_url = item['result'].replace("/api/v1", "") try: IPaddress = socket.gethostbyname(siteDomain) if IPaddress: rASN = NetInfo() ASN = rASN.GetASN(IPaddress).strip('\"') else: pass # can't resolv except: IPaddress = "" ASN = "" # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True) lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" pass except Exception as e: # Unknown status code LOG.error("Connection error: {}".format(e)) pass LOG.info(siteURL + " " + siteDomain + " " + IPaddress + " " + source_url + " " + now + " " + lastHTTPcode) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode, ASN) else: LOG.debug("Entry still known: " + siteURL) pass
def OpenphishExtractor(openphish_file,SearchString,LOG,SQL,TABLEname,PROXY,UAFILE): UAG = UAgent() with open(openphish_file,"rt") as txt: for entry in txt: ## Search if SearchString in entry: # remove URL containing UID-style strings siteURL = re.split("(?:[0-9a-fA-F]:?){32}", entry.rstrip())[0] dn = dirname(siteURL) ## Test if entry still exist in DB #if SQL.SQLiteVerifyEntry(TABLEname, siteURL) is 0: if SQL.SQLiteVerifyEntry(TABLEname, dn) is 0: now=str(TimestampNow().Timestamp()) siteDomain=urlparse(entry).netloc source_url=openphish_file try: IPaddress=socket.gethostbyname(siteDomain) # can't resolv except: IPaddress="" # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5,12)) lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True, timeout=(5,12)) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" err = sys.exc_info() LOG.error("HTTP error: " + str(err)) pass except: # Unknown status code err = sys.exc_info() LOG.error("Connection error: " + str(err)) pass # Add data into database LOG.info(siteURL) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode) else: LOG.debug("Entry still known: "+siteURL) pass else: pass
def PKDownloadOpenDir(siteURL, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE): global Ziplst proxies = {'http': PROXY, 'https': PROXY} UAG = UAgent() UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} now = str(TimestampNow().Timestamp()) SHA = SHA256() Ziplst = [] rhtml = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) thtml = BeautifulSoup(rhtml.text, 'html.parser') PageTitle = thtml.title.text thtmlatag = thtml.select('a') Ziplst += [siteURL + "/" + tag['href'] for tag in thtmlatag if '.zip' in tag.text] for file in Ziplst: try: r = requests.get(file, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) zzip = file.replace('/', '_').replace(':', '') if zipfile.is_zipfile(io.BytesIO(r.content)): savefile = DLDir + zzip # Still collected file if os.path.exists(savefile): LOG.info("[DL ] Found still collected archive: " + savefile) return # New file to download else: LOG.info("[DL ] Found archive in an open dir, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(r.content) pass ZipFileName = str(zzip) ZipFileHash = SHA.hashFile(savefile) SQL.SQLiteInvestigUpdatePK(InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) return else: pass except requests.exceptions.ContentDecodingError: LOG.error("[DL ] content-type error") except: pass
def AddUniqueURL(URLadd, LOG, SQL, TABLEname, PROXY, UAFILE): UAG = UAgent() # add schema if URLadd.startswith("http://") or URLadd.startswith("https://"): pass else: URLadd = "http://{}".format(URLadd) # remove URL containing UID-style strings siteURL = re.split("(?:[0-9a-fA-F]:?){32}", URLadd.rstrip())[0] # Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, siteURL) == 0: now = str(TimestampNow().Timestamp()) siteDomain = urlparse(URLadd).netloc source_url = "Manual" try: IPaddress = socket.gethostbyname(siteDomain) rASN = NetInfo() ASN = rASN.GetASN(IPaddress).strip('\"') # can't resolv except: IPaddress = "" ASN = "" # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" err = sys.exc_info() LOG.error("HTTP error: " + str(err)) pass except: # Unknown status code err = sys.exc_info() LOG.error("Connection error: " + str(err)) pass # Add data into database LOG.info(siteURL) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode, ASN) else: LOG.info("Entry still known: " + siteURL) pass
def UrlqueryExtractor(LOG, SQL, TABLEname, PROXY, UAFILE): UAG = UAgent() ## Search in Urlquery HTML file try: m = re.findall(r"<td><a title='(.*?)' href='(.*?)'>", HTMLText) for line in m: # remove URL containing UID-style strings siteURL = re.split("(?:[0-9a-fA-F]:?){32}", line[0])[0] if siteURL.startswith('https:'): siteDomain = siteURL.split('/')[2] else: siteDomain = siteURL.split('/')[0] siteURL = "http://" + siteURL ## Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, siteURL) is 0: ## Proceed to informations retrieve source_url = "https://urlquery.net/" + line[1] try: IPaddress = socket.gethostbyname(siteDomain) # can't resolv except: IPaddress = "" now = str(TimestampNow().Timestamp()) # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" pass except: # Unknown status code err = sys.exc_info() LOG.error("Connection error: " + str(err)) pass LOG.info(siteURL + " " + siteDomain + " " + IPaddress + " " + source_url + " " + now + " " + lastHTTPcode) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode) else: LOG.debug("Entry still known: " + siteURL) pass except: err = sys.exc_info() LOG.error("HTML parser Error! " + str(err))
def PhishtankExtractor(phishtank_file, SearchString, LOG, SQL, TABLEname, PROXY, UAFILE): UAG = UAgent() # Search in Phishtank JSON file file = json.loads(open(phishtank_file).read()) for entry in file: # Search if SearchString in entry['url']: # remove URL containing UID-style strings siteURL = re.split("(?:[0-9a-fA-F]:?){32}", entry['url'])[0] dn = dirname(siteURL) # Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, dn) is 0: IPaddress = entry['details'][0]['ip_address'] source_url = entry['phish_detail_url'] siteDomain = urlparse(entry['url']).netloc now = str(TimestampNow().Timestamp()) try: IPaddress = socket.gethostbyname(siteDomain) # can't resolv except: IPaddress = "" # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" err = sys.exc_info() LOG.error("HTTP error: " + str(err)) pass except: # Unknown status code err = sys.exc_info() LOG.error("Connection error: " + str(err)) pass LOG.info(siteURL) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode) else: LOG.debug("Entry still known: " + siteURL) pass else: pass
def TryPKDownload(siteURL, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE): global ziplist proxies = {'http': PROXY, 'https': PROXY} UAG = UAgent() UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} now = str(TimestampNow().Timestamp()) SHA = SHA256() PsiteURL = None ResiteURL = siteURL PsiteURL = urlparse(ResiteURL) if len(PsiteURL.path.split("/")[1:]) >= 2: siteURL = ResiteURL.rsplit('/', 1)[0] else: siteURL = ResiteURL # Let's try to find a phishing kit source archive try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) if (str(r.status_code) != "404"): LOG.info("[" + str(r.status_code) + "] " + r.url) SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) if SQL.SQLiteInvestigVerifyEntry(InvTABLEname, siteDomain, IPaddress) is 0: SQL.SQLiteInvestigInsert(InvTABLEname, siteURL, siteDomain, IPaddress, now, str(r.status_code)) else: pass ziplist = [] path = siteURL pathl = '/' .join(path.split("/")[:3]) pathlist = path.split("/")[3:] # Make list current = 0 newpath = "" while current < len(pathlist): if current == 0: newpath = pathlist[current] else: newpath = newpath + "/" + pathlist[current] current = current + 1 pathD = pathl + "/" + newpath ziplist.append(pathD) # Get page title try: if len(ziplist) >= 1: rhtml = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) thtml = rhtml.text tit = re.search('<title>(.*?)</title>', thtml, re.IGNORECASE) if tit is not None: PageTitle = tit.group(1) LOG.info(PageTitle) SQL.SQLiteInvestigUpdateTitle(InvTABLEname, siteURL, PageTitle) else: pass except AttributeError: pass except requests.exceptions.ReadTimeout: pass except: err = sys.exc_info() LOG.error("Get PageTitle Error: " + siteURL + str(err)) try: # Try too find and download phishing kit archive (.zip) if len(ziplist) >= 1: for zip in ziplist: if (' = ' or '%' or '?' or '-' or '@') not in os.path.basename(os.path.normpath(zip)): try: LOG.info("trying " + zip + ".zip") rz = requests.get(zip + ".zip", headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) if str(rz.status_code) != "404": lastHTTPcode = str(rz.status_code) zzip = zip.replace('/', '_').replace(':', '') try: if "application/zip" in rz.headers['content-type'] or "application/octet-stream" in rz.headers['content-type']: savefile = DLDir + zzip + '.zip' # Still collected file if os.path.exists(savefile): LOG.info("[DL ] Found still collected archive: " + savefile) return # New file to download else: LOG.info("[DL ] Found archive, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(rz.content) pass ZipFileName = str(zzip + '.zip') ZipFileHash = SHA.hashFile(savefile) SQL.SQLiteInvestigUpdatePK(InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) return else: pass except requests.exceptions.ContentDecodingError: LOG.error("[DL ] content-type error") except: pass # 404 else: pass except requests.exceptions.ReadTimeout: LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ConnectTimeout: LOG.debug("Connection Timeout") except: err = sys.exc_info() LOG.error("Error: " + str(err)) print("Error: " + str(err)) pass # else: # pass else: pass else: pass # Ziplist empty else: pass except: err = sys.exc_info() LOG.error("DL Error: " + str(err)) else: LOG.debug("[" + str(r.status_code) + "] " + r.url) lastHTTPcode = str(r.status_code) SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, lastHTTPcode) SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) except requests.exceptions.ConnectionError: err = sys.exc_info() if '0x05: Connection refused' in err: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. refused') if '0x04: Host unreachable' in err: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Unreachable') else: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. error') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection error: " + siteURL) except requests.exceptions.ConnectTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. timeout') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ReadTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. readtimeout') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection Read Timeout: " + siteURL) except requests.exceptions.MissingSchema: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Malformed URL') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Malformed URL, skipping: " + siteURL + "\n") except: err = sys.exc_info() LOG.error("Error: " + str(err))
def PKDownloadOpenDir(siteURL, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE, ASN): global Ziplst proxies = {'http': PROXY, 'https': PROXY} UAG = UAgent() UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} now = str(TimestampNow().Timestamp()) SHA = SHA256() Ziplst = [] rhtml = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) thtml = BeautifulSoup(rhtml.text, 'html.parser') try: PageTitle = thtml.title.text.strip() except: PageTitle = None if PageTitle is not None: PageTitle = re.sub('\s+', ' ', PageTitle) SQL.SQLiteInvestigUpdateTitle(InvTABLEname, siteURL, PageTitle) else: pass thtmlatag = thtml.select('a') Ziplst += [ siteURL + "/" + tag['href'] for tag in thtmlatag if '.zip' in tag.text ] for f in Ziplst: try: r = requests.get(f, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) lastHTTPcode = str(r.status_code) # Reduce filename lenght if len(f) > 250: zzip = f.replace('/', '_').replace(':', '')[:250] else: zzip = f.replace('/', '_').replace(':', '') try: savefile = DLDir + zzip # Still collected file if os.path.exists(savefile): LOG.info("[DL ] Found still collected archive: " + savefile) # New file to download else: if zipfile.is_zipfile(io.BytesIO(r.content)): LOG.info( "[DL ] Found archive in an open dir, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(r.content) pass ZipFileName = str(zzip + '.zip') ZipFileHash = SHA.hashFile(savefile) # Extract e-mails from downloaded file try: ZS = ZipSearch() extracted_emails = str( ZS.PKzipSearch(InvTABLEname, SQL, LOG, DLDir, savefile)).strip("[]").replace( "'", "") LOG.info( "[Emails] found: {}".format(extracted_emails)) SQL.SQLiteInvestigInsertEmail( InvTABLEname, extracted_emails, ZipFileName) except Exception as e: LOG.info( "Extracted emails exception: {}".format(e)) SQL.SQLiteInvestigUpdatePK(InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) else: pass except Exception as e: LOG.error("Error downloading file: {}".format(e)) except requests.exceptions.ContentDecodingError: LOG.error("[DL ] content-type error")
def TryPKDownload(siteURL, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE, ASN): global ziplist global PageTitle proxies = {'http': PROXY, 'https': PROXY} UAG = UAgent() UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} now = str(TimestampNow().Timestamp()) SHA = SHA256() # Let's try to find a phishing kit source archive try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) # Generate page hash try: soup = BeautifulSoup(r.content, 'lxml') # Body hash only # body = soup.find('body') try: #page_body = body.findChildren() page_body = soup except: # print(r.content) ## print, frequently, JS in <head> pass try: sha1 = hashlib.sha1() sha1.update(repr(page_body).encode("utf-8")) PageHash = sha1.hexdigest() SQL.SQLiteInsertPageHash(TABLEname, siteURL, PageHash) except: pass except Exception as e: print(e) # if (str(r.status_code) != "404"): LOG.info("[" + str(r.status_code) + "] " + r.url) SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) if SQL.SQLiteInvestigVerifyEntry(InvTABLEname, siteDomain, IPaddress) is 0: SQL.SQLiteInvestigInsert(InvTABLEname, siteURL, siteDomain, IPaddress, now, str(r.status_code)) else: pass ziplist = [] path = siteURL pathl = '/'.join(path.split("/")[:3]) pathlist = path.split("/")[3:] # Make list current = 0 newpath = "" while current < len(pathlist): if current == 0: newpath = pathlist[current] else: newpath = newpath + "/" + pathlist[current] current = current + 1 pathD = pathl + "/" + newpath ziplist.append(pathD) rootpath = pathl + "/" if rootpath != pathD: ziplist.append(rootpath) else: pass # Get page title try: if len(ziplist) >= 1: rhtml = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) thtml = BeautifulSoup(rhtml.text, 'html.parser') try: PageTitle = thtml.title.text.strip() except: PageTitle = None if PageTitle is not None: PageTitle = re.sub('\s+', ' ', PageTitle) LOG.info(PageTitle) SQL.SQLiteInvestigUpdateTitle(InvTABLEname, siteURL, PageTitle) else: pass except AttributeError: pass except requests.exceptions.ReadTimeout: pass except requests.exceptions.ConnectTimeout: pass except: err = sys.exc_info() LOG.error("Get PageTitle Error: " + siteURL + str(err)) # Try to find and download phishing kit archive (.zip) try: if len(ziplist) >= 1: for zip in ziplist: if (' = ' or '%' or '?' or '-' or '@') not in os.path.basename( os.path.normpath(zip)): try: # if URL is not rootpath siteURL if int(len(zip.split("/")[3:][0])) > 0: LOG.info("trying " + zip + ".zip") # Try to use cfscraper if Cloudflare's check if "Cloudflare" in PageTitle: scraper = cfscrape.create_scraper() rz = scraper.get(zip + ".zip", headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) else: rz = requests.get(zip + ".zip", headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) # if str(rz.status_code) != "404": lastHTTPcode = str(rz.status_code) # Reduce filename lenght if len(zip) > 250: zzip = zip.replace('/', '_').replace( ':', '')[:250] else: zzip = zip.replace('/', '_').replace( ':', '') try: savefile = DLDir + zzip + '.zip' # Still collected file if os.path.exists(savefile): LOG.info( "[DL ] Found still collected archive: " + savefile) return # New file to download else: if zipfile.is_zipfile( io.BytesIO(rz.content)): LOG.info( "[DL ] Found archive, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(rz.content) pass ZipFileName = str(zzip + '.zip') ZipFileHash = SHA.hashFile( savefile) SQL.SQLiteInvestigUpdatePK( InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) # Extract e-mails from downloaded file try: ZS = ZipSearch() extracted_emails = str( ZS.PKzipSearch( InvTABLEname, SQL, LOG, DLDir, savefile)).strip( "[]").replace( "'", "") LOG.info( "[Email] Found: {}". format( extracted_emails)) SQL.SQLiteInvestigInsertEmail( InvTABLEname, extracted_emails, ZipFileName) except Exception as e: LOG.info( "Extracted emails exception: {}" .format(e)) return else: pass except requests.exceptions.ContentDecodingError: LOG.error("[DL ] content-type error") except: pass # rootpath of siteURL else: rr = requests.get(zip, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) thtml = BeautifulSoup(rr.text, 'html.parser') try: PageTitle = thtml.title.text.strip() except: PageTitle = None if PageTitle is not None: PageTitle = re.sub('\s+', ' ', PageTitle) else: pass except requests.exceptions.ReadTimeout: LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ConnectTimeout: LOG.debug("Connection Timeout") except Exception as e: LOG.error("Error Downloading zip: {}".format(e)) pass # Search for OpenDir try: if PageTitle is not None: # OpenDir's zip search if 'Index of' in PageTitle: PKDownloadOpenDir(zip, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE, ASN) # 000webhostapp OpenDir-like zip search elif '.000webhostapp.com Free Website' in PageTitle: PKDownloadOpenDir(zip, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE, ASN) else: pass else: pass except Exception as e: LOG.error("Potential OpenDir connection error: " + str(e)) pass else: pass else: pass # Ziplist empty else: pass except Exception as e: LOG.error("DL Error: " + str(e)) except requests.exceptions.ConnectionError: err = sys.exc_info() if '0x05: Connection refused' in err: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. refused') if '0x04: Host unreachable' in err: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Unreachable') else: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. error') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection error: " + siteURL) except requests.exceptions.ConnectTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. timeout') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ReadTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. readtimeout') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection Read Timeout: " + siteURL) except requests.exceptions.MissingSchema: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Malformed URL') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Malformed URL, skipping: " + siteURL) except requests.exceptions.InvalidURL: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Malformed URL') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Malformed URL, skipping: " + siteURL) except requests.exceptions.ChunkedEncodingError: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Can\'t read data') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Can't read data, skipping: " + siteURL) except requests.exceptions.TooManyRedirects: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Too many redirects') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Too many redirects, skipping: " + siteURL) except KeyboardInterrupt: LOG.info("Shutdown requested...exiting") os._exit(0) except Exception as e: LOG.error("Error trying to find kit: " + str(e))
def SiteURLSQL(phishtank_file, entry, LOG, SQL, TABLEname, PROXY, UAFILE, UAG): # remove URL containing UID-style strings siteURL = quote(re.split("(?:[0-9a-fA-F]:?){32}", entry['url'])[0], ':/') dn = dirname(siteURL) # Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, dn) is 0: IPaddress = entry['details'][0]['ip_address'] source_url = entry['phish_detail_url'] siteDomain = urlparse(entry['url']).netloc now = str(TimestampNow().Timestamp()) try: IPaddress = socket.gethostbyname(siteDomain) if IPaddress: rASN = NetInfo() ASN = rASN.GetASN(IPaddress).strip('\"') else: pass # can't resolv except: IPaddress = "" ASN = "" # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) # Follow redirect and add new URI to database if (len(r.history) > 1) and ("301" in str( r.history[-1])) and (siteURL != r.url) and ( siteURL.split('/')[:-1] != r.url.split('/')[:-2] ) and (siteURL + '/' != r.url): lastHTTPcode = str(r.status_code) SQL.SQLiteInsertPK(TABLEname, r.url, siteDomain, IPaddress, source_url, now, lastHTTPcode, ASN) else: pass lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" err = sys.exc_info() LOG.error("HTTP error: " + str(err)) pass except: # Unknown status code err = sys.exc_info() LOG.error("Connection error: " + str(err)) pass # Add data into database LOG.info(siteURL) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode, ASN) else: LOG.debug("Entry still known: " + siteURL) pass
def UrlscanExtractor(LOG, SQL, TABLEname, PROXY, UAFILE): UAG = UAgent() ## Search in Urlquery HTML file try: for item in HTMLText['results']: # remove URL containing UID-style strings siteURL = re.split("(?:[0-9a-fA-F]:?){32}", item['page']['url'])[0] dn = dirname(siteURL) ## Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, dn) is 0: source_url = item['result'].replace("/api/v1", "") siteDomain = urlparse(item['page']['url']).netloc try: IPaddress = socket.gethostbyname(siteDomain) # can't resolv except: IPaddress = "" now = str(TimestampNow().Timestamp()) # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True, timeout=(5, 12)) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" pass except: # Unknown status code err = sys.exc_info() LOG.error("Connection error: " + str(err)) pass LOG.info(siteURL + " " + siteDomain + " " + IPaddress + " " + source_url + " " + now + " " + lastHTTPcode) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode) else: LOG.debug("Entry still known: " + siteURL) pass except: err = sys.exc_info() LOG.error("HTML parser Error! " + str(err))
def SiteURLSQL(SearchString, line, LOG, SQL, TABLEname, PROXY, UAFILE, UAG): # remove URL containing UID-style strings siteURL = quote(re.split("(?:[0-9a-fA-F]:?){32}", line[0])[0], ':/') if siteURL.startswith('https:'): siteDomain = siteURL.split('/')[2] else: siteDomain = siteURL.split('/')[0] siteURL = "http://" + siteURL dn = dirname(siteURL) # Test if entry still exist in DB if SQL.SQLiteVerifyEntry(TABLEname, dn) is 0: # Proceed to informations retrieve now = str(TimestampNow().Timestamp()) source_url = "https://urlquery.net/" + line[1] try: IPaddress = socket.gethostbyname(siteDomain) if IPaddress: rASN = NetInfo() ASN = rASN.GetASN(IPaddress).strip('\"') else: pass # can't resolv except: IPaddress = "" ASN = "" # HTTP connection try: proxies = {'http': PROXY, 'https': PROXY} UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True) # Follow redirect and add new URI to database if (len(r.history) > 1) and ("301" in str( r.history[-1])) and (siteURL != r.url) and ( siteURL.split('/')[:-1] != r.url.split('/')[:-2] ) and (siteURL + '/' != r.url): lastHTTPcode = str(r.status_code) SQL.SQLiteInsertPK(TABLEname, r.url, siteDomain, IPaddress, source_url, now, lastHTTPcode, ASN) else: pass lastHTTPcode = str(r.status_code) except ValueError: # No user-agent configured r = requests.get(siteURL, proxies=proxies, allow_redirects=True) lastHTTPcode = str(r.status_code) except requests.exceptions.Timeout: lastHTTPcode = "timeout" except requests.exceptions.ConnectionError: lastHTTPcode = "aborted" except: lastHTTPcode = "---" pass except Exception as e: # Unknown status code LOG.error("Connection error: {}".format(e)) pass # Add data into database LOG.info(siteURL + " " + siteDomain + " " + IPaddress + " " + source_url + " " + now + " " + lastHTTPcode) SQL.SQLiteInsertPK(TABLEname, siteURL, siteDomain, IPaddress, source_url, now, lastHTTPcode, ASN) else: LOG.debug("Entry still known: " + siteURL) pass
def TryPKDownload(siteURL, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE): proxies = {'http': PROXY, 'https': PROXY} UAG = UAgent() UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} now = str(TimestampNow().Timestamp()) SHA = SHA256() try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) LOG.info("[" + str(r.status_code) + "] " + r.url) if str(r.status_code) == "200": SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) if SQL.SQLiteInvestigVerifyEntry(InvTABLEname, siteDomain, IPaddress) is 0: SQL.SQLiteInvestigInsert(InvTABLEname, siteURL, siteDomain, IPaddress, now, '200') else: pass ziplist = [] path = siteURL pathl = '/'.join(path.split("/")[:3]) pathlist = path.split("/")[3:] # Make list current = 0 newpath = "" while current < len(pathlist): if current == 0: newpath = pathlist[current] else: newpath = newpath + "/" + pathlist[current] current = current + 1 pathD = pathl + "/" + newpath ziplist.append(pathD) try: # Try too find and download phishing kit archive (.zip) if len(ziplist) > 1: for zip in ziplist: if ('=' or '%' or '?' or '-' or '@' or '.') not in os.path.basename( os.path.normpath(zip)): if ('/') not in zip[-1:] and ('.') not in zip[-3:]: try: LOG.info("trying " + zip + ".zip") rz = requests.get(zip + ".zip", headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12)) if str(rz.status_code) != "404": lastHTTPcode = str(rz.status_code) zzip = zip.replace('/', '_').replace( ':', '') if "application/zip" in rz.headers[ 'content-type'] or "application/octet-stream" in rz.headers[ 'content-type']: savefile = DLDir + zzip + '.zip' # Still collected file if os.path.exists(savefile): LOG.info( "[DL ] Found still collected archive: " + savefile) # New file to download else: LOG.info( "[DL ] Found archive, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(rz.content) pass ZipFileName = str(zzip + '.zip') ZipFileHash = SHA.hashFile( savefile) SQL.SQLiteInvestigUpdatePK( InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) else: pass # 404 else: pass except: err = sys.exc_info() LOG.error("Error: " + str(err)) print("Error: " + str(err)) pass else: pass else: pass else: pass # Ziplist empty else: pass except: pass elif str(r.status_code) == "404": lastHTTPcode = str(r.status_code) SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, lastHTTPcode) SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) else: lastHTTPcode = str(r.status_code) SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, lastHTTPcode) except requests.exceptions.ConnectionError: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Err') LOG.debug("Connection error: " + siteURL) except requests.exceptions.ConnectTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'To') LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ReadTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'RTo') LOG.debug("Connection Read Timeout: " + siteURL) except requests.exceptions.MissingSchema: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Err') LOG.debug("Malformed URL, skipping: " + siteURL + "\n") except: err = sys.exc_info() LOG.error("Error: " + str(err))
def TryPKDownload(siteURL, siteDomain, IPaddress, TABLEname, InvTABLEname, DLDir, SQL, PROXY, LOG, UAFILE): global ziplist proxies = {'http': PROXY, 'https': PROXY} UAG = UAgent() UA = UAG.ChooseUA(UAFILE) user_agent = {'User-agent': UA} now = str(TimestampNow().Timestamp()) SHA = SHA256() PsiteURL = None ResiteURL = siteURL PsiteURL = urlparse(ResiteURL) if len(PsiteURL.path.split("/")[1:]) >= 2: siteURL = ResiteURL.rsplit('/', 1)[0] else: siteURL = ResiteURL # Let's try to find a phishing kit source archive try: r = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) if (str(r.status_code) != "404"): LOG.info("[" + str(r.status_code) + "] " + r.url) SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) if SQL.SQLiteInvestigVerifyEntry(InvTABLEname, siteDomain, IPaddress) is 0: SQL.SQLiteInvestigInsert(InvTABLEname, siteURL, siteDomain, IPaddress, now, str(r.status_code)) else: pass ziplist = [] path = siteURL pathl = '/'.join(path.split("/")[:3]) pathlist = path.split("/")[3:] # Make list current = 0 newpath = "" while current < len(pathlist): if current == 0: newpath = pathlist[current] else: newpath = newpath + "/" + pathlist[current] current = current + 1 pathD = pathl + "/" + newpath ziplist.append(pathD) # Get page title try: if len(ziplist) >= 1: rhtml = requests.get(siteURL, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) thtml = rhtml.text tit = re.search('<title>(.*?)</title>', thtml, re.IGNORECASE) if tit is not None: PageTitle = tit.group(1) LOG.info(PageTitle) SQL.SQLiteInvestigUpdateTitle(InvTABLEname, siteURL, PageTitle) else: pass except AttributeError: pass except requests.exceptions.ReadTimeout: pass except: err = sys.exc_info() LOG.error("Get PageTitle Error: " + siteURL + str(err)) # Set redis to record URL Done list redis_set = redis.Redis(db=1) redis_set.sadd('StalkPhishURLs', 0) # Try to retrieve all possible path for one url and find whether there are .zip files try: if len(ziplist) >= 1: for url in [pathl] + ziplist: if redis_set.sismember('StalkPhishURLs', url): continue LOG.info("Retrieving Path " + url) urllist = RetriveIndexPath(url, proxies, user_agent, []) for urlzip in urllist: if redis_set.sismember('StalkPhishURLs', urlzip): continue LOG.info("trying " + urlzip) rz = requests.get(urlzip, headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) if str(rz.status_code) != "404": lastHTTPcode = str(rz.status_code) zzip = urlzip.replace('/', '_').replace(':', '') try: if "application/zip" in rz.headers[ 'content-type'] or "application/octet-stream" in rz.headers[ 'content-type']: savefile = DLDir + zzip # Still collected file if os.path.exists(savefile): LOG.info( "[DL ] Found still collected archive: " + savefile) # New file to download else: LOG.info( "[DL ] Found archive, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(rz.content) pass ZipFileName = str(zzip) ZipFileHash = SHA.hashFile( savefile) SQL.SQLiteInvestigUpdatePK( InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) else: pass except requests.exceptions.ContentDecodingError: LOG.error("[DL ] content-type error") except: pass # 404 else: pass redis_set.sadd('StalkPhishURLs', *urllist, url) except: err = sys.exc_info() LOG.error("DL Error: " + str(err)) # Add popular Kit names into ziplist try: import pandas as pd perct = 0.1 topNames = pd.read_csv( '/home/chaoxu18/StalkPhish/stalkphish/ExtracKitNameCount.csv' ) topNamesNum = round(perct * len(topNames)) if topNamesNum > 0: topNames = topNames['kitname'].values[:topNamesNum] for topname in topNames: ziplist.append(siteURL + '/' + topname + '.zip') except: LOG.error("Top Kit Name Error: " + str(err)) pass try: # Try too find and download phishing kit archive (.zip) if len(ziplist) >= 1: for zip in ziplist: if redis_set.sismember('StalkPhishURLs', zip + ".zip"): continue else: redis_set.sadd('StalkPhishURLs', zip + ".zip") if ('=' or '%' or '?' or '-' or '@') not in os.path.basename( os.path.normpath(zip)): try: LOG.info("trying " + zip + ".zip") rz = requests.get(zip + ".zip", headers=user_agent, proxies=proxies, allow_redirects=True, timeout=(5, 12), verify=False) if str(rz.status_code) != "404": lastHTTPcode = str(rz.status_code) zzip = zip.replace('/', '_').replace(':', '') try: if "application/zip" in rz.headers[ 'content-type'] or "application/octet-stream" in rz.headers[ 'content-type']: savefile = DLDir + zzip + '.zip' # Still collected file if os.path.exists(savefile): LOG.info( "[DL ] Found still collected archive: " + savefile) # New file to download else: LOG.info( "[DL ] Found archive, downloaded it as: " + savefile) with open(savefile, "wb") as code: code.write(rz.content) pass ZipFileName = str(zzip + '.zip') ZipFileHash = SHA.hashFile( savefile) SQL.SQLiteInvestigUpdatePK( InvTABLEname, siteURL, ZipFileName, ZipFileHash, now, lastHTTPcode) else: pass except requests.exceptions.ContentDecodingError: LOG.error("[DL ] content-type error") except: pass # 404 else: pass except requests.exceptions.ReadTimeout: LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ConnectTimeout: LOG.debug("Connection Timeout") except: err = sys.exc_info() LOG.error("Error: " + str(err)) print("Error: " + str(err)) pass # else: # pass else: pass else: pass # Ziplist empty else: pass except: err = sys.exc_info() LOG.error("DL Error: " + str(err)) else: LOG.debug("[" + str(r.status_code) + "] " + r.url) lastHTTPcode = str(r.status_code) SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, lastHTTPcode) SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) except requests.exceptions.ConnectionError: err = sys.exc_info() if '0x05: Connection refused' in err: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. refused') if '0x04: Host unreachable' in err: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Unreachable') else: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. error') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection error: " + siteURL) except requests.exceptions.ConnectTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. timeout') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection Timeout: " + siteURL) except requests.exceptions.ReadTimeout: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Conn. readtimeout') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Connection Read Timeout: " + siteURL) except requests.exceptions.MissingSchema: SQL.SQLiteInvestigUpdateCode(InvTABLEname, siteURL, now, 'Malformed URL') SQL.SQLiteInsertStillTryDownload(TABLEname, siteURL) LOG.debug("Malformed URL, skipping: " + siteURL + "\n") except: err = sys.exc_info() LOG.error("Error: " + str(err))