def test_unicode_space_terminates_urls(self): self.assertEqual(list(iocextract.extract_urls('as\xa0example[.]com/Es3tC0deR3name.exe):')), ['example[.]com/Es3tC0deR3name.exe']) self.assertEqual(list(iocextract.extract_urls('as\xa0example[.]com\xa0words):')), ['example[.]com']) self.assertEqual(list(iocextract.extract_urls('as\xa0http://example.com/test\xa0words):')), ['http://example.com/test'])
def test_refang_url(self): content_list = [ 'http://example.com/test', 'http:// example .com /test', 'http://example[.]com/test', 'http://example[.]com[/]test', 'http://example(.)com(/)test', 'http://example[dot]com/test', 'hxxp://example.com/test', 'example [.] com/test', 'example(.)com/test', 'hxxp://example[.com/test', 'hxxp://example.]com/test', 'hxxp://exampledot]com/test', 'hxxp://example[dotcom/test', 'hxxp://example.com[/test', 'http__example.com/test', ] for content in content_list: self.assertEqual( list(iocextract.extract_urls(content, refang=True))[0], 'http://example.com/test') self.assertEqual(iocextract.refang_url(content), 'http://example.com/test') self.assertEqual(iocextract.refang_url('ftx://example.com/test'), 'ftp://example.com/test') # IPv6 works as expected content = 'http://[2001:db8:85a3:0:0:8a2e:370:7334]:80/test' self.assertEqual(iocextract.refang_url(content), content) self.assertEqual( list(iocextract.extract_urls(content, refang=True))[0], content)
def test_url_generic_regex_tight_edge_cases(self): self.assertEqual(len(list(iocextract.extract_urls('https://+test+/'))), 0) self.assertEqual(len(list(iocextract.extract_urls('https://[test]/'))), 1) self.assertEqual(len(list(iocextract.extract_urls('https:// test /'))), 1)
def test_urlencoded_url_extraction(self): self.assertEquals(list(iocextract.extract_urls('rget="http%3A%2F%2Fexample%2Ecom%2Fwhite%2Ehta"/>', refang=True))[0], 'http://example.com/white.hta') self.assertEquals(list(iocextract.extract_urls('http%3A%2F%2Fexample%2Ecom', refang=True))[0], 'http://example.com') self.assertEquals(list(iocextract.extract_urls('http%3A%2F%2Fexample%2Ecom'))[0], 'http%3A%2F%2Fexample%2Ecom') self.assertEquals(list(iocextract.extract_urls('http%3A%2F%2Fexa-mple%2Ecom', refang=True))[0], 'http://exa-mple.com')
def test_url_strip(self): self.assertEqual( list( iocextract.extract_urls( 'http://schemas.openxmlformats.org/drawingml/2006/main"><a:graphicData', strip=True))[0], 'http://schemas.openxmlformats.org/drawingml/2006/main') self.assertEqual( list( iocextract.extract_urls( "http://127.0.0.1:%u/')%%IMPORT%%Command", strip=True))[0], "http://127.0.0.1:%u/")
def test_backslash_url_extraction(self): self.assertEqual(list(iocextract.extract_urls('example\.com', refang=True))[0], 'http://example.com') self.assertEqual(list(iocextract.extract_urls('test\.example\.com', refang=True))[0], 'http://test.example.com') self.assertEqual(list(iocextract.extract_urls('test \. example \. com', refang=True))[0], 'http://test.example.com') self.assertEqual(list(iocextract.extract_urls('test\.example \. com', refang=True))[0], 'http://test.example.com') self.assertEqual(list(iocextract.extract_urls('http://test \. example \. com', refang=True))[1], 'http://test.example.com') self.assertEqual(list(iocextract.extract_urls('test.example\.com', refang=True))[0], 'http://test.example.com') self.assertEqual(list(iocextract.extract_urls('test\.example.com', refang=True))[0], 'http://test.example.com') self.assertEqual(list(iocextract.extract_urls('a.b.c.test\.example.com', refang=True))[0], 'http://a.b.c.test.example.com') self.assertEqual(list(iocextract.extract_urls('a\.b.c.test\.example.com', refang=True))[0], 'http://a.b.c.test.example.com')
def test_b64_url_extraction_just_url(self): content_list = [ base64.b64encode(b'http://example.com').decode('ascii'), base64.b64encode(b'http://example.com/some/url').decode('ascii'), base64.b64encode(b'http://example.com/some/url').decode('ascii'), base64.b64encode(b'FtP://example.com/some/url').decode('ascii'), ] for content in content_list: self.assertEqual(list(iocextract.extract_urls(content))[0], content) self.assertEqual(list(iocextract.extract_urls(_wrap_spaces(content)))[0], content) self.assertEqual(list(iocextract.extract_urls(_wrap_tabs(content)))[0], content) self.assertEqual(list(iocextract.extract_urls(_wrap_newlines(content)))[0], content) self.assertEqual(list(iocextract.extract_urls(_wrap_nonwords(content)))[0], content)
def test_defang_http(self): content = "http://example.com" combinations = [ [ "(http", ")http", ], [ "[http", "]http", ], [ "{http", "}http", ], ["hxxp"], ] # for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace("http", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_urls(defanged_content, refang=True)) self.assertEqual(len(result), 1, "failed defang on: " + defang_style) self.assertEqual(result[0], content, "incorrectly refanged")
def extract_text_obserables(username, text): observable_list = [] user_id = '@{0}'.format(username) user_url = 'https://twitter.com/{0}'.format(username) try: for ip in iocextract.extract_ips(text, refang=True): if validate_ip(ip): observable_list.append(TwitterObservable(user_id, user_url, 'ip', ip)) for url in iocextract.extract_urls(text, refang=True): if 'ghostbin.com' in url or 'pastebin.com' in url: paste_observables = extract_paste_observables(username, url) if len(paste_observables) > 0: observable_list.extend(paste_observables) elif validate_url(url): observable_list.append(TwitterObservable(user_id, user_url, 'url', clean_url(url))) except Exception as e: logger.warning('Exception parsing text: {0}'.format(e)) return observable_list
def test_defang_unsupported_dot_slash_slash(self): content = "https://www.example.com" combinations = [ ["(://(", "(://)", ")://(", ")://)", "(://", ")://"], [ "[://[", "[://]", "]://[", "]://]", "[://", "]://", ], [ "{://{", "{://}", "}://{", "}://}", "{://", "}://", ], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace("://", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_urls(defanged_content, refang=True)) self.assertNotEqual( len(result), 1, "should fail on defanging style : " + defang_style)
def _sniff_text(text): """ checks every regex for findings, and return a dictionary of all findings """ results = {} if (args.ioc): print("") urls = list(iocextract.extract_urls(text)) ips = list(iocextract.extract_ips(text)) emails = list(iocextract.extract_emails(text)) hashes = list(iocextract.extract_hashes(text)) rules = list(iocextract.extract_yara_rules(text)) if (urls): results.update({"urls": urls}) if (ips): results.update({"ips": ips}) if (emails): results.update({"emails": emails}) if (hashes): results.update({"hashes": hashes}) if (rules): results.update({"rules": rules}) else: for key, value in regexList.items(): findings = set(re.findall(value, text)) if findings: results.update({key: findings}) return results
def remove_html_tags(self, list_dots): """ 使用beautifulsoup去除标签 :param list_dots: 经过dots判断的短链 :return: 去除标签后的URL列表 """ list_dots_init = [] list_soup = [] for url in list_dots: list_dots_init.append(url) list_dots_init = "\n".join(list_dots_init) # beautifulsoup去除标签 soup = BeautifulSoup(list_dots_init, features="html.parser") # 移除标签 valid_tags = ['a'] for tag in soup.find_all(True): if tag.name in valid_tags: tag.extract() soup = "\n".join(soup) # 再次iocextract提取 soup = iocextract.extract_urls(soup) for url in soup: list_soup.append(url) return list_soup
def test_b64_url_extraction_with_wrappers(self): content_list = [ base64.b64encode(b' http://example.com/test ').decode('ascii'), base64.b64encode( b'words in front http://example.com/test ').decode('ascii'), base64.b64encode(b' http://example.com/test words after').decode( 'ascii'), base64.b64encode(b' http://example.com/test\x99\x80 ').decode( 'ascii'), base64.b64encode(b'sadasdasdasdhttp://example.com/test ').decode( 'ascii'), base64.b64encode(b'adasdasdasdhttp://example.com/test ').decode( 'ascii'), base64.b64encode(b'dasdasdasdhttp://example.com/test ').decode( 'ascii'), base64.b64encode(b'asdasdasdhttp://example.com/test ').decode( 'ascii'), base64.b64encode(b'sdasdasdhttp://example.com/test ').decode( 'ascii'), base64.b64encode( b'reallylongreallylongreallylongreallylongreallylongreallylongreallylonghttp://example.com/test reallylong' ).decode('ascii'), ] for content in content_list: self.assertEqual( list(iocextract.extract_urls(content, refang=True))[0], 'http://example.com/test')
def extractIOC(path): extractor = URLExtract() try: out = execute_command('src\\strings64.exe ' + path) except: out = execute_command('src\\strings64.exe ' + path) out = out.decode("utf-8").split('\n') extract_url = [] ipv4 = [] ipv6 = [] emails = [] for url in iocextract.extract_urls(str(out), refang=True, strip=True): n = extractor.find_urls(url) try: n = n[0] n = str(n).replace("\\r", "") extract_url.append(n) except: pass extract_url = list(set(extract_url)) for ip4 in iocextract.extract_ipv4s(str(out), refang=True): ipv4.append(ip4) for ip6 in iocextract.extract_ipv6s(str(out)): ipv6.append(ip6) for email in iocextract.extract_emails(str(out), refang=True): emails.append(str(email).replace("\\r", "")) return (extract_url, ipv4, ipv6, emails)
def test_bracket_url_dots_in_netloc(self): content_list = [ 'hXXps://192.168.149[.]100/api/info', 'hXXps://subdomain.example[.]com/some/path', 'h__ps__subdomain.example[.]com/some/path', 'http://subdomain.example.com/test[.]doc' ] for content in content_list: for extracted in iocextract.extract_urls(content): self.assertEqual(extracted, content) # We terminate on any character not in the allowed set of domain + # scheme characters. That means these will show up from the generic regex, # but not the bracket regex. Note the space termination in the second result: self.assertEqual(list(iocextract.extract_urls('hXXps:// 192.168.149[.]100/api/info')), ['hXXps:// 192.168.149[.]100/api/info', '192.168.149[.]100/api/info'])
def test_path_refang(self): content_list = [ 'http://example.com/test[.]htm', 'http://example[.]com/test[.]htm', ] for content in content_list: self.assertEqual(list(iocextract.extract_urls(content, refang=True))[0], 'http://example.com/test.htm') self.assertEqual(iocextract.refang_url(content), 'http://example.com/test.htm')
def extract_all_url_simple(slef, content_raw): """ 提取文本中全部的URL(仅使用iocextract) :param content_raw: 传入进行提取全部URL的文本 :return: 文本中全部URL的列表 """ # 经过iocextract提取后的URL列表 url_list_after_ioc = iocextract.extract_urls(content_raw) return url_list_after_ioc
def test_b64_url_extraction_whitespace(self): content_list = [ 'aHR0cDovL2V4 YW1wbGUuY29t', 'aHR0cD\r\n ovL2V4Y\r\n W1wbGUuY29tIA==', 'aHR0c\t\t\t\t\t\tDovL 2V4YW1wbGUuY29tI CA=', 'a H R 0 c D o v L 2 V 4 Y W 1 w b G U u Y 2 9 t I C A g ', ] for content in content_list: self.assertEqual(list(iocextract.extract_urls(content, refang=True))[0], 'http://example.com')
def _utility_ioc_extractor_function(self, event, *args, **kwargs): results = {} results["was_successful"] = False try: # Get the function parameters: incident_id = kwargs.get("incident_id") # number text_string = kwargs.get("text_string") # text log = logging.getLogger(__name__) # Establish logging text_string = unicodedata.normalize( "NFKD", BeautifulSoup(text_string, "html.parser").get_text( ' ')) # Strip HTML and normalize text # Parse IOCs by type from text_string - OrderedDict.fromkeys() preserves order and removes duplicates. results["ipv4s"] = list( OrderedDict.fromkeys( list(iocextract.extract_ipv4s(text_string, refang=True)))) results["ipv6s"] = list( OrderedDict.fromkeys( list(iocextract.extract_ipv6s(text_string)))) results["urls"] = list( OrderedDict.fromkeys( list(iocextract.extract_urls( text_string, refang=True)))) # URLs and domains results["domains"] = list( OrderedDict.fromkeys([ urlparse(url).netloc for url in results["urls"] ])) # domains only results["email_addresses"] = list( OrderedDict.fromkeys( list(iocextract.extract_emails(text_string, refang=True)))) results["email_domains"] = list( OrderedDict.fromkeys([ email.split('@')[1] for email in results["email_addresses"] ])) # domains only results["md5_hashes"] = list( OrderedDict.fromkeys( list(iocextract.extract_md5_hashes(text_string)))) results["sha256_hashes"] = list( OrderedDict.fromkeys( list(iocextract.extract_sha256_hashes(text_string)))) results["was_successful"] = True # Produce a FunctionResult with the results yield FunctionResult(results) except Exception: yield FunctionError()
def deobfuscate(self, url): if url: # Parse the URL via iocextract url_set = set(iocextract.extract_urls(url, refang=True)) if url_set: _url = url_set.pop() logger.debug("Parsing URL: %s to: %s" % (url, _url)) return _url else: return self._deobfuscate_m(url) else: return None
def extract_URLs(content): if content is not None: print ("\n***** Extract URLs *****\n") ### Identify URLs in content ### extractor = URLExtract(); extractor_urls = extractor.find_urls(content) iocextract_urls = list(iocextract.extract_urls(content, refang=True)) iocextract_ips = list(iocextract.extract_ips(content, refang=True)) iocextract_ips_valid = [] if (len(iocextract_ips) > 0): for ip in iocextract_ips: # Add check to further refine list of potential IPs: # Basic format check: # IPv4: xxx.xxx.xxx.xxx or # IPv6: xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx if ip.count(".") != 3 or ip.count(":") != 7: print ("Invalid IP address: " + str(ip)) else: iocextract_ips_valid.append(ip) print ("iocextract.extract_ips method - format validated") print (iocextract_ips_valid) print ("extractor.find method") print (extractor_urls) print ("iocextract.extract_urls method") print (iocextract_urls) info_to_evaluate = extractor_urls + iocextract_urls + iocextract_ips_valid index = 0 # Occassionally, the functions above return urls with trailing commas. Remove these. for ioc in info_to_evaluate: if ioc.endswith(','): info_to_evaluate[index] = ioc[:-1] index += 1 print ("Removed trailing commas") print (info_to_evaluate) print ("Successfully extracted URLs") return info_to_evaluate
def main(inp, out): for line in inp.readlines(): for url in iocextract.extract_urls(line, refang=True): if url not in common.URLs: common.URLs.append(url) print(url + ', ') else: print(url + ' Already in List') out.write('\n#####URLS#####\n\n') for item in common.URLs: out.write('"' + item + '", \n')
def ioc_parse(line): """ Use library that can handle defanged formats for IOCs (Indicators of Compromise) """ params = [] formatted = line for url in iocextract.extract_urls(formatted, strip=True): refanged = iocextract.refang_url(url) param = get_ioc_param('url', url, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], url, formatted[param[1]:]) for ip in iocextract.extract_ipv4s(formatted): refanged = iocextract.refang_ipv4(ip) param = get_ioc_param('ip_address', ip, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for ip in iocextract.extract_ipv6s(formatted): param = get_ioc_param('ip_address', ip, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for email in iocextract.extract_emails(formatted): refanged = iocextract.refang_email(email) param = get_ioc_param('email', email, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], email, formatted[param[1]:]) for h in iocextract.extract_hashes(formatted): param = get_ioc_param('hash', h, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], h, formatted[param[1]:]) for rule in iocextract.extract_yara_rules(formatted): param = get_ioc_param('yara_rule', rule, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], rule, formatted[param[1]:]) return formatted, params
def test_b64_url_extraction_bad_pad(self): content_list = [ # good 'aHR0cDovL2V4YW1wbGUuY29t', 'aHR0cDovL2V4YW1wbGUuY29tIA==', 'aHR0cDovL2V4YW1wbGUuY29tICA=', 'aHR0cDovL2V4YW1wbGUuY29tICAg', # bad 'aHR0cDovL2V4YW1wbGUuY29t=', 'aHR0cDovL2V4YW1wbGUuY29tIA=', 'aHR0cDovL2V4YW1wbGUuY29tICA', 'aHR0cDovL2V4YW1wbGUuY29tI', 'aHR0cDovL2V4YW1wbGUuY29tba', ] for content in content_list: self.assertEqual(list(iocextract.extract_urls(content, refang=True))[0], 'http://example.com')
def test_defang_unsupported_http(self): content = "http://example.com" combinations = [ ["(http(", "(http)", ")http(", ")http)", "http(", "http)"], ["[http[", "[http]", "]http[", "]http]", "http(", "http]"], ["{http{", "{http}", "}http{", "}http}", "http{", "http}"], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace("http", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_urls(defanged_content, refang=True)) self.assertNotEqual( len(result), 1, "should fail on defanging style: " + defang_style)
def test_defang_unsupported_colon(self): content = "https://www.example.com" combinations = [ ["(:(", "(:)", "):(", "):)", "(:", ":(", "):", ":)"], ["[:[", "[:]", "]:[", "]:]", "[:", ":[", "]:", ":]"], ["{:{", "{:}", "}:{", "}:}", "{:", ":{", "}:", ":}"], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace(":", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_urls(defanged_content, refang=True)) self.assertNotEqual( len(result), 1, "should fail on defanging style : " + defang_style)
def test_defang_dot_slash_slash(self): content = "https://example.com" combinations = [ ["://)", "://("], ["://]", "://["], ["__"], [":\\\\"], ] #incorrectly refangsd on "://}" "://{" for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace("://", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_urls(defanged_content, refang=True)) self.assertEqual(len(result), 1, "failed defang on: " + defang_style) self.assertEqual(result[0], content, "incorrectly refanged")
def CapeReporter(values): cape_val = [] for usrInput in values: chk_ip = list(iocextract.extract_ipv4s(usrInput)) chk_url = list(iocextract.extract_urls(usrInput)) chk_md5 = list(iocextract.extract_md5_hashes(usrInput)) chk_sha1 = list(iocextract.extract_sha1_hashes(usrInput)) chk_256 = list(iocextract.extract_sha256_hashes(usrInput)) if chk_url: usrInput = chk_url[0] argType = 'url' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_ip: usrInput = chk_ip[0] argType = 'ip' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_md5: usrInput = chk_md5[0] argType = 'md5' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_sha1: usrInput = chk_sha1[0] argType = 'sha1' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_256: usrInput = chk_256[0] argType = 'sha256' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) else: pass return cape_val
def create_group_pulse(input_text): # Create the pulse title unix_time = str(int(time.time())) pulse_title = 'SlackIOCs - ' + unix_time API_KEY = '' otx = OTXv2(API_KEY) group_id = 840 # Create a list of indicators indicators = [] for url in iocextract.extract_urls(input_text): indicators.append({'indicator': url, 'type': 'URL'}) for ip in iocextract.extract_ips(input_text): indicators.append({'indicator': ip, 'type': 'IPv4'}) for sha256 in iocextract.extract_sha256_hashes(input_text): indicators.append({'indicator': sha256, 'type': 'FileHash-SHA256'}) for sha1 in iocextract.extract_sha1_hashes(input_text): indicators.append({'indicator': sha1, 'type': 'FileHash-SHA1'}) for md5 in iocextract.extract_md5_hashes(input_text): indicators.append({'indicator': md5, 'type': 'FileHash-MD5'}) for email in iocextract.extract_emails(input_text): indicators.append({'indicator': email, 'type': 'EMAIL'}) print('Adding ' + str(indicators)) response = otx.create_pulse(name=pulse_title, public=True, indicators=indicators, tags=['covid19'], references=[], group_ids=[group_id], tlp='White') print('Response: ' + str(response))
def artifacts(self, raw): artifacts = [] urls = list(iocextract.extract_urls(str(raw))) ipv4s = list(iocextract.extract_ipv4s(str(raw))) mail_addresses = list(iocextract.extract_emails(str(raw))) hashes = list(iocextract.extract_hashes(str(raw))) if urls: for u in urls: artifacts.append(self.build_artifact('url',str(u))) if ipv4s: for i in ipv4s: artifacts.append(self.build_artifact('ip',str(i))) if mail_addresses: for e in mail_addresses: artifacts.append(self.build_artifact('mail',str(e))) if hashes: for h in hashes: artifacts.append(self.build_artifact('hash',str(h))) return artifacts