def test_email_extract(self): content_list = [ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '[email protected]', ] for content in content_list: self.assertEquals(list(iocextract.extract_emails(content))[0], content) self.assertEquals(list(iocextract.extract_emails(_wrap_spaces(content)))[0], content) self.assertEquals(list(iocextract.extract_emails(_wrap_tabs(content)))[0], content) self.assertEquals(list(iocextract.extract_emails(_wrap_newlines(content)))[0], content) invalid_list = [ '@a.co', 'myuser@', '@', # don't extract non-fqdn emails 'a@a', ] for content in invalid_list: self.assertEquals(len(list(iocextract.extract_emails(content))), 0) self.assertEquals(len(list(iocextract.extract_emails(_wrap_spaces(content)))), 0) self.assertEquals(len(list(iocextract.extract_emails(_wrap_tabs(content)))), 0) self.assertEquals(len(list(iocextract.extract_emails(_wrap_newlines(content)))), 0)
def extractIOC(path): extractor = URLExtract() try: out = execute_command('src\\strings64.exe ' + path) except: out = execute_command('src\\strings64.exe ' + path) out = out.decode("utf-8").split('\n') extract_url = [] ipv4 = [] ipv6 = [] emails = [] for url in iocextract.extract_urls(str(out), refang=True, strip=True): n = extractor.find_urls(url) try: n = n[0] n = str(n).replace("\\r", "") extract_url.append(n) except: pass extract_url = list(set(extract_url)) for ip4 in iocextract.extract_ipv4s(str(out), refang=True): ipv4.append(ip4) for ip6 in iocextract.extract_ipv6s(str(out)): ipv6.append(ip6) for email in iocextract.extract_emails(str(out), refang=True): emails.append(str(email).replace("\\r", "")) return (extract_url, ipv4, ipv6, emails)
def test_defang_unsupported_at(self): content = "*****@*****.**" combinations = [ ["(@(", ")@(", ")@)", "@(", ")@"], [ "[@[", "]@[", "]@]", "@[", "]@", ], [ "{@{", "}@{", "}@}", "@{", "}@", ], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace("@", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_emails(defanged_content, refang=True)) self.assertNotEqual( len(result), 1, "should fail on defanging style : " + defang_style)
def test_email_refang(self): content_list = [ 'myuser@example[.]com[.]tld', 'myuser @example[.]com[.]tld', 'myuser @ example.com.tld', 'myuser@example(.)com[.tld', 'myuser@example[.]com.tld', 'myuser@example [.] com.tld', 'myuser@example [.] com [.] tld', 'myuser@example [.] com [.tld', 'myuser@example [[[ . ])] com [.tld', 'myuser[@]example [[[ . ])] com [.tld', 'myuser [ @ ] example [[[ . ])] com [.tld', 'myuser { @ ) example [[[ . ])] com [.tld', 'myuser { @ ) example { . ])] com [.tld', 'myuser { at ) example { . ])] com [.tld', 'myuser { at ) example { doT ])] com [dot tld', 'myuser At example DOT com DOT tld', 'myuser[@]example[.com[.tld]', ] for content in content_list: self.assertEqual( list(iocextract.extract_emails(content, refang=True))[0], '*****@*****.**') self.assertEqual(iocextract.refang_email(content), '*****@*****.**')
def _sniff_text(text): """ checks every regex for findings, and return a dictionary of all findings """ results = {} if (args.ioc): print("") urls = list(iocextract.extract_urls(text)) ips = list(iocextract.extract_ips(text)) emails = list(iocextract.extract_emails(text)) hashes = list(iocextract.extract_hashes(text)) rules = list(iocextract.extract_yara_rules(text)) if (urls): results.update({"urls": urls}) if (ips): results.update({"ips": ips}) if (emails): results.update({"emails": emails}) if (hashes): results.update({"hashes": hashes}) if (rules): results.update({"rules": rules}) else: for key, value in regexList.items(): findings = set(re.findall(value, text)) if findings: results.update({key: findings}) return results
def _utility_ioc_extractor_function(self, event, *args, **kwargs): results = {} results["was_successful"] = False try: # Get the function parameters: incident_id = kwargs.get("incident_id") # number text_string = kwargs.get("text_string") # text log = logging.getLogger(__name__) # Establish logging text_string = unicodedata.normalize( "NFKD", BeautifulSoup(text_string, "html.parser").get_text( ' ')) # Strip HTML and normalize text # Parse IOCs by type from text_string - OrderedDict.fromkeys() preserves order and removes duplicates. results["ipv4s"] = list( OrderedDict.fromkeys( list(iocextract.extract_ipv4s(text_string, refang=True)))) results["ipv6s"] = list( OrderedDict.fromkeys( list(iocextract.extract_ipv6s(text_string)))) results["urls"] = list( OrderedDict.fromkeys( list(iocextract.extract_urls( text_string, refang=True)))) # URLs and domains results["domains"] = list( OrderedDict.fromkeys([ urlparse(url).netloc for url in results["urls"] ])) # domains only results["email_addresses"] = list( OrderedDict.fromkeys( list(iocextract.extract_emails(text_string, refang=True)))) results["email_domains"] = list( OrderedDict.fromkeys([ email.split('@')[1] for email in results["email_addresses"] ])) # domains only results["md5_hashes"] = list( OrderedDict.fromkeys( list(iocextract.extract_md5_hashes(text_string)))) results["sha256_hashes"] = list( OrderedDict.fromkeys( list(iocextract.extract_sha256_hashes(text_string)))) results["was_successful"] = True # Produce a FunctionResult with the results yield FunctionResult(results) except Exception: yield FunctionError()
def ioc_parse(line): """ Use library that can handle defanged formats for IOCs (Indicators of Compromise) """ params = [] formatted = line for url in iocextract.extract_urls(formatted, strip=True): refanged = iocextract.refang_url(url) param = get_ioc_param('url', url, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], url, formatted[param[1]:]) for ip in iocextract.extract_ipv4s(formatted): refanged = iocextract.refang_ipv4(ip) param = get_ioc_param('ip_address', ip, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for ip in iocextract.extract_ipv6s(formatted): param = get_ioc_param('ip_address', ip, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for email in iocextract.extract_emails(formatted): refanged = iocextract.refang_email(email) param = get_ioc_param('email', email, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], email, formatted[param[1]:]) for h in iocextract.extract_hashes(formatted): param = get_ioc_param('hash', h, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], h, formatted[param[1]:]) for rule in iocextract.extract_yara_rules(formatted): param = get_ioc_param('yara_rule', rule, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], rule, formatted[param[1]:]) return formatted, params
def create_group_pulse(input_text): # Create the pulse title unix_time = str(int(time.time())) pulse_title = 'SlackIOCs - ' + unix_time API_KEY = '' otx = OTXv2(API_KEY) group_id = 840 # Create a list of indicators indicators = [] for url in iocextract.extract_urls(input_text): indicators.append({'indicator': url, 'type': 'URL'}) for ip in iocextract.extract_ips(input_text): indicators.append({'indicator': ip, 'type': 'IPv4'}) for sha256 in iocextract.extract_sha256_hashes(input_text): indicators.append({'indicator': sha256, 'type': 'FileHash-SHA256'}) for sha1 in iocextract.extract_sha1_hashes(input_text): indicators.append({'indicator': sha1, 'type': 'FileHash-SHA1'}) for md5 in iocextract.extract_md5_hashes(input_text): indicators.append({'indicator': md5, 'type': 'FileHash-MD5'}) for email in iocextract.extract_emails(input_text): indicators.append({'indicator': email, 'type': 'EMAIL'}) print('Adding ' + str(indicators)) response = otx.create_pulse(name=pulse_title, public=True, indicators=indicators, tags=['covid19'], references=[], group_ids=[group_id], tlp='White') print('Response: ' + str(response))
def artifacts(self, raw): artifacts = [] urls = list(iocextract.extract_urls(str(raw))) ipv4s = list(iocextract.extract_ipv4s(str(raw))) mail_addresses = list(iocextract.extract_emails(str(raw))) hashes = list(iocextract.extract_hashes(str(raw))) if urls: for u in urls: artifacts.append(self.build_artifact('url',str(u))) if ipv4s: for i in ipv4s: artifacts.append(self.build_artifact('ip',str(i))) if mail_addresses: for e in mail_addresses: artifacts.append(self.build_artifact('mail',str(e))) if hashes: for h in hashes: artifacts.append(self.build_artifact('hash',str(h))) return artifacts
def test_defang_at(self): content = "*****@*****.**" combinations = [ [ "(@)", "(@", "@)", ], ["[@]", "[@", "@]"], ["{@}", "{@", "@}"], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace("@", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_emails(defanged_content, refang=True)) self.assertEqual(len(result), 1, "failed defang on: " + defang_style) self.assertEqual(result[0], content)
def each(self, target): self.results = dict() # combine strings into one space-separated string target_strings = ' '.join(list(_strings(target))) # extract and add iocs iocs = [] iocs.extend(list(iocextract.extract_ips(target_strings))) iocs.extend(list(iocextract.extract_emails(target_strings))) iocs.extend(list(iocextract.extract_hashes(target_strings))) iocs.extend(list(iocextract.extract_yara_rules(target_strings))) # iocs.extend(list(iocextract.extract_urls(target_strings))) iocs[:] = (value for value in iocs if value not in blacklist) # extract and add iocs self.results['iocs'] = iocs # Add observables for ioc in self.results['iocs']: self.add_ioc(ioc) # TODO: tag return True
def artifacts(self, raw): if self.filename: return [ self.build_artifact("file", self.filename), ] else: artifacts = [] raw_str = str(raw) raw_str = raw_str.replace('\\"', '"') urls = set(iocextract.extract_urls(raw_str)) ipv4s = set(iocextract.extract_ipv4s(raw_str)) mail_addresses = set(iocextract.extract_emails(raw_str)) if urls: for u in urls: artifacts.append(self.build_artifact("url", str(u))) if ipv4s: for i in ipv4s: artifacts.append(self.build_artifact("ip", str(i))) if mail_addresses: for e in mail_addresses: artifacts.append(self.build_artifact("mail", str(e))) return artifacts
def check_clippy(iocs): last_text = '' while True: iocs_found = False urls, ips, emails, hashes = None, None, None, None text = clipboard.wait_for_text() # If there's text and it has not already been parsed if text is not None and text != last_text: urls = iter_check(extract_urls(text, refang=True)) if urls is not None: iocs = iocs + [u for u in urls] iocs_found = True ips = iter_check(extract_ips(text, refang=True)) if ips is not None: iocs = iocs + [i for i in ips] iocs_found = True emails = iter_check(extract_emails(text, refang=True)) if emails is not None: iocs = iocs + [e for e in emails] iocs_found = True hashes = iter_check(extract_hashes(text)) if hashes is not None: iocs = iocs + [h for h in hashes] iocs_found = True if iocs_found: GLib.idle_add(win.submit_iocs, list(set(iocs))) iocs = [] last_text = text time.sleep(1)
def test_ip_email(self): content = "*****@*****.**" result = list(iocextract.extract_emails(content)) self.assertEqual(len(result), 1) self.assertEqual(result[0], content)
def test_xmpp(self): content = "*****@*****.**" result = list(iocextract.extract_emails(content)) self.assertEqual(len(result), 1)
def test_email(self): content = "*****@*****.**" result = list(iocextract.extract_emails(content)) self.assertEqual(len(result), 1) self.assertEqual(result[0], content)
def test_email_extract(self): content_list = [ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '[email protected]', 'myuser @example[.]com', 'myuser@ example[.]com', 'myuser @ example[.]com', 'myuser @ example [ . ] com', 'myuser @ example.com', 'myuser@example [.] com', 'myuser@example[.]com[.]tld', 'myuser@example(.)com[.tld', 'myuser@example[.]com.tld', 'myuser@example [.] com.tld', 'myuser@example [.] com [.] tld', 'myuser@example [.] com [.tld', 'myuser@example [ . ] com', 'myuser@example [ . ] com [ .tld', 'myuser@example [[[[ [ [ [ . )]) com', 'myuser@example [[[[ [ [ [ dot )]) com', 'myuser at example [[[[ [ [ [ dot )]) com', 'myuser at example [ dot ] com', 'myuser at example[ dot ]com', 'myuser at example[dot]com', 'myuser at example [dot] com', 'myuser [at] example dot com', 'myuser at example dot com', 'myuser AT example DOT com', 'myuser[@]example.com', 'myuser[@]example[.com', 'myuser [ at ] example.com', 'myuser [at] example.com', 'myuser[at]example.com', 'myuser[ at ]example.com', 'myuser/AT/example/DOT/com', ] for content in content_list: self.assertEqual( list(iocextract.extract_emails(content))[0], content) self.assertEqual( list(iocextract.extract_emails(_wrap_spaces(content)))[0], content) self.assertEqual( list(iocextract.extract_emails(_wrap_tabs(content)))[0], content) self.assertEqual( list(iocextract.extract_emails(_wrap_newlines(content)))[0], content) invalid_list = [ '@a.co', 'myuser@', '@', # don't extract non-fqdn emails 'a@a', 'myuser @ word more words', 'myuser @ word more words.period', 'myuser @ words. Sentence', 'myuser@example . com', 'myuser@example .]com', 'myuseratexampledotcom', 'myuseratexample dot com', 'myuser at exampledotcom', ] for content in invalid_list: self.assertEqual(len(list(iocextract.extract_emails(content))), 0) self.assertEqual( len(list(iocextract.extract_emails(_wrap_spaces(content)))), 0) self.assertEqual( len(list(iocextract.extract_emails(_wrap_tabs(content)))), 0) self.assertEqual( len(list(iocextract.extract_emails(_wrap_newlines(content)))), 0) expected = 'myuser@example [.] com' partial_list = [ 'myuser@example [.] com. tld', 'myuser@example [.] com . tld', 'myuser@example [.] com!!!???', ] for content in partial_list: self.assertEqual( list(iocextract.extract_emails(content))[0], expected) self.assertEqual( list(iocextract.extract_emails(_wrap_spaces(content)))[0], expected) self.assertEqual( list(iocextract.extract_emails(_wrap_tabs(content)))[0], expected) self.assertEqual( list(iocextract.extract_emails(_wrap_newlines(content)))[0], expected)
for filename in os.listdir(path): if(count > maximum - 1): break if(filename in skip_files): continue # Extract text from pdf filepath = os.path.join(path, filename) content = convert_pdf_txt(filepath) # Extract Indicators of Compromise from text, recording time extracted_files[filename] = {} extract_start_time = time.time() extracted_files[filename]["urls"] = list(iocextract.extract_urls(content, refang=True)) extracted_files[filename]["email_addresses"] = list(iocextract.extract_emails(content, refang=True)) extracted_files[filename]["ipv4s"] = list(iocextract.extract_ipv4s(content, refang=True)) extracted_files[filename]["ipv6s"] = list(iocextract.extract_ipv6s(content)) extracted_files[filename]["md5s"] = list(iocextract.extract_md5_hashes(content)) extracted_files[filename]["sha1s"] = list(iocextract.extract_sha1_hashes(content)) extracted_files[filename]["sha256s"] = list(iocextract.extract_sha256_hashes(content)) extracted_files[filename]["sha512s"] = list(iocextract.extract_sha512_hashes(content)) extracted_files[filename]["yara"] = list(iocextract.extract_yara_rules(content)) extract_avg_numerator += time.time() - extract_start_time count += 1 process_end_time = time.time() # add some meta info on process run time extracted_files["meta"] = {