def test_ip_regex_allows_backslash_escape(self): self.assertEqual( list(iocextract.extract_ips('10.10.10\.10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10.10.10\\\\\\\\.10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10\.10\.10\.10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10\\\\\\\\\.10\\.10\.10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10[.]10(.10\.10', refang=True))[0], '10.10.10.10')
def _sniff_text(text): """ checks every regex for findings, and return a dictionary of all findings """ results = {} if (args.ioc): print("") urls = list(iocextract.extract_urls(text)) ips = list(iocextract.extract_ips(text)) emails = list(iocextract.extract_emails(text)) hashes = list(iocextract.extract_hashes(text)) rules = list(iocextract.extract_yara_rules(text)) if (urls): results.update({"urls": urls}) if (ips): results.update({"ips": ips}) if (emails): results.update({"emails": emails}) if (hashes): results.update({"hashes": hashes}) if (rules): results.update({"rules": rules}) else: for key, value in regexList.items(): findings = set(re.findall(value, text)) if findings: results.update({key: findings}) return results
def extract(filein, fileout): # Setting up extractation of text from pdf rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' # 'utf16','utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # open file f = open(filein, mode='rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f): interpreter.process_page(page) f.close() device.close() text = retstr.getvalue() retstr.close() with open(fileout+".csv","w",newline="") as file: writer = csv.writer(file) writer.writerow(["IP","ASN","Country Code"]) for ip in iocextract.extract_ips(text, refang=True): print(ip) try: ans = resolveIP(ip) except: print("An error has occured") writer.writerow(ans) file.close() return
def extract_text_obserables(username, text): observable_list = [] user_id = '@{0}'.format(username) user_url = 'https://twitter.com/{0}'.format(username) try: for ip in iocextract.extract_ips(text, refang=True): if validate_ip(ip): observable_list.append(TwitterObservable(user_id, user_url, 'ip', ip)) for url in iocextract.extract_urls(text, refang=True): if 'ghostbin.com' in url or 'pastebin.com' in url: paste_observables = extract_paste_observables(username, url) if len(paste_observables) > 0: observable_list.extend(paste_observables) elif validate_url(url): observable_list.append(TwitterObservable(user_id, user_url, 'url', clean_url(url))) except Exception as e: logger.warning('Exception parsing text: {0}'.format(e)) return observable_list
def test_ip_regex_allows_multiple_brackets(self): self.assertEqual( list(iocextract.extract_ips('10.10.10.]]]10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10.10.10.)))10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10.10.10[[[.10', refang=True))[0], '10.10.10.10') self.assertEqual( list( iocextract.extract_ips('10[[[[.]]]]10[[[.]]10[.10', refang=True))[0], '10.10.10.10') self.assertEqual( list(iocextract.extract_ips('10(((.]]]]10([[.)10.)10', refang=True))[0], '10.10.10.10')
def extract_URLs(content): if content is not None: print ("\n***** Extract URLs *****\n") ### Identify URLs in content ### extractor = URLExtract(); extractor_urls = extractor.find_urls(content) iocextract_urls = list(iocextract.extract_urls(content, refang=True)) iocextract_ips = list(iocextract.extract_ips(content, refang=True)) iocextract_ips_valid = [] if (len(iocextract_ips) > 0): for ip in iocextract_ips: # Add check to further refine list of potential IPs: # Basic format check: # IPv4: xxx.xxx.xxx.xxx or # IPv6: xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx if ip.count(".") != 3 or ip.count(":") != 7: print ("Invalid IP address: " + str(ip)) else: iocextract_ips_valid.append(ip) print ("iocextract.extract_ips method - format validated") print (iocextract_ips_valid) print ("extractor.find method") print (extractor_urls) print ("iocextract.extract_urls method") print (iocextract_urls) info_to_evaluate = extractor_urls + iocextract_urls + iocextract_ips_valid index = 0 # Occassionally, the functions above return urls with trailing commas. Remove these. for ioc in info_to_evaluate: if ioc.endswith(','): info_to_evaluate[index] = ioc[:-1] index += 1 print ("Removed trailing commas") print (info_to_evaluate) print ("Successfully extracted URLs") return info_to_evaluate
def main(): # Parse input file stix_package = STIXPackage.from_xml(FILENAME) # Convert STIXPackage to a Python stix_dict = stix_package.to_dict() #Extract description from the indicator (suitable for indicator only) description = stix_dict["indicators"][0]["description"] # Convert the first STIXPackage dictionary into another STIXPackage via # the from_dict() method. # Pattern for domain / email and IP addresses raw_iocs = re.findall( r'[a-zA-Z0-9-\.]*\[\.?\@?\][a-zA-Z0-9-\.\[\.\@\]]*[-a-zA-Z0-9@:%_\+.~#?&//=]*', description) print(len(raw_iocs)) for i in range(len(raw_iocs)): # Replace the on9 strings for on9string in on9strings: raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # Import those IOCs into the array. if re.match(r'.*[@]+', raw_iocs[i]): iocs['email'].append(raw_iocs[i]) elif re.match(r'.*[//].*', raw_iocs[i]): iocs['url'].append(raw_iocs[i]) elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): iocs['domain'].append(raw_iocs[i]) #Extract hashes by their plugin for hash_extracted in iocextract.extract_hashes(description): iocs['hash'].append(hash_extracted) #Extract Yara rule for yara_extracted in iocextract.extract_yara_rules(description): iocs['yara'].append(yara_extracted) #Extract IP for ip_extracted in iocextract.extract_ips(description, refang=True): iocs['ip'].append(ip_extracted) for key in iocs: for item in iocs[key]: print(key + ":" + item)
def create_group_pulse(input_text): # Create the pulse title unix_time = str(int(time.time())) pulse_title = 'SlackIOCs - ' + unix_time API_KEY = '' otx = OTXv2(API_KEY) group_id = 840 # Create a list of indicators indicators = [] for url in iocextract.extract_urls(input_text): indicators.append({'indicator': url, 'type': 'URL'}) for ip in iocextract.extract_ips(input_text): indicators.append({'indicator': ip, 'type': 'IPv4'}) for sha256 in iocextract.extract_sha256_hashes(input_text): indicators.append({'indicator': sha256, 'type': 'FileHash-SHA256'}) for sha1 in iocextract.extract_sha1_hashes(input_text): indicators.append({'indicator': sha1, 'type': 'FileHash-SHA1'}) for md5 in iocextract.extract_md5_hashes(input_text): indicators.append({'indicator': md5, 'type': 'FileHash-MD5'}) for email in iocextract.extract_emails(input_text): indicators.append({'indicator': email, 'type': 'EMAIL'}) print('Adding ' + str(indicators)) response = otx.create_pulse(name=pulse_title, public=True, indicators=indicators, tags=['covid19'], references=[], group_ids=[group_id], tlp='White') print('Response: ' + str(response))
def extract(filein, fileout): # Setting up extractation of text from pdf rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' # 'utf16','utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # open file f = open(filein, mode='rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f): interpreter.process_page(page) f.close() device.close() text = retstr.getvalue() retstr.close() # open/create output file fout = open(fileout + ".txt", mode="wb") fout.write(b"=== IP ===\n") for ip in iocextract.extract_ips(text, refang=True): # print(ip) fout.write(ip.encode("latin-1") + b"\n") fout.write(b"=== URL ===\n") for url in iocextract.extract_urls(text, refang=True): # print(url) fout.write(url.encode("latin-1") + b"\n") fout.write(b"=== Hashes ===\n") for _hash in iocextract.extract_hashes(text): # print(_hash) fout.write(_hash.encode("latin-1") + b"\n") fout.close() return
def each(self, target): self.results = dict() # combine strings into one space-separated string target_strings = ' '.join(list(_strings(target))) # extract and add iocs iocs = [] iocs.extend(list(iocextract.extract_ips(target_strings))) iocs.extend(list(iocextract.extract_emails(target_strings))) iocs.extend(list(iocextract.extract_hashes(target_strings))) iocs.extend(list(iocextract.extract_yara_rules(target_strings))) # iocs.extend(list(iocextract.extract_urls(target_strings))) iocs[:] = (value for value in iocs if value not in blacklist) # extract and add iocs self.results['iocs'] = iocs # Add observables for ioc in self.results['iocs']: self.add_ioc(ioc) # TODO: tag return True
def check_clippy(iocs): last_text = '' while True: iocs_found = False urls, ips, emails, hashes = None, None, None, None text = clipboard.wait_for_text() # If there's text and it has not already been parsed if text is not None and text != last_text: urls = iter_check(extract_urls(text, refang=True)) if urls is not None: iocs = iocs + [u for u in urls] iocs_found = True ips = iter_check(extract_ips(text, refang=True)) if ips is not None: iocs = iocs + [i for i in ips] iocs_found = True emails = iter_check(extract_emails(text, refang=True)) if emails is not None: iocs = iocs + [e for e in emails] iocs_found = True hashes = iter_check(extract_hashes(text)) if hashes is not None: iocs = iocs + [h for h in hashes] iocs_found = True if iocs_found: GLib.idle_add(win.submit_iocs, list(set(iocs))) iocs = [] last_text = text time.sleep(1)
def test_ipv6_included_in_ips(self): content = '2001:0db8:85a3:0000:0000:8a2e:0370:7334' self.assertEqual(list(iocextract.extract_ips(content))[0], content)
def test_ipv4_included_in_ips(self): content = '127.0.0.1' self.assertEqual(list(iocextract.extract_ips(content))[0], content)
def get_ips(content): array_ips = [] for ips in iocextract.extract_ips(content): array_ips.append(ips) return array_ips
def process_element(self, content, reference_link, include_nonobfuscated=False): """Take a single source content/url and return a list of Artifacts""" # truncate content to a reasonable length for reference_text reference_text = content[:TRUNCATE_LENGTH] + ( '...' if len(content) > TRUNCATE_LENGTH else '') artifact_list = [] # collect URLs and domains scraped = iocextract.extract_urls(content) for url in scraped: # dump anything with ellipses, these get through the regex if u'\u2026' in url: continue artifact = threatingestor.artifacts.URL( url, self.name, reference_link=reference_link, reference_text=reference_text) # dump urls that appear to have the same domain as reference_url if artifact.domain() == urlparse(reference_link).netloc: continue if artifact.is_obfuscated() or include_nonobfuscated: # do URL collection artifact_list.append(artifact) # do domain collection in the same pass if artifact.is_domain(): artifact_list.append( threatingestor.artifacts.Domain( artifact.domain(), self.name, reference_link=reference_link, reference_text=reference_text)) # collect IPs scraped = iocextract.extract_ips(content) for ip in scraped: artifact = threatingestor.artifacts.IPAddress( ip, self.name, reference_link=reference_link, reference_text=reference_text) try: ipaddress = artifact.ipaddress() if ipaddress.is_private or ipaddress.is_loopback or ipaddress.is_reserved: # don't care continue except ValueError: # invalid IP continue artifact_list.append(artifact) # collect yara rules scraped = iocextract.extract_yara_rules(content) for rule in scraped: artifact = threatingestor.artifacts.YARASignature( rule, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) # collect hashes scraped = iocextract.extract_hashes(content) for hash_ in scraped: artifact = threatingestor.artifacts.Hash( hash_, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) # generate generic task title = "Manual Task: {u}".format(u=reference_link) description = 'URL: {u}\nTask autogenerated by ThreatIngestor from source: {s}'.format( s=self.name, u=reference_link) artifact = threatingestor.artifacts.Task(title, self.name, reference_link=reference_link, reference_text=description) artifact_list.append(artifact) return artifact_list
def parse_indicators_from_description_string(self, description_string, title): # print type(description_string) iocs = { 'title': title, 'domain': [], 'ip': [], 'email': [], 'hash': [], 'url': [], 'hash': [], 'yara': [], 'other': [] } on9strings = {'[.]': '.', 'hxxp': 'http', '[@]': '@'} # Convert the first STIXPackage dictionary into another STIXPackage via the from_dict() method. # Pattern for domain / email and IP addresses raw_iocs = re.findall( r'[a-zA-Z0-9-\.]*\[\.?\@?\][a-zA-Z0-9-\.\[\.\@\]]*[-a-zA-Z0-9@:%_\+.~#?&//=]*', description_string) # print(len(raw_iocs)) # for i in range(len(raw_iocs)): # # Replace the on9 strings # for on9string in on9strings: # raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # # Import those IOCs into the array. # if re.match(r'.*[@]+', raw_iocs[i]): # iocs['email'].append(raw_iocs[i]) # elif re.match(r'.*[//].*', raw_iocs[i]): # iocs['url'].append(raw_iocs[i]) # elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): # iocs['domain'].append(raw_iocs[i]) # # Extract hashes by their plugin # for hash_extracted in iocextract.extract_hashes(description_string): # iocs['hash'].append(hash_extracted) # # Extract Yara rule # for yara_extracted in iocextract.extract_yara_rules(description_string): # iocs['yara'].append(yara_extracted) # # Extract IP # for ip_extracted in iocextract.extract_ips(description_string, refang=True): # iocs['ip'].append(ip_extracted) for i in range(len(raw_iocs)): # Replace the on9 strings for on9string in on9strings: raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # Import those IOCs into the array. if re.match(r'.*[@]+', raw_iocs[i]): iocs['email'].append(raw_iocs[i]) iocs['email'] = list(set(iocs['email'])) elif re.match(r'.*[//].*', raw_iocs[i]): iocs['url'].append(raw_iocs[i]) iocs['url'] = list(set(iocs['url'])) elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): if re.match("^([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}$", raw_iocs[i]): iocs['domain'].append(raw_iocs[i]) iocs['domain'] = list(set(iocs['domain'])) # Extract hashes by their plugin for hash_extracted in iocextract.extract_hashes(description_string): iocs['hash'].append(hash_extracted) iocs['hash'] = list(set(iocs['hash'])) # Extract Yara rule for yara_extracted in iocextract.extract_yara_rules( description_string): iocs['yara'].append(yara_extracted) iocs['yara'] = list(set(iocs['yara'])) # Extract IP for ip_extracted in iocextract.extract_ips(description_string, refang=True): # Use regex to validate the IP format if re.match(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", ip_extracted): iocs['ip'].append(ip_extracted) iocs['ip'] = list(set(iocs['ip'])) # for key in iocs: # for item in iocs[key]: # print(key + ":" + item) return iocs
def _parse_indicators_from_stix_description(self, xml_content): iocs = { 'title': '', 'domain': [], 'ip': [], 'email': [], 'hash': [], 'url': [], 'hash': [], 'yara': [], 'other': [] } on9strings = {'[.]': '.', 'hxxp': 'http', '[@]': '@'} # Parse input file stix_package = STIXPackage.from_xml(xml_content) # Convert STIXPackage to a Python stix_dict = stix_package.to_dict() # Extract description from the indicator (suitable for indicator only) # print "-" * 100 # print stix_dict # print "-" * 100 description = stix_dict["indicators"][0]["description"] # Extract title title = stix_dict["indicators"][0]["title"] iocs['title'] = [title] # Convert the first STIXPackage dictionary into another STIXPackage via the from_dict() method. # Pattern for domain / email and IP addresses raw_iocs = re.findall( r'[a-zA-Z0-9-\.]*\[\.?\@?\][a-zA-Z0-9-\.\[\.\@\]]*[-a-zA-Z0-9@:%_\+.~#?&//=]*', description) # print(len(raw_iocs)) for i in range(len(raw_iocs)): # Replace the on9 strings for on9string in on9strings: raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # Import those IOCs into the array. if re.match(r'.*[@]+', raw_iocs[i]): iocs['email'].append(raw_iocs[i]) iocs['email'] = list(set(iocs['email'])) elif re.match(r'.*[//].*', raw_iocs[i]): iocs['url'].append(raw_iocs[i]) iocs['url'] = list(set(iocs['url'])) elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): if re.match("^([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}$", raw_iocs[i]): iocs['domain'].append(raw_iocs[i]) iocs['domain'] = list(set(iocs['domain'])) # Extract hashes by their plugin for hash_extracted in iocextract.extract_hashes(description): iocs['hash'].append(hash_extracted) iocs['hash'] = list(set(iocs['hash'])) # Extract Yara rule for yara_extracted in iocextract.extract_yara_rules(description): iocs['yara'].append(yara_extracted) iocs['yara'] = list(set(iocs['yara'])) # Extract IP for ip_extracted in iocextract.extract_ips(description, refang=True): # Use regex to validate the IP format if re.match(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", ip_extracted): iocs['ip'].append(ip_extracted) iocs['ip'] = list(set(iocs['ip'])) # for key in iocs: # for item in iocs[key]: # print(key + ":" + item) return iocs
def process_element(self, content, reference_link, include_nonobfuscated=False): """Take a single source content/url and return a list of Artifacts. This is the main work block of Source plugins, which handles IOC extraction and artifact creation. :param content: String content to extract from. :param reference_link: Reference link to attach to all artifacts. :param include_nonobfuscated: Include non-defanged URLs in output? """ logger.debug(f"Processing in source '{self.name}'") # Truncate content to a reasonable length for reference_text. reference_text = content[:TRUNCATE_LENGTH] + ( '...' if len(content) > TRUNCATE_LENGTH else '') # Initialize an empty list and a map of counters to track each artifact type. artifact_list = [] artifact_type_count = { 'domain': 0, 'hash': 0, 'ipaddress': 0, 'task': 0, 'url': 0, 'yarasignature': 0, } # Collect URLs and domains. scraped = itertools.chain( iocextract.extract_unencoded_urls(content), # Decode encoded URLs, since we can't operate on encoded ones. iocextract.extract_encoded_urls(content, refang=True), ) for url in scraped: # Dump anything with ellipses, these get through the regex. if u'\u2026' in url: continue artifact = threatingestor.artifacts.URL( url, self.name, reference_link=reference_link, reference_text=reference_text) # Dump URLs that appear to have the same domain as reference_url. try: if artifact.domain() == urlparse(reference_link).netloc: continue except ValueError: # Error parsing reference_link as a URL. Ignoring. pass if artifact.is_obfuscated() or include_nonobfuscated: # Do URL collection. artifact_list.append(artifact) artifact_type_count['url'] += 1 # Do domain collection in the same pass. # Note: domains will always be a subset of URLs. There is no # domain extraction. if artifact.is_domain(): artifact_list.append( threatingestor.artifacts.Domain( artifact.domain(), self.name, reference_link=reference_link, reference_text=reference_text)) artifact_type_count['domain'] += 1 # Collect IPs. scraped = iocextract.extract_ips(content) for ip in scraped: artifact = threatingestor.artifacts.IPAddress( ip, self.name, reference_link=reference_link, reference_text=reference_text) try: ipaddress = artifact.ipaddress() if ipaddress.is_private or ipaddress.is_loopback or ipaddress.is_reserved: # Skip private, loopback, reserved IPs. continue except ValueError: # Skip invalid IPs. continue artifact_list.append(artifact) artifact_type_count['ipaddress'] += 1 # Collect YARA rules. scraped = iocextract.extract_yara_rules(content) for rule in scraped: artifact = threatingestor.artifacts.YARASignature( rule, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) artifact_type_count['yarasignature'] += 1 # Collect hashes. scraped = iocextract.extract_hashes(content) for hash_ in scraped: artifact = threatingestor.artifacts.Hash( hash_, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) artifact_type_count['hash'] += 1 # Generate generic task. title = f"Manual Task: {reference_link}" description = f"URL: {reference_link}\nTask autogenerated by ThreatIngestor from source: {self.name}" artifact = threatingestor.artifacts.Task(title, self.name, reference_link=reference_link, reference_text=description) artifact_list.append(artifact) artifact_type_count['task'] += 1 logger.debug(f"Found {len(artifact_list)} total artifacts") logger.debug(f"Type breakdown: {artifact_type_count}") return artifact_list