def _sniff_text(text): """ checks every regex for findings, and return a dictionary of all findings """ results = {} if (args.ioc): print("") urls = list(iocextract.extract_urls(text)) ips = list(iocextract.extract_ips(text)) emails = list(iocextract.extract_emails(text)) hashes = list(iocextract.extract_hashes(text)) rules = list(iocextract.extract_yara_rules(text)) if (urls): results.update({"urls": urls}) if (ips): results.update({"ips": ips}) if (emails): results.update({"emails": emails}) if (hashes): results.update({"hashes": hashes}) if (rules): results.update({"rules": rules}) else: for key, value in regexList.items(): findings = set(re.findall(value, text)) if findings: results.update({key: findings}) return results
def main(inp, out): for line in inp.readlines(): for new_hash in iocextract.extract_hashes(line): if new_hash not in common.Hashes: common.Hashes.append(new_hash) print(new_hash + ', ') else: print(new_hash + ' Already in List') out.write('#####HASHES#####\n\n') for item in common.Hashes: out.write('"' + item + '", \n')
def ioc_parse(line): """ Use library that can handle defanged formats for IOCs (Indicators of Compromise) """ params = [] formatted = line for url in iocextract.extract_urls(formatted, strip=True): refanged = iocextract.refang_url(url) param = get_ioc_param('url', url, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], url, formatted[param[1]:]) for ip in iocextract.extract_ipv4s(formatted): refanged = iocextract.refang_ipv4(ip) param = get_ioc_param('ip_address', ip, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for ip in iocextract.extract_ipv6s(formatted): param = get_ioc_param('ip_address', ip, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for email in iocextract.extract_emails(formatted): refanged = iocextract.refang_email(email) param = get_ioc_param('email', email, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], email, formatted[param[1]:]) for h in iocextract.extract_hashes(formatted): param = get_ioc_param('hash', h, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], h, formatted[param[1]:]) for rule in iocextract.extract_yara_rules(formatted): param = get_ioc_param('yara_rule', rule, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], rule, formatted[param[1]:]) return formatted, params
def test_hash_extract(self): content = """ 68b329da9893e34099c7d8ad5cb9c940 adc83b19e793491b1c6ea0fd8b46cd9f32e592fc 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09 """ processed = list(iocextract.extract_hashes(content)) self.assertEqual(len(processed), 4) self.assertEqual(processed[0], '68b329da9893e34099c7d8ad5cb9c940') processed = list(iocextract.extract_iocs(content)) self.assertEqual(len(processed), 4) self.assertEqual(processed[0], '68b329da9893e34099c7d8ad5cb9c940')
def main(): # Parse input file stix_package = STIXPackage.from_xml(FILENAME) # Convert STIXPackage to a Python stix_dict = stix_package.to_dict() #Extract description from the indicator (suitable for indicator only) description = stix_dict["indicators"][0]["description"] # Convert the first STIXPackage dictionary into another STIXPackage via # the from_dict() method. # Pattern for domain / email and IP addresses raw_iocs = re.findall( r'[a-zA-Z0-9-\.]*\[\.?\@?\][a-zA-Z0-9-\.\[\.\@\]]*[-a-zA-Z0-9@:%_\+.~#?&//=]*', description) print(len(raw_iocs)) for i in range(len(raw_iocs)): # Replace the on9 strings for on9string in on9strings: raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # Import those IOCs into the array. if re.match(r'.*[@]+', raw_iocs[i]): iocs['email'].append(raw_iocs[i]) elif re.match(r'.*[//].*', raw_iocs[i]): iocs['url'].append(raw_iocs[i]) elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): iocs['domain'].append(raw_iocs[i]) #Extract hashes by their plugin for hash_extracted in iocextract.extract_hashes(description): iocs['hash'].append(hash_extracted) #Extract Yara rule for yara_extracted in iocextract.extract_yara_rules(description): iocs['yara'].append(yara_extracted) #Extract IP for ip_extracted in iocextract.extract_ips(description, refang=True): iocs['ip'].append(ip_extracted) for key in iocs: for item in iocs[key]: print(key + ":" + item)
def artifacts(self, raw): artifacts = [] urls = list(iocextract.extract_urls(str(raw))) ipv4s = list(iocextract.extract_ipv4s(str(raw))) mail_addresses = list(iocextract.extract_emails(str(raw))) hashes = list(iocextract.extract_hashes(str(raw))) if urls: for u in urls: artifacts.append(self.build_artifact('url',str(u))) if ipv4s: for i in ipv4s: artifacts.append(self.build_artifact('ip',str(i))) if mail_addresses: for e in mail_addresses: artifacts.append(self.build_artifact('mail',str(e))) if hashes: for h in hashes: artifacts.append(self.build_artifact('hash',str(h))) return artifacts
def extract_text_indicators(username, tweet_id, text): indicator_list = [] user_id = '@{0}'.format(username) tweet_url = 'https://twitter.com/{0}/status/{1}'.format(username, tweet_id) try: for ip in iocextract.extract_ipv4s(text, refang=True): if is_valid_ip(ip): indicator_list.append( TwitterIndicator(user_id, tweet_url, 'IPv4', ip)) for hash in iocextract.extract_hashes(text): hash_type = get_hash_type(hash) if hash_type: indicator_list.append( TwitterIndicator(user_id, tweet_url, hash_type, hash)) for url in iocextract.extract_urls(text, refang=True): if 'ghostbin.com' in url or 'pastebin.com' in url: paste_indicators = extract_paste_indicators(username, url) if len(paste_indicators) > 0: indicator_list.extend(paste_indicators) url = apply_url_fixes(url) if is_valid_url(url): indicator_list.append( TwitterIndicator(user_id, tweet_url, 'URL', url)) elif INCLUDE_DOMAINS: if is_valid_domain(url): indicator_list.append( TwitterIndicator(user_id, tweet_url, 'HOST', url)) except Exception as ex: LOGGER.warning('Exception parsing text: {0}'.format(ex)) return indicator_list
def extract(filein, fileout): # Setting up extractation of text from pdf rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' # 'utf16','utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # open file f = open(filein, mode='rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f): interpreter.process_page(page) f.close() device.close() text = retstr.getvalue() retstr.close() # open/create output file fout = open(fileout + ".txt", mode="wb") fout.write(b"=== IP ===\n") for ip in iocextract.extract_ips(text, refang=True): # print(ip) fout.write(ip.encode("latin-1") + b"\n") fout.write(b"=== URL ===\n") for url in iocextract.extract_urls(text, refang=True): # print(url) fout.write(url.encode("latin-1") + b"\n") fout.write(b"=== Hashes ===\n") for _hash in iocextract.extract_hashes(text): # print(_hash) fout.write(_hash.encode("latin-1") + b"\n") fout.close() return
def each(self, target): self.results = dict() # combine strings into one space-separated string target_strings = ' '.join(list(_strings(target))) # extract and add iocs iocs = [] iocs.extend(list(iocextract.extract_ips(target_strings))) iocs.extend(list(iocextract.extract_emails(target_strings))) iocs.extend(list(iocextract.extract_hashes(target_strings))) iocs.extend(list(iocextract.extract_yara_rules(target_strings))) # iocs.extend(list(iocextract.extract_urls(target_strings))) iocs[:] = (value for value in iocs if value not in blacklist) # extract and add iocs self.results['iocs'] = iocs # Add observables for ioc in self.results['iocs']: self.add_ioc(ioc) # TODO: tag return True
def check_clippy(iocs): last_text = '' while True: iocs_found = False urls, ips, emails, hashes = None, None, None, None text = clipboard.wait_for_text() # If there's text and it has not already been parsed if text is not None and text != last_text: urls = iter_check(extract_urls(text, refang=True)) if urls is not None: iocs = iocs + [u for u in urls] iocs_found = True ips = iter_check(extract_ips(text, refang=True)) if ips is not None: iocs = iocs + [i for i in ips] iocs_found = True emails = iter_check(extract_emails(text, refang=True)) if emails is not None: iocs = iocs + [e for e in emails] iocs_found = True hashes = iter_check(extract_hashes(text)) if hashes is not None: iocs = iocs + [h for h in hashes] iocs_found = True if iocs_found: GLib.idle_add(win.submit_iocs, list(set(iocs))) iocs = [] last_text = text time.sleep(1)
def parse_indicators_from_description_string(self, description_string, title): # print type(description_string) iocs = { 'title': title, 'domain': [], 'ip': [], 'email': [], 'hash': [], 'url': [], 'hash': [], 'yara': [], 'other': [] } on9strings = {'[.]': '.', 'hxxp': 'http', '[@]': '@'} # Convert the first STIXPackage dictionary into another STIXPackage via the from_dict() method. # Pattern for domain / email and IP addresses raw_iocs = re.findall( r'[a-zA-Z0-9-\.]*\[\.?\@?\][a-zA-Z0-9-\.\[\.\@\]]*[-a-zA-Z0-9@:%_\+.~#?&//=]*', description_string) # print(len(raw_iocs)) # for i in range(len(raw_iocs)): # # Replace the on9 strings # for on9string in on9strings: # raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # # Import those IOCs into the array. # if re.match(r'.*[@]+', raw_iocs[i]): # iocs['email'].append(raw_iocs[i]) # elif re.match(r'.*[//].*', raw_iocs[i]): # iocs['url'].append(raw_iocs[i]) # elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): # iocs['domain'].append(raw_iocs[i]) # # Extract hashes by their plugin # for hash_extracted in iocextract.extract_hashes(description_string): # iocs['hash'].append(hash_extracted) # # Extract Yara rule # for yara_extracted in iocextract.extract_yara_rules(description_string): # iocs['yara'].append(yara_extracted) # # Extract IP # for ip_extracted in iocextract.extract_ips(description_string, refang=True): # iocs['ip'].append(ip_extracted) for i in range(len(raw_iocs)): # Replace the on9 strings for on9string in on9strings: raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # Import those IOCs into the array. if re.match(r'.*[@]+', raw_iocs[i]): iocs['email'].append(raw_iocs[i]) iocs['email'] = list(set(iocs['email'])) elif re.match(r'.*[//].*', raw_iocs[i]): iocs['url'].append(raw_iocs[i]) iocs['url'] = list(set(iocs['url'])) elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): if re.match("^([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}$", raw_iocs[i]): iocs['domain'].append(raw_iocs[i]) iocs['domain'] = list(set(iocs['domain'])) # Extract hashes by their plugin for hash_extracted in iocextract.extract_hashes(description_string): iocs['hash'].append(hash_extracted) iocs['hash'] = list(set(iocs['hash'])) # Extract Yara rule for yara_extracted in iocextract.extract_yara_rules( description_string): iocs['yara'].append(yara_extracted) iocs['yara'] = list(set(iocs['yara'])) # Extract IP for ip_extracted in iocextract.extract_ips(description_string, refang=True): # Use regex to validate the IP format if re.match(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", ip_extracted): iocs['ip'].append(ip_extracted) iocs['ip'] = list(set(iocs['ip'])) # for key in iocs: # for item in iocs[key]: # print(key + ":" + item) return iocs
def process_element(self, content, reference_link, include_nonobfuscated=False): """Take a single source content/url and return a list of Artifacts""" # truncate content to a reasonable length for reference_text reference_text = content[:TRUNCATE_LENGTH] + ( '...' if len(content) > TRUNCATE_LENGTH else '') artifact_list = [] # collect URLs and domains scraped = iocextract.extract_urls(content) for url in scraped: # dump anything with ellipses, these get through the regex if u'\u2026' in url: continue artifact = threatingestor.artifacts.URL( url, self.name, reference_link=reference_link, reference_text=reference_text) # dump urls that appear to have the same domain as reference_url if artifact.domain() == urlparse(reference_link).netloc: continue if artifact.is_obfuscated() or include_nonobfuscated: # do URL collection artifact_list.append(artifact) # do domain collection in the same pass if artifact.is_domain(): artifact_list.append( threatingestor.artifacts.Domain( artifact.domain(), self.name, reference_link=reference_link, reference_text=reference_text)) # collect IPs scraped = iocextract.extract_ips(content) for ip in scraped: artifact = threatingestor.artifacts.IPAddress( ip, self.name, reference_link=reference_link, reference_text=reference_text) try: ipaddress = artifact.ipaddress() if ipaddress.is_private or ipaddress.is_loopback or ipaddress.is_reserved: # don't care continue except ValueError: # invalid IP continue artifact_list.append(artifact) # collect yara rules scraped = iocextract.extract_yara_rules(content) for rule in scraped: artifact = threatingestor.artifacts.YARASignature( rule, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) # collect hashes scraped = iocextract.extract_hashes(content) for hash_ in scraped: artifact = threatingestor.artifacts.Hash( hash_, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) # generate generic task title = "Manual Task: {u}".format(u=reference_link) description = 'URL: {u}\nTask autogenerated by ThreatIngestor from source: {s}'.format( s=self.name, u=reference_link) artifact = threatingestor.artifacts.Task(title, self.name, reference_link=reference_link, reference_text=description) artifact_list.append(artifact) return artifact_list
def start(self): self.logging() # Extraction if self.extract_all is not None \ or self.ip is not None \ or self.hash is not None \ or self.domain is not None \ or self.extract_file is not None: self.logger.info( 'Checking the type of extraction will be performed.') if self.extract_file is not None: self.logger.info( f'Obtaining IOC from file: {self.extract_file}') if os.path.exists(self.extract_file): openfile = open(self.extract_file, 'r+') all_text = openfile.read() title = self.extract_file file_name = self.extract_file else: self.logger.error( 'The given directory or file was not found.') elif self.extract_url is not None: self.logger.info( f'Obtaining IOC from WebSite: {self.extract_url}') self.driver.get(self.extract_url) soup = BeautifulSoup(self.driver.page_source, "html.parser") title = soup.find('title').get_text() all_text = self.select_all_text(soup=soup) file_name = self.extract_url if self.extract_all: self.driver.get(self.baseurl) count = 0 for extract_iocs in iocextract.extract_iocs(all_text): if '/' not in extract_iocs \ and '[at]' not in extract_iocs: if len( self.database.compare_ioc( IOC=extract_iocs.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_iocs.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="IOCS", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_iocs.replace('[.]', '.'), count=count, name=extract_iocs.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_iocs}') elif self.domain: self.driver.get(self.baseurl) count = 0 for extract_urls in iocextract.extract_urls(all_text): if '/' not in extract_urls \ and '[at]' not in extract_urls: if len( self.database.compare_ioc( IOC=extract_urls.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_urls.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="Domain", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_urls.replace('[.]', '.'), count=count, name=extract_urls.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_urls}') elif self.ip: self.driver.get(self.baseurl) count = 0 for extract_ipv4s in iocextract.extract_ipv4s(all_text): if '/' not in extract_ipv4s \ and '[at]' not in extract_ipv4s: if len( self.database.compare_ioc( IOC=extract_ipv4s.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_ipv4s.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="ipv4", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_ipv4s.replace('[.]', '.'), count=count, name=extract_ipv4s.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_ipv4s}') elif self.hash: self.logger.info('Getting only the Hashes from the site.') self.driver.get(self.baseurl) count = 0 for extract_hashes in iocextract.extract_hashes(all_text): if '/' not in extract_hashes \ and '[at]' not in extract_hashes: if len( self.database.compare_ioc( IOC=extract_hashes.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_hashes.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="Hash", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_hashes.replace('[.]', '.'), count=count, name=extract_hashes.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_hashes}') if self.feed is not None: # MalwareBaazar count = 0 for iocs in MalwareBaazar().start: if len(self.database.compare_ioc( IOC=iocs['sha256_hash'])) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['file_name'], signature=iocs['signature'], tags=iocs['tags'], font='Bazaar') self.database.save_ioc(file_name=iocs['file_name'], IOC=iocs['sha256_hash'], signature=iocs['signature'], tags=str(iocs['tags']).replace("'",'') \ .replace('[','') \ .replace(']',''), font='Bazaar', type="Hash") self.uploadIOC(comment=comment, IOC=iocs['sha256_hash'], count=count, name=iocs['file_name']) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['sha256_hash']}") # Circl for feed in MISPFeed( url="https://www.circl.lu/doc/misp/feed-osint/").start: request = requests.get(feed, headers={ 'User-Agent': 'Mozilla/5.0' }).json() count = 0 for iocs in request['Event']['Attribute']: if iocs['category'] == 'Payload delivery': if '.' not in iocs['value'] \ and len(iocs['value']) == 32 \ or len(iocs['value']) == 64: if len(self.database.compare_ioc( IOC=iocs['value'])) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=iocs['value'], signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC( comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['value']}") elif iocs['category'] == 'External analysis': if 'virustotal' in iocs['value']: hash = iocs['value'].split('/')[4] if len(self.database.compare_ioc(IOC=hash)) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=hash, signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC( comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['value']}") elif iocs['category'] == 'Artifacts dropped': hash = iocs['value'] if len(self.database.compare_ioc(IOC=hash)) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=hash, signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC(comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f'IOC already registered: {hash}')
def get_hash(content): array_hashes = [] for hashes in iocextract.extract_hashes(content): array_hashes.append(hashes) return array_hashes.append(hash)
def _parse_indicators_from_stix_description(self, xml_content): iocs = { 'title': '', 'domain': [], 'ip': [], 'email': [], 'hash': [], 'url': [], 'hash': [], 'yara': [], 'other': [] } on9strings = {'[.]': '.', 'hxxp': 'http', '[@]': '@'} # Parse input file stix_package = STIXPackage.from_xml(xml_content) # Convert STIXPackage to a Python stix_dict = stix_package.to_dict() # Extract description from the indicator (suitable for indicator only) # print "-" * 100 # print stix_dict # print "-" * 100 description = stix_dict["indicators"][0]["description"] # Extract title title = stix_dict["indicators"][0]["title"] iocs['title'] = [title] # Convert the first STIXPackage dictionary into another STIXPackage via the from_dict() method. # Pattern for domain / email and IP addresses raw_iocs = re.findall( r'[a-zA-Z0-9-\.]*\[\.?\@?\][a-zA-Z0-9-\.\[\.\@\]]*[-a-zA-Z0-9@:%_\+.~#?&//=]*', description) # print(len(raw_iocs)) for i in range(len(raw_iocs)): # Replace the on9 strings for on9string in on9strings: raw_iocs[i] = raw_iocs[i].replace(on9string, on9strings[on9string]) # Import those IOCs into the array. if re.match(r'.*[@]+', raw_iocs[i]): iocs['email'].append(raw_iocs[i]) iocs['email'] = list(set(iocs['email'])) elif re.match(r'.*[//].*', raw_iocs[i]): iocs['url'].append(raw_iocs[i]) iocs['url'] = list(set(iocs['url'])) elif re.match(r'.*[a-zA-Z]', raw_iocs[i]): if re.match("^([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}$", raw_iocs[i]): iocs['domain'].append(raw_iocs[i]) iocs['domain'] = list(set(iocs['domain'])) # Extract hashes by their plugin for hash_extracted in iocextract.extract_hashes(description): iocs['hash'].append(hash_extracted) iocs['hash'] = list(set(iocs['hash'])) # Extract Yara rule for yara_extracted in iocextract.extract_yara_rules(description): iocs['yara'].append(yara_extracted) iocs['yara'] = list(set(iocs['yara'])) # Extract IP for ip_extracted in iocextract.extract_ips(description, refang=True): # Use regex to validate the IP format if re.match(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", ip_extracted): iocs['ip'].append(ip_extracted) iocs['ip'] = list(set(iocs['ip'])) # for key in iocs: # for item in iocs[key]: # print(key + ":" + item) return iocs
help="File from where to extract the hashes", required=True) args = parser.parse_args() url = "https://www.virustotal.com/api/v3/files/" hash_list = [] with open(args.file, 'r') as f: # for line in f: # hash_list.append(line.split(":")[1]) string = f.read() for hash_extracted in iocextract.extract_hashes(string): hash_list.append(hash_extracted) hash_list = list(dict.fromkeys(hash_list)) print("list size: " + str(len(hash_list))) for hash in hash_list: print("hash: " + hash) headers = {"x-apikey": args.apikey} while True: response = requests.get(url + hash, headers=headers) if response.status_code == 200 or response.status_code == 404: break
def process_element(self, content, reference_link, include_nonobfuscated=False): """Take a single source content/url and return a list of Artifacts. This is the main work block of Source plugins, which handles IOC extraction and artifact creation. :param content: String content to extract from. :param reference_link: Reference link to attach to all artifacts. :param include_nonobfuscated: Include non-defanged URLs in output? """ logger.debug(f"Processing in source '{self.name}'") # Truncate content to a reasonable length for reference_text. reference_text = content[:TRUNCATE_LENGTH] + ( '...' if len(content) > TRUNCATE_LENGTH else '') # Initialize an empty list and a map of counters to track each artifact type. artifact_list = [] artifact_type_count = { 'domain': 0, 'hash': 0, 'ipaddress': 0, 'task': 0, 'url': 0, 'yarasignature': 0, } # Collect URLs and domains. scraped = itertools.chain( iocextract.extract_unencoded_urls(content), # Decode encoded URLs, since we can't operate on encoded ones. iocextract.extract_encoded_urls(content, refang=True), ) for url in scraped: # Dump anything with ellipses, these get through the regex. if u'\u2026' in url: continue artifact = threatingestor.artifacts.URL( url, self.name, reference_link=reference_link, reference_text=reference_text) # Dump URLs that appear to have the same domain as reference_url. try: if artifact.domain() == urlparse(reference_link).netloc: continue except ValueError: # Error parsing reference_link as a URL. Ignoring. pass if artifact.is_obfuscated() or include_nonobfuscated: # Do URL collection. artifact_list.append(artifact) artifact_type_count['url'] += 1 # Do domain collection in the same pass. # Note: domains will always be a subset of URLs. There is no # domain extraction. if artifact.is_domain(): artifact_list.append( threatingestor.artifacts.Domain( artifact.domain(), self.name, reference_link=reference_link, reference_text=reference_text)) artifact_type_count['domain'] += 1 # Collect IPs. scraped = iocextract.extract_ips(content) for ip in scraped: artifact = threatingestor.artifacts.IPAddress( ip, self.name, reference_link=reference_link, reference_text=reference_text) try: ipaddress = artifact.ipaddress() if ipaddress.is_private or ipaddress.is_loopback or ipaddress.is_reserved: # Skip private, loopback, reserved IPs. continue except ValueError: # Skip invalid IPs. continue artifact_list.append(artifact) artifact_type_count['ipaddress'] += 1 # Collect YARA rules. scraped = iocextract.extract_yara_rules(content) for rule in scraped: artifact = threatingestor.artifacts.YARASignature( rule, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) artifact_type_count['yarasignature'] += 1 # Collect hashes. scraped = iocextract.extract_hashes(content) for hash_ in scraped: artifact = threatingestor.artifacts.Hash( hash_, self.name, reference_link=reference_link, reference_text=reference_text) artifact_list.append(artifact) artifact_type_count['hash'] += 1 # Generate generic task. title = f"Manual Task: {reference_link}" description = f"URL: {reference_link}\nTask autogenerated by ThreatIngestor from source: {self.name}" artifact = threatingestor.artifacts.Task(title, self.name, reference_link=reference_link, reference_text=description) artifact_list.append(artifact) artifact_type_count['task'] += 1 logger.debug(f"Found {len(artifact_list)} total artifacts") logger.debug(f"Type breakdown: {artifact_type_count}") return artifact_list