def extract_iocs(): file = input("File you want to extract IOCs from(full file path): ") iocs = [] with open(file, "r") as f: f = f.read() print(f"{green}\nIOCs extracted:\n{reset}") for everything in iocextract.extract_iocs(f): iocs.append(iocextract.defang(everything)) print(f"{red}{iocextract.defang(everything)}{reset}") iocs = "\n".join(iocs) Files.mk_file("extract_iocs.txt", iocs)
def test_corpus_results(self): in_data = open('test_data/input.txt', 'r').read() valid_results = open('test_data/valid.txt', 'r').read().splitlines() invalid_results = open('test_data/invalid.txt', 'r').read().splitlines() out_data = list(iocextract.extract_iocs(in_data)) for expected in valid_results: self.assertIn(expected, out_data) for unexpected in invalid_results: self.assertNotIn(unexpected, out_data)
def test_hash_extract(self): content = """ 68b329da9893e34099c7d8ad5cb9c940 adc83b19e793491b1c6ea0fd8b46cd9f32e592fc 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09 """ processed = list(iocextract.extract_hashes(content)) self.assertEqual(len(processed), 4) self.assertEqual(processed[0], '68b329da9893e34099c7d8ad5cb9c940') processed = list(iocextract.extract_iocs(content)) self.assertEqual(len(processed), 4) self.assertEqual(processed[0], '68b329da9893e34099c7d8ad5cb9c940')
def test_yara_included_in_iocs(self): content = 'rule testRule { condition: true }' self.assertEqual(list(iocextract.extract_iocs(content))[0], content)
def test_ipv6_included_in_iocs(self): content = '2001:0db8:85a3:0000:0000:8a2e:0370:7334' self.assertEqual(list(iocextract.extract_iocs(content))[0], content)
def test_url_included_in_iocs(self): content = 'http://domain.com/test' self.assertEqual(list(iocextract.extract_iocs(content))[0], content)
def test_ipv4_included_in_iocs(self): content = '127.0.0.1' self.assertEqual(list(iocextract.extract_iocs(content))[0], content)
def test_email_included_in_iocs(self): content = '*****@*****.**' self.assertEqual(list(iocextract.extract_iocs(content))[0], content)
import tweepy import iocextract import re consumer_key = "" consumer_secret = "" access_token = "" access_token_secret = "" tweet_mode = 'extended' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) id = "user_id_here" user = api.user_timeline(id, count=100, tweet_mode='extended') iocList = [] for tweet in user: #print(tweet.full_text) #adjust to "true" for "unsafe" ioc's for ioc in iocextract.extract_iocs(str(tweet.full_text), refang=False): if re.match(r"(?!https:\/\/t\.co|127\.0\.0\.1)", str(ioc)): #print(ioc) iocList.append(ioc) stripSpace = [x.strip(' ') for x in iocList] strippedIOC = list(set(stripSpace)) for ioc in strippedIOC: print(ioc)
def start(self): self.logging() # Extraction if self.extract_all is not None \ or self.ip is not None \ or self.hash is not None \ or self.domain is not None \ or self.extract_file is not None: self.logger.info( 'Checking the type of extraction will be performed.') if self.extract_file is not None: self.logger.info( f'Obtaining IOC from file: {self.extract_file}') if os.path.exists(self.extract_file): openfile = open(self.extract_file, 'r+') all_text = openfile.read() title = self.extract_file file_name = self.extract_file else: self.logger.error( 'The given directory or file was not found.') elif self.extract_url is not None: self.logger.info( f'Obtaining IOC from WebSite: {self.extract_url}') self.driver.get(self.extract_url) soup = BeautifulSoup(self.driver.page_source, "html.parser") title = soup.find('title').get_text() all_text = self.select_all_text(soup=soup) file_name = self.extract_url if self.extract_all: self.driver.get(self.baseurl) count = 0 for extract_iocs in iocextract.extract_iocs(all_text): if '/' not in extract_iocs \ and '[at]' not in extract_iocs: if len( self.database.compare_ioc( IOC=extract_iocs.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_iocs.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="IOCS", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_iocs.replace('[.]', '.'), count=count, name=extract_iocs.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_iocs}') elif self.domain: self.driver.get(self.baseurl) count = 0 for extract_urls in iocextract.extract_urls(all_text): if '/' not in extract_urls \ and '[at]' not in extract_urls: if len( self.database.compare_ioc( IOC=extract_urls.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_urls.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="Domain", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_urls.replace('[.]', '.'), count=count, name=extract_urls.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_urls}') elif self.ip: self.driver.get(self.baseurl) count = 0 for extract_ipv4s in iocextract.extract_ipv4s(all_text): if '/' not in extract_ipv4s \ and '[at]' not in extract_ipv4s: if len( self.database.compare_ioc( IOC=extract_ipv4s.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_ipv4s.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="ipv4", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_ipv4s.replace('[.]', '.'), count=count, name=extract_ipv4s.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_ipv4s}') elif self.hash: self.logger.info('Getting only the Hashes from the site.') self.driver.get(self.baseurl) count = 0 for extract_hashes in iocextract.extract_hashes(all_text): if '/' not in extract_hashes \ and '[at]' not in extract_hashes: if len( self.database.compare_ioc( IOC=extract_hashes.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_hashes.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="Hash", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_hashes.replace('[.]', '.'), count=count, name=extract_hashes.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_hashes}') if self.feed is not None: # MalwareBaazar count = 0 for iocs in MalwareBaazar().start: if len(self.database.compare_ioc( IOC=iocs['sha256_hash'])) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['file_name'], signature=iocs['signature'], tags=iocs['tags'], font='Bazaar') self.database.save_ioc(file_name=iocs['file_name'], IOC=iocs['sha256_hash'], signature=iocs['signature'], tags=str(iocs['tags']).replace("'",'') \ .replace('[','') \ .replace(']',''), font='Bazaar', type="Hash") self.uploadIOC(comment=comment, IOC=iocs['sha256_hash'], count=count, name=iocs['file_name']) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['sha256_hash']}") # Circl for feed in MISPFeed( url="https://www.circl.lu/doc/misp/feed-osint/").start: request = requests.get(feed, headers={ 'User-Agent': 'Mozilla/5.0' }).json() count = 0 for iocs in request['Event']['Attribute']: if iocs['category'] == 'Payload delivery': if '.' not in iocs['value'] \ and len(iocs['value']) == 32 \ or len(iocs['value']) == 64: if len(self.database.compare_ioc( IOC=iocs['value'])) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=iocs['value'], signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC( comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['value']}") elif iocs['category'] == 'External analysis': if 'virustotal' in iocs['value']: hash = iocs['value'].split('/')[4] if len(self.database.compare_ioc(IOC=hash)) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=hash, signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC( comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['value']}") elif iocs['category'] == 'Artifacts dropped': hash = iocs['value'] if len(self.database.compare_ioc(IOC=hash)) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=hash, signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC(comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f'IOC already registered: {hash}')
def dump_cti_experts(api, base_dir, test_run=False): # If you want to top_users_res = [] user_dir = os.path.join(base_dir, 'users') user_status_dir = os.path.join(base_dir, r'users\status') list_dir = os.path.join(base_dir, 'lists') if not os.path.exists(user_status_dir): os.makedirs(user_status_dir) if not os.path.exists(list_dir): os.makedirs(list_dir) # For each CTI expert (in the input list), dump the info of all the lists that the expert is a member into a file for user, user_id in get_current_user(): try: user_lists_dump_path = os.path.join(user_dir, user_id + '.user.csv') if not os.path.exists(user_lists_dump_path): lists = get_user_lists(api, user) dump_user_lists(user, lists, user_lists_dump_path) time.sleep(2) if test_run: break except Exception as exp: print('ERROR {}:{}'.format(user, exp.reason)) list_rank = [] specific_words = [ 'ioc', 'malware', 'Indicator.?of.?Compromise', 'threat.?hunt', 'threat.?hunt', 'phishing.?hunt', 'phish.?hunt', 'threat.?int', 'threat.?research', 'ransomware', 'mal.?doc' ] generic_words = ['info.?sec', 'cyber.?sec', 'security', 'ransomware'] specific_regex_rule = re.compile('|'.join(specific_words), re.IGNORECASE) generic_regex_rule = re.compile('|'.join(generic_words), re.IGNORECASE) # sub_scores: number of relevant words, number_follower/log(number_followers), number_subscriber, owner_strength # score is a product of the above sub scores # each sub score must be in the range [0,+infinity), however, average must be 1 # sub scores that are above average increase the total score all_lists = {} total_sec_word_count = 0 total_member_score = 0 total_subscriber_count = 0 total_owner_strength = 0 for file in glob.glob(os.path.join(user_dir, "*.user.csv")): with open(file, 'r', encoding='utf_8') as input_file: reader = csv.reader(input_file) next(reader) counter = 0 for row in reader: counter += 1 id = row[0] if id not in all_lists: all_lists[id] = {} all_lists[id]['id'] = row[0] all_lists[id]['name'] = row[1] all_lists[id]['text'] = row[1] + ' ' + row[3] all_lists[id]['sec_word_count'] = len(specific_regex_rule.findall(all_lists[id]['text'])) * 3 + \ len(generic_regex_rule.findall(all_lists[id]['text'])) total_sec_word_count += all_lists[id]['sec_word_count'] all_lists[id]['member_count'] = int(row[4]) if all_lists[id]['member_count'] > 1: all_lists[id]['member_score'] = all_lists[id][ 'member_count'] / math.log2( all_lists[id]['member_count']) else: all_lists[id]['member_score'] = 0 total_member_score += all_lists[id]['member_score'] all_lists[id]['subscriber_count'] = int(row[5]) all_lists[id]['subscriber_count'] += 1 total_subscriber_count += all_lists[id]['subscriber_count'] all_lists[id]['owner_screen_name'] = row[9] all_lists[id]['owner_followers_count'] = int(row[12]) all_lists[id]['owner_friends_count'] = int(row[13]) if all_lists[id]['owner_friends_count'] >= 1: all_lists[id]['owner_strength'] = math.log2( (all_lists[id]['owner_followers_count'] + all_lists[id]['owner_friends_count']) / all_lists[id]['owner_friends_count']) else: all_lists[id]['owner_strength'] = 0 total_owner_strength += all_lists[id]['owner_strength'] if test_run: if counter > 10: break avg_sec_word_count = total_sec_word_count / len(all_lists) avg_member_score = total_member_score / len(all_lists) avg_subscriber_count = total_subscriber_count / len(all_lists) avg_owner_strength = total_owner_strength / len(all_lists) top_lists = select_top_lists(all_lists, avg_sec_word_count, avg_member_score, avg_subscriber_count, avg_owner_strength) counter = 0 # Dump the latest 1000 timeline tweets of each top lists for top_list in top_lists: try: print(top_list[0] + '\t' + top_list[1]['owner_screen_name'] + '\t\t' + top_list[1]['name']) file_name = top_list[0] + '---' + top_list[1][ 'owner_screen_name'] + '---' + top_list[1]['name'].replace( '/', '-') + '.dump.list.csv' list_tweets_file_path = os.path.join(list_dir, file_name) if not os.path.exists(list_tweets_file_path): tweets = get_list_timeline(api, top_list[0], 1000) dump_list_tweets(top_list[0], tweets, list_tweets_file_path) counter += 1 if test_run: if counter > 10: break else: if counter > 150: break except Exception as exp: print('ERROR processing tweets of ' + str(top_list[0])) print(exp) # For each List, count the number of IoCs appread in the dump of the latest 1000 timeline tweets top_lists_iocs = {} ioc_global_freq = {} count = 0 for file in glob.glob(os.path.join(list_dir, "*.dump.list.csv")): count += 1 name = os.path.basename(file) print('processing ' + name) id = name.split('---')[0] if id not in top_lists_iocs: top_lists_iocs[id] = set() with open(file, 'r', encoding='utf_8') as input_file: for line in input_file: try: tweet = json.loads(line) iocs = iocextract.extract_iocs(tweet['text'], refang=True) for ioc in iocs: if ioc not in tweet['text']: top_lists_iocs[id].add(ioc) if ioc not in ioc_global_freq: ioc_global_freq[ioc] = 1 else: ioc_global_freq[ioc] += 1 except Exception as exp: print('ERROR processing ' + name + ' tweet: ' + line) # Calculate the uniqueness score for each of the lists list_ranking = {} average_score = 0 for list_id, iocs in top_lists_iocs.items(): total_score = 0 for ioc in iocs: ioc_count = ioc_global_freq[ioc] + 1 # total_score += 1 / math.log2(ioc_count) total_score += 1 / ioc_count list_ranking[list_id] = total_score average_score += total_score average_score = average_score / len(list_ranking) list_rank_ioc = [] for top_list in top_lists: if top_list[0] in list_ranking: top_list[1]['ioc_uniqness'] = list_ranking[ top_list[0]] / average_score top_list[1]['score'] *= top_list[1]['ioc_uniqness'] list_rank_ioc.append(top_list[1]) ranked_list = sorted(list_rank_ioc, key=lambda x: x['score'], reverse=True) with open(os.path.join(list_dir, 'list_ioc_rank'), 'w', encoding='utf_8') as rank_output: for list in ranked_list: rank_output.write('{},{},{},{}\n'.format(list['id'], list['owner_screen_name'], list['name'], list['score'])) member_scores = {} for list in ranked_list: try: file_name = list['id'] + '---' + list[ 'owner_screen_name'] + '---' + list['name'].replace( '/', '-') + '.members.list.csv' print('Getting members of ' + list['id']) list_members_file_path = os.path.join(list_dir, file_name) if not os.path.exists(list_members_file_path): members = get_list_members(api, list['id']) print('List members count ' + str(len(members))) dump_list_users(list['id'], members, list_members_file_path) else: members = [] with open(list_members_file_path, 'r') as member_file: for line in member_file: member = Dummy() member_json_obj = json.loads(line) member.screen_name = member_json_obj['screen_name'] member.id = member_json_obj['id'] members.append(member) for member in members: if member.id not in member_scores: member_scores[member.id] = { 'score': 0, 'screen_name': member.screen_name, 'lists': set() } member_scores[member.id]['lists'].add(list['id']) member_scores[member.id]['score'] += list['score'] print('All members count ' + str(len(member_scores))) except Exception as exp: print('ERROR getting members ' + list_id) member_ranks = sorted(member_scores.items(), key=lambda x: x[1]['score'], reverse=True) with open(os.path.join(base_dir, 'top_users'), 'w', encoding='utf_8', newline='') as top_users_output: writer = csv.writer(top_users_output) writer.writerow(['id', 'screen_name', 'score', 'lists']) for member in member_ranks: writer.writerow([ member[0], member[1]['screen_name'], member[1]['score'], member[1]['lists'] ]) member_ranks = member_ranks[:1000] print("Top 1k users before considering users' tweeting history") print(member_ranks) count = 0 # For each user in top_users file with open(os.path.join(base_dir, 'top_users'), 'r', encoding='utf_8', newline='') as top_users_input: csv_reader = csv.reader(top_users_input) next(csv_reader) user_iocs = {} ignore = True for row in csv_reader: user_id, screen_name, score = row[0], row[1], float(row[2]) try: print(str(count) + " - Getting tweets of " + screen_name) user_tweets_file_path = os.path.join( user_status_dir, '{}_{}_tweets.csv'.format(user_id, screen_name)) time_now = datetime.datetime.now() if screen_name not in user_iocs: user_iocs[screen_name] = { 'id': user_id, 'screen_name': screen_name, 'score': score, 'days': {} } # If we have not the tweet history of the user, collect the latest of their 400 timeline tweets if not os.path.exists(user_tweets_file_path): all_tweets = get_user_timeline(api, screen_name, 400) # write the csv with open(os.path.join( user_status_dir, '{}_{}_tweets.csv'.format(user_id, screen_name)), 'w', encoding='utf_8') as output_file: # dump tweets for i in all_tweets: output_file.write(json.dumps(i._json) + '\n') output_file.flush() else: ignore = False all_tweets = [] with open(user_tweets_file_path, 'r', encoding='utf_8') as input_file: next(input_file) for line in input_file: try: all_tweets.append(json.loads(line)) except Exception as exp: print("Error loading tweets in " + user_tweets_file_path) for tweet in all_tweets: if not hasattr(tweet, 'text'): text = tweet['text'] else: text = tweet.text if not hasattr(tweet, 'created_at'): created_at = parse(tweet['created_at']) created_at = created_at.replace(tzinfo=None) else: created_at = tweet.created_at iocs = iocextract.extract_iocs(text, refang=True) for ioc in iocs: if ioc not in text: day_diff = (time_now - created_at).days if day_diff < 0: day_diff = 0 if day_diff not in user_iocs[screen_name]['days']: user_iocs[screen_name]['days'][day_diff] = set( ) user_iocs[screen_name]['days'][day_diff].add(ioc) count += 1 except Exception as exp: print('Error getting statuses of ' + screen_name) if test_run: if count > 20: break else: if count > 5000: break if count % 50 == 0: print("\n\n\n\ncurrent number " + str(count) + '\n\n\n\n') avg_ioc_score = 0 for screen_name, ioc in user_iocs.items(): ioc_score = 0 for day, iocs in ioc['days'].items(): ioc_score += len(iocs) / ((int(day) + 1)**(1 / 3)) ioc_score += 1 ioc['ioc_score'] = ioc_score avg_ioc_score += ioc_score avg_ioc_score = avg_ioc_score / len(user_iocs) for screen_name, ioc in user_iocs.items(): ioc['ioc_score'] /= avg_ioc_score ioc['total_score'] = ioc['ioc_score'] * ioc['score'] final_user_rank = sorted(user_iocs.items(), key=lambda x: x[1]['total_score'], reverse=True) with open(os.path.join(base_dir, 'top_users_final'), 'w', encoding='utf_8', newline='') as top_users_output: writer = csv.writer(top_users_output) writer.writerow([ 'id', 'screen_name', 'score', 'ioc_score', 'final_score', 'days' ]) for screen_name, details in final_user_rank: writer.writerow([ details['id'], details['screen_name'], details['score'], details['ioc_score'], details['total_score'], json.dumps({x: len(y) for x, y in details['days'].items()}) ]) top_users_res.append((details['screen_name'], details['id'])) return top_users_res