def update_hash_prefix_cache(): active = get_active() if active and active['ctime'] and active['mtime'] and min( active['ctime'], active['mtime']) >= (time.time() - (30 * 60)): # no need to update, active DB exists and is recent logger.info('active database is fresh') inactive = get_inactive() # remove inactivate database if it exists to free up disk space remove_inactive(inactive) else: # we need to update the inactive DB, so get its info and delete it inactive = get_inactive() remove_inactive(inactive) # download to temporary file name tmp_file = inactive['name'] + '.tmp' logger.info('downloading database to ' + tmp_file) sbl = SafeBrowsingList(gsb_api_key, tmp_file, True) sbl.update_hash_prefix_cache() logger.info("finished creating " + tmp_file) # rename to inactive file name if path.isfile(tmp_file + JOURNAL): rename(tmp_file + JOURNAL, inactive['name'] + JOURNAL) logger.info("renamed " + tmp_file + JOURNAL + ' to ' + inactive['name'] + JOURNAL) rename(tmp_file, inactive['name']) logger.info("renamed " + tmp_file + ' to ' + inactive['name'])
def update_hash_prefix_cache(): logger.info('opening database at ' + dbfile) sbl = SafeBrowsingList(gsb_api_key, dbfile, True) logger.info('updating database at ' + dbfile) sbl.update_hash_prefix_cache() logger.info('checkpointing database at ' + dbfile) with sbl.storage.get_cursor() as dbc: dbc.execute('PRAGMA wal_checkpoint(FULL)') sbl.storage.db.commit() logger.info("all done!")
class SafeBrowsing(object): TYPE = "GoogleSBCheck" CP_FMT = '{scheme}://{netloc}/{path}' def __init__(self, name=None, api_key=None, db_path='/tmp/gsb_4.db', update_hash_prefix_cache=False): self.api_key = api_key self.db_path = db_path self.sbl = SafeBrowsingList(api_key, db_path=db_path) self.update_hash_prefix_cache = update_hash_prefix_cache try: os.stat(db_path) except: self.update_hash_prefix_cache = True if self.update_hash_prefix_cache: # this may take a while so be patient (over 1600MB of data) self.sbl.update_hash_prefix_cache() def is_blacklisted(self, url): return not SafeBrowsing.thread_safe_lookup(url) is None def lookup_url(self, url): up = urlparse(url) cp = self.CP_FMT.format(**{ 'scheme': up.scheme, 'netloc': up.netloc, 'path': up.path }).strip('/') + '/' return self.sbl.lookup_url(cp) def handle_domain(self, domain): return self.handle_domains([ domain, ]) def handle_domains(self, domains): results = {} for domain in domains: t = "https://" + domain u = "http://" + domain results[domain] = False if self.lookup_url(t) or self.lookup_url(u): results[domain] = True continue return results
class SafeBrowsing(object): def __init__(self, api_key, db_path=LINUX_DFT_PATH, update_hash_prefix_cache=False): global API_KEY, DB_PATH API_KEY = api_key DB_PATH = db_path self.sbl = SafeBrowsingList(api_key, db_path=db_path) self.update_hash_prefix_cache = update_hash_prefix_cache try: os.stat(db_path) except: self.update_hash_prefix_cache = True if self.update_hash_prefix_cache: # this may take a while so be patient (over 1600MB of data) self.sbl.update_hash_prefix_cache() def is_blacklisted(self, url): return not SafeBrowsing.thread_safe_lookup(url) is None def lookup_url(self, url): # cp_fmt = '{scheme}://{netloc}/{path}' # up = URLPARSE(url) # cp = cp_fmt.format(**{'scheme':up.scheme, 'netloc':up.netloc, 'path':up.path}).strip('/')+'/' return self.sbl.lookup_url(url) @classmethod def init(cls, api_key): return SafeBrowsing(api_key) @staticmethod def set_global(api_key, db_path='/tmp/gsb_4.db'): global SB_CHECK, API_KEY, DB_PATH API_KEY = api_key, db_path SB_CHECK = SafeBrowsing(api_key, db_path=db_path) @staticmethod def thread_safe_lookup(url): global SB_CHECK sbl = SafeBrowsing(API_KEY, db_path=DB_PATH) return sbl.lookup_url(url)
class Security: def __init__(self): self.sbl = SafeBrowsingList(GoogleConfig.SAFEBROWSINGAPIKEY) self.sbl.update_hash_prefix_cache() pass def validate_referer(self, url): threat_list = self.sbl.lookup_url(url) if threat_list is None: return None return threat_list def get_referer(self): referer = request.referrer if not referer: return None return referer @staticmethod def is_safe_url(url): ref_url = urlparse(request.host_url) test_url = urlparse(urljoin(request.host_url, url)) return test_url.scheme in ('http', 'https') and \ ref_url.netloc == test_url.netloc
mainStart = time.time() #update GSB dataset start = time.time() print "Updating local GSB dataset..." print datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: sbl = SafeBrowsingList(config['gsb-api']['key']) con = mdb.connect(config['mysql']['host'], config['mysql']['username'], config['mysql']['password'], config['mysql']['database_5'], charset='utf8') con.autocommit(True) cur = con.cursor() sbl.update_hash_prefix_cache() #hash_prefixes = sbl.get_all_hash_prefixes() #from my modified version gglsbl_db = "/tmp/gsb_v4.db" sql_db = sqlite3.connect(gglsbl_db) cursor = sql_db.cursor() cursor.execute('''SELECT HEX(value) from hash_prefix''') #get all hash prefixes #cursor.execute('''SELECT value from full_hash''') #get all full hashes all_rows = cursor.fetchall() gsb_url_hash_prefix_dict = {} for url_hash_prefix in all_rows: gsb_url_hash_prefix_dict[url_hash_prefix] = True sql = "INSERT INTO gsb_update_log_5(num_urls_gglsbl4, num_unique_urls_gglsbl4) VALUES(%s, %s)" cur.execute(sql, (len(all_rows), len(gsb_url_hash_prefix_dict)))
def worker(self, id, queue): with open(r'config\gglsbl.auth', 'r') as auth_file: gglsbl_key = auth_file.read().strip() sbl = SafeBrowsingList(gglsbl_key, db_path=r"dataset\google_safe_browisng_db") # sbl.update_hash_prefix_cache() turn = True while True: # Update Google SBL database every 12 hours at time X (e.g. 3 AM and 3 PM) hour = datetime.datetime.today().hour if hour % 12 == 3 and turn: sbl.update_hash_prefix_cache() turn = False elif hour % 12 != 3: turn = True today = get_date() with open(os.path.join('results', today + '.ioc.csv'), 'a+', encoding='utf_8') as output_file: tweet = queue.get() try: if hasattr(tweet, 'retweeted_status') and hasattr( tweet.retweeted_status, 'extended_tweet' ) and 'full_text' in tweet.retweeted_status.extended_tweet: text = tweet.retweeted_status.extended_tweet[ 'full_text'] elif hasattr(tweet, 'extended_tweet' ) and 'full_text' in tweet.extended_tweet: text = tweet.extended_tweet['full_text'] elif not hasattr(tweet, 'text'): text = tweet['text'] else: text = tweet.text if hasattr(tweet, 'retweeted_status'): if hasattr(tweet.retweeted_status, 'extended_tweet'): final_urls = tweet.retweeted_status.extended_tweet[ 'entities']['urls'] else: final_urls = tweet.retweeted_status.entities[ 'urls'] else: if hasattr(tweet, 'extended_tweet'): final_urls = tweet.extended_tweet['entities'][ 'urls'] else: final_urls = tweet.entities['urls'] for final_url in final_urls: # If a pastebin URL, get the raw content and append it to the tweet content if final_url['expanded_url'].startswith( 'https://pastebin.com/'): pastebin = final_url['expanded_url'] if 'raw' not in pastebin: pastebin = pastebin.replace( 'https://pastebin.com/', 'https://pastebin.com/raw/') req = requests.get(pastebin) text += '\n' + req.content user_type = 'top' if tweet.user.id_str in self.rand_users: user_type = 'rand' print( "###########################$$$$$$$$$$$$$$$$$$$$$$$$$$$" ) print(text) # classifier must be retrained with new data # vector = vectorize(text, self.wordlist) # vector.append(len(tweet.entities['hashtags'])) # vector.append(len(tweet.entities['user_mentions'])) # vector = numpy.array(vector).reshape(1, -1) # estimates = [] # for i in range(number_of_classifiers): # y_estimate = self.classifiers[i].predict(vector) # estimates.append(y_estimate) # vote = statistics.mode([x[0] for x in estimates]) # print("Prediction: "+vote) ips = list(iocextract.extract_ips(text, refang=True)) for ip in ips: if ip not in text: output_file.write('{},{},{},{},{},ip,{}\n'.format( tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, ip)) urls = list(iocextract.extract_urls(text, refang=True)) for url in urls: if url not in text: result = sbl.lookup_url(url.rstrip('.')) if result is not None: output_file.write( '{},{},{},{},{},url,{},{}\n'.format( tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, url.rstrip('.'), result)) else: output_file.write( '{},{},{},{},{},url,{},benign\n'.format( tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, url.rstrip('.'))) emails = list(iocextract.extract_emails(text, refang=True)) for email in emails: if email not in text: output_file.write( '{},{},{},{},{},email,{}\n'.format( tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, email)) hashes = list(iocextract.extract_hashes(text)) for hash in hashes: output_file.write('{},{},{},{},{},hash,{}\n'.format( tweet.id, tweet.created_at, user_type, tweet.user.id_str, tweet.user.screen_name, hash)) except Exception as exp: print(exp) queue.task_done()
class URLMonitor(Plugin): blacklist = [] moderators = [] sbl = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.moderators = self.plugin_config['MODERATORS'] # Initialize Safe Browsing API if self.plugin_config['GOOGLE_SAFE_BROWSING']: self.sbl = SafeBrowsingList( self.plugin_config['GOOGLE_SAFE_BROWSING_API_KEY']) self.sbl.update_hash_prefix_cache() # Populate Blacklist from URLS for url in self.plugin_config['BLACKLISTS']: url = url.strip() if url.endswith('.json'): r = requests.get(url) # Assuming MEW List format for item in r.json(): self.blacklist.append(item['id']) elif url.endswidth('.csv'): print('csv not implemented') # TODO else: print('txt not implement') # TODO print(self.__class__.__name__, 'initialized') def process_message(self, data): # print(data) # Private (Groups) or Public Channels if chan.startswith('C') or chan.startswith('G'): chan = data['channel'] text = data['text'] # Find all URLS in message text, extract host and compare against blacklist and Google Safebrowsing urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) def alert(url): # TODO flag user # TODO early warning system self.slack_client.api_call( 'chat.postMessage', channel=self.plugin_config['MODERATE_CHAN'], ' '.join(self.moderators) + ' ' + text) # TODO can probably use outputs for this if len(self.plugin_config.WARNING_MESSAGE): self.outputs.append( [data['channel'], self.plugin_config.WARNING_MESSAGE]) for u in urls: o = urlparse(u) host = re.split(":\d{,4}", o.netloc)[0] # Check Blacklist if host in self.blacklist: alert(u) break # Check Google Safebrowsing elif sbl.lookup_url(u): alert(u) break
def updateCache(): sbl = SafeBrowsingList(GOOGLE_SAFEBROWSE_API_KEY , db_path="/opt/crawler/gsb_v3.db") sbl.update_hash_prefix_cache()