def test_module(self): # # TODO: delete item item_id = 'tests/2021/01/01/global.gz' item = Item.Item(item_id) item.delete() item_content = b'Lorem ipsum dolor sit amet, consectetur adipiscing elit' item_content_1 = b64encode(gzip.compress(item_content)).decode() item_content_2 = b64encode( gzip.compress(item_content + b' more text ...')).decode() message = f'{item_id} {item_content_1}' # Test new item result = self.module_obj.compute(message, r_result=True) print(f'test new item: {result}') self.assertEqual(result, item_id) # Test duplicate result = self.module_obj.compute(message, r_result=True) print(f'test duplicate {result}') self.assertIsNone(result) # Test same id with != content item = Item.Item( 'tests/2021/01/01/global_831875da824fc86ab5cc0e835755b520.gz') item.delete() message = f'{item_id} {item_content_2}' result = self.module_obj.compute(message, r_result=True) print(f'test same id with != content: {result}') self.assertIn(item_id[:-3], result) self.assertNotEqual(result, item_id)
def compute(self, message): obj_id = Item.get_item_id(message) # Extract info from message content = Item.get_item_content(obj_id) date = Item.get_item_date(obj_id) for decoder in self.decoder_order: # add threshold and size limit # max execution time on regex signal.alarm(decoder['max_execution_time']) try: encoded_list = decoder['regex'].findall(content) except TimeoutException: encoded_list = [] self.process.incr_module_timeout_statistic( ) # add encoder type self.redis_logger.debug(f"{obj_id} processing timeout") continue else: signal.alarm(0) if (len(encoded_list) > 0): content = self.decode_string(content, message, date, encoded_list, decoder['name'], decoder['encoded_min_size'])
def check_all_iban(l_iban, obj_id): nb_valid_iban = 0 for iban in l_iban: iban = iban[0] + iban[1] + iban[2] iban = ''.join(e for e in iban if e.isalnum()) #iban = iban.upper() res = iban_regex_verify.findall(iban) date = datetime.datetime.now().strftime("%Y%m") if res: print('checking ' + iban) if is_valid_iban(iban): print('------') nb_valid_iban = nb_valid_iban + 1 server_statistics.hincrby('iban_by_country:' + date, iban[0:2], 1) if (nb_valid_iban > 0): to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id)) publisher.warning('{}Checked found {} IBAN;{}'.format( to_print, nb_valid_iban, obj_id)) msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id) p.populate_set_out(msg, 'Tags') #Send to duplicate p.populate_set_out(obj_id, 'Duplicate')
def compute(self, item_id): # refresh Tracked term if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type( 'word'): self.list_tracked_words = Term.get_tracked_words_list() self.last_refresh_word = time.time() self.redis_logger.debug('Tracked word refreshed') if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type( 'set'): self.set_tracked_words_list = Term.get_set_tracked_words_list() self.last_refresh_set = time.time() self.redis_logger.debug('Tracked set refreshed') # Cast message as Item item_date = Item.get_item_date(item_id) item_content = Item.get_item_content(item_id) signal.alarm(self.max_execution_time) dict_words_freq = None try: dict_words_freq = Term.get_text_word_frequency(item_content) except TimeoutException: self.redis_logger.warning("{0} processing timeout".format(item_id)) else: signal.alarm(0) if dict_words_freq: # create token statistics #for word in dict_words_freq: # Term.create_token_statistics(item_date, word, dict_words_freq[word]) # check solo words for word in self.list_tracked_words: if word in dict_words_freq: self.new_term_found(word, 'word', item_id, item_date) # check words set for elem in self.set_tracked_words_list: list_words = elem[0] nb_words_threshold = elem[1] word_set = elem[2] nb_uniq_word = 0 for word in list_words: if word in dict_words_freq: nb_uniq_word += 1 if nb_uniq_word >= nb_words_threshold: self.new_term_found(word_set, 'set', item_id, item_date)
def report_items() -> None: """Print out all the item IDs used, with subtypes.""" from packages import Item with get_report_file('items.txt').open('w') as f: for item in sorted(Item.all(), key=lambda it: it.id): for vers_name, version in item.versions.items(): if len(item.versions) == 1: f.write(f'- <{item.id}>\n') else: f.write(f'- <{item.id}:{vers_name}>\n') variant_to_id = defaultdict(list) for sty_id, variant in version.styles.items(): variant_to_id[variant].append(sty_id) for variant, style_ids in variant_to_id.items(): f.write(f'\t- [ ] {", ".join(sorted(style_ids))}:\n' f'\t {variant.source}\n')
if message is None: publisher.debug("{} queue is empty, waiting".format(config_section)) time.sleep(1) continue print(message) paste = Paste.Paste(message) date = str(paste._get_p_date()) content = paste.get_p_content() content = remove_html(content) extract_all_id(message, content, regex_pgp_public_blocs) extract_all_id(message, content, regex_pgp_signature) extract_all_id(message, content, regex_pgp_message) item_date = Item.get_item_date(message) for key_id in set_key: print(key_id) Pgp.pgp.save_item_correlation('key', key_id, message, item_date) for name_id in set_name: print(name_id) Pgp.pgp.save_item_correlation('name', key_id, message, item_date) for mail_id in set_mail: print(mail_id) Pgp.pgp.save_item_correlation('mail', key_id, message, item_date)
publisher.channel = "Script" publisher.info("Script TermTrackerMod started") config_section = 'TermTrackerMod' p = Process(config_section) max_execution_time = p.config.getint(config_section, "max_execution_time") full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url while True: item_id = p.get_from_set() if item_id is not None: item_date = Item.get_item_date(item_id) item_content = Item.get_item_content(item_id) signal.alarm(max_execution_time) try: dict_words_freq = Term.get_text_word_frequency(item_content) except TimeoutException: print("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) # create token statistics for word in dict_words_freq: Term.create_token_statistics(item_date, word, dict_words_freq[word])
"{} queue is empty, waiting 10s".format(config_section)) time.sleep(10) continue else: tag, path = message.split(';') # add the tag to the tags word_list res = server.sadd('list_tags', tag) if res == 1: print("new tags added : {}".format(tag)) # add the path to the tag set #curr_date = datetime.date.today().strftime("%Y%m%d") item_date = get_item_date(path) res = server.sadd('{}:{}'.format(tag, item_date), path) if res == 1: print("new paste: {}".format(path)) print(" tagged: {}".format(tag)) set_tag_metadata(tag, item_date) server_metadata.sadd('tag:{}'.format(path), tag) # Domain Object if Item.is_crawled( path) and tag != 'infoleak:submission="crawler"': domain = Item.get_item_domain(path) server_metadata.sadd('tag:{}'.format(domain), tag) server.sadd('domain:{}:{}'.format(tag, item_date), domain) curr_date = datetime.date.today().strftime("%Y%m%d") server.hincrby('daily_tags:{}'.format(item_date), tag, 1) p.populate_set_out(message, 'MISP_The_Hive_feeder')
publisher.info("BankAccount started") #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b') iban_regex = re.compile( r'\b([A-Za-z]{2}[ \-]?[0-9]{2})(?=(?:[ \-]?[A-Za-z0-9]){9,30})((?:[ \-]?[A-Za-z0-9]{3,5}){2,6})([ \-]?[A-Za-z0-9]{1,3})\b' ) iban_regex_verify = re.compile(r'^([A-Z]{2})([0-9]{2})([A-Z0-9]{9,30})$') while True: message = p.get_from_set() if message is not None: obj_id = Item.get_item_id(message) content = Item.get_item_content(obj_id) signal.alarm(max_execution_time) try: l_iban = iban_regex.findall(content) except TimeoutException: print("{0} processing timeout".format(obj_id)) continue else: signal.alarm(0) if (len(l_iban) > 0): check_all_iban(l_iban, obj_id)
if message is None: publisher.debug("{} queue is empty, waiting 10s".format(config_section)) time.sleep(10) continue else: tag, path = message.split(';') # add the tag to the tags word_list res = server.sadd('list_tags', tag) if res == 1: print("new tags added : {}".format(tag)) # add the path to the tag set #curr_date = datetime.date.today().strftime("%Y%m%d") item_date = get_item_date(path) res = server.sadd('{}:{}'.format(tag, item_date), path) if res == 1: print("new paste: {}".format(path)) print(" tagged: {}".format(tag)) set_tag_metadata(tag, item_date) server_metadata.sadd('tag:{}'.format(path), tag) # Domain Object if Item.is_crawled(path): domain = Item.get_item_domain(path) server_metadata.sadd('tag:{}'.format(domain), tag) server.sadd('domain:{}:{}'.format(tag, item_date), domain) curr_date = datetime.date.today().strftime("%Y%m%d") server.hincrby('daily_tags:{}'.format(item_date), tag, 1) p.populate_set_out(message, 'MISP_The_Hive_feeder')
def save_hash(decoder_name, message, date, decoded): print(decoder_name) type = magic.from_buffer(decoded, mime=True) hash = sha1(decoded).hexdigest() print(hash) data = {} data['name'] = hash data['date'] = datetime.datetime.now().strftime("%d/%m/%y") data['origin'] = message data['estimated type'] = type json_data = json.dumps(data) date_paste = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8]) date_key = date[0:4] + date[4:6] + date[6:8] serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1) serv_metadata.zincrby('hash_date:'+date_key, hash, 1) serv_metadata.zincrby(decoder_name+'_date:'+date_key, hash, 1) # first time we see this hash if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'): serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_paste) serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) else: serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) # first time we see this hash (all encoding) on this paste if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None: serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1) serv_metadata.sadd('hash_paste:'+message, hash) # paste - hash map # create hash metadata serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type) serv_metadata.sadd('hash_all_type', type) # first time we see this hash encoding on this paste if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None: print('first '+decoder_name) serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # paste - hash map # create hash metadata serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type) # first time we see this hash today #if serv_metadata.zscore('hash_date:'+date_key, hash) is None: # serv_metadata.zincrby('hash_type:'+type, date_key, 1) # first time we see this hash encoding today if serv_metadata.zscore(decoder_name+'_date:'+date_key, hash) is None: serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1) save_hash_on_disk(decoded, type, hash, json_data) print('found {} '.format(type)) serv_metadata.hincrby('metadata_hash:'+hash, decoder_name+'_decoder', 1) serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1) serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste # Domain Object if Item.is_crawled(message): domain = Item.get_item_domain(message) serv_metadata.sadd('hash_domain:{}'.format(domain), hash) # domain - hash map serv_metadata.sadd('domain_hash:{}'.format(hash), domain) # hash - domain map
import sys import cld3 import time from packages import Item from lib import Domain from pubsublogger import publisher from Helper import Process if __name__ == '__main__': publisher.port = 6380 publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'Languages' # Setup the I/O queues p = Process(config_section) while True: message = p.get_from_set() if message is None: publisher.debug( "{} queue is empty, waiting".format(config_section)) time.sleep(1) continue item_id = Item.get_item_id(message) if Item.is_crawled(item_id): domain = Item.get_item_domain(item_id) Domain.add_domain_languages_by_item_id(domain, item_id)