def test_module(self):
        # # TODO: delete item
        item_id = 'tests/2021/01/01/global.gz'
        item = Item.Item(item_id)
        item.delete()

        item_content = b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'
        item_content_1 = b64encode(gzip.compress(item_content)).decode()
        item_content_2 = b64encode(
            gzip.compress(item_content + b' more text ...')).decode()
        message = f'{item_id} {item_content_1}'

        # Test new item
        result = self.module_obj.compute(message, r_result=True)
        print(f'test new item: {result}')
        self.assertEqual(result, item_id)

        # Test duplicate
        result = self.module_obj.compute(message, r_result=True)
        print(f'test duplicate {result}')
        self.assertIsNone(result)

        # Test same id with != content
        item = Item.Item(
            'tests/2021/01/01/global_831875da824fc86ab5cc0e835755b520.gz')
        item.delete()
        message = f'{item_id} {item_content_2}'
        result = self.module_obj.compute(message, r_result=True)
        print(f'test same id with != content: {result}')
        self.assertIn(item_id[:-3], result)
        self.assertNotEqual(result, item_id)
Beispiel #2
0
    def compute(self, message):

        obj_id = Item.get_item_id(message)

        # Extract info from message
        content = Item.get_item_content(obj_id)
        date = Item.get_item_date(obj_id)

        for decoder in self.decoder_order:  # add threshold and size limit
            # max execution time on regex
            signal.alarm(decoder['max_execution_time'])

            try:
                encoded_list = decoder['regex'].findall(content)
            except TimeoutException:
                encoded_list = []
                self.process.incr_module_timeout_statistic(
                )  # add encoder type
                self.redis_logger.debug(f"{obj_id} processing timeout")
                continue
            else:
                signal.alarm(0)

                if (len(encoded_list) > 0):
                    content = self.decode_string(content, message, date,
                                                 encoded_list, decoder['name'],
                                                 decoder['encoded_min_size'])
def check_all_iban(l_iban, obj_id):
    nb_valid_iban = 0
    for iban in l_iban:
        iban = iban[0] + iban[1] + iban[2]
        iban = ''.join(e for e in iban if e.isalnum())
        #iban = iban.upper()
        res = iban_regex_verify.findall(iban)
        date = datetime.datetime.now().strftime("%Y%m")
        if res:
            print('checking ' + iban)
            if is_valid_iban(iban):
                print('------')
                nb_valid_iban = nb_valid_iban + 1
                server_statistics.hincrby('iban_by_country:' + date, iban[0:2],
                                          1)

    if (nb_valid_iban > 0):
        to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id),
                                           Item.get_item_date(obj_id),
                                           Item.get_basename(obj_id))
        publisher.warning('{}Checked found {} IBAN;{}'.format(
            to_print, nb_valid_iban, obj_id))
        msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
        p.populate_set_out(msg, 'Tags')

        #Send to duplicate
        p.populate_set_out(obj_id, 'Duplicate')
    def compute(self, item_id):
        # refresh Tracked term
        if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type(
                'word'):
            self.list_tracked_words = Term.get_tracked_words_list()
            self.last_refresh_word = time.time()
            self.redis_logger.debug('Tracked word refreshed')

        if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type(
                'set'):
            self.set_tracked_words_list = Term.get_set_tracked_words_list()
            self.last_refresh_set = time.time()
            self.redis_logger.debug('Tracked set refreshed')

        # Cast message as Item
        item_date = Item.get_item_date(item_id)
        item_content = Item.get_item_content(item_id)

        signal.alarm(self.max_execution_time)

        dict_words_freq = None
        try:
            dict_words_freq = Term.get_text_word_frequency(item_content)
        except TimeoutException:
            self.redis_logger.warning("{0} processing timeout".format(item_id))
        else:
            signal.alarm(0)

        if dict_words_freq:
            # create token statistics
            #for word in dict_words_freq:
            #    Term.create_token_statistics(item_date, word, dict_words_freq[word])

            # check solo words
            for word in self.list_tracked_words:
                if word in dict_words_freq:
                    self.new_term_found(word, 'word', item_id, item_date)

                # check words set
                for elem in self.set_tracked_words_list:
                    list_words = elem[0]
                    nb_words_threshold = elem[1]
                    word_set = elem[2]
                    nb_uniq_word = 0

                    for word in list_words:
                        if word in dict_words_freq:
                            nb_uniq_word += 1
                    if nb_uniq_word >= nb_words_threshold:
                        self.new_term_found(word_set, 'set', item_id,
                                            item_date)
Beispiel #5
0
def report_items() -> None:
    """Print out all the item IDs used, with subtypes."""
    from packages import Item
    with get_report_file('items.txt').open('w') as f:
        for item in sorted(Item.all(), key=lambda it: it.id):
            for vers_name, version in item.versions.items():
                if len(item.versions) == 1:
                    f.write(f'- <{item.id}>\n')
                else:
                    f.write(f'- <{item.id}:{vers_name}>\n')

                variant_to_id = defaultdict(list)
                for sty_id, variant in version.styles.items():
                    variant_to_id[variant].append(sty_id)

                for variant, style_ids in variant_to_id.items():
                    f.write(f'\t- [ ] {", ".join(sorted(style_ids))}:\n'
                            f'\t  {variant.source}\n')
Beispiel #6
0
            if message is None:
                publisher.debug("{} queue is empty, waiting".format(config_section))
                time.sleep(1)
                continue

            print(message)
            paste = Paste.Paste(message)

            date = str(paste._get_p_date())
            content = paste.get_p_content()
            content = remove_html(content)


            extract_all_id(message, content, regex_pgp_public_blocs)
            extract_all_id(message, content, regex_pgp_signature)
            extract_all_id(message, content, regex_pgp_message)

        item_date = Item.get_item_date(message)

        for key_id in set_key:
            print(key_id)
            Pgp.pgp.save_item_correlation('key', key_id, message, item_date)

        for name_id in set_name:
            print(name_id)
            Pgp.pgp.save_item_correlation('name', key_id, message, item_date)

        for mail_id in set_mail:
            print(mail_id)
            Pgp.pgp.save_item_correlation('mail', key_id, message, item_date)
Beispiel #7
0
    publisher.channel = "Script"
    publisher.info("Script TermTrackerMod started")

    config_section = 'TermTrackerMod'
    p = Process(config_section)
    max_execution_time = p.config.getint(config_section, "max_execution_time")

    full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url

    while True:

        item_id = p.get_from_set()

        if item_id is not None:

            item_date = Item.get_item_date(item_id)
            item_content = Item.get_item_content(item_id)

            signal.alarm(max_execution_time)
            try:
                dict_words_freq = Term.get_text_word_frequency(item_content)
            except TimeoutException:
                print("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)

            # create token statistics
            for word in dict_words_freq:
                Term.create_token_statistics(item_date, word,
                                             dict_words_freq[word])
Beispiel #8
0
                "{} queue is empty, waiting 10s".format(config_section))
            time.sleep(10)
            continue

        else:
            tag, path = message.split(';')
            # add the tag to the tags word_list
            res = server.sadd('list_tags', tag)
            if res == 1:
                print("new tags added : {}".format(tag))
            # add the path to the tag set
            #curr_date = datetime.date.today().strftime("%Y%m%d")
            item_date = get_item_date(path)
            res = server.sadd('{}:{}'.format(tag, item_date), path)
            if res == 1:
                print("new paste: {}".format(path))
                print("   tagged: {}".format(tag))
                set_tag_metadata(tag, item_date)
            server_metadata.sadd('tag:{}'.format(path), tag)

            # Domain Object
            if Item.is_crawled(
                    path) and tag != 'infoleak:submission="crawler"':
                domain = Item.get_item_domain(path)
                server_metadata.sadd('tag:{}'.format(domain), tag)
                server.sadd('domain:{}:{}'.format(tag, item_date), domain)

            curr_date = datetime.date.today().strftime("%Y%m%d")
            server.hincrby('daily_tags:{}'.format(item_date), tag, 1)
            p.populate_set_out(message, 'MISP_The_Hive_feeder')
    publisher.info("BankAccount started")

    #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')
    iban_regex = re.compile(
        r'\b([A-Za-z]{2}[ \-]?[0-9]{2})(?=(?:[ \-]?[A-Za-z0-9]){9,30})((?:[ \-]?[A-Za-z0-9]{3,5}){2,6})([ \-]?[A-Za-z0-9]{1,3})\b'
    )
    iban_regex_verify = re.compile(r'^([A-Z]{2})([0-9]{2})([A-Z0-9]{9,30})$')

    while True:

        message = p.get_from_set()

        if message is not None:

            obj_id = Item.get_item_id(message)

            content = Item.get_item_content(obj_id)

            signal.alarm(max_execution_time)
            try:
                l_iban = iban_regex.findall(content)
            except TimeoutException:
                print("{0} processing timeout".format(obj_id))
                continue
            else:
                signal.alarm(0)

            if (len(l_iban) > 0):
                check_all_iban(l_iban, obj_id)
Beispiel #10
0
        if message is None:
            publisher.debug("{} queue is empty, waiting 10s".format(config_section))
            time.sleep(10)
            continue

        else:
            tag, path = message.split(';')
            # add the tag to the tags word_list
            res = server.sadd('list_tags', tag)
            if res == 1:
                print("new tags added : {}".format(tag))
            # add the path to the tag set
            #curr_date = datetime.date.today().strftime("%Y%m%d")
            item_date = get_item_date(path)
            res = server.sadd('{}:{}'.format(tag, item_date), path)
            if res == 1:
                print("new paste: {}".format(path))
                print("   tagged: {}".format(tag))
                set_tag_metadata(tag, item_date)
            server_metadata.sadd('tag:{}'.format(path), tag)

            # Domain Object
            if Item.is_crawled(path):
                domain = Item.get_item_domain(path)
                server_metadata.sadd('tag:{}'.format(domain), tag)
                server.sadd('domain:{}:{}'.format(tag, item_date), domain)

            curr_date = datetime.date.today().strftime("%Y%m%d")
            server.hincrby('daily_tags:{}'.format(item_date), tag, 1)
            p.populate_set_out(message, 'MISP_The_Hive_feeder')
Beispiel #11
0
def save_hash(decoder_name, message, date, decoded):
    print(decoder_name)
    type = magic.from_buffer(decoded, mime=True)
    hash = sha1(decoded).hexdigest()
    print(hash)

    data = {}
    data['name'] = hash
    data['date'] = datetime.datetime.now().strftime("%d/%m/%y")
    data['origin'] = message
    data['estimated type'] = type
    json_data = json.dumps(data)

    date_paste = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8])
    date_key = date[0:4] + date[4:6] + date[6:8]

    serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1)
    serv_metadata.zincrby('hash_date:'+date_key, hash, 1)
    serv_metadata.zincrby(decoder_name+'_date:'+date_key, hash, 1)

    # first time we see this hash
    if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'):
        serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_paste)
        serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste)
    else:
        serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste)

    # first time we see this hash (all encoding) on this paste
    if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None:
        serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1)
        serv_metadata.sadd('hash_paste:'+message, hash) # paste - hash map
        # create hash metadata
        serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type)
        serv_metadata.sadd('hash_all_type', type)

    # first time we see this hash encoding on this paste
    if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None:
        print('first '+decoder_name)

        serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # paste - hash map

        # create hash metadata
        serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type)

        # first time we see this hash today
        #if serv_metadata.zscore('hash_date:'+date_key, hash) is None:
        #    serv_metadata.zincrby('hash_type:'+type, date_key, 1)

        # first time we see this hash encoding today
        if serv_metadata.zscore(decoder_name+'_date:'+date_key, hash) is None:
            serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1)

        save_hash_on_disk(decoded, type, hash, json_data)
        print('found {} '.format(type))

    serv_metadata.hincrby('metadata_hash:'+hash, decoder_name+'_decoder', 1)

    serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1)

    serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map
    serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste

    # Domain Object
    if Item.is_crawled(message):
        domain = Item.get_item_domain(message)
        serv_metadata.sadd('hash_domain:{}'.format(domain), hash) # domain - hash map
        serv_metadata.sadd('domain_hash:{}'.format(hash), domain) # hash - domain map
Beispiel #12
0
import sys
import cld3
import time

from packages import Item
from lib import Domain

from pubsublogger import publisher
from Helper import Process

if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = 'Script'
    # Section name in bin/packages/modules.cfg
    config_section = 'Languages'
    # Setup the I/O queues
    p = Process(config_section)

    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug(
                "{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue

        item_id = Item.get_item_id(message)
        if Item.is_crawled(item_id):
            domain = Item.get_item_domain(item_id)
            Domain.add_domain_languages_by_item_id(domain, item_id)