예제 #1
0
"""
import time
from packages import Paste
from pubsublogger import publisher

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Tokenize'
    p = Process(config_section)

    # LOGGING #
    publisher.info("Tokeniser started")

    while True:
        message = p.get_from_set()
        print message
        if message is not None:
            paste = Paste.Paste(message)
            for word, score in paste._get_top_words().items():
                if len(word) >= 4:
                    msg = '{} {} {}'.format(paste.p_path, word, score)
                    p.populate_set_out(msg)
        else:
            publisher.debug("Tokeniser is idling 10s")
            time.sleep(10)
            print "sleepin"
예제 #2
0
            time.sleep(1)
            continue
        # Creating the full filepath
        filename = os.path.join(os.environ['AIL_HOME'],
                                p.config.get("Directories", "pastes"), paste)

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        decoded = base64.standard_b64decode(gzip64encoded)

        with open(filename, 'wb') as f:
            f.write(decoded)
        '''try:
            decoded2 = gunzip_bytes_obj(decoded)
        except:
            decoded2 =''

        type = magic.from_buffer(decoded2, mime=True)

        if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby':

            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
            print(filename)
            print(type)
            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
            '''
        p.populate_set_out(filename)
        processed_paste += 1
예제 #3
0
            message += ' Related websites: {}'.format((', '.join(sites_set)))

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source,
                                                      paste.p_date,
                                                      paste.p_name, message,
                                                      paste.p_rel_path)

        print('\n '.join(creds))

        #num of creds above tresh, publish an alert
        if len(creds) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".
                  format(filepath))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')

            msg = 'infoleak:automatic-detection="credential";{}'.format(
                filepath)
            p.populate_set_out(msg, 'Tags')

            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1
예제 #4
0
        sites_set = set(re.findall(regex_web, content))

        message = 'Checked {} credentials found.'.format(len(creds))
        if sites_set:
            message += ' Related websites: {}'.format( (', '.join(sites_set)) )

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path)

        print('\n '.join(creds))

        #num of creds above tresh, publish an alert
        if len(creds) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".format( filepath ))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')

            msg = 'infoleak:automatic-detection="credential";{}'.format(filepath)
            p.populate_set_out(msg, 'Tags')

            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
예제 #5
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, url, domain, original_paste, super_father,
                     *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [domain]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(
                self.p.config.get("Directories", "crawled"), date)

            self.crawled_paste_filemame = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "pastes"),
                self.p.config.get("Directories", "crawled"), date)

            self.crawled_screenshot = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "crawled_screenshot"), date)

        def start_requests(self):
            yield SplashRequest(self.start_urls,
                                self.parse,
                                errback=self.errback_catcher,
                                endpoint='render.json',
                                meta={'father': self.original_paste},
                                args={
                                    'html': 1,
                                    'wait': 10,
                                    'render_all': 1,
                                    'har': 1,
                                    'png': 1
                                })

        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if (error_log['info']['text'] == 'Connection to proxy refused'
                    ):
                    print('Connection to proxy refused')
            else:

                #avoid filename too big
                if len(self.domains[0]) > 215:
                    UUID = self.domains[0][-215:] + str(uuid.uuid4())
                else:
                    UUID = self.domains[0] + str(uuid.uuid4())
                filename_paste = os.path.join(self.crawled_paste_filemame,
                                              UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_screenshot = os.path.join(self.crawled_screenshot,
                                                   UUID + '.png')

                # save new paste on disk
                if self.save_crawled_paste(filename_paste,
                                           response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd(
                        '{}_up:{}'.format(self.type, self.full_date),
                        self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type),
                                           self.domains[0])
                    self.r_serv_onion.sadd(
                        'month_{}_up:{}'.format(self.type, self.date_month),
                        self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(
                            self.type, self.domains[0])):
                        self.r_serv_onion.hset(
                            '{}_metadata:{}'.format(self.type,
                                                    self.domains[0]),
                            'first_seen', self.full_date)
                    self.r_serv_onion.hset(
                        '{}_metadata:{}'.format(self.type, self.domains[0]),
                        'last_seen', self.full_date)

                    #create paste metadata
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'super_father',
                        self.super_father)
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'father',
                        response.meta['father'])
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'domain',
                        self.domains[0])
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'real_link',
                        response.url)

                    self.r_serv_metadata.sadd(
                        'paste_children:' + response.meta['father'],
                        filename_paste)

                    dirname = os.path.dirname(filename_screenshot)
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)

                    size_screenshot = (len(response.data['png']) * 3) / 4

                    if size_screenshot < 5000000:  #bytes
                        with open(filename_screenshot, 'wb') as f:
                            f.write(
                                base64.standard_b64decode(
                                    response.data['png'].encode()))

                    with open(filename_screenshot + 'har.txt', 'wb') as f:
                        f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(
                            link.url,
                            self.parse,
                            errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste},
                            args={
                                'html': 1,
                                'png': 1,
                                'render_all': 1,
                                'har': 1,
                                'wait': 10
                            })

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                request = failure.request
                url = request.meta['splash']['args']['url']
                father = request.meta['father']

                self.logger.error(
                    'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                    url)
                time.sleep(10)
                yield SplashRequest(url,
                                    self.parse,
                                    errback=self.errback_catcher,
                                    endpoint='render.json',
                                    meta={'father': father},
                                    args={
                                        'html': 1,
                                        'png': 1,
                                        'render_all': 1,
                                        'har': 1,
                                        'wait': 10
                                    })

            else:
                print('failure')
                #print(failure)
                print(failure.type)
                #print(failure.request.meta['item'])
            '''
            #if isinstance(failure.value, HttpError):
            elif failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
            '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(
                    filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder",
                                           "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
예제 #6
0
                    r_serv2, PST.get_regex(email_regex))

                if MX_values[0] >= 1:

                    PST.__setattr__(channel, MX_values)
                    PST.save_attribute_redis(channel, (MX_values[0],
                                             list(MX_values[1])))

                    pprint.pprint(MX_values)
                    to_print = 'Mails;{};{};{};Checked {} e-mail(s)'.\
                        format(PST.p_source, PST.p_date, PST.p_name,
                               MX_values[0])
                    if MX_values[0] > is_critical:
                        publisher.warning(to_print)
                        #Send to duplicate
                        p.populate_set_out(filename, 'Duplicate')
                        
                    else:
                        publisher.info(to_print)
                #Send to ModuleStats 
                for mail in MX_values[1]:
                    print 'mail;{};{};{}'.format(1, mail, PST.p_date)
                    p.populate_set_out('mail;{};{};{}'.format(1, mail, PST.p_date), 'ModuleStats')
                    p.populate_set_out('mail;{}'.format(filename), 'BrowseWarningPaste')

            prec_filename = filename

        else:
            publisher.debug("Script Mails is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
예제 #7
0
            all_cards = re.findall(regex, content)
            if len(all_cards) > 0:
                print 'All matching', all_cards
                creditcard_set = set([])

                for card in all_cards:
                    clean_card = re.sub('[^0-9]', '', card)
                    if lib_refine.is_luhn_valid(clean_card):
                        print clean_card, 'is valid'
                        creditcard_set.add(clean_card)

                paste.__setattr__(channel, creditcard_set)
                paste.save_attribute_redis(channel, creditcard_set)

                pprint.pprint(creditcard_set)
                to_print = 'CreditCard;{};{};{};'.format(
                    paste.p_source, paste.p_date, paste.p_name)
                if (len(creditcard_set) > 0):
                    publisher.warning('{}Checked {} valid number(s);{}'.format(
                        to_print, len(creditcard_set), paste.p_path))
                    #Send to duplicate
                    p.populate_set_out(filename, 'Duplicate')
                    #send to Browse_warning_paste
                    p.populate_set_out('creditcard;{}'.format(filename), 'BrowseWarningPaste')
                else:
                    publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path))
        else:
            publisher.debug("Script creditcard is idling 1m")
            time.sleep(10)

예제 #8
0
                creditcard_set = set([])

                for card in all_cards:
                    clean_card = re.sub('[^0-9]', '', card)
                    clean_card = clean_card
                    if lib_refine.is_luhn_valid(clean_card):
                        print(clean_card, 'is valid')
                        creditcard_set.add(clean_card)

                paste.__setattr__(channel, creditcard_set)
                paste.save_attribute_redis(channel, creditcard_set)

                pprint.pprint(creditcard_set)
                to_print = 'CreditCard;{};{};{};'.format(
                    paste.p_source, paste.p_date, paste.p_name)
                if (len(creditcard_set) > 0):
                    publisher.warning('{}Checked {} valid number(s);{}'.format(
                        to_print, len(creditcard_set), paste.p_rel_path))
                    print('{}Checked {} valid number(s);{}'.format(
                        to_print, len(creditcard_set), paste.p_rel_path))
                    #Send to duplicate
                    p.populate_set_out(filename, 'Duplicate')

                    msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename)
                    p.populate_set_out(msg, 'Tags')
                else:
                    publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_rel_path))
        else:
            publisher.debug("Script creditcard is idling 1m")
            time.sleep(10)
예제 #9
0
    delta = date_to - date_from  # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append(date.strftime('%Y%m%d'))
    return l_date


config_section = 'Keys'
p = Process(config_section)

r_tags = redis.StrictRedis(host=p.config.get("ARDB_Tags", "host"),
                           port=p.config.getint("ARDB_Tags", "port"),
                           db=p.config.getint("ARDB_Tags", "db"),
                           decode_responses=True)

tag = 'infoleak:automatic-detection="pgp-message"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)

# get all tagged items
for date in l_dates:
    daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date))

    for item in daily_tagged_items:
        p.populate_set_out(item, 'PgpDump')
예제 #10
0
    delta = date_to - date_from # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append( date.strftime('%Y%m%d') )
    return l_date

config_section = 'Keys'
p = Process(config_section)

r_tags = redis.StrictRedis(
    host=p.config.get("ARDB_Tags", "host"),
    port=p.config.getint("ARDB_Tags", "port"),
    db=p.config.getint("ARDB_Tags", "db"),
    decode_responses=True)

tag = 'infoleak:automatic-detection="pgp-message"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)

# get all tagged items
for date in l_dates:
    daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date))

    for item in daily_tagged_items:
        p.populate_set_out(item, 'PgpDump')
예제 #11
0
                    # Send a notification only when the member is in the set
                    if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name):

                        # create mail body
                        mail_body = ("AIL Framework,\n"
                                    "New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n"
                                    ''+full_paste_url + filename)

                        # Send to every associated email adress
                        for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]):
                            sendEmailNotification(email, 'Term', mail_body)

                    # tag paste
                    for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]):
                        msg = '{};{}'.format(tag, filename)
                        p.populate_set_out(msg, 'Tags')

                    print(the_set, "matched in", filename)
                    set_name = 'set_' + dico_setname_to_redis[the_set]
                    new_to_the_set = server_term.sadd(set_name, filename)
                    new_to_the_set = True if new_to_the_set == 1 else False

                    #consider the num of occurence of this set
                    set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))

                    # FIXME - avoid using per paste as a set is checked over the entire paste
                    #1 term per paste
                    if new_to_the_set:
                        set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
                        server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
                server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
예제 #12
0
        sites= re.findall(regex_web, content) #Use to count occurences
        sites_set = set(re.findall(regex_web, content))

        message = 'Checked {} credentials found.'.format(len(creds))
        if sites_set:
            message += ' Related websites: {}'.format(', '.join(sites_set))

        to_print = 'Credential;{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message)

        print('\n '.join(creds))

        if len(creds) > critical:
            print("========> Found more than 10 credentials in this file : {}".format(filepath))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')
            #Send to BrowseWarningPaste
            p.populate_set_out('credential;{}'.format(filepath), 'BrowseWarningPaste')
            
            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
                faup.decode(url)
예제 #13
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs):
            self.type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.arg_crawler = {  'html': crawler_options['html'],
                                  'wait': 10,
                                  'render_all': 1,
                                  'har': crawler_options['har'],
                                  'png': crawler_options['png']}

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )

        def start_requests(self):
            yield SplashRequest(
                self.start_urls,
                self.parse,
                errback=self.errback_catcher,
                endpoint='render.json',
                meta={'father': self.original_item, 'root_key': None},
                args=self.arg_crawler
            )

        def parse(self,response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if(error_log['info']['text'] == 'Connection to proxy refused'):
                    print('Connection to proxy refused')
            else:

                #avoid filename too big
                if len(self.domains[0]) > 215:
                    UUID = self.domains[0][-215:]+str(uuid.uuid4())
                else:
                    UUID = self.domains[0]+str(uuid.uuid4())
                filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_har = os.path.join(self.crawled_har, UUID)

                # # TODO: modify me
                # save new paste on disk
                if self.save_crawled_paste(relative_filename_paste, response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
                    self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
                        self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)

                    # create root_key
                    if self.root_key is None:
                        self.root_key = relative_filename_paste
                        # Create/Update crawler history
                        self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
                        # Update domain port number
                        all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
                        if all_domain_ports:
                            all_domain_ports = all_domain_ports.split(';')
                        else:
                            all_domain_ports = []
                        if self.port not in all_domain_ports:
                            all_domain_ports.append(self.port)
                            self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))

                    #create paste metadata
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key)
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father'])
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url)

                    self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)

                    if 'png' in response.data:
                        size_screenshot = (len(response.data['png'])*3) /4

                        if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto
                            image_content = base64.standard_b64decode(response.data['png'].encode())
                            hash = sha256(image_content).hexdigest()
                            img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
                            filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
                            dirname = os.path.dirname(filename_img)
                            if not os.path.exists(dirname):
                                os.makedirs(dirname)
                            if not os.path.exists(filename_img):
                                with open(filename_img, 'wb') as f:
                                    f.write(image_content)
                            # add item metadata
                            self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
                            # add sha256 metadata
                            self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste)

                    if 'har' in response.data:
                        dirname = os.path.dirname(filename_har)
                        if not os.path.exists(dirname):
                            os.makedirs(dirname)
                        with open(filename_har+'.json', 'wb') as f:
                            f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(
                            link.url,
                            self.parse,
                            errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']},
                            args=self.arg_crawler
                        )

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                request = failure.request
                url = request.meta['splash']['args']['url']
                father = request.meta['father']

                self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
                time.sleep(10)
                if response:
                    response_root_key = response.meta['root_key']
                else:
                    response_root_key = None
                yield SplashRequest(
                    url,
                    self.parse,
                    errback=self.errback_catcher,
                    endpoint='render.json',
                    meta={'father': father, 'root_key': response.meta['root_key']},
                    args=self.arg_crawler
                )

            else:
                print('failure')
                #print(failure)
                print(failure.type)
                #print(failure.request.meta['item'])

            '''
            #if isinstance(failure.value, HttpError):
            elif failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
            '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
예제 #14
0
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        decoded = base64.standard_b64decode(gzip64encoded)

        with open(filename, 'wb') as f:
            f.write(decoded)
        '''try:
            decoded2 = gunzip_bytes_obj(decoded)
        except:
            decoded2 =''

        type = magic.from_buffer(decoded2, mime=True)

        if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby':

            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
            print(filename)
            print(type)
            print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------')
        '''

        # remove PASTES_FOLDER from item path (crawled item + submited)
        if PASTES_FOLDERS in paste:
            paste = paste.replace(PASTES_FOLDERS, '', 1)

        p.populate_set_out(paste)
        processed_paste+=1
예제 #15
0
                MX_values = lib_refine.checking_MX_record(
                    r_serv2, PST.get_regex(email_regex), addr_dns)

                if MX_values[0] >= 1:

                    PST.__setattr__(channel, MX_values)
                    PST.save_attribute_redis(channel, (MX_values[0],
                                             list(MX_values[1])))

                    to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\
                        format(PST.p_source, PST.p_date, PST.p_name,
                               MX_values[0], PST.p_path)
                    if MX_values[0] > is_critical:
                        publisher.warning(to_print)
                        #Send to duplicate
                        p.populate_set_out(filename, 'Duplicate')
                        p.populate_set_out('mail;{}'.format(filename), 'alertHandler')

                        msg = 'infoleak:automatic-detection="mail";{}'.format(filename)
                        p.populate_set_out(msg, 'Tags')

                        #create country statistics
                        date = datetime.datetime.now().strftime("%Y%m")
                        for mail in MX_values[1]:
                            print('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date))
                            p.populate_set_out('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date), 'ModuleStats')

                            faup.decode(mail)
                            tld = faup.get()['tld']
                            server_statistics.hincrby('mail_by_tld:'+date, tld, MX_values[1][mail])
예제 #16
0
파일: Mail.py 프로젝트: xuezs/AIL-framework
                    r_serv2, PST.get_regex(email_regex))

                if MX_values[0] >= 1:

                    PST.__setattr__(channel, MX_values)
                    PST.save_attribute_redis(
                        channel, (MX_values[0], list(MX_values[1])))

                    pprint.pprint(MX_values)
                    to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\
                        format(PST.p_source, PST.p_date, PST.p_name,
                               MX_values[0], PST.p_path)
                    if MX_values[0] > is_critical:
                        publisher.warning(to_print)
                        #Send to duplicate
                        p.populate_set_out(filename, 'Duplicate')
                        p.populate_set_out('mail;{}'.format(filename),
                                           'alertHandler')

                    else:
                        publisher.info(to_print)
                #Send to ModuleStats
                for mail in MX_values[1]:
                    print 'mail;{};{};{}'.format(1, mail, PST.p_date)
                    p.populate_set_out(
                        'mail;{};{};{}'.format(1, mail, PST.p_date),
                        'ModuleStats')

            prec_filename = filename

        else:
예제 #17
0
                                            'onion_metadata:{}'.format(domain),
                                            'first_seen'):
                                        r_onion.sadd(
                                            'onion_crawler_priority_queue',
                                            msg)
                                        print('send to priority queue')
                                    else:
                                        r_onion.sadd('onion_crawler_queue',
                                                     msg)
                                #p.populate_set_out(msg, 'Crawler')

                    else:
                        for url in fetch(p, r_cache, urls, domains_list, path):
                            publisher.info('{}Checked {};{}'.format(
                                to_print, url, PST.p_path))
                            p.populate_set_out('onion;{}'.format(PST.p_path),
                                               'alertHandler')

                            msg = 'infoleak:automatic-detection="onion";{}'.format(
                                PST.p_path)
                            p.populate_set_out(msg, 'Tags')
                else:
                    publisher.info('{}Onion related;{}'.format(
                        to_print, PST.p_path))

            prec_filename = filename
        else:
            publisher.debug("Script url is Idling 10s")
            #print('Sleeping')
            time.sleep(10)
예제 #18
0
class AbstractModule(ABC):
    """
    Abstract Module class
    """

    def __init__(self, module_name=None, queue_name=None, logger_channel='Script'):
        """
        Init Module
        module_name: str; set the module name if different from the instance ClassName
        queue_name: str; set the queue name if different from the instance ClassName
        logger_channel: str; set the logger channel name, 'Script' by default
        """
        # Module name if provided else instance className
        self.module_name = module_name if module_name else self._module_name()

        # Module name if provided else instance className
        self.queue_name = queue_name if queue_name else self._module_name()

        # Init Redis Logger
        self.redis_logger = publisher

        # Port of the redis instance used by pubsublogger
        self.redis_logger.port = 6380

        # Channel name to publish logs
        # # TODO: refactor logging
        # If provided could be a namespaced channel like script:<ModuleName>
        self.redis_logger.channel = logger_channel


        # Run module endlessly
        self.proceed = True

        # Waiting time in secondes between two proccessed messages
        self.pending_seconds = 10

        # Setup the I/O queues
        self.process = Process(self.queue_name)

    def get_message(self):
        """
        Get message from the Redis Queue (QueueIn)
        Input message can change between modules
        ex: '<item id>'
        """
        return self.process.get_from_set()

    def send_message_to_queue(self, message, queue_name=None):
        """
        Send message to queue
        :param message: message to send in queue
        :param queue_name: queue or module name

        ex: send_to_queue(item_id, 'Global')
        """
        self.process.populate_set_out(message, queue_name)

    def run(self):
        """
        Run Module endless process
        """

        # Endless loop processing messages from the input queue
        while self.proceed:
            # Get one message (ex:item id) from the Redis Queue (QueueIn)
            message = self.get_message()

            if message:
                try:
                    # Module processing with the message from the queue
                    self.compute(message)
                except Exception as err:
                    trace = traceback.format_tb(err.__traceback__)
                    self.redis_logger.critical(f"Error in module {self.module_name}: {err}")
                    self.redis_logger.critical(f"Module {self.module_name} input message: {message}")
                    self.redis_logger.critical(trace)
                    print()
                    print(f"ERROR: {err}")
                    print(f'MESSAGE: {message}')
                    print('TRACEBACK:')
                    for line in trace:
                        print(line)

            else:
                self.computeNone()
                # Wait before next process
                self.redis_logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
                time.sleep(self.pending_seconds)


    def _module_name(self):
        """
        Returns the instance class name (ie. the Module Name)
        """
        return self.__class__.__name__


    @abstractmethod
    def compute(self, message):
        """
        Main method of the Module to implement
        """
        pass


    def computeNone(self):
        """
        Method of the Module when there is no message
        """
        pass
예제 #19
0
                                        type_hidden_service, domain),
                                    'first_seen', date)
                            r_onion.hset(
                                '{}_metadata:{}'.format(
                                    type_hidden_service, domain), 'last_seen',
                                date)
                        else:
                            #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
                            if r_onion.sismember(
                                    'month_{}_up:{}'.format(
                                        type_hidden_service, date_month),
                                    domain) and r_serv_metadata.exists(
                                        'paste_children:' + paste):
                                msg = 'infoleak:automatic-detection="{}";{}'.format(
                                    type_hidden_service, paste)
                                p.populate_set_out(msg, 'Tags')

                        # last check
                        r_onion.hset(
                            '{}_metadata:{}'.format(type_hidden_service,
                                                    domain), 'last_check',
                            date)

                        # last_father
                        r_onion.hset(
                            '{}_metadata:{}'.format(type_hidden_service,
                                                    domain), 'paste_parent',
                            paste)

                        # add onion screenshot history
                        # add crawled days
예제 #20
0
                to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
                                                    PST.p_name)
                if len(domains_list) > 0:

                    publisher.warning('{}Detected {} .onion(s);{}'.format(
                        to_print, len(domains_list), PST.p_path))
                    now = datetime.datetime.now()
                    path = os.path.join(
                        'onions',
                        str(now.year).zfill(4),
                        str(now.month).zfill(2),
                        str(now.day).zfill(2),
                        str(int(time.mktime(now.utctimetuple()))))
                    to_print = 'Onion;{};{};{};'.format(
                        PST.p_source, PST.p_date, PST.p_name)
                    for url in fetch(p, r_cache, urls, domains_list, path):
                        publisher.warning('{}Checked {};{}'.format(
                            to_print, url, PST.p_path))
                        p.populate_set_out('onion;{}'.format(PST.p_path),
                                           'BrowseWarningPaste')
                else:
                    publisher.info('{}Onion related;{}'.format(
                        to_print, PST.p_path))

            prec_filename = filename
        else:
            publisher.debug("Script url is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
        message = p.get_from_set()
예제 #21
0
            message += ' Related websites: {}'.format((', '.join(sites_set)))

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source,
                                                      paste.p_date,
                                                      paste.p_name, message,
                                                      paste.p_path)

        print('\n '.join(creds))

        #num of creds above tresh, publish an alert
        if len(creds) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".
                  format(filepath))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')
            #Send to alertHandler
            msg = 'credential;{}'.format(filepath)
            p.populate_set_out(msg, 'alertHandler')

            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
예제 #22
0
    delta = date_to - date_from # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append( date.strftime('%Y%m%d') )
    return l_date

config_section = 'Global'
p = Process(config_section)

r_tags = redis.StrictRedis(
    host=p.config.get("ARDB_Tags", "host"),
    port=p.config.getint("ARDB_Tags", "port"),
    db=p.config.getint("ARDB_Tags", "db"),
    decode_responses=True)

tag = 'infoleak:automatic-detection="bitcoin-address"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)

# get all tagged items
for date in l_dates:
    daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date))

    for item in daily_tagged_items:
        p.populate_set_out(item)
예제 #23
0
파일: Categ.py 프로젝트: Mrnmap/ALLInfo
        bname = os.path.basename(filename)
        tmp_dict[bname] = []
        with open(os.path.join(args.d, filename), 'r') as f:
            patterns = [r'%s' % (re.escape(s.strip())) for s in f]
            tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)

    prec_filename = None

    while True:
        filename = p.get_from_set()
        if filename is None:
            publisher.debug("Script Categ is Idling 10s")
            print('Sleeping')
            time.sleep(10)
            continue

        paste = Paste.Paste(filename)
        content = paste.get_p_content()

        for categ, pattern in tmp_dict.items():
            found = set(re.findall(pattern, content))
            if len(found) >= matchingThreshold:
                msg = '{} {}'.format(paste.p_rel_path, len(found))

                print(msg, categ)
                p.populate_set_out(msg, categ)

                publisher.info('Categ;{};{};{};Detected {} as {};{}'.format(
                    paste.p_source, paste.p_date, paste.p_name, len(found),
                    categ, paste.p_rel_path))
예제 #24
0
    # FUNCTIONS #
    tmp_string = "Lines script Subscribed to channel {} and Start to publish \
            on channel Longlines, Shortlines"

    publisher.info(tmp_string)

    while True:
        try:
            message = p.get_from_set()
            print message
            if message is not None:
                PST = Paste.Paste(message)
            else:
                publisher.debug("Tokeniser is idling 10s")
                time.sleep(10)
                continue

            # FIXME do it in the paste class
            lines_infos = PST.get_lines_info()
            PST.save_attribute_redis("p_nb_lines", lines_infos[0])
            PST.save_attribute_redis("p_max_length_line", lines_infos[1])

            # FIXME Not used.
            PST.store.sadd("Pastes_Objects", PST.p_path)
            if lines_infos[1] < args.max:
                p.populate_set_out(PST.p_path, 'LinesShort')
            else:
                p.populate_set_out(PST.p_path, 'LinesLong')
        except IOError:
            print "CRC Checksum Error on : ", PST.p_path
예제 #25
0
                            domain = re.findall(url_regex, url)
                            if len(domain) > 0:
                                domain = domain[0][4]
                            else:
                                continue

                            if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
                                if not r_onion.sismember('onion_domain_crawler_queue', domain):
                                    print('send to onion crawler')
                                    r_onion.sadd('onion_domain_crawler_queue', domain)
                                    msg = '{};{}'.format(url,PST.p_path)
                                    r_onion.sadd('onion_crawler_queue', msg)
                                #p.populate_set_out(msg, 'Crawler')

                    else:
                        for url in fetch(p, r_cache, urls, domains_list, path):
                            publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
                            p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')

                            msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
                            p.populate_set_out(msg, 'Tags')
                else:
                    publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))

            prec_filename = filename
        else:
            publisher.debug("Script url is Idling 10s")
            #print('Sleeping')
            time.sleep(10)
        message = p.get_from_set()
예제 #26
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, crawler_options, date, requested_mode, url,
                     domain, port, original_item, *args, **kwargs):
            self.type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4],
                                         date['date_day'][4:6],
                                         date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.arg_crawler = {
                'html': crawler_options['html'],
                'wait': 10,
                'render_all': 1,
                'har': crawler_options['har'],
                'png': crawler_options['png']
            }

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(
                self.p.config.get("Directories", "crawled"), date_str)

            self.crawled_paste_filemame = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "pastes"),
                self.p.config.get("Directories", "crawled"), date_str)

            self.crawled_har = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "crawled_screenshot"),
                date_str)
            self.crawled_screenshot = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "crawled_screenshot"))

        def start_requests(self):
            yield SplashRequest(self.start_urls,
                                self.parse,
                                errback=self.errback_catcher,
                                endpoint='render.json',
                                meta={
                                    'father': self.original_item,
                                    'root_key': None
                                },
                                args=self.arg_crawler)

        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if (error_log['info']['text'] == 'Connection to proxy refused'
                    ):
                    print('Connection to proxy refused')
            else:

                #avoid filename too big
                if len(self.domains[0]) > 215:
                    UUID = self.domains[0][-215:] + str(uuid.uuid4())
                else:
                    UUID = self.domains[0] + str(uuid.uuid4())
                filename_paste_full = os.path.join(self.crawled_paste_filemame,
                                                   UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_har = os.path.join(self.crawled_har, UUID)

                # # TODO: modify me
                # save new paste on disk
                if self.save_crawled_paste(relative_filename_paste,
                                           response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd(
                        '{}_up:{}'.format(self.type, self.full_date),
                        self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type),
                                           self.domains[0])
                    self.r_serv_onion.sadd(
                        'month_{}_up:{}'.format(self.type, self.date_month),
                        self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(
                            self.type, self.domains[0])):
                        self.r_serv_onion.hset(
                            '{}_metadata:{}'.format(self.type,
                                                    self.domains[0]),
                            'first_seen', self.full_date)

                    # create root_key
                    if self.root_key is None:
                        self.root_key = relative_filename_paste
                        # Create/Update crawler history
                        self.r_serv_onion.zadd(
                            'crawler_history_{}:{}:{}'.format(
                                self.type, self.domains[0], self.port),
                            self.date_epoch, self.root_key)
                        # Update domain port number
                        all_domain_ports = self.r_serv_onion.hget(
                            '{}_metadata:{}'.format(self.type,
                                                    self.domains[0]), 'ports')
                        if all_domain_ports:
                            all_domain_ports = all_domain_ports.split(';')
                        else:
                            all_domain_ports = []
                        if self.port not in all_domain_ports:
                            all_domain_ports.append(self.port)
                            self.r_serv_onion.hset(
                                '{}_metadata:{}'.format(
                                    self.type, self.domains[0]), 'ports',
                                ';'.join(all_domain_ports))

                    #create paste metadata
                    self.r_serv_metadata.hset(
                        'paste_metadata:{}'.format(relative_filename_paste),
                        'super_father', self.root_key)
                    self.r_serv_metadata.hset(
                        'paste_metadata:{}'.format(relative_filename_paste),
                        'father', response.meta['father'])
                    self.r_serv_metadata.hset(
                        'paste_metadata:{}'.format(relative_filename_paste),
                        'domain', '{}:{}'.format(self.domains[0], self.port))
                    self.r_serv_metadata.hset(
                        'paste_metadata:{}'.format(relative_filename_paste),
                        'real_link', response.url)

                    self.r_serv_metadata.sadd(
                        'paste_children:' + response.meta['father'],
                        relative_filename_paste)

                    if 'png' in response.data:
                        size_screenshot = (len(response.data['png']) * 3) / 4

                        if size_screenshot < 5000000 or self.requested_mode:  #bytes or manual/auto
                            image_content = base64.standard_b64decode(
                                response.data['png'].encode())
                            hash = sha256(image_content).hexdigest()
                            img_dir_path = os.path.join(
                                hash[0:2], hash[2:4], hash[4:6], hash[6:8],
                                hash[8:10], hash[10:12])
                            filename_img = os.path.join(
                                self.crawled_screenshot, 'screenshot',
                                img_dir_path, hash[12:] + '.png')
                            dirname = os.path.dirname(filename_img)
                            if not os.path.exists(dirname):
                                os.makedirs(dirname)
                            if not os.path.exists(filename_img):
                                with open(filename_img, 'wb') as f:
                                    f.write(image_content)
                            # add item metadata
                            self.r_serv_metadata.hset(
                                'paste_metadata:{}'.format(
                                    relative_filename_paste), 'screenshot',
                                hash)
                            # add sha256 metadata
                            self.r_serv_onion.sadd(
                                'screenshot:{}'.format(hash),
                                relative_filename_paste)

                    if 'har' in response.data:
                        dirname = os.path.dirname(filename_har)
                        if not os.path.exists(dirname):
                            os.makedirs(dirname)
                        with open(filename_har + '.json', 'wb') as f:
                            f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(link.url,
                                            self.parse,
                                            errback=self.errback_catcher,
                                            endpoint='render.json',
                                            meta={
                                                'father':
                                                relative_filename_paste,
                                                'root_key':
                                                response.meta['root_key']
                                            },
                                            args=self.arg_crawler)

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                request = failure.request
                url = request.meta['splash']['args']['url']
                father = request.meta['father']

                self.logger.error(
                    'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                    url)
                time.sleep(10)
                if response:
                    response_root_key = response.meta['root_key']
                else:
                    response_root_key = None
                yield SplashRequest(url,
                                    self.parse,
                                    errback=self.errback_catcher,
                                    endpoint='render.json',
                                    meta={
                                        'father': father,
                                        'root_key': response.meta['root_key']
                                    },
                                    args=self.arg_crawler)

            else:
                print('failure')
                #print(failure)
                print(failure.type)
                #print(failure.request.meta['item'])
            '''
            #if isinstance(failure.value, HttpError):
            elif failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
            '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(
                    filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder",
                                           "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
예제 #27
0
    url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"

    while True:
        if message is not None:
            filename, score = message.split()

            if prec_filename is None or filename != prec_filename:
                domains_list = []
                PST = Paste.Paste(filename)
                client = ip2asn()
                for x in PST.get_regex(url_regex):
                    matching_url = re.search(url_regex, PST.get_p_content())
                    url = matching_url.group(0)

                    to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
                    p.populate_set_out(to_send, 'Url')

                    faup.decode(url)
                    domain = faup.get_domain()
                    subdomain = faup.get_subdomain()
                    f1 = None

                    domains_list.append(domain)

                    publisher.debug('{} Published'.format(url))

                    if f1 == "onion":
                        print domain

                    hostl = unicode(avoidNone(subdomain)+avoidNone(domain))
                    try:
예제 #28
0
            all_cards = re.findall(regex, content)
            if len(all_cards) > 0:
                print 'All matching', all_cards
                creditcard_set = set([])

                for card in all_cards:
                    clean_card = re.sub('[^0-9]', '', card)
                    if lib_refine.is_luhn_valid(clean_card):
                        print clean_card, 'is valid'
                        creditcard_set.add(clean_card)

                paste.__setattr__(channel, creditcard_set)
                paste.save_attribute_redis(channel, creditcard_set)

                pprint.pprint(creditcard_set)
                to_print = 'CreditCard;{};{};{};'.format(
                    paste.p_source, paste.p_date, paste.p_name)
                if (len(creditcard_set) > 0):
                    publisher.warning('{}Checked {} valid number(s)'.format(
                        to_print, len(creditcard_set)))
                    #Send to duplicate
                    p.populate_set_out(filename, 'Duplicate')
                    #send to Browse_warning_paste
                    p.populate_set_out('creditcard;{}'.format(filename), 'BrowseWarningPaste')
                else:
                    publisher.info('{}CreditCard related'.format(to_print))
        else:
            publisher.debug("Script creditcard is idling 1m")
            time.sleep(10)

예제 #29
0
    # FUNCTIONS #
    tmp_string = "Lines script Subscribed to channel {} and Start to publish \
            on channel Longlines, Shortlines"
    publisher.info(tmp_string)

    while True:
        try:
            message = p.get_from_set()
            print(message)
            if message is not None:
                PST = Paste.Paste(message)
            else:
                publisher.debug("Tokeniser is idling 10s")
                time.sleep(10)
                continue

            # FIXME do it in the paste class
            lines_infos = PST.get_lines_info()
            PST.save_attribute_redis("p_nb_lines", lines_infos[0])
            PST.save_attribute_redis("p_max_length_line", lines_infos[1])

            # FIXME Not used.
            PST.store.sadd("Pastes_Objects", PST.p_rel_path)
            print(PST.p_rel_path)
            if lines_infos[1] < args.max:
                p.populate_set_out( PST.p_rel_path , 'LinesShort')
            else:
                p.populate_set_out( PST.p_rel_path , 'LinesLong')
        except IOError:
            print("CRC Checksum Error on : ", PST.p_rel_path)
예제 #30
0
    # Sent to the logging a description of the module
    publisher.info("Tags module started")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()

        if message is None:
            publisher.debug("{} queue is empty, waiting 10s".format(config_section))
            time.sleep(10)
            continue

        else:
            tag, path = message.split(';')
            # add the tag to the tags word_list
            res = server.sadd('list_tags', tag)
            if res == 1:
                print("new tags added : {}".format(tag))
            # add the path to the tag set
            res = server.sadd(tag, path)
            if res == 1:
                print("new paste: {}".format(path))
                print("   tagged: {}".format(tag))
            server_metadata.sadd('tag:'+path, tag)

            curr_date = datetime.date.today()
            serv_statistics.hincrby(curr_date.strftime("%Y%m%d"),'paste_tagged:'+tag, 1)
            p.populate_set_out(message, 'MISP_The_Hive_feeder')
예제 #31
0
                                                     new_file_md5)
                    else:
                        filename = '{}_{}'.format(filename, new_file_md5)

                    # continue if new file already exist
                    if os.path.isfile(filename):
                        print('ignore duplicated file')
                        continue

                    print('new file: {}'.format(filename))
                # ignore duplicate
                else:
                    print('ignore duplicated file')
                    continue

            # create subdir
            dirname = os.path.dirname(filename)
            if not os.path.exists(dirname):
                os.makedirs(dirname)

            with open(filename, 'wb') as f:
                f.write(decoded)

            paste = filename
            # remove PASTES_FOLDER from
            if PASTES_FOLDERS in paste:
                paste = paste.replace(PASTES_FOLDERS, '', 1)

            p.populate_set_out(paste)
            processed_paste += 1
예제 #32
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [domain]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date )

            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )

        def start_requests(self):
            yield SplashRequest(
                self.start_urls,
                self.parse,
                #errback=self.errback_catcher,
                endpoint='render.json',
                meta={'father': self.original_paste},
                args={  'html': 1,
                        'wait': 10,
                        'render_all': 1,
                        'har': 1,
                        'png': 1}
            )

        def parse(self,response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if(error_log['info']['text'] == 'Connection to proxy refused'):
                    print('Connection to proxy refused')
            else:

                UUID = self.domains[0]+str(uuid.uuid4())
                filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')

                # save new paste on disk
                if self.save_crawled_paste(filename_paste, response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
                    self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
                        self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
                    self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)

                    #create paste metadata
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)

                    self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)

                    dirname = os.path.dirname(filename_screenshot)
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)

                    size_screenshot = (len(response.data['png'])*3) /4

                    if size_screenshot < 5000000: #bytes
                        with open(filename_screenshot, 'wb') as f:
                            f.write(base64.standard_b64decode(response.data['png'].encode()))

                    with open(filename_screenshot+'har.txt', 'wb') as f:
                        f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(
                            link.url,
                            self.parse,
                            #errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste},
                            args={  'html': 1,
                                    'png': 1,
                                    'render_all': 1,
                                    'har': 1,
                                    'wait': 10}
                        )

        '''
        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))
            print('failure')
            #print(failure)
            print(failure.type)
            #print(failure.request.meta['item'])

            #if isinstance(failure.value, HttpError):
            if failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
        '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
예제 #33
0
    delta = date_to - date_from  # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append(date.strftime('%Y%m%d'))
    return l_date


config_section = 'Global'
p = Process(config_section)

r_tags = redis.StrictRedis(host=p.config.get("ARDB_Tags", "host"),
                           port=p.config.getint("ARDB_Tags", "port"),
                           db=p.config.getint("ARDB_Tags", "db"),
                           decode_responses=True)

tag = 'infoleak:automatic-detection="bitcoin-address"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)

# get all tagged items
for date in l_dates:
    daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date))

    for item in daily_tagged_items:
        p.populate_set_out(item)
예제 #34
0
    url_regex = "(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"

    while True:
        if message is not None:
            filename, score = message.split()

            if prec_filename is None or filename != prec_filename:
                domains_list = []
                PST = Paste.Paste(filename)
                client = ip2asn()
                for x in PST.get_regex(url_regex):
                    scheme, credential, subdomain, domain, host, tld, \
                        port, resource_path, query_string, f1, f2, f3, \
                        f4 = x
                    domains_list.append(domain)
                    p.populate_set_out(x, 'Url')
                    publisher.debug('{} Published'.format(x))

                    if f1 == "onion":
                        print domain

                    hostl = unicode(subdomain + domain)
                    try:
                        socket.setdefaulttimeout(2)
                        ip = socket.gethostbyname(unicode(hostl))
                    except:
                        # If the resolver is not giving any IPv4 address,
                        # ASN/CC lookup is skip.
                        continue

                    try:
예제 #35
0
파일: Mixer.py 프로젝트: Mrnmap/ALLInfo
                relay_message = "{0} {1}".format(paste_name, gzip64encoded)
                #relay_message = b" ".join( [paste_name, gzip64encoded] )

                digest = hashlib.sha1(gzip64encoded.encode('utf8')).hexdigest()

                # Avoid any duplicate coming from any sources
                if operation_mode == 1:
                    if server.exists(digest):  # Content already exists
                        #STATS
                        duplicated_paste_per_feeder[feeder_name] += 1
                    else:  # New content

                        # populate Global OR populate another set based on the feeder_name
                        if feeder_name in FEED_QUEUE_MAPPING:
                            p.populate_set_out(relay_message,
                                               FEED_QUEUE_MAPPING[feeder_name])
                        else:
                            p.populate_set_out(relay_message, 'Mixer')

                    server.sadd(digest, feeder_name)
                    server.expire(digest, ttl_key)

                # Keep duplicate coming from different sources
                elif operation_mode == 2:
                    # Filter to avoid duplicate
                    content = server.get('HASH_' + paste_name)
                    if content is None:
                        # New content
                        # Store in redis for filtering
                        server.set('HASH_' + paste_name, digest)
                        server.sadd(paste_name, feeder_name)
예제 #36
0
                    r_serv2, PST.get_regex(email_regex))

                if MX_values[0] >= 1:

                    PST.__setattr__(channel, MX_values)
                    PST.save_attribute_redis(
                        channel, (MX_values[0], list(MX_values[1])))

                    pprint.pprint(MX_values)
                    to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\
                        format(PST.p_source, PST.p_date, PST.p_name,
                               MX_values[0], PST.p_path)
                    if MX_values[0] > is_critical:
                        publisher.warning(to_print)
                        #Send to duplicate
                        p.populate_set_out(filename, 'Duplicate')
                        p.populate_set_out('mail;{}'.format(filename),
                                           'BrowseWarningPaste')

                    else:
                        publisher.info(to_print)
                #Send to ModuleStats
                for mail in MX_values[1]:
                    print 'mail;{};{};{}'.format(1, mail, PST.p_date)
                    p.populate_set_out(
                        'mail;{};{};{}'.format(1, mail, PST.p_date),
                        'ModuleStats')

            prec_filename = filename

        else:
예제 #37
0
                                    int(1)))

            # Add in set only if term is not in the blacklist
            if low_word not in server_term.smembers(BlackListTermsSet_Name):
                #consider the num of occurence of this term
                server_term.zincrby(curr_set, low_word, float(score))
                #1 term per paste
                server_term.zincrby("per_paste_" + curr_set, low_word,
                                    float(1))

            #Add more info for tracked terms
            check_if_tracked_term(low_word, filename)

            #send to RegexForTermsFrequency
            to_send = "{} {} {}".format(filename, timestamp, word)
            p.populate_set_out(to_send, 'RegexForTermsFrequency')

        else:

            if generate_new_graph:
                generate_new_graph = False
                print('Building graph')
                today = datetime.date.today()
                year = today.year
                month = today.month

                lib_words.create_curve_with_word_file(r_serv1, csv_path,
                                                      wordfile_path, year,
                                                      month)

            publisher.debug("Script Curve is Idling")
def main():
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'DomClassifier'

    p = Process(config_section)
    addr_dns = p.config.get("DomClassifier", "dns")

    publisher.info("""ZMQ DomainClassifier is Running""")

    c = DomainClassifier.domainclassifier.Extract(rawtext="",
                                                  nameservers=[addr_dns])

    cc = p.config.get("DomClassifier", "cc")
    cc_tld = p.config.get("DomClassifier", "cc_tld")

    while True:
        try:
            item_id = p.get_from_set()

            if item_id is None:
                publisher.debug("Script DomClassifier is idling 1s")
                time.sleep(1)
                continue

            item_content = item_basic.get_item_content(item_id)
            mimetype = item_basic.get_item_mimetype(item_id)
            item_basename = item_basic.get_basename(item_id)
            item_source = item_basic.get_source(item_id)
            item_date = item_basic.get_item_date(item_id)

            if mimetype.split('/')[0] == "text":
                c.text(rawtext=item_content)
                c.potentialdomain()
                c.validdomain(passive_dns=True, extended=False)
                print(c.vdomain)

                if c.vdomain and d4.is_passive_dns_enabled():
                    for dns_record in c.vdomain:
                        p.populate_set_out(dns_record)

                localizeddomains = c.include(expression=cc_tld)
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning(
                        f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {cc_tld};{item_id}"
                    )
                localizeddomains = c.localizedomain(cc=cc)

                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning(
                        f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {cc};{item_id}"
                    )

        except IOError:
            print("CRC Checksum Failed on :", item_id)
            publisher.error(
                f"Duplicate;{item_source};{item_date};{item_basename};CRC Checksum Failed"
            )
예제 #39
0
파일: Tags.py 프로젝트: wubic/ail-framework
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'Tags'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Tags module started")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()

        if message is None:
            publisher.debug(
                "{} queue is empty, waiting 10s".format(config_section))
            time.sleep(10)
            continue

        else:
            print(message)
            tag, item_id = message.split(';')

            Tag.add_tag("item", tag, item_id)

            p.populate_set_out(message, 'MISP_The_Hive_feeder')
예제 #40
0
if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = '<section name>'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("<description of the module>")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()
        if message is None:
            publisher.debug("{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue

        # Do something with the message from the queue
        something_has_been_done = do_something(message)

        # (Optional) Send that thing to the next queue
        p.populate_set_out(something_has_been_done)
예제 #41
0
        bname = os.path.basename(filename)
        tmp_dict[bname] = []
        with open(os.path.join(args.d, filename), 'r') as f:
            patterns = [r'%s' % re.escape(s.strip()) for s in f]
            tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)

    prec_filename = None

    while True:
        filename = p.get_from_set()
        if filename is None:
            publisher.debug("Script Categ is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
            continue

        paste = Paste.Paste(filename)
        content = paste.get_p_content()

        for categ, pattern in tmp_dict.items():
            found = set(re.findall(pattern, content))
            if len(found) > 0:
                msg = '{} {}'.format(paste.p_path, len(found))
                print msg, categ
                p.populate_set_out(msg, categ)

                publisher.info(
                    'Categ;{};{};{};Detected {} as {}'.format(
                        paste.p_source, paste.p_date, paste.p_name,
                        len(found), categ))
예제 #42
0
        sites_set = set(re.findall(regex_web, content))

        message = 'Checked {} credentials found.'.format(len(creds))
        if sites_set:
            message += ' Related websites: {}'.format(', '.join(sites_set))

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path)

        print('\n '.join(creds))

        #num of creds above tresh, publish an alert
        if len(creds) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".format(filepath))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')
            #Send to alertHandler
            p.populate_set_out('credential;{}'.format(filepath), 'alertHandler')
            
            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
                faup.decode(url)
예제 #43
0
        if sites_set:
            message += ' Related websites: {}'.format(', '.join(sites_set))

        to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source,
                                                      paste.p_date,
                                                      paste.p_name, message,
                                                      paste.p_path)

        print('\n '.join(creds))

        if len(creds) > critical:
            print("========> Found more than 10 credentials in this file : {}".
                  format(filepath))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(filepath, 'Duplicate')
            #Send to BrowseWarningPaste
            p.populate_set_out('credential;{}'.format(filepath),
                               'BrowseWarningPaste')

            #Put in form, count occurences, then send to moduleStats
            creds_sites = {}
            site_occurence = re.findall(regex_site_for_stats, content)
            for site in site_occurence:
                site_domain = site[1:-1]
                if site_domain in creds_sites.keys():
                    creds_sites[site_domain] += 1
                else:
                    creds_sites[site_domain] = 1

            for url in sites:
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'PreProcessFeed'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("<description of the module>")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()
        if message is None:
            publisher.debug(
                "{} queue is empty, waiting".format(config_section))
            print("queue empty")
            time.sleep(1)
            continue

        # Do something with the message from the queue
        new_message = do_something(message)

        # (Optional) Send that thing to the next queue
        p.populate_set_out(new_message)
예제 #45
0
    url_regex = "(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"

    while True:
        if message is not None:
            filename, score = message.split()

            if prec_filename is None or filename != prec_filename:
                domains_list = []
                PST = Paste.Paste(filename)
                client = ip2asn()
                for x in PST.get_regex(url_regex):
                    scheme, credential, subdomain, domain, host, tld, \
                        port, resource_path, query_string, f1, f2, f3, \
                        f4 = x
                    domains_list.append(domain)
                    p.populate_set_out(x, 'Url')
                    publisher.debug('{} Published'.format(x))

                    if f1 == "onion":
                        print domain

                    hostl = unicode(subdomain+domain)
                    try:
                        socket.setdefaulttimeout(2)
                        ip = socket.gethostbyname(unicode(hostl))
                    except:
                        # If the resolver is not giving any IPv4 address,
                        # ASN/CC lookup is skip.
                        continue

                    try:
예제 #46
0
    # LOGGING #
    publisher.info("Feed Script started to receive & publish.")

    while True:

        message = p.get_from_set()
        # Recovering the streamed message informations.
        if message is not None:
            splitted = message.split()
            if len(splitted) == 2:
                paste, gzip64encoded = splitted
            else:
                # TODO Store the name of the empty paste inside a Redis-list.
                print "Empty Paste: not processed"
                publisher.debug("Empty Paste: {0} not processed".format(paste))
                continue
        else:
            print "Empty Queues: Waiting..."
            time.sleep(1)
            continue
        # Creating the full filepath
        filename = os.path.join(os.environ["AIL_HOME"], p.config.get("Directories", "pastes"), paste)
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        with open(filename, "wb") as f:
            f.write(base64.standard_b64decode(gzip64encoded))
        p.populate_set_out(filename)
예제 #47
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, splash_url, type, crawler_options, date,
                     requested_mode, url, domain, port, cookies, original_item,
                     *args, **kwargs):
            self.splash_url = splash_url
            self.domain_type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4],
                                         date['date_day'][4:6],
                                         date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.png = crawler_options['png']
            self.har = crawler_options['har']
            self.cookies = cookies

            config_section = 'Crawler'
            self.p = Process(config_section)
            self.item_dir = os.path.join(
                self.p.config.get("Directories", "crawled"), date_str)

            config_loader = ConfigLoader.ConfigLoader()
            self.har_dir = os.path.join(
                config_loader.get_files_directory('har'), date_str)
            config_loader = None

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.root_key = None

        def build_request_arg(self, cookies):
            return {
                'wait': 10,
                'resource_timeout':
                30,  # /!\ Weird behaviour if timeout < resource_timeout /!\
                'timeout': 30,
                'cookies': cookies,
                'lua_source': script_cookie
            }

        def start_requests(self):
            l_cookies = self.build_request_arg(self.cookies)
            yield SplashRequest(self.start_urls,
                                self.parse,
                                errback=self.errback_catcher,
                                endpoint='execute',
                                meta={
                                    'father': self.original_item,
                                    'current_url': self.start_urls
                                },
                                args=l_cookies)

        # # TODO: remove duplicate and anchor
        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # no response
                #print('504 detected')
                pass

            # LUA ERROR # # TODO: print/display errors
            elif 'error' in response.data:
                if (response.data['error'] == 'network99'):
                    ## splash restart ##
                    error_retry = request.meta.get('error_retry', 0)
                    if error_retry < 3:
                        error_retry += 1
                        url = request.meta['current_url']
                        father = request.meta['father']

                        self.logger.error(
                            'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                            url)
                        time.sleep(10)
                        yield SplashRequest(url,
                                            self.parse,
                                            errback=self.errback_catcher,
                                            endpoint='execute',
                                            cache_args=['lua_source'],
                                            meta={
                                                'father': father,
                                                'current_url': url,
                                                'error_retry': error_retry
                                            },
                                            args=self.build_request_arg(
                                                response.cookiejar))
                    else:
                        print('Connection to proxy refused')
                else:
                    print(response.data['error'])

            elif response.status != 200:
                print('other response: {}'.format(response.status))
                # detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                print(error_log)
            #elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
            #    pass # ignore response
            else:

                item_id = crawlers.create_item_id(self.item_dir,
                                                  self.domains[0])
                self.save_crawled_item(item_id, response.data['html'])
                crawlers.create_item_metadata(item_id, self.domains[0],
                                              response.data['last_url'],
                                              self.port,
                                              response.meta['father'])

                if self.root_key is None:
                    self.root_key = item_id
                    crawlers.add_domain_root_item(item_id, self.domain_type,
                                                  self.domains[0],
                                                  self.date_epoch, self.port)
                    crawlers.create_domain_metadata(self.domain_type,
                                                    self.domains[0], self.port,
                                                    self.full_date,
                                                    self.date_month)

                if 'cookies' in response.data:
                    all_cookies = response.data['cookies']
                else:
                    all_cookies = []

                # SCREENSHOT
                if 'png' in response.data and self.png:
                    sha256_string = Screenshot.save_crawled_screeshot(
                        response.data['png'],
                        5000000,
                        f_save=self.requested_mode)
                    if sha256_string:
                        Screenshot.save_item_relationship(
                            sha256_string, item_id)
                        Screenshot.save_domain_relationship(
                            sha256_string, self.domains[0])
                # HAR
                if 'har' in response.data and self.har:
                    crawlers.save_har(self.har_dir, item_id,
                                      response.data['har'])

                le = LinkExtractor(allow_domains=self.domains, unique=True)
                for link in le.extract_links(response):
                    l_cookies = self.build_request_arg(all_cookies)
                    yield SplashRequest(link.url,
                                        self.parse,
                                        errback=self.errback_catcher,
                                        endpoint='execute',
                                        meta={
                                            'father': item_id,
                                            'current_url': link.url
                                        },
                                        args=l_cookies)

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                ## DEBUG ##
                self.logger.error(failure.request)
                if failure.value.response:
                    self.logger.error(failure.value.response)
                ## ----- ##

                # Extract request metadata
                url = failure.request.meta['current_url']
                father = failure.request.meta['father']
                l_cookies = self.build_request_arg(
                    failure.request.meta['splash']['args']['cookies'])

                # Check if Splash restarted
                if not crawlers.is_splash_reachable(self.splash_url):
                    self.logger.error(
                        'Splash, ResponseNeverReceived for %s, retry in 30s ...',
                        url)
                    time.sleep(30)

                yield SplashRequest(url,
                                    self.parse,
                                    errback=self.errback_catcher,
                                    endpoint='execute',
                                    meta={
                                        'father': father,
                                        'current_url': url
                                    },
                                    args=l_cookies)

            else:
                self.logger.error(failure.type)
                self.logger.error(failure.getErrorMessage())

        def save_crawled_item(self, item_id, item_content):
            gzip64encoded = crawlers.save_crawled_item(item_id, item_content)

            # Send item to queue
            # send paste to Global
            relay_message = "{0} {1}".format(item_id, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder",
                                           "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(item_id)
            self.p.populate_set_out(msg, 'Tags')
예제 #48
0
                relay_message = "{0} {1}".format(paste_name, gzip64encoded)
                #relay_message = b" ".join( [paste_name, gzip64encoded] )

                digest = hashlib.sha1(gzip64encoded.encode('utf8')).hexdigest()

                # Avoid any duplicate coming from any sources
                if operation_mode == 1:
                    if server.exists(digest): # Content already exists
                        #STATS
                        duplicated_paste_per_feeder[feeder_name] += 1
                    else: # New content

                        # populate Global OR populate another set based on the feeder_name
                        if feeder_name in FEED_QUEUE_MAPPING:
                            p.populate_set_out(relay_message, FEED_QUEUE_MAPPING[feeder_name])
                        else:
                            p.populate_set_out(relay_message, 'Mixer')

                    server.sadd(digest, feeder_name)
                    server.expire(digest, ttl_key)


                # Keep duplicate coming from different sources
                elif operation_mode == 2:
                    # Filter to avoid duplicate
                    content = server.get('HASH_'+paste_name)
                    if content is None:
                        # New content
                        # Store in redis for filtering
                        server.set('HASH_'+paste_name, digest)
예제 #49
0
        message = 'Checked {} credentials found.'.format(len(all_credentials))
        if all_sites:
            message += ' Related websites: {}'.format((', '.join(all_sites)))
        print(message)

        to_print = 'Credential;{};{};{};{};{}'.format(
            Item.get_source(item_id), Item.get_item_date(item_id),
            Item.get_item_basename(item_id), message, item_id)

        #num of creds above tresh, publish an alert
        if len(all_credentials) > criticalNumberToAlert:
            print("========> Found more than 10 credentials in this file : {}".
                  format(item_id))
            publisher.warning(to_print)
            #Send to duplicate
            p.populate_set_out(item_id, 'Duplicate')

            msg = 'infoleak:automatic-detection="credential";{}'.format(
                item_id)
            p.populate_set_out(msg, 'Tags')

            site_occurence = regex_helper.regex_findall(
                module_name,
                redis_cache_key,
                regex_site_for_stats,
                item_id,
                item_content,
                max_time=max_execution_time,
                r_set=False)

            creds_sites = {}
예제 #50
0
                    clean_card = re.sub('[^0-9]', '', card)
                    clean_card = clean_card
                    if lib_refine.is_luhn_valid(clean_card):
                        print(clean_card, 'is valid')
                        creditcard_set.add(clean_card)

                paste.__setattr__(channel, creditcard_set)
                paste.save_attribute_redis(channel, creditcard_set)

                pprint.pprint(creditcard_set)
                to_print = 'CreditCard;{};{};{};'.format(
                    paste.p_source, paste.p_date, paste.p_name)
                if (len(creditcard_set) > 0):
                    publisher.warning('{}Checked {} valid number(s);{}'.format(
                        to_print, len(creditcard_set), paste.p_path))
                    print('{}Checked {} valid number(s);{}'.format(
                        to_print, len(creditcard_set), paste.p_path))
                    #Send to duplicate
                    p.populate_set_out(filename, 'Duplicate')
                    #send to Browse_warning_paste
                    msg = 'creditcard;{}'.format(filename)
                    p.populate_set_out(msg, 'alertHandler')

                    msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename)
                    p.populate_set_out(msg, 'Tags')
                else:
                    publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path))
        else:
            publisher.debug("Script creditcard is idling 1m")
            time.sleep(10)