def __init__(self, module_name=None, queue_name=None, logger_channel='Script'): """ Init Module module_name: str; set the module name if different from the instance ClassName queue_name: str; set the queue name if different from the instance ClassName logger_channel: str; set the logger channel name, 'Script' by default """ # Module name if provided else instance className self.module_name = module_name if module_name else self._module_name() # Module name if provided else instance className self.queue_name = queue_name if queue_name else self._module_name() # Init Redis Logger self.redis_logger = publisher # Port of the redis instance used by pubsublogger self.redis_logger.port = 6380 # Channel name to publish logs # # TODO: refactor logging # If provided could be a namespaced channel like script:<ModuleName> self.redis_logger.channel = logger_channel # Run module endlessly self.proceed = True # Waiting time in secondes between two proccessed messages self.pending_seconds = 10 # Setup the I/O queues self.process = Process(self.queue_name)
def __init__(self, module_name=None, queue_name=None): """ Init Module module_name: str; set the module name if different from the instance ClassName """ # Module name if provided else instance className self.module_name = module_name if module_name else self._module_name() # Module name if provided else instance className self.queue_name = queue_name if queue_name else self._module_name() # Init Redis Logger self.redis_logger = publisher # Port of the redis instance used by pubsublogger self.redis_logger.port = 6380 # Channel name to publish logs self.redis_logger.channel = 'Script' # TODO modify generic channel Script to a namespaced channel like: # publish module logs to script:<ModuleName> channel # self.redis_logger.channel = 'script:%s'%(self.module_name) # Run module endlessly self.proceed = True # Waiting time in secondes between two proccessed messages self.pending_seconds = 10 # Setup the I/O queues self.process = Process(self.queue_name)
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.png = crawler_options['png'] self.har = crawler_options['har'] self.cookies = cookies config_section = 'Crawler' self.p = Process(config_section) self.item_dir = os.path.join( self.p.config.get("Directories", "crawled"), date_str) self.har_dir = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.root_key = None
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): self.type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) # # TODO: timeout in config self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, 'render_all': 1, 'timeout': 30, 'har': crawler_options['har'], 'png': crawler_options['png']} config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str ) self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
def main(): publisher.port = 6380 publisher.channel = "Script" config_section = 'DomClassifier' p = Process(config_section) addr_dns = p.config.get("DomClassifier", "dns") publisher.info("""ZMQ DomainClassifier is Running""") c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) cc = p.config.get("DomClassifier", "cc") cc_tld = p.config.get("DomClassifier", "cc_tld") while True: try: message = p.get_from_set() if message is not None: PST = Paste.Paste(message) else: publisher.debug("Script DomClassifier is idling 1s") time.sleep(1) continue paste = PST.get_p_content() mimetype = PST._get_p_encoding() if mimetype == "text/plain": c.text(rawtext=paste) c.potentialdomain() c.validdomain(rtype=['A'], extended=True) localizeddomains = c.include(expression=cc_tld) if localizeddomains: print(localizeddomains) publisher.warning( 'DomainC;{};{};{};Checked {} located in {};{}'.format( PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path)) localizeddomains = c.localizedomain(cc=cc) if localizeddomains: print(localizeddomains) publisher.warning( 'DomainC;{};{};{};Checked {} located in {};{}'.format( PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path)) except IOError: print("CRC Checksum Failed on :", PST.p_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( PST.p_source, PST.p_date, PST.p_name))
def __init__(self, type, url, domain, original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join( self.p.config.get("Directories", "crawled"), date) self.crawled_paste_filemame = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date) self.crawled_screenshot = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date)
def main(): publisher.port = 6380 publisher.channel = "Script" config_section = 'DomClassifier' p = Process(config_section) addr_dns = p.config.get("DomClassifier", "dns") publisher.info("""ZMQ DomainClassifier is Running""") c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) cc = p.config.get("DomClassifier", "cc") cc_tld = p.config.get("DomClassifier", "cc_tld") while True: try: message = p.get_from_set() if message is not None: PST = Paste.Paste(message) else: publisher.debug("Script DomClassifier is idling 1s") time.sleep(1) continue paste = PST.get_p_content() mimetype = PST._get_p_encoding() if mimetype == "text/plain": c.text(rawtext=paste) c.potentialdomain() c.validdomain(rtype=['A'], extended=True) localizeddomains = c.include(expression=cc_tld) if localizeddomains: print(localizeddomains) publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path)) localizeddomains = c.localizedomain(cc=cc) if localizeddomains: print(localizeddomains) publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format( PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path)) except IOError: print("CRC Checksum Failed on :", PST.p_path) publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format( PST.p_source, PST.p_date, PST.p_name))
def setUp(self): self.paste = Paste('../samples/2018/01/01/keys_certificat_sample.gz') # Section name in bin/packages/modules.cfg self.config_section = 'Keys' # Setup the I/O queues p = Process(self.config_section)
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): self.type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, 'render_all': 1, 'har': crawler_options['har'], 'png': crawler_options['png']} config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str ) self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
def search_phone(message): paste = Paste.Paste(message) content = paste.get_p_content() # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required) reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') # list of the regex results in the Paste, may be null results = reg_phone.findall(content) # if the list is greater than 4, we consider the Paste may contain a list of phone numbers if len(results) > 4 : print results publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name)) if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'Phone' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("Run Phone module") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug("{} queue is empty, waiting".format(config_section)) time.sleep(1) continue # Do something with the message from the queue search_phone(message)
def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
import os import argparse import time import re from pubsublogger import publisher from packages import Paste from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Categ' p = Process(config_section) matchingThreshold = p.config.getint("Categ", "matchingThreshold") # SCRIPT PARSER # parser = argparse.ArgumentParser( description='Start Categ module on files.') parser.add_argument( '-d', type=str, default="../files/", help='Path to the directory containing the category files.', action='store') args = parser.parse_args()
#!/usr/bin/env python2 # -*-coding:UTF-8 -* import time import sys from packages import Paste from pubsublogger import publisher from Helper import Process import re from pyfaup.faup import Faup if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Credential" p = Process(config_section) publisher.info("Find credentials") faup = Faup() critical = 8 regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+" regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" while True: message = p.get_from_set() if message is None: publisher.debug("Script Credential is Idling 10s") time.sleep(10) continue
# Country and ASN lookup from cymru.ip2asn.dns import DNSClient as ip2asn import socket import pycountry import ipaddress from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Web' p = Process(config_section) # REDIS # r_serv2 = redis.StrictRedis( host=p.config.get("Redis_Cache", "host"), port=p.config.getint("Redis_Cache", "port"), db=p.config.getint("Redis_Cache", "db")) # Country to log as critical cc_critical = p.config.get("Url", "cc_critical") # FUNCTIONS # publisher.info("Script URL subscribed to channel web_categ") # FIXME For retro compatibility channel = 'web_categ'
sys.path.append(os.environ['AIL_BIN']) from Helper import Process def substract_date(date_from, date_to): date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8])) date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8])) delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append( date.strftime('%Y%m%d') ) return l_date config_section = 'Global' p = Process(config_section) r_tags = redis.StrictRedis( host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="bitcoin-address"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen)
#split username with spec. char or with upper case, distinguish start with upper REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+" REDIS_KEY_NUM_USERNAME = '******' REDIS_KEY_NUM_PATH = 'uniqNumForUsername' REDIS_KEY_ALL_CRED_SET = 'AllCredentials' REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev' REDIS_KEY_ALL_PATH_SET = 'AllPath' REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev' REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping' if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Credential" p = Process(config_section) publisher.info("Find credentials") minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold") faup = Faup() server_cred = redis.StrictRedis(host=p.config.get("ARDB_TermCred", "host"), port=p.config.get("ARDB_TermCred", "port"), db=p.config.get("ARDB_TermCred", "db"), decode_responses=True) criticalNumberToAlert = p.config.getint("Credential", "criticalNumberToAlert") minTopPassList = p.config.getint("Credential", "minTopPassList")
return valid_mxdomain def extract_all_emails(queue, item_content): queue.put(re.findall(email_regex, item_content)) if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Mail' faup = Faup() p = Process(config_section) publisher.info("Mails module started") # Numbers of Mails needed to Tags mail_threshold = 10 max_execution_time = 30 email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" q = Queue() while True: message = p.get_from_set()
sys.path.append(os.environ['AIL_BIN']) from Helper import Process def substract_date(date_from, date_to): date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8])) date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8])) delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append( date.strftime('%Y%m%d') ) return l_date config_section = 'Keys' p = Process(config_section) r_tags = redis.StrictRedis( host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="pgp-message"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen)
def getValueOverRange(word, startDate, num_day): to_return = 0 for timestamp in range(startDate, startDate - num_day * oneDay, -oneDay): value = server_term.hget(timestamp, word) to_return += int(value) if value is not None else 0 return to_return if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Curve' p = Process(config_section) # REDIS # r_serv1 = redis.StrictRedis(host=p.config.get("ARDB_Curve", "host"), port=p.config.get("ARDB_Curve", "port"), db=p.config.get("ARDB_Curve", "db"), decode_responses=True) server_term = redis.StrictRedis(host=p.config.get("ARDB_TermFreq", "host"), port=p.config.get("ARDB_TermFreq", "port"), db=p.config.get("ARDB_TermFreq", "db"), decode_responses=True) # FUNCTIONS # publisher.info("Script Curve started")
#!/usr/bin/env python2 # -*-coding:UTF-8 -* import time from packages import Paste from pubsublogger import publisher from Helper import Process import re if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Release" p = Process(config_section) publisher.info("Release scripts to find release names") movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+" tv = "[a-zA-Z0-9.]+\.S[0-9]{2}E[0-9]{2}.[a-zA-Z0-9.]+\.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+" xxx = "[a-zA-Z0-9._]+.XXX.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+" regexs = [movie, tv, xxx] regex = '|'.join(regexs) while True: filepath = p.get_from_set() if filepath is None: publisher.debug("Script Release is Idling 10s") print 'Sleeping' time.sleep(10) continue paste = Paste.Paste(filepath)
def add_quote_inside_tab(tab): quoted_tab = "[" for elem in tab[1:-1].split(','): elem = elem.lstrip().strip() quoted_tab += "\'{}\', ".format(elem) quoted_tab = quoted_tab[:-2] #remove trailing , quoted_tab += "]" return str(quoted_tab) if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'SetForTermsFrequency' p = Process(config_section) # REDIS # server_term = redis.StrictRedis( host=p.config.get("ARDB_TermFreq", "host"), port=p.config.get("ARDB_TermFreq", "port"), db=p.config.get("ARDB_TermFreq", "db"), decode_responses=True) # FUNCTIONS # publisher.info("RegexForTermsFrequency script started") # create direct link in mail full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url #get the dico and matching percent
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) def start_requests(self): yield SplashRequest( self.start_urls, self.parse, #errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, 'wait': 10, 'render_all': 1, 'har': 1, 'png': 1} ) def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if(error_log['info']['text'] == 'Connection to proxy refused'): print('Connection to proxy refused') else: UUID = self.domains[0]+str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) with open(filename_screenshot+'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, #errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'har': 1, 'wait': 10} ) ''' def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) #if isinstance(failure.value, HttpError): if failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format(filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True
from pubsublogger import publisher from Helper import Process from packages import Paste if __name__ == '__main__': # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'Tags' # Setup the I/O queues p = Process(config_section) server = redis.StrictRedis( host=p.config.get("ARDB_Tags", "host"), port=p.config.get("ARDB_Tags", "port"), db=p.config.get("ARDB_Tags", "db"), decode_responses=True) server_metadata = redis.StrictRedis( host=p.config.get("ARDB_Metadata", "host"), port=p.config.get("ARDB_Metadata", "port"), db=p.config.get("ARDB_Metadata", "db"), decode_responses=True) serv_statistics = redis.StrictRedis( host=p.config.get('ARDB_Statistics', 'host'),
import time from packages import Paste from packages import lib_refine from pubsublogger import publisher import re from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'CreditCards' p = Process(config_section) # FUNCTIONS # publisher.info("Creditcard script subscribed to channel creditcard_categ") creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?" # FIXME For retro compatibility channel = 'creditcard_categ' # Source: http://www.richardsramblings.com/regex/credit-card-numbers/ cards = [ r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16-digit VISA, with separators r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16 digits MasterCard r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Discover Card
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, url, domain, original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join( self.p.config.get("Directories", "crawled"), date) self.crawled_paste_filemame = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date) self.crawled_screenshot = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date) def start_requests(self): yield SplashRequest(self.start_urls, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, 'wait': 10, 'render_all': 1, 'har': 1, 'png': 1 }) def parse(self, response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if (error_log['info']['text'] == 'Connection to proxy refused' ): print('Connection to proxy refused') else: #avoid filename too big if len(self.domains[0]) > 215: UUID = self.domains[0][-215:] + str(uuid.uuid4()) else: UUID = self.domains[0] + str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID + '.png') # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd( '{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd( 'month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format( self.type, self.domains[0])): self.r_serv_onion.hset( '{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset( '{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) #create paste metadata self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'real_link', response.url) self.r_serv_metadata.sadd( 'paste_children:' + response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) size_screenshot = (len(response.data['png']) * 3) / 4 if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write( base64.standard_b64decode( response.data['png'].encode())) with open(filename_screenshot + 'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'har': 1, 'wait': 10 }) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): request = failure.request url = request.meta['splash']['args']['url'] father = request.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': father}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'har': 1, 'wait': 10 }) else: print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) ''' #if isinstance(failure.value, HttpError): elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format( filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True
from pubsublogger import publisher from Helper import Process from packages import Tag if __name__ == '__main__': # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'Tags' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("Tags module started") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug( "{} queue is empty, waiting 10s".format(config_section)) time.sleep(10) continue
#split username with spec. char or with upper case, distinguish start with upper REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+" REDIS_KEY_NUM_USERNAME = '******' REDIS_KEY_NUM_PATH = 'uniqNumForUsername' REDIS_KEY_ALL_CRED_SET = 'AllCredentials' REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev' REDIS_KEY_ALL_PATH_SET = 'AllPath' REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev' REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping' if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Credential" module_name = "Credential" p = Process(config_section) publisher.info("Find credentials") faup = Faup() regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+" regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+" regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" redis_cache_key = regex_helper.generate_redis_cache_key(module_name) while True: message = p.get_from_set() if message is None:
raise TimeoutException signal.signal(signal.SIGALRM, timeout_handler) if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'SentimentAnalysis' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("<description of the module>") # REDIS_LEVEL_DB # server = redis.StrictRedis( host=p.config.get("ARDB_Sentiment", "host"), port=p.config.get("ARDB_Sentiment", "port"), db=p.config.get("ARDB_Sentiment", "db"), decode_responses=True) while True: message = p.get_from_set() if message is None: publisher.debug("{} queue is empty, waiting".format(config_section))
from packages import lib_refine from pubsublogger import publisher from pyfaup.faup import Faup from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Mail' faup = Faup() p = Process(config_section) addr_dns = p.config.get("Mail", "dns") # REDIS # r_serv2 = redis.StrictRedis( host=p.config.get("Redis_Cache", "host"), port=p.config.getint("Redis_Cache", "port"), db=p.config.getint("Redis_Cache", "db"), decode_responses=True) # ARDB # server_statistics = redis.StrictRedis( host=p.config.get("ARDB_Statistics", "host"), port=p.config.getint("ARDB_Statistics", "port"), db=p.config.getint("ARDB_Statistics", "db"), decode_responses=True)
'dash': { 'name': 'dash', # e.g. XmNfXq2kDmrNBTiDTofohRemwGur1WmgTT 'regex': r'\b(?<![+/=])X[A-Za-z0-9]{33}(?![+/=])\b', 'max_execution_time': default_max_execution_time, 'tag': 'infoleak:automatic-detection="dash-address"', } } if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Bitcoin' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("Run Cryptocurrency module ") # Endless loop getting messages from the input queue while True: # Get one message from the input queue item_id = p.get_from_set() if item_id is None: publisher.debug( "{} queue is empty, waiting".format(config_section)) time.sleep(1) continue # Do something with the message from the queue
def substract_date(date_from, date_to): date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8])) date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8])) delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append(date.strftime('%Y%m%d')) return l_date config_section = 'Global' p = Process(config_section) r_tags = redis.StrictRedis(host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="bitcoin-address"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen) # get all tagged items
import time from packages import Paste from packages import lib_refine from pubsublogger import publisher import re import sys from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'CreditCards' p = Process(config_section) # FUNCTIONS # publisher.info("Creditcard script subscribed to channel creditcard_categ") creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?" # FIXME For retro compatibility channel = 'creditcard_categ' # Source: http://www.richardsramblings.com/regex/credit-card-numbers/ cards = [ r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16-digit VISA, with separators r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16 digits MasterCard r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Discover Card
failed.append(url) print('Failed at downloading', url) print(process.stdout.read()) print('Failed:', len(failed), 'Downloaded:', len(downloaded)) if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" torclient_host = '127.0.0.1' torclient_port = 9050 config_section = 'Onion' p = Process(config_section) r_cache = redis.StrictRedis(host=p.config.get("Redis_Cache", "host"), port=p.config.getint("Redis_Cache", "port"), db=p.config.getint("Redis_Cache", "db"), decode_responses=True) r_onion = redis.StrictRedis(host=p.config.get("ARDB_Onion", "host"), port=p.config.getint("ARDB_Onion", "port"), db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) # FUNCTIONS # publisher.info("Script subscribed to channel onion_categ") # FIXME For retro compatibility channel = 'onion_categ'
def timeout_handler(signum, frame): raise TimeoutException signal.signal(signal.SIGALRM, timeout_handler) ''' This module takes its input from the global module. It applies some regex and publish matched content ''' if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Release" p = Process(config_section) max_execution_time = p.config.getint("Curve", "max_execution_time") publisher.info("Release scripts to find release names") movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+" tv = "[a-zA-Z0-9.]+\.S[0-9]{2}E[0-9]{2}.[a-zA-Z0-9.]+\.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+" xxx = "[a-zA-Z0-9._]+.XXX.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+" regexs = [movie, tv, xxx] regex = '|'.join(regexs) while True: signal.alarm(max_execution_time) filepath = p.get_from_set() if filepath is None: publisher.debug("Script Release is Idling 10s")
from Helper import Process # CONFIG # refresh_time = 30 FEED_QUEUE_MAPPING = { "feeder2": "preProcess1" } # Map a feeder name to a pre-processing module if __name__ == '__main__': publisher.port = 6380 publisher.channel = 'Script' config_section = 'Mixer' p = Process(config_section) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') if not os.path.exists(configfile): raise Exception('Unable to find the configuration file. \ Did you set environment variables? \ Or activate the virtualenv.') cfg = configparser.ConfigParser() cfg.read(configfile) # REDIS # server = redis.StrictRedis(host=cfg.get("Redis_Mixer_Cache", "host"), port=cfg.getint("Redis_Mixer_Cache", "port"), db=cfg.getint("Redis_Mixer_Cache", "db"), decode_responses=True)
msg = 'infoleak:automatic-detection="'+decoder_name+'";{}'.format(message) p.populate_set_out(msg, 'Tags') if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'Decoder' # Setup the I/O queues p = Process(config_section) serv_metadata = redis.StrictRedis( host=p.config.get("ARDB_Metadata", "host"), port=p.config.getint("ARDB_Metadata", "port"), db=p.config.getint("ARDB_Metadata", "db"), decode_responses=True) # Sent to the logging a description of the module publisher.info("Decoder started") regex_binary = '[0-1]{40,}' #regex_hex = '(0[xX])?[A-Fa-f0-9]{40,}' regex_hex = '[A-Fa-f0-9]{40,}' regex_base64 = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
import pprint import time import dns.exception from packages import Paste from packages import lib_refine from pubsublogger import publisher from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Mail' p = Process(config_section) # REDIS # r_serv2 = redis.StrictRedis( host=p.config.get("Redis_Cache", "host"), port=p.config.getint("Redis_Cache", "port"), db=p.config.getint("Redis_Cache", "db")) # FUNCTIONS # publisher.info("Suscribed to channel mails_categ") # FIXME For retro compatibility channel = 'mails_categ' message = p.get_from_set() prec_filename = None
def run(config_section): p = Process(config_section) if not p.publish(): print(config_section, 'has no publisher.')
print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') exit(1) type_hidden_service = sys.argv[1] splash_port = sys.argv[2] publisher.port = 6380 publisher.channel = "Script" publisher.info("Script Crawler started") config_section = 'Crawler' # Setup the I/O queues p = Process(config_section) url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" re.compile(url_onion) url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" re.compile(url_i2p) if type_hidden_service == 'onion': regex_hidden_service = url_onion splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) elif type_hidden_service == 'i2p': regex_hidden_service = url_i2p splash_url = '{}:{}'.format(p.config.get("Crawler", "splash_url_i2p"), splash_port) elif type_hidden_service == 'regular':
def do_something(message): return None if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = '<section name>' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("<description of the module>") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug("{} queue is empty, waiting".format(config_section)) time.sleep(1) continue # Do something with the message from the queue something_has_been_done = do_something(message)
import signal class TimeoutException(Exception): pass def timeout_handler(signum, frame): raise TimeoutException signal.signal(signal.SIGALRM, timeout_handler) if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Tokenize' p = Process(config_section) # LOGGING # publisher.info("Tokeniser started") while True: message = p.get_from_set() print(message) if message is not None: paste = Paste.Paste(message) signal.alarm(5) try: for word, score in paste._get_top_words().items(): if len(word) >= 4: msg = '{} {} {}'.format(paste.p_rel_path, word, score) p.populate_set_out(msg)
import os import argparse import time import re from pubsublogger import publisher from packages import Paste from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Categ' p = Process(config_section) # SCRIPT PARSER # parser = argparse.ArgumentParser(description='Start Categ module on files.') parser.add_argument( '-d', type=str, default="../files/", help='Path to the directory containing the category files.', action='store') args = parser.parse_args() # FUNCTIONS # publisher.info("Script Categ started") categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential']
""" import base64 import os import time from pubsublogger import publisher from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Global" p = Process(config_section) # LOGGING # publisher.info("Feed Script started to receive & publish.") while True: message = p.get_from_set() # Recovering the streamed message informations. if message is not None: splitted = message.split() if len(splitted) == 2: paste, gzip64encoded = splitted else: # TODO Store the name of the empty paste inside a Redis-list. print "Empty Paste: not processed"
exit(1) ################################################## #mode = sys.argv[1] splash_port = sys.argv[1] rotation_mode = deque(['onion', 'regular']) default_proto_map = {'http': 80, 'https': 443} ######################################################## add ftp ??? publisher.port = 6380 publisher.channel = "Script" publisher.info("Script Crawler started") config_section = 'Crawler' # Setup the I/O queues p = Process(config_section) splash_url = '{}:{}'.format(p.config.get("Crawler", "splash_url"), splash_port) print('splash url: {}'.format(splash_url)) PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) r_serv_metadata = redis.StrictRedis( host=p.config.get("ARDB_Metadata", "host"), port=p.config.getint("ARDB_Metadata", "port"), db=p.config.getint("ARDB_Metadata", "db"), decode_responses=True) r_cache = redis.StrictRedis(host=p.config.get("Redis_Cache", "host"),
import configparser from Helper import Process # CONFIG # refresh_time = 30 FEED_QUEUE_MAPPING = { "feeder2": "preProcess1" } # Map a feeder name to a pre-processing module if __name__ == '__main__': publisher.port = 6380 publisher.channel = 'Script' config_section = 'Mixer' p = Process(config_section) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') if not os.path.exists(configfile): raise Exception('Unable to find the configuration file. \ Did you set environment variables? \ Or activate the virtualenv.') cfg = configparser.ConfigParser() cfg.read(configfile) # REDIS # server = redis.StrictRedis( host=cfg.get("Redis_Mixer_Cache", "host"), port=cfg.getint("Redis_Mixer_Cache", "port"), db=cfg.getint("Redis_Mixer_Cache", "db"),
def test_Process_Constructor_using_key_module(self): conf_section = 'Keys' process = Process(conf_section) self.assertEqual(process.subscriber_name, 'Keys')
#split username with spec. char or with upper case, distinguish start with upper REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+" REDIS_KEY_NUM_USERNAME = '******' REDIS_KEY_NUM_PATH = 'uniqNumForUsername' REDIS_KEY_ALL_CRED_SET = 'AllCredentials' REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev' REDIS_KEY_ALL_PATH_SET = 'AllPath' REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev' REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping' if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Credential" p = Process(config_section) publisher.info("Find credentials") minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold") faup = Faup() server_cred = redis.StrictRedis( host=p.config.get("ARDB_TermCred", "host"), port=p.config.get("ARDB_TermCred", "port"), db=p.config.get("ARDB_TermCred", "db"), decode_responses=True) server_statistics = redis.StrictRedis( host=p.config.get("ARDB_Statistics", "host"), port=p.config.getint("ARDB_Statistics", "port"), db=p.config.getint("ARDB_Statistics", "db"),
from Helper import Process # Used to prevent concat with empty fields due to url parsing def avoidNone(a_string): if a_string is None: return "" else: return a_string if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Web' p = Process(config_section) # REDIS # r_serv2 = redis.StrictRedis( host=p.config.get("Redis_Cache", "host"), port=p.config.getint("Redis_Cache", "port"), db=p.config.getint("Redis_Cache", "db")) # Protocol file path protocolsfile_path = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "protocolsfile")) # Country to log as critical cc_critical = p.config.get("Url", "cc_critical") # FUNCTIONS #
from pubsublogger import publisher from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, ID import os from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Indexer' p = Process(config_section) # Indexer configuration - index dir and schema setup indexpath = os.path.join(os.environ['AIL_HOME'], p.config.get("Indexer", "path")) indexertype = p.config.get("Indexer", "type") if indexertype == "whoosh": schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) if not os.path.exists(indexpath): os.mkdir(indexpath) if not exists_in(indexpath): ix = create_in(indexpath, schema) else: ix = open_dir(indexpath)
*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly. """ import argparse import time from packages import Paste from pubsublogger import publisher from Helper import Process if __name__ == '__main__': publisher.port = 6380 publisher.channel = 'Script' config_section = 'Lines' p = Process(config_section) # SCRIPT PARSER # parser = argparse.ArgumentParser( description='This script is a part of the Analysis Information \ Leak framework.') parser.add_argument( '-max', type=int, default=500, help='The limit between "short lines" and "long lines"', action='store') args = parser.parse_args() # FUNCTIONS # tmp_string = "Lines script Subscribed to channel {} and Start to publish \
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): self.splash_url = splash_url self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.png = crawler_options['png'] self.har = crawler_options['har'] self.cookies = cookies config_section = 'Crawler' self.p = Process(config_section) self.item_dir = os.path.join( self.p.config.get("Directories", "crawled"), date_str) config_loader = ConfigLoader.ConfigLoader() self.har_dir = os.path.join( config_loader.get_files_directory('har'), date_str) config_loader = None self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.root_key = None def build_request_arg(self, cookies): return { 'wait': 10, 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ 'timeout': 30, 'cookies': cookies, 'lua_source': script_cookie } def start_requests(self): l_cookies = self.build_request_arg(self.cookies) yield SplashRequest(self.start_urls, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': self.original_item, 'current_url': self.start_urls }, args=l_cookies) # # TODO: remove duplicate and anchor def parse(self, response): #print(response.headers) #print(response.status) if response.status == 504: # no response #print('504 detected') pass # LUA ERROR # # TODO: print/display errors elif 'error' in response.data: if (response.data['error'] == 'network99'): ## splash restart ## error_retry = request.meta.get('error_retry', 0) if error_retry < 3: error_retry += 1 url = request.meta['current_url'] father = request.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', cache_args=['lua_source'], meta={ 'father': father, 'current_url': url, 'error_retry': error_retry }, args=self.build_request_arg( response.cookiejar)) else: print('Connection to proxy refused') else: print(response.data['error']) elif response.status != 200: print('other response: {}'.format(response.status)) # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): # pass # ignore response else: item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) if self.root_key is None: self.root_key = item_id crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) if 'cookies' in response.data: all_cookies = response.data['cookies'] else: all_cookies = [] # SCREENSHOT if 'png' in response.data and self.png: sha256_string = Screenshot.save_crawled_screeshot( response.data['png'], 5000000, f_save=self.requested_mode) if sha256_string: Screenshot.save_item_relationship( sha256_string, item_id) Screenshot.save_domain_relationship( sha256_string, self.domains[0]) # HAR if 'har' in response.data and self.har: crawlers.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(link.url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': item_id, 'current_url': link.url }, args=l_cookies) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): ## DEBUG ## self.logger.error(failure.request) if failure.value.response: self.logger.error(failure.value.response) ## ----- ## # Extract request metadata url = failure.request.meta['current_url'] father = failure.request.meta['father'] l_cookies = self.build_request_arg( failure.request.meta['splash']['args']['cookies']) # Check if Splash restarted if not crawlers.is_splash_reachable(self.splash_url): self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 30s ...', url) time.sleep(30) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': father, 'current_url': url }, args=l_cookies) else: self.logger.error(failure.type) self.logger.error(failure.getErrorMessage()) def save_crawled_item(self, item_id, item_content): gzip64encoded = crawlers.save_crawled_item(item_id, item_content) # Send item to queue # send paste to Global relay_message = "{0} {1}".format(item_id, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(item_id) self.p.populate_set_out(msg, 'Tags')
*Need running Redis instances. (Redis) *Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly. """ import time from packages import Paste from pubsublogger import publisher from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Tokenize' p = Process(config_section) # LOGGING # publisher.info("Tokeniser started") while True: message = p.get_from_set() print message if message is not None: paste = Paste.Paste(message) for word, score in paste._get_top_words().items(): if len(word) >= 4: msg = '{} {} {}'.format(paste.p_path, word, score) p.populate_set_out(msg) else: publisher.debug("Tokeniser is idling 10s")
# LOG: CONFIGURE PUBLISHER # ---------------------------------------------------- publisher.port = 6380 publisher.channel = 'Script' # REDIS QUEUE: CONFIGURE ACCESS TO MESSAGES QUEUE # ---------------------------------------------------- # Section name in bin/packages/modules.cfg config_section = 'TwitterAnalyzer' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("Script Twitter Sentiment Analysis started") # DATABASES: CONFIGURE DATABASES # ---------------------------------------------------- # DB FOR TWITTER ANALYSIS serverTA = redis.StrictRedis(host="localhost",port="6382",db=10,decode_responses=True) serverTT = redis.StrictRedis(host="localhost",port="6382",db=11,decode_responses=True) #serverTA = redis.StrictRedis( # host=p.config.get("ARDB_TwitterAnalyzer", "host"), # port=p.config.get("ARDB_TwitterAnalyzer", "port"),
def rreplace(s, old, new, occurrence): li = s.rsplit(old, occurrence) return new.join(li) if __name__ == '__main__': publisher.port = 6380 publisher.channel = 'Script' processed_paste = 0 time_1 = time.time() config_section = 'Global' p = Process(config_section) # get and sanityze PASTE DIRECTORY PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) PASTES_FOLDERS = PASTES_FOLDER + '/' PASTES_FOLDERS = os.path.join(os.path.realpath(PASTES_FOLDERS), '') # LOGGING # publisher.info("Feed Script started to receive & publish.") while True: message = p.get_from_set() # Recovering the streamed message informations. if message is not None:
import pprint import time from packages import Paste from packages import lib_refine from pubsublogger import publisher import re from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'CreditCards' p = Process(config_section) # FUNCTIONS # publisher.info("Creditcard script subscribed to channel creditcard_categ") creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?" # FIXME For retro compatibility channel = 'creditcard_categ' # Source: http://www.richardsramblings.com/regex/credit-card-numbers/ cards = [ r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16-digit VISA, with separators r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16 digits MasterCard r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Discover Card r'\b35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Japan Credit Bureau (JCB)
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): self.type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, 'render_all': 1, 'har': crawler_options['har'], 'png': crawler_options['png']} config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str ) self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") ) def start_requests(self): yield SplashRequest( self.start_urls, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_item, 'root_key': None}, args=self.arg_crawler ) def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if(error_log['info']['text'] == 'Connection to proxy refused'): print('Connection to proxy refused') else: #avoid filename too big if len(self.domains[0]) > 215: UUID = self.domains[0][-215:]+str(uuid.uuid4()) else: UUID = self.domains[0]+str(uuid.uuid4()) filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_har = os.path.join(self.crawled_har, UUID) # # TODO: modify me # save new paste on disk if self.save_crawled_paste(relative_filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) # create root_key if self.root_key is None: self.root_key = relative_filename_paste # Create/Update crawler history self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key) # Update domain port number all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') if all_domain_ports: all_domain_ports = all_domain_ports.split(';') else: all_domain_ports = [] if self.port not in all_domain_ports: all_domain_ports.append(self.port) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) #create paste metadata self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) if 'png' in response.data: size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto image_content = base64.standard_b64decode(response.data['png'].encode()) hash = sha256(image_content).hexdigest() img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png') dirname = os.path.dirname(filename_img) if not os.path.exists(dirname): os.makedirs(dirname) if not os.path.exists(filename_img): with open(filename_img, 'wb') as f: f.write(image_content) # add item metadata self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash) # add sha256 metadata self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste) if 'har' in response.data: dirname = os.path.dirname(filename_har) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename_har+'.json', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): request = failure.request url = request.meta['splash']['args']['url'] father = request.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) if response: response_root_key = response.meta['root_key'] else: response_root_key = None yield SplashRequest( url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': father, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) else: print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) ''' #if isinstance(failure.value, HttpError): elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format(filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True