コード例 #1
0
    def __init__(self, module_name=None, queue_name=None, logger_channel='Script'):
        """
        Init Module
        module_name: str; set the module name if different from the instance ClassName
        queue_name: str; set the queue name if different from the instance ClassName
        logger_channel: str; set the logger channel name, 'Script' by default
        """
        # Module name if provided else instance className
        self.module_name = module_name if module_name else self._module_name()

        # Module name if provided else instance className
        self.queue_name = queue_name if queue_name else self._module_name()

        # Init Redis Logger
        self.redis_logger = publisher

        # Port of the redis instance used by pubsublogger
        self.redis_logger.port = 6380

        # Channel name to publish logs
        # # TODO: refactor logging
        # If provided could be a namespaced channel like script:<ModuleName>
        self.redis_logger.channel = logger_channel


        # Run module endlessly
        self.proceed = True

        # Waiting time in secondes between two proccessed messages
        self.pending_seconds = 10

        # Setup the I/O queues
        self.process = Process(self.queue_name)
コード例 #2
0
    def __init__(self, module_name=None, queue_name=None):
        """
        Init Module
        module_name: str; set the module name if different from the instance ClassName
        """
        # Module name if provided else instance className
        self.module_name = module_name if module_name else self._module_name()

        # Module name if provided else instance className
        self.queue_name = queue_name if queue_name else self._module_name()

        # Init Redis Logger
        self.redis_logger = publisher
        # Port of the redis instance used by pubsublogger
        self.redis_logger.port = 6380
        # Channel name to publish logs
        self.redis_logger.channel = 'Script'
        # TODO modify generic channel Script to a namespaced channel like:
        # publish module logs to script:<ModuleName> channel
        # self.redis_logger.channel = 'script:%s'%(self.module_name)

        # Run module endlessly
        self.proceed = True

        # Waiting time in secondes between two proccessed messages
        self.pending_seconds = 10

        # Setup the I/O queues
        self.process = Process(self.queue_name)
コード例 #3
0
        def __init__(self, type, crawler_options, date, requested_mode, url,
                     domain, port, cookies, original_item, *args, **kwargs):
            self.domain_type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4],
                                         date['date_day'][4:6],
                                         date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.png = crawler_options['png']
            self.har = crawler_options['har']
            self.cookies = cookies

            config_section = 'Crawler'
            self.p = Process(config_section)
            self.item_dir = os.path.join(
                self.p.config.get("Directories", "crawled"), date_str)
            self.har_dir = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "crawled_screenshot"),
                date_str)
            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.root_key = None
コード例 #4
0
        def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs):
            self.type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            # # TODO: timeout in config
            self.arg_crawler = {  'html': crawler_options['html'],
                                  'wait': 10,
                                  'render_all': 1,
                                  'timeout': 30,
                                  'har': crawler_options['har'],
                                  'png': crawler_options['png']}

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
コード例 #5
0
def main():
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'DomClassifier'

    p = Process(config_section)
    addr_dns = p.config.get("DomClassifier", "dns")

    publisher.info("""ZMQ DomainClassifier is Running""")

    c = DomainClassifier.domainclassifier.Extract(rawtext="",
                                                  nameservers=[addr_dns])

    cc = p.config.get("DomClassifier", "cc")
    cc_tld = p.config.get("DomClassifier", "cc_tld")

    while True:
        try:
            message = p.get_from_set()

            if message is not None:
                PST = Paste.Paste(message)
            else:
                publisher.debug("Script DomClassifier is idling 1s")
                time.sleep(1)
                continue
            paste = PST.get_p_content()
            mimetype = PST._get_p_encoding()

            if mimetype == "text/plain":
                c.text(rawtext=paste)
                c.potentialdomain()
                c.validdomain(rtype=['A'], extended=True)
                localizeddomains = c.include(expression=cc_tld)
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning(
                        'DomainC;{};{};{};Checked {} located in {};{}'.format(
                            PST.p_source, PST.p_date, PST.p_name,
                            localizeddomains, cc_tld, PST.p_path))
                localizeddomains = c.localizedomain(cc=cc)
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning(
                        'DomainC;{};{};{};Checked {} located in {};{}'.format(
                            PST.p_source, PST.p_date, PST.p_name,
                            localizeddomains, cc, PST.p_path))
        except IOError:
            print("CRC Checksum Failed on :", PST.p_path)
            publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
                PST.p_source, PST.p_date, PST.p_name))
コード例 #6
0
        def __init__(self, type, url, domain, original_paste, super_father,
                     *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [domain]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(
                self.p.config.get("Directories", "crawled"), date)

            self.crawled_paste_filemame = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "pastes"),
                self.p.config.get("Directories", "crawled"), date)

            self.crawled_screenshot = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "crawled_screenshot"), date)
コード例 #7
0
def main():
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'DomClassifier'

    p = Process(config_section)
    addr_dns = p.config.get("DomClassifier", "dns")

    publisher.info("""ZMQ DomainClassifier is Running""")

    c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns])

    cc = p.config.get("DomClassifier", "cc")
    cc_tld = p.config.get("DomClassifier", "cc_tld")

    while True:
        try:
            message = p.get_from_set()

            if message is not None:
                PST = Paste.Paste(message)
            else:
                publisher.debug("Script DomClassifier is idling 1s")
                time.sleep(1)
                continue
            paste = PST.get_p_content()
            mimetype = PST._get_p_encoding()

            if mimetype == "text/plain":
                c.text(rawtext=paste)
                c.potentialdomain()
                c.validdomain(rtype=['A'], extended=True)
                localizeddomains = c.include(expression=cc_tld)
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc_tld, PST.p_path))
                localizeddomains = c.localizedomain(cc=cc)
                if localizeddomains:
                    print(localizeddomains)
                    publisher.warning('DomainC;{};{};{};Checked {} located in {};{}'.format(
                        PST.p_source, PST.p_date, PST.p_name, localizeddomains, cc, PST.p_path))
        except IOError:
            print("CRC Checksum Failed on :", PST.p_path)
            publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
                PST.p_source, PST.p_date, PST.p_name))
コード例 #8
0
ファイル: testKeys.py プロジェクト: Mrnmap/ALLInfo
    def setUp(self):
        self.paste = Paste('../samples/2018/01/01/keys_certificat_sample.gz')

        # Section name in bin/packages/modules.cfg
        self.config_section = 'Keys'

        # Setup the I/O queues
        p = Process(self.config_section)
コード例 #9
0
        def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs):
            self.type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.arg_crawler = {  'html': crawler_options['html'],
                                  'wait': 10,
                                  'render_all': 1,
                                  'har': crawler_options['har'],
                                  'png': crawler_options['png']}

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
コード例 #10
0
def search_phone(message):
    paste = Paste.Paste(message)
    content = paste.get_p_content()
    # regex to find phone numbers, may raise many false positives (shalt thou seek optimization, upgrading is required)
    reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})')
    # list of the regex results in the Paste, may be null
    results = reg_phone.findall(content)

    # if the list is greater than 4, we consider the Paste may contain a list of phone numbers
    if len(results) > 4 :
        print results
        publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name))

	if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'Phone'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Run Phone module")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()
        if message is None:
            publisher.debug("{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue

        # Do something with the message from the queue
        search_phone(message)
コード例 #11
0
        def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [domain]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date )

            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
コード例 #12
0
ファイル: Categ.py プロジェクト: Mrnmap/ALLInfo
import os
import argparse
import time
import re
from pubsublogger import publisher
from packages import Paste

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Categ'

    p = Process(config_section)
    matchingThreshold = p.config.getint("Categ", "matchingThreshold")

    # SCRIPT PARSER #
    parser = argparse.ArgumentParser(
        description='Start Categ module on files.')

    parser.add_argument(
        '-d',
        type=str,
        default="../files/",
        help='Path to the directory containing the category files.',
        action='store')

    args = parser.parse_args()
コード例 #13
0
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
import time
import sys
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import re
from pyfaup.faup import Faup

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    faup = Faup()

    critical = 8

    regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
    regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"
    regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"
    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug("Script Credential is Idling 10s")
            time.sleep(10)
            continue
コード例 #14
0
ファイル: Url.py プロジェクト: MaximeStor/AIL-framework
# Country and ASN lookup
from cymru.ip2asn.dns import DNSClient as ip2asn
import socket
import pycountry
import ipaddress

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Web'

    p = Process(config_section)

    # REDIS #
    r_serv2 = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"))

    # Country to log as critical
    cc_critical = p.config.get("Url", "cc_critical")

    # FUNCTIONS #
    publisher.info("Script URL subscribed to channel web_categ")

    # FIXME For retro compatibility
    channel = 'web_categ'
コード例 #15
0
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process

def substract_date(date_from, date_to):
    date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
    date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
    delta = date_to - date_from # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append( date.strftime('%Y%m%d') )
    return l_date

config_section = 'Global'
p = Process(config_section)

r_tags = redis.StrictRedis(
    host=p.config.get("ARDB_Tags", "host"),
    port=p.config.getint("ARDB_Tags", "port"),
    db=p.config.getint("ARDB_Tags", "db"),
    decode_responses=True)

tag = 'infoleak:automatic-detection="bitcoin-address"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)
コード例 #16
0
#split username with spec. char or with upper case, distinguish start with upper
REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+"
REDIS_KEY_NUM_USERNAME = '******'
REDIS_KEY_NUM_PATH = 'uniqNumForUsername'
REDIS_KEY_ALL_CRED_SET = 'AllCredentials'
REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev'
REDIS_KEY_ALL_PATH_SET = 'AllPath'
REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev'
REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping'

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    minimumLengthThreshold = p.config.getint("Credential",
                                             "minimumLengthThreshold")

    faup = Faup()
    server_cred = redis.StrictRedis(host=p.config.get("ARDB_TermCred", "host"),
                                    port=p.config.get("ARDB_TermCred", "port"),
                                    db=p.config.get("ARDB_TermCred", "db"),
                                    decode_responses=True)

    criticalNumberToAlert = p.config.getint("Credential",
                                            "criticalNumberToAlert")
    minTopPassList = p.config.getint("Credential", "minTopPassList")
コード例 #17
0
ファイル: Credential.py プロジェクト: Rafiot/AIL-framework
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
import time
import sys
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import re
from pyfaup.faup import Faup

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    faup = Faup()

    critical = 8

    regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
    regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"
    regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"
    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug("Script Credential is Idling 10s")
            time.sleep(10)
            continue
コード例 #18
0
    return valid_mxdomain


def extract_all_emails(queue, item_content):
    queue.put(re.findall(email_regex, item_content))


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Mail'

    faup = Faup()

    p = Process(config_section)

    publisher.info("Mails module started")

    # Numbers of Mails needed to Tags
    mail_threshold = 10

    max_execution_time = 30

    email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}"

    q = Queue()

    while True:
        message = p.get_from_set()
コード例 #19
0
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process

def substract_date(date_from, date_to):
    date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
    date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
    delta = date_to - date_from # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append( date.strftime('%Y%m%d') )
    return l_date

config_section = 'Keys'
p = Process(config_section)

r_tags = redis.StrictRedis(
    host=p.config.get("ARDB_Tags", "host"),
    port=p.config.getint("ARDB_Tags", "port"),
    db=p.config.getint("ARDB_Tags", "db"),
    decode_responses=True)

tag = 'infoleak:automatic-detection="pgp-message"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)
コード例 #20
0

def getValueOverRange(word, startDate, num_day):
    to_return = 0
    for timestamp in range(startDate, startDate - num_day * oneDay, -oneDay):
        value = server_term.hget(timestamp, word)
        to_return += int(value) if value is not None else 0
    return to_return


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Curve'
    p = Process(config_section)

    # REDIS #
    r_serv1 = redis.StrictRedis(host=p.config.get("ARDB_Curve", "host"),
                                port=p.config.get("ARDB_Curve", "port"),
                                db=p.config.get("ARDB_Curve", "db"),
                                decode_responses=True)

    server_term = redis.StrictRedis(host=p.config.get("ARDB_TermFreq", "host"),
                                    port=p.config.get("ARDB_TermFreq", "port"),
                                    db=p.config.get("ARDB_TermFreq", "db"),
                                    decode_responses=True)

    # FUNCTIONS #
    publisher.info("Script Curve started")
コード例 #21
0
ファイル: Release.py プロジェクト: Rafiot/AIL-framework
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
import time
from packages import Paste
from pubsublogger import publisher
from Helper import Process
import re

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Release"
    p = Process(config_section)
    publisher.info("Release scripts to find release names")

    movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+"
    tv = "[a-zA-Z0-9.]+\.S[0-9]{2}E[0-9]{2}.[a-zA-Z0-9.]+\.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+"
    xxx = "[a-zA-Z0-9._]+.XXX.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+"

    regexs = [movie, tv, xxx]

    regex = '|'.join(regexs)
    while True:
        filepath = p.get_from_set()
        if filepath is None:
            publisher.debug("Script Release is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
            continue

        paste = Paste.Paste(filepath)
コード例 #22
0
def add_quote_inside_tab(tab):
    quoted_tab = "["
    for elem in tab[1:-1].split(','):
        elem = elem.lstrip().strip()
        quoted_tab += "\'{}\', ".format(elem)
    quoted_tab = quoted_tab[:-2] #remove trailing ,
    quoted_tab += "]"
    return str(quoted_tab)

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'SetForTermsFrequency'
    p = Process(config_section)

    # REDIS #
    server_term = redis.StrictRedis(
        host=p.config.get("ARDB_TermFreq", "host"),
        port=p.config.get("ARDB_TermFreq", "port"),
        db=p.config.get("ARDB_TermFreq", "db"),
        decode_responses=True)

    # FUNCTIONS #
    publisher.info("RegexForTermsFrequency script started")

    # create direct link in mail
    full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url

    #get the dico and matching percent
コード例 #23
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [domain]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date )

            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )

        def start_requests(self):
            yield SplashRequest(
                self.start_urls,
                self.parse,
                #errback=self.errback_catcher,
                endpoint='render.json',
                meta={'father': self.original_paste},
                args={  'html': 1,
                        'wait': 10,
                        'render_all': 1,
                        'har': 1,
                        'png': 1}
            )

        def parse(self,response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if(error_log['info']['text'] == 'Connection to proxy refused'):
                    print('Connection to proxy refused')
            else:

                UUID = self.domains[0]+str(uuid.uuid4())
                filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')

                # save new paste on disk
                if self.save_crawled_paste(filename_paste, response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
                    self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
                        self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)
                    self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date)

                    #create paste metadata
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
                    self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)

                    self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)

                    dirname = os.path.dirname(filename_screenshot)
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)

                    size_screenshot = (len(response.data['png'])*3) /4

                    if size_screenshot < 5000000: #bytes
                        with open(filename_screenshot, 'wb') as f:
                            f.write(base64.standard_b64decode(response.data['png'].encode()))

                    with open(filename_screenshot+'har.txt', 'wb') as f:
                        f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(
                            link.url,
                            self.parse,
                            #errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste},
                            args={  'html': 1,
                                    'png': 1,
                                    'render_all': 1,
                                    'har': 1,
                                    'wait': 10}
                        )

        '''
        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))
            print('failure')
            #print(failure)
            print(failure.type)
            #print(failure.request.meta['item'])

            #if isinstance(failure.value, HttpError):
            if failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
        '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
コード例 #24
0
ファイル: Tags.py プロジェクト: mokaddem/AIL-framework
from pubsublogger import publisher
from Helper import Process
from packages import Paste

if __name__ == '__main__':

    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'Tags'

    # Setup the I/O queues
    p = Process(config_section)

    server = redis.StrictRedis(
                host=p.config.get("ARDB_Tags", "host"),
                port=p.config.get("ARDB_Tags", "port"),
                db=p.config.get("ARDB_Tags", "db"),
                decode_responses=True)

    server_metadata = redis.StrictRedis(
                host=p.config.get("ARDB_Metadata", "host"),
                port=p.config.get("ARDB_Metadata", "port"),
                db=p.config.get("ARDB_Metadata", "db"),
                decode_responses=True)

    serv_statistics = redis.StrictRedis(
        host=p.config.get('ARDB_Statistics', 'host'),
コード例 #25
0
ファイル: CreditCards.py プロジェクト: Rafiot/AIL-framework
import time
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
import re


from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'CreditCards'

    p = Process(config_section)

    # FUNCTIONS #
    publisher.info("Creditcard script subscribed to channel creditcard_categ")


    creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?"

    # FIXME For retro compatibility
    channel = 'creditcard_categ'

    # Source: http://www.richardsramblings.com/regex/credit-card-numbers/
    cards = [
        r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16-digit VISA, with separators
        r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16 digits MasterCard
        r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Discover Card
コード例 #26
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, url, domain, original_paste, super_father,
                     *args, **kwargs):
            self.type = type
            self.original_paste = original_paste
            self.super_father = super_father
            self.start_urls = url
            self.domains = [domain]
            date = datetime.datetime.now().strftime("%Y/%m/%d")
            self.full_date = datetime.datetime.now().strftime("%Y%m%d")
            self.date_month = datetime.datetime.now().strftime("%Y%m")

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(
                self.p.config.get("Directories", "crawled"), date)

            self.crawled_paste_filemame = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "pastes"),
                self.p.config.get("Directories", "crawled"), date)

            self.crawled_screenshot = os.path.join(
                os.environ['AIL_HOME'],
                self.p.config.get("Directories", "crawled_screenshot"), date)

        def start_requests(self):
            yield SplashRequest(self.start_urls,
                                self.parse,
                                errback=self.errback_catcher,
                                endpoint='render.json',
                                meta={'father': self.original_paste},
                                args={
                                    'html': 1,
                                    'wait': 10,
                                    'render_all': 1,
                                    'har': 1,
                                    'png': 1
                                })

        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if (error_log['info']['text'] == 'Connection to proxy refused'
                    ):
                    print('Connection to proxy refused')
            else:

                #avoid filename too big
                if len(self.domains[0]) > 215:
                    UUID = self.domains[0][-215:] + str(uuid.uuid4())
                else:
                    UUID = self.domains[0] + str(uuid.uuid4())
                filename_paste = os.path.join(self.crawled_paste_filemame,
                                              UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_screenshot = os.path.join(self.crawled_screenshot,
                                                   UUID + '.png')

                # save new paste on disk
                if self.save_crawled_paste(filename_paste,
                                           response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd(
                        '{}_up:{}'.format(self.type, self.full_date),
                        self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type),
                                           self.domains[0])
                    self.r_serv_onion.sadd(
                        'month_{}_up:{}'.format(self.type, self.date_month),
                        self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(
                            self.type, self.domains[0])):
                        self.r_serv_onion.hset(
                            '{}_metadata:{}'.format(self.type,
                                                    self.domains[0]),
                            'first_seen', self.full_date)
                    self.r_serv_onion.hset(
                        '{}_metadata:{}'.format(self.type, self.domains[0]),
                        'last_seen', self.full_date)

                    #create paste metadata
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'super_father',
                        self.super_father)
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'father',
                        response.meta['father'])
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'domain',
                        self.domains[0])
                    self.r_serv_metadata.hset(
                        'paste_metadata:' + filename_paste, 'real_link',
                        response.url)

                    self.r_serv_metadata.sadd(
                        'paste_children:' + response.meta['father'],
                        filename_paste)

                    dirname = os.path.dirname(filename_screenshot)
                    if not os.path.exists(dirname):
                        os.makedirs(dirname)

                    size_screenshot = (len(response.data['png']) * 3) / 4

                    if size_screenshot < 5000000:  #bytes
                        with open(filename_screenshot, 'wb') as f:
                            f.write(
                                base64.standard_b64decode(
                                    response.data['png'].encode()))

                    with open(filename_screenshot + 'har.txt', 'wb') as f:
                        f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(
                            link.url,
                            self.parse,
                            errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste},
                            args={
                                'html': 1,
                                'png': 1,
                                'render_all': 1,
                                'har': 1,
                                'wait': 10
                            })

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                request = failure.request
                url = request.meta['splash']['args']['url']
                father = request.meta['father']

                self.logger.error(
                    'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                    url)
                time.sleep(10)
                yield SplashRequest(url,
                                    self.parse,
                                    errback=self.errback_catcher,
                                    endpoint='render.json',
                                    meta={'father': father},
                                    args={
                                        'html': 1,
                                        'png': 1,
                                        'render_all': 1,
                                        'har': 1,
                                        'wait': 10
                                    })

            else:
                print('failure')
                #print(failure)
                print(failure.type)
                #print(failure.request.meta['item'])
            '''
            #if isinstance(failure.value, HttpError):
            elif failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
            '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(
                    filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder",
                                           "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True
コード例 #27
0
ファイル: Tags.py プロジェクト: wubic/ail-framework
from pubsublogger import publisher
from Helper import Process
from packages import Tag

if __name__ == '__main__':

    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'Tags'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Tags module started")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()

        if message is None:
            publisher.debug(
                "{} queue is empty, waiting 10s".format(config_section))
            time.sleep(10)
            continue
コード例 #28
0
#split username with spec. char or with upper case, distinguish start with upper
REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+"
REDIS_KEY_NUM_USERNAME = '******'
REDIS_KEY_NUM_PATH = 'uniqNumForUsername'
REDIS_KEY_ALL_CRED_SET = 'AllCredentials'
REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev'
REDIS_KEY_ALL_PATH_SET = 'AllPath'
REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev'
REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping'

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    module_name = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    faup = Faup()

    regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
    #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+"
    regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+"
    regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"

    redis_cache_key = regex_helper.generate_redis_cache_key(module_name)

    while True:
        message = p.get_from_set()

        if message is None:
コード例 #29
0
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'SentimentAnalysis'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("<description of the module>")

    # REDIS_LEVEL_DB #
    server = redis.StrictRedis(
        host=p.config.get("ARDB_Sentiment", "host"),
        port=p.config.get("ARDB_Sentiment", "port"),
        db=p.config.get("ARDB_Sentiment", "db"),
        decode_responses=True)

    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug("{} queue is empty, waiting".format(config_section))
コード例 #30
0
ファイル: Mail.py プロジェクト: mokaddem/AIL-framework
from packages import lib_refine
from pubsublogger import publisher

from pyfaup.faup import Faup

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Mail'

    faup = Faup()

    p = Process(config_section)
    addr_dns = p.config.get("Mail", "dns")

    # REDIS #
    r_serv2 = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"),
        decode_responses=True)
    # ARDB #
    server_statistics = redis.StrictRedis(
        host=p.config.get("ARDB_Statistics", "host"),
        port=p.config.getint("ARDB_Statistics", "port"),
        db=p.config.getint("ARDB_Statistics", "db"),
        decode_responses=True)
コード例 #31
0
ファイル: Cryptocurrencies.py プロジェクト: Mrnmap/ALLInfo
    'dash': {
        'name': 'dash',  # e.g. XmNfXq2kDmrNBTiDTofohRemwGur1WmgTT
        'regex': r'\b(?<![+/=])X[A-Za-z0-9]{33}(?![+/=])\b',
        'max_execution_time': default_max_execution_time,
        'tag': 'infoleak:automatic-detection="dash-address"',
    }
}

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Bitcoin'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Run Cryptocurrency module ")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        item_id = p.get_from_set()
        if item_id is None:
            publisher.debug(
                "{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue

        # Do something with the message from the queue
コード例 #32
0
def substract_date(date_from, date_to):
    date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]),
                              int(date_from[6:8]))
    date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]),
                            int(date_to[6:8]))
    delta = date_to - date_from  # timedelta
    l_date = []
    for i in range(delta.days + 1):
        date = date_from + datetime.timedelta(i)
        l_date.append(date.strftime('%Y%m%d'))
    return l_date


config_section = 'Global'
p = Process(config_section)

r_tags = redis.StrictRedis(host=p.config.get("ARDB_Tags", "host"),
                           port=p.config.getint("ARDB_Tags", "port"),
                           db=p.config.getint("ARDB_Tags", "db"),
                           decode_responses=True)

tag = 'infoleak:automatic-detection="bitcoin-address"'

# get tag first/last seen
first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen')
last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen')

l_dates = substract_date(first_seen, last_seen)

# get all tagged items
コード例 #33
0
ファイル: CreditCards.py プロジェクト: CIRCL/AIL-framework
import time
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
import re
import sys

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'CreditCards'

    p = Process(config_section)

    # FUNCTIONS #
    publisher.info("Creditcard script subscribed to channel creditcard_categ")


    creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?"

    # FIXME For retro compatibility
    channel = 'creditcard_categ'

    # Source: http://www.richardsramblings.com/regex/credit-card-numbers/
    cards = [
        r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16-digit VISA, with separators
        r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16 digits MasterCard
        r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Discover Card
コード例 #34
0
            failed.append(url)
            print('Failed at downloading', url)
            print(process.stdout.read())
    print('Failed:', len(failed), 'Downloaded:', len(downloaded))


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    torclient_host = '127.0.0.1'
    torclient_port = 9050

    config_section = 'Onion'

    p = Process(config_section)
    r_cache = redis.StrictRedis(host=p.config.get("Redis_Cache", "host"),
                                port=p.config.getint("Redis_Cache", "port"),
                                db=p.config.getint("Redis_Cache", "db"),
                                decode_responses=True)

    r_onion = redis.StrictRedis(host=p.config.get("ARDB_Onion", "host"),
                                port=p.config.getint("ARDB_Onion", "port"),
                                db=p.config.getint("ARDB_Onion", "db"),
                                decode_responses=True)

    # FUNCTIONS #
    publisher.info("Script subscribed to channel onion_categ")

    # FIXME For retro compatibility
    channel = 'onion_categ'
コード例 #35
0
ファイル: Release.py プロジェクト: mokaddem/AIL-framework
def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

'''
This module takes its input from the global module.
It applies some regex and publish matched content
'''

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Release"
    p = Process(config_section)
    max_execution_time = p.config.getint("Curve", "max_execution_time")
    publisher.info("Release scripts to find release names")

    movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+"
    tv = "[a-zA-Z0-9.]+\.S[0-9]{2}E[0-9]{2}.[a-zA-Z0-9.]+\.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+"
    xxx = "[a-zA-Z0-9._]+.XXX.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+"

    regexs = [movie, tv, xxx]

    regex = '|'.join(regexs)
    while True:
        signal.alarm(max_execution_time)
        filepath = p.get_from_set()
        if filepath is None:
            publisher.debug("Script Release is Idling 10s")
コード例 #36
0
ファイル: Mixer.py プロジェクト: Mrnmap/ALLInfo
from Helper import Process

# CONFIG #
refresh_time = 30
FEED_QUEUE_MAPPING = {
    "feeder2": "preProcess1"
}  # Map a feeder name to a pre-processing module

if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = 'Script'

    config_section = 'Mixer'

    p = Process(config_section)

    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
    if not os.path.exists(configfile):
        raise Exception('Unable to find the configuration file. \
                        Did you set environment variables? \
                        Or activate the virtualenv.')

    cfg = configparser.ConfigParser()
    cfg.read(configfile)

    # REDIS #
    server = redis.StrictRedis(host=cfg.get("Redis_Mixer_Cache", "host"),
                               port=cfg.getint("Redis_Mixer_Cache", "port"),
                               db=cfg.getint("Redis_Mixer_Cache", "db"),
                               decode_responses=True)
コード例 #37
0
ファイル: Decoder.py プロジェクト: CIRCL/AIL-framework
    msg = 'infoleak:automatic-detection="'+decoder_name+'";{}'.format(message)
    p.populate_set_out(msg, 'Tags')


if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'Decoder'

    # Setup the I/O queues
    p = Process(config_section)

    serv_metadata = redis.StrictRedis(
        host=p.config.get("ARDB_Metadata", "host"),
        port=p.config.getint("ARDB_Metadata", "port"),
        db=p.config.getint("ARDB_Metadata", "db"),
        decode_responses=True)

    # Sent to the logging a description of the module
    publisher.info("Decoder started")

    regex_binary = '[0-1]{40,}'
    #regex_hex = '(0[xX])?[A-Fa-f0-9]{40,}'
    regex_hex = '[A-Fa-f0-9]{40,}'
    regex_base64 = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
コード例 #38
0
ファイル: Mail.py プロジェクト: Rafiot/AIL-framework
import pprint
import time
import dns.exception
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Mail'

    p = Process(config_section)

    # REDIS #
    r_serv2 = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"))

    # FUNCTIONS #
    publisher.info("Suscribed to channel mails_categ")

    # FIXME For retro compatibility
    channel = 'mails_categ'

    message = p.get_from_set()
    prec_filename = None
コード例 #39
0
ファイル: QueueOut.py プロジェクト: Rafiot/AIL-framework
def run(config_section):
    p = Process(config_section)
    if not p.publish():
        print(config_section, 'has no publisher.')
コード例 #40
0
        print('usage:', 'Crawler.py',
              'type_hidden_service (onion or i2p or regular)', 'splash_port')
        exit(1)

    type_hidden_service = sys.argv[1]
    splash_port = sys.argv[2]

    publisher.port = 6380
    publisher.channel = "Script"

    publisher.info("Script Crawler started")

    config_section = 'Crawler'

    # Setup the I/O queues
    p = Process(config_section)

    url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
    re.compile(url_onion)
    url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
    re.compile(url_i2p)

    if type_hidden_service == 'onion':
        regex_hidden_service = url_onion
        splash_url = '{}:{}'.format(
            p.config.get("Crawler", "splash_url_onion"), splash_port)
    elif type_hidden_service == 'i2p':
        regex_hidden_service = url_i2p
        splash_url = '{}:{}'.format(p.config.get("Crawler", "splash_url_i2p"),
                                    splash_port)
    elif type_hidden_service == 'regular':
コード例 #41
0
ファイル: template.py プロジェクト: Rafiot/AIL-framework
def do_something(message):
    return None

if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = '<section name>'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("<description of the module>")

    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()
        if message is None:
            publisher.debug("{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue

        # Do something with the message from the queue
        something_has_been_done = do_something(message)
コード例 #42
0
import time
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
import re


from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'CreditCards'

    p = Process(config_section)

    # FUNCTIONS #
    publisher.info("Creditcard script subscribed to channel creditcard_categ")


    creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?"

    # FIXME For retro compatibility
    channel = 'creditcard_categ'

    # Source: http://www.richardsramblings.com/regex/credit-card-numbers/
    cards = [
        r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16-digit VISA, with separators
        r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16 digits MasterCard
        r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Discover Card
コード例 #43
0
ファイル: Tokenize.py プロジェクト: CIRCL/AIL-framework
import signal

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Tokenize'
    p = Process(config_section)

    # LOGGING #
    publisher.info("Tokeniser started")

    while True:
        message = p.get_from_set()
        print(message)
        if message is not None:
            paste = Paste.Paste(message)
            signal.alarm(5)
            try:
                for word, score in paste._get_top_words().items():
                    if len(word) >= 4:
                        msg = '{} {} {}'.format(paste.p_rel_path, word, score)
                        p.populate_set_out(msg)
コード例 #44
0
ファイル: Categ.py プロジェクト: MaximeStor/AIL-framework
import os
import argparse
import time
import re
from pubsublogger import publisher
from packages import Paste

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Categ'

    p = Process(config_section)

    # SCRIPT PARSER #
    parser = argparse.ArgumentParser(description='Start Categ module on files.')

    parser.add_argument(
        '-d', type=str, default="../files/",
        help='Path to the directory containing the category files.',
        action='store')

    args = parser.parse_args()

    # FUNCTIONS #
    publisher.info("Script Categ started")

    categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential']
コード例 #45
0
ファイル: Global.py プロジェクト: tonirss/AIL-framework
"""
import base64
import os
import time
from pubsublogger import publisher

from Helper import Process


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = "Global"

    p = Process(config_section)

    # LOGGING #
    publisher.info("Feed Script started to receive & publish.")

    while True:

        message = p.get_from_set()
        # Recovering the streamed message informations.
        if message is not None:
            splitted = message.split()
            if len(splitted) == 2:
                paste, gzip64encoded = splitted
            else:
                # TODO Store the name of the empty paste inside a Redis-list.
                print "Empty Paste: not processed"
コード例 #46
0
        exit(1)
##################################################
#mode = sys.argv[1]
    splash_port = sys.argv[1]

    rotation_mode = deque(['onion', 'regular'])
    default_proto_map = {'http': 80, 'https': 443}
    ######################################################## add ftp ???

    publisher.port = 6380
    publisher.channel = "Script"
    publisher.info("Script Crawler started")
    config_section = 'Crawler'

    # Setup the I/O queues
    p = Process(config_section)

    splash_url = '{}:{}'.format(p.config.get("Crawler", "splash_url"),
                                splash_port)
    print('splash url: {}'.format(splash_url))

    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "pastes"))

    r_serv_metadata = redis.StrictRedis(
        host=p.config.get("ARDB_Metadata", "host"),
        port=p.config.getint("ARDB_Metadata", "port"),
        db=p.config.getint("ARDB_Metadata", "db"),
        decode_responses=True)

    r_cache = redis.StrictRedis(host=p.config.get("Redis_Cache", "host"),
コード例 #47
0
ファイル: Mixer.py プロジェクト: CIRCL/AIL-framework
import configparser

from Helper import Process


# CONFIG #
refresh_time = 30
FEED_QUEUE_MAPPING = { "feeder2": "preProcess1" } # Map a feeder name to a pre-processing module

if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = 'Script'

    config_section = 'Mixer'

    p = Process(config_section)

    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
    if not os.path.exists(configfile):
        raise Exception('Unable to find the configuration file. \
                        Did you set environment variables? \
                        Or activate the virtualenv.')

    cfg = configparser.ConfigParser()
    cfg.read(configfile)

    # REDIS #
    server = redis.StrictRedis(
        host=cfg.get("Redis_Mixer_Cache", "host"),
        port=cfg.getint("Redis_Mixer_Cache", "port"),
        db=cfg.getint("Redis_Mixer_Cache", "db"),
コード例 #48
0
ファイル: Release.py プロジェクト: xme/AIL-framework
def timeout_handler(signum, frame):
    raise TimeoutException


signal.signal(signal.SIGALRM, timeout_handler)
'''
This module takes its input from the global module.
It applies some regex and publish matched content
'''

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Release"
    p = Process(config_section)
    max_execution_time = p.config.getint("Curve", "max_execution_time")
    publisher.info("Release scripts to find release names")

    movie = "[a-zA-Z0-9.]+\.[0-9]{4}.[a-zA-Z0-9.]+\-[a-zA-Z]+"
    tv = "[a-zA-Z0-9.]+\.S[0-9]{2}E[0-9]{2}.[a-zA-Z0-9.]+\.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+"
    xxx = "[a-zA-Z0-9._]+.XXX.[a-zA-Z0-9.]+\-[a-zA-Z0-9]+"

    regexs = [movie, tv, xxx]

    regex = '|'.join(regexs)
    while True:
        signal.alarm(max_execution_time)
        filepath = p.get_from_set()
        if filepath is None:
            publisher.debug("Script Release is Idling 10s")
コード例 #49
0
ファイル: testHelper.py プロジェクト: Mrnmap/ALLInfo
    def test_Process_Constructor_using_key_module(self):

        conf_section = 'Keys'
        process = Process(conf_section)
        self.assertEqual(process.subscriber_name, 'Keys')
コード例 #50
0
ファイル: Credential.py プロジェクト: CIRCL/AIL-framework
#split username with spec. char or with upper case, distinguish start with upper
REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+"
REDIS_KEY_NUM_USERNAME = '******'
REDIS_KEY_NUM_PATH = 'uniqNumForUsername'
REDIS_KEY_ALL_CRED_SET = 'AllCredentials'
REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev'
REDIS_KEY_ALL_PATH_SET = 'AllPath'
REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev'
REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping'

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"
    config_section = "Credential"
    p = Process(config_section)
    publisher.info("Find credentials")

    minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold")

    faup = Faup()
    server_cred = redis.StrictRedis(
        host=p.config.get("ARDB_TermCred", "host"),
        port=p.config.get("ARDB_TermCred", "port"),
        db=p.config.get("ARDB_TermCred", "db"),
        decode_responses=True)

    server_statistics = redis.StrictRedis(
        host=p.config.get("ARDB_Statistics", "host"),
        port=p.config.getint("ARDB_Statistics", "port"),
        db=p.config.getint("ARDB_Statistics", "db"),
コード例 #51
0
ファイル: Web.py プロジェクト: stedeluxe/AIL-framework
from Helper import Process

# Used to prevent concat with empty fields due to url parsing
def avoidNone(a_string):
    if a_string is None:
        return ""
    else:
        return a_string

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Web'

    p = Process(config_section)

    # REDIS #
    r_serv2 = redis.StrictRedis(
        host=p.config.get("Redis_Cache", "host"),
        port=p.config.getint("Redis_Cache", "port"),
        db=p.config.getint("Redis_Cache", "db"))

    # Protocol file path
    protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
                         p.config.get("Directories", "protocolsfile"))

    # Country to log as critical
    cc_critical = p.config.get("Url", "cc_critical")

    # FUNCTIONS #
コード例 #52
0
ファイル: Indexer.py プロジェクト: marcoramilli/AIL-framework
from pubsublogger import publisher

from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID
import os

from Helper import Process


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Indexer'

    p = Process(config_section)

    # Indexer configuration - index dir and schema setup
    indexpath = os.path.join(os.environ['AIL_HOME'],
                             p.config.get("Indexer", "path"))
    indexertype = p.config.get("Indexer", "type")
    if indexertype == "whoosh":
        schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
                                                         unique=True),
                        content=TEXT)
        if not os.path.exists(indexpath):
            os.mkdir(indexpath)
        if not exists_in(indexpath):
            ix = create_in(indexpath, schema)
        else:
            ix = open_dir(indexpath)
コード例 #53
0
ファイル: Lines.py プロジェクト: CIRCL/AIL-framework
*Need the ZMQ_PubSub_Line_Q Module running to be able to work properly.

"""
import argparse
import time
from packages import Paste
from pubsublogger import publisher

from Helper import Process

if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = 'Script'

    config_section = 'Lines'
    p = Process(config_section)

    # SCRIPT PARSER #
    parser = argparse.ArgumentParser(
        description='This script is a part of the Analysis Information \
                Leak framework.')

    parser.add_argument(
        '-max', type=int, default=500,
        help='The limit between "short lines" and "long lines"',
        action='store')

    args = parser.parse_args()

    # FUNCTIONS #
    tmp_string = "Lines script Subscribed to channel {} and Start to publish \
コード例 #54
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, splash_url, type, crawler_options, date,
                     requested_mode, url, domain, port, cookies, original_item,
                     *args, **kwargs):
            self.splash_url = splash_url
            self.domain_type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4],
                                         date['date_day'][4:6],
                                         date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.png = crawler_options['png']
            self.har = crawler_options['har']
            self.cookies = cookies

            config_section = 'Crawler'
            self.p = Process(config_section)
            self.item_dir = os.path.join(
                self.p.config.get("Directories", "crawled"), date_str)

            config_loader = ConfigLoader.ConfigLoader()
            self.har_dir = os.path.join(
                config_loader.get_files_directory('har'), date_str)
            config_loader = None

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.root_key = None

        def build_request_arg(self, cookies):
            return {
                'wait': 10,
                'resource_timeout':
                30,  # /!\ Weird behaviour if timeout < resource_timeout /!\
                'timeout': 30,
                'cookies': cookies,
                'lua_source': script_cookie
            }

        def start_requests(self):
            l_cookies = self.build_request_arg(self.cookies)
            yield SplashRequest(self.start_urls,
                                self.parse,
                                errback=self.errback_catcher,
                                endpoint='execute',
                                meta={
                                    'father': self.original_item,
                                    'current_url': self.start_urls
                                },
                                args=l_cookies)

        # # TODO: remove duplicate and anchor
        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # no response
                #print('504 detected')
                pass

            # LUA ERROR # # TODO: print/display errors
            elif 'error' in response.data:
                if (response.data['error'] == 'network99'):
                    ## splash restart ##
                    error_retry = request.meta.get('error_retry', 0)
                    if error_retry < 3:
                        error_retry += 1
                        url = request.meta['current_url']
                        father = request.meta['father']

                        self.logger.error(
                            'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                            url)
                        time.sleep(10)
                        yield SplashRequest(url,
                                            self.parse,
                                            errback=self.errback_catcher,
                                            endpoint='execute',
                                            cache_args=['lua_source'],
                                            meta={
                                                'father': father,
                                                'current_url': url,
                                                'error_retry': error_retry
                                            },
                                            args=self.build_request_arg(
                                                response.cookiejar))
                    else:
                        print('Connection to proxy refused')
                else:
                    print(response.data['error'])

            elif response.status != 200:
                print('other response: {}'.format(response.status))
                # detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                print(error_log)
            #elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
            #    pass # ignore response
            else:

                item_id = crawlers.create_item_id(self.item_dir,
                                                  self.domains[0])
                self.save_crawled_item(item_id, response.data['html'])
                crawlers.create_item_metadata(item_id, self.domains[0],
                                              response.data['last_url'],
                                              self.port,
                                              response.meta['father'])

                if self.root_key is None:
                    self.root_key = item_id
                    crawlers.add_domain_root_item(item_id, self.domain_type,
                                                  self.domains[0],
                                                  self.date_epoch, self.port)
                    crawlers.create_domain_metadata(self.domain_type,
                                                    self.domains[0], self.port,
                                                    self.full_date,
                                                    self.date_month)

                if 'cookies' in response.data:
                    all_cookies = response.data['cookies']
                else:
                    all_cookies = []

                # SCREENSHOT
                if 'png' in response.data and self.png:
                    sha256_string = Screenshot.save_crawled_screeshot(
                        response.data['png'],
                        5000000,
                        f_save=self.requested_mode)
                    if sha256_string:
                        Screenshot.save_item_relationship(
                            sha256_string, item_id)
                        Screenshot.save_domain_relationship(
                            sha256_string, self.domains[0])
                # HAR
                if 'har' in response.data and self.har:
                    crawlers.save_har(self.har_dir, item_id,
                                      response.data['har'])

                le = LinkExtractor(allow_domains=self.domains, unique=True)
                for link in le.extract_links(response):
                    l_cookies = self.build_request_arg(all_cookies)
                    yield SplashRequest(link.url,
                                        self.parse,
                                        errback=self.errback_catcher,
                                        endpoint='execute',
                                        meta={
                                            'father': item_id,
                                            'current_url': link.url
                                        },
                                        args=l_cookies)

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                ## DEBUG ##
                self.logger.error(failure.request)
                if failure.value.response:
                    self.logger.error(failure.value.response)
                ## ----- ##

                # Extract request metadata
                url = failure.request.meta['current_url']
                father = failure.request.meta['father']
                l_cookies = self.build_request_arg(
                    failure.request.meta['splash']['args']['cookies'])

                # Check if Splash restarted
                if not crawlers.is_splash_reachable(self.splash_url):
                    self.logger.error(
                        'Splash, ResponseNeverReceived for %s, retry in 30s ...',
                        url)
                    time.sleep(30)

                yield SplashRequest(url,
                                    self.parse,
                                    errback=self.errback_catcher,
                                    endpoint='execute',
                                    meta={
                                        'father': father,
                                        'current_url': url
                                    },
                                    args=l_cookies)

            else:
                self.logger.error(failure.type)
                self.logger.error(failure.getErrorMessage())

        def save_crawled_item(self, item_id, item_content):
            gzip64encoded = crawlers.save_crawled_item(item_id, item_content)

            # Send item to queue
            # send paste to Global
            relay_message = "{0} {1}".format(item_id, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder",
                                           "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(item_id)
            self.p.populate_set_out(msg, 'Tags')
コード例 #55
0
ファイル: Tokenize.py プロジェクト: Rafiot/AIL-framework
*Need running Redis instances. (Redis)
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.

"""
import time
from packages import Paste
from pubsublogger import publisher

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Tokenize'
    p = Process(config_section)

    # LOGGING #
    publisher.info("Tokeniser started")

    while True:
        message = p.get_from_set()
        print message
        if message is not None:
            paste = Paste.Paste(message)
            for word, score in paste._get_top_words().items():
                if len(word) >= 4:
                    msg = '{} {} {}'.format(paste.p_path, word, score)
                    p.populate_set_out(msg)
        else:
            publisher.debug("Tokeniser is idling 10s")
コード例 #56
0
    # LOG: CONFIGURE PUBLISHER
    # ----------------------------------------------------

    publisher.port = 6380
    publisher.channel = 'Script'


    # REDIS QUEUE: CONFIGURE ACCESS TO MESSAGES QUEUE
    # ----------------------------------------------------

    # Section name in bin/packages/modules.cfg
    config_section = 'TwitterAnalyzer'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Script Twitter Sentiment Analysis started")


    # DATABASES: CONFIGURE DATABASES
    # ----------------------------------------------------

    # DB FOR TWITTER ANALYSIS
    serverTA = redis.StrictRedis(host="localhost",port="6382",db=10,decode_responses=True)
    serverTT = redis.StrictRedis(host="localhost",port="6382",db=11,decode_responses=True)

    #serverTA = redis.StrictRedis(
    #	host=p.config.get("ARDB_TwitterAnalyzer", "host"),
    #	port=p.config.get("ARDB_TwitterAnalyzer", "port"),
コード例 #57
0

def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)


if __name__ == '__main__':
    publisher.port = 6380
    publisher.channel = 'Script'
    processed_paste = 0
    time_1 = time.time()

    config_section = 'Global'

    p = Process(config_section)

    # get and sanityze PASTE DIRECTORY
    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "pastes"))
    PASTES_FOLDERS = PASTES_FOLDER + '/'
    PASTES_FOLDERS = os.path.join(os.path.realpath(PASTES_FOLDERS), '')

    # LOGGING #
    publisher.info("Feed Script started to receive & publish.")

    while True:

        message = p.get_from_set()
        # Recovering the streamed message informations.
        if message is not None:
コード例 #58
0
ファイル: CreditCard.py プロジェクト: cs24/AIL-framework
import pprint
import time
from packages import Paste
from packages import lib_refine
from pubsublogger import publisher
import re

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'CreditCards'

    p = Process(config_section)

    # FUNCTIONS #
    publisher.info("Creditcard script subscribed to channel creditcard_categ")

    creditcard_regex = "4[0-9]{12}(?:[0-9]{3})?"

    # FIXME For retro compatibility
    channel = 'creditcard_categ'

    # Source: http://www.richardsramblings.com/regex/credit-card-numbers/
    cards = [
        r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16-digit VISA, with separators
        r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # 16 digits MasterCard
        r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Discover Card
        r'\b35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b',  # Japan Credit Bureau (JCB)
コード例 #59
0
    class TorSplashSpider(Spider):
        name = 'TorSplashSpider'

        def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs):
            self.type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.arg_crawler = {  'html': crawler_options['html'],
                                  'wait': 10,
                                  'render_all': 1,
                                  'har': crawler_options['har'],
                                  'png': crawler_options['png']}

            config_section = 'Crawler'
            self.p = Process(config_section)

            self.r_cache = redis.StrictRedis(
                host=self.p.config.get("Redis_Cache", "host"),
                port=self.p.config.getint("Redis_Cache", "port"),
                db=self.p.config.getint("Redis_Cache", "db"),
                decode_responses=True)

            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.r_serv_metadata = redis.StrictRedis(
                host=self.p.config.get("ARDB_Metadata", "host"),
                port=self.p.config.getint("ARDB_Metadata", "port"),
                db=self.p.config.getint("ARDB_Metadata", "db"),
                decode_responses=True)

            self.r_serv_onion = redis.StrictRedis(
                host=self.p.config.get("ARDB_Onion", "host"),
                port=self.p.config.getint("ARDB_Onion", "port"),
                db=self.p.config.getint("ARDB_Onion", "db"),
                decode_responses=True)

            self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
                                            self.p.config.get("Directories", "crawled"), date_str )

            self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )

        def start_requests(self):
            yield SplashRequest(
                self.start_urls,
                self.parse,
                errback=self.errback_catcher,
                endpoint='render.json',
                meta={'father': self.original_item, 'root_key': None},
                args=self.arg_crawler
            )

        def parse(self,response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # down ?
                print('504 detected')
            elif response.status != 200:
                print('other response: {}'.format(response.status))
                #print(error_log)
                #detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                if(error_log['info']['text'] == 'Connection to proxy refused'):
                    print('Connection to proxy refused')
            else:

                #avoid filename too big
                if len(self.domains[0]) > 215:
                    UUID = self.domains[0][-215:]+str(uuid.uuid4())
                else:
                    UUID = self.domains[0]+str(uuid.uuid4())
                filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_har = os.path.join(self.crawled_har, UUID)

                # # TODO: modify me
                # save new paste on disk
                if self.save_crawled_paste(relative_filename_paste, response.data['html']):

                    # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)

                    self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
                    self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
                    self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0])

                    # create onion metadata
                    if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])):
                        self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date)

                    # create root_key
                    if self.root_key is None:
                        self.root_key = relative_filename_paste
                        # Create/Update crawler history
                        self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
                        # Update domain port number
                        all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
                        if all_domain_ports:
                            all_domain_ports = all_domain_ports.split(';')
                        else:
                            all_domain_ports = []
                        if self.port not in all_domain_ports:
                            all_domain_ports.append(self.port)
                            self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))

                    #create paste metadata
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key)
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father'])
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
                    self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url)

                    self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)

                    if 'png' in response.data:
                        size_screenshot = (len(response.data['png'])*3) /4

                        if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto
                            image_content = base64.standard_b64decode(response.data['png'].encode())
                            hash = sha256(image_content).hexdigest()
                            img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
                            filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
                            dirname = os.path.dirname(filename_img)
                            if not os.path.exists(dirname):
                                os.makedirs(dirname)
                            if not os.path.exists(filename_img):
                                with open(filename_img, 'wb') as f:
                                    f.write(image_content)
                            # add item metadata
                            self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
                            # add sha256 metadata
                            self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste)

                    if 'har' in response.data:
                        dirname = os.path.dirname(filename_har)
                        if not os.path.exists(dirname):
                            os.makedirs(dirname)
                        with open(filename_har+'.json', 'wb') as f:
                            f.write(json.dumps(response.data['har']).encode())

                    # save external links in set
                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
                    #for link in lext.extract_links(response):
                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)

                    le = LinkExtractor(allow_domains=self.domains, unique=True)
                    for link in le.extract_links(response):
                        yield SplashRequest(
                            link.url,
                            self.parse,
                            errback=self.errback_catcher,
                            endpoint='render.json',
                            meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']},
                            args=self.arg_crawler
                        )

        def errback_catcher(self, failure):
            # catch all errback failures,
            self.logger.error(repr(failure))

            if failure.check(ResponseNeverReceived):
                request = failure.request
                url = request.meta['splash']['args']['url']
                father = request.meta['father']

                self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
                time.sleep(10)
                if response:
                    response_root_key = response.meta['root_key']
                else:
                    response_root_key = None
                yield SplashRequest(
                    url,
                    self.parse,
                    errback=self.errback_catcher,
                    endpoint='render.json',
                    meta={'father': father, 'root_key': response.meta['root_key']},
                    args=self.arg_crawler
                )

            else:
                print('failure')
                #print(failure)
                print(failure.type)
                #print(failure.request.meta['item'])

            '''
            #if isinstance(failure.value, HttpError):
            elif failure.check(HttpError):
                # you can get the response
                response = failure.value.response
                print('HttpError')
                self.logger.error('HttpError on %s', response.url)

            #elif isinstance(failure.value, DNSLookupError):
            elif failure.check(DNSLookupError):
                # this is the original request
                request = failure.request
                print(DNSLookupError)
                print('DNSLookupError')
                self.logger.error('DNSLookupError on %s', request.url)

            #elif isinstance(failure.value, TimeoutError):
            elif failure.check(TimeoutError):
                request = failure.request
                print('TimeoutError')
                print(TimeoutError)
                self.logger.error('TimeoutError on %s', request.url)
            '''

        def save_crawled_paste(self, filename, content):

            if os.path.isfile(filename):
                print('File: {} already exist in submitted pastes'.format(filename))
                return False

            try:
                gzipencoded = gzip.compress(content.encode())
                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
            except:
                print("file error: {}".format(filename))
                return False

            # send paste to Global
            relay_message = "{0} {1}".format(filename, gzip64encoded)
            self.p.populate_set_out(relay_message, 'Mixer')

            # increase nb of paste by feeder name
            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)

            # tag crawled paste
            msg = 'infoleak:submission="crawler";{}'.format(filename)
            self.p.populate_set_out(msg, 'Tags')
            return True