Esempio n. 1
0
def initialize_url_filter():
    global url_filter
    # initialize the crawlphish url filter
    url_filter = CrawlphishURLFilter()
    # TODO schedule tasks to reload lists
    url_filter.load()
    logging.debug("url filter loaded")
Esempio n. 2
0
    def execute_analysis(self, url):

        if not self._initialized:
            # used to decide what URLs to actually crawl
            self.url_filter = CrawlphishURLFilter()

            # a whitelist of sites we'll always crawl
            self.watch_file(self.url_filter.whitelist_path,
                            self.url_filter.load_whitelist)
            self.watch_file(self.url_filter.blacklist_path,
                            self.url_filter.load_blacklist)
            self.watch_file(self.url_filter.regex_path,
                            self.url_filter.load_path_regexes)

            self._initialized = True

        analysis = self.create_analysis(url)
        # are we able to download it?
        analysis.downloaded = False
        # if not, why?
        #analysis.error_reason = None

        # is this URL crawlable?
        filter_result = self.url_filter.filter(url.value)
        analysis.filtered_status = filter_result.filtered
        analysis.filtered_status_reason = filter_result.reason

        if analysis.filtered_status:
            logging.debug("{} is not crawlable: {}".format(
                url.value, analysis.filtered_status_reason))
            return False

        parsed_url = filter_result.parsed_url
        if parsed_url is None:
            logging.debug("unable to parse url {}".format(url.value))
            return False

        formatted_url = urlunparse(parsed_url)

        # update brocess if we're configured to do so
        if self.update_brocess and parsed_url.hostname and not is_ipv4(
                parsed_url.hostname):
            logging.debug(
                "updating brocess with crawlphish request for {}".format(
                    parsed_url.hostname))
            add_httplog(parsed_url.hostname)

        # what proxies are we going to use to attempt to download the url?
        # these are attempted in the order specified in the configuration setting
        proxy_configs = []
        for name in self.proxies.split(','):
            if name == 'GLOBAL':
                proxy_configs.append((name, saq.PROXIES))
            else:
                proxy_configs.append((name, saq.OTHER_PROXIES[name]))

        proxy_result = None

        for index, proxy_config in enumerate(proxy_configs):
            proxy_name, proxy_config = proxy_config

            proxy_result = CloudphishProxyResult()
            proxy_result.proxy_name = proxy_name
            analysis.proxies.append(proxy_name)
            analysis.proxy_results[proxy_name] = proxy_result
            session = requests.Session()
            session.proxies = proxy_config

            try:
                logging.info("requesting url {} via {}".format(
                    formatted_url, proxy_name))
                response = session.request('GET',
                                           formatted_url,
                                           headers=self.headers,
                                           timeout=self.timeout,
                                           allow_redirects=True,
                                           verify=False,
                                           stream=True)

                proxy_result.status_code = response.status_code
                proxy_result.status_code_reason = response.reason
                logging.info("url request result {} ({}) for {}".format(
                    response.status_code, response.reason, formatted_url))

                for header in response.headers.keys():
                    proxy_result.headers[header] = response.headers[header]

                for part in response.history:
                    proxy_result.history.append(part.url)

                # did we get an error code?
                if math.floor(response.status_code / 100) in [4, 5]:
                    proxy_result.error_reason = '({}) {}'.format(
                        proxy_result.status_code,
                        proxy_result.status_code_reason)
                    continue

                # all is well -- break out and download the content
                break

            except requests.Timeout as e:
                proxy_result.error_reason = "request timed out"
                continue
            except Exception as e:
                proxy_result.error_reason = str(e)
                #report_exception()
                continue

            # we should never get here
            logging.error("executed invalid branch?")
            break

        # did we successfully start a download?
        if proxy_result.error_reason is not None:
            logging.info("unable to download {}: {}".format(
                formatted_url, proxy_result.error_reason))
            return True

        path_components = [x for x in parsed_url.path.split('/') if x.strip()]

        # need to figure out what to call it
        file_name = None
        # content-disposition header is the official way
        if 'content-disposition' in response.headers:
            file_name = response.headers['content-disposition']
            # we could potentially see there here: attachment; filename="blah..."
            content_file_match = re.search(
                'attachment; filename*?="?(?P<real_filename>[^"]+)"?',
                response.headers['content-disposition'])
            if content_file_match:
                file_name = content_file_match.group('real_filename')

                # handle rfc5987 which allows utf-8 encoding and url-encoding
                if file_name.lower().startswith("utf-8''"):
                    file_name = file_name[7:]
                    file_name = urllib.unquote(file_name).decode('utf8')

        # otherwise we use the last element of the path
        if not file_name and parsed_url.path and not parsed_url.path.endswith(
                '/'):
            file_name = path_components[-1]

        # default if we can't figure it out
        if not file_name:
            file_name = 'unknown.crawlphish'

        # truncate if too long
        if len(file_name) > self.max_file_name_length:
            file_name = file_name[len(file_name) - self.max_file_name_length:]

        # replace invalid filesystem characters
        file_name = secure_filename(file_name)

        # make the crawlphish dir
        dest_dir = os.path.join(self.root.storage_dir, 'crawlphish')
        try:
            if not os.path.isdir(dest_dir):
                os.makedirs(dest_dir)
        except Exception as e:
            logging.error("unable to create directory {}: {}".format(
                dest_dir, e))
        file_path = os.path.join(dest_dir, file_name)

        # prevent file path collision
        if os.path.isfile(file_path):
            duplicate_count = 1
            file_path = os.path.join(
                dest_dir, "{}_{}".format(duplicate_count, file_name))
            while os.path.isfile(file_path):
                duplicate_count = duplicate_count + 1
                file_path = os.path.join(
                    dest_dir, "{}_{}".format(duplicate_count, file_name))

        # download the results up to the limit
        try:
            bytes_downloaded = 0
            with open(file_path, 'wb') as fp:
                for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE):
                    bytes_downloaded += len(chunk)
                    fp.write(chunk)

                    if bytes_downloaded >= self.max_download_size:
                        logging.debug(
                            "exceeded max download size for {}".format(url))
                        response.close()

            logging.debug("downloaded {} bytes for {}".format(
                bytes_downloaded, file_path))

        except Exception as e:
            analysis.downloaded = False
            proxy_result.error_reason = "data transfer interrupted: {}".format(
                e)
            logging.debug("url {} transfer failed: {}".format(url, e))
            return True

        # record all the details of the transaction
        analysis.downloaded = True
        analysis.file_name = file_name
        analysis.requested_url = formatted_url
        analysis.final_url = response.url

        # if the final url is different than the original url, record that url as an observable
        final_url = None
        if analysis.final_url and analysis.final_url != url.value:
            final_url = analysis.add_observable(F_URL,
                                                analysis.final_url,
                                                o_time=url.time)
            if final_url:
                final_url.add_tag('redirection_target')
                final_url.add_relationship(R_REDIRECTED_FROM, url)

        #if len(response.history) > 1:
        #url.add_tag('redirection')

        # and add the file for processing
        download = analysis.add_observable(
            F_FILE, os.path.relpath(file_path, start=self.root.storage_dir))
        if download:
            download.add_relationship(R_DOWNLOADED_FROM,
                                      final_url if final_url else url)
            # only extract if non-error http response
            if response.status_code >= 200 and response.status_code <= 299:
                download.add_directive(DIRECTIVE_EXTRACT_URLS)

        return True
Esempio n. 3
0
class CrawlphishAnalyzer(AnalysisModule):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.headers = {'User-Agent': self.config['user-agent']}

        self._initialized = False

    def verify_environment(self):
        self.verify_config_exists('whitelist_path')
        self.verify_path_exists(self.config['whitelist_path'])
        self.verify_config_exists('regex_path')
        self.verify_path_exists(self.config['regex_path'])
        self.verify_config_exists('blacklist_path')
        self.verify_path_exists(self.config['blacklist_path'])
        self.verify_config_exists('uncommon_network_threshold')
        self.verify_config_exists('user-agent')
        self.verify_config_exists('timeout')
        self.verify_config_exists('max_download_size')
        self.verify_config_exists('cooldown_period')
        self.verify_config_exists('update_brocess')
        self.verify_config_exists('proxies')

        for name in self.config['proxies'].split(','):
            if name == 'GLOBAL':
                continue

            if 'proxy_{}'.format(name) not in saq.CONFIG:
                logging.critical(
                    "invalid proxy name {} in crawlphish config".format(name))

    @property
    def whitelist_path(self):
        return self.url_filter.whitelist_path

    @property
    def regex_path(self):
        return self.url_filter.regex_path

    @property
    def blacklist_path(self):
        return self.url_filter.blacklist_path

    @property
    def uncommon_network_threshold(self):
        """How many connections decides that a given fqdn or ip address is an "uncommon network"? """
        return self.config.getint('uncommon_network_threshold')

    @property
    def user_agent(self):
        return self.config['user-agent']

    @property
    def timeout(self):
        """How long to wait for an HTTP request to time out (in seconds)."""
        return self.config.getint('timeout')

    @property
    def max_download_size(self):
        """Maximum download size (in MB)."""
        return self.config.getint('max_download_size') * 1024 * 1024

    @property
    def update_brocess(self):
        """Are we updating brocess when we make a request?"""
        return self.config.getboolean('update_brocess')

    @property
    def proxies(self):
        """The list of proxies we'll use to download URLs, attepted in order."""
        return self.config['proxies']

    @property
    def generated_analysis_type(self):
        return CrawlphishAnalysisV2

    @property
    def valid_observable_types(self):
        return F_URL

    @property
    def required_directives(self):
        return [DIRECTIVE_CRAWL]

    def execute_analysis(self, url):

        if not self._initialized:
            # used to decide what URLs to actually crawl
            self.url_filter = CrawlphishURLFilter()

            # a whitelist of sites we'll always crawl
            self.watch_file(self.url_filter.whitelist_path,
                            self.url_filter.load_whitelist)
            self.watch_file(self.url_filter.blacklist_path,
                            self.url_filter.load_blacklist)
            self.watch_file(self.url_filter.regex_path,
                            self.url_filter.load_path_regexes)

            self._initialized = True

        analysis = self.create_analysis(url)
        # are we able to download it?
        analysis.downloaded = False
        # if not, why?
        #analysis.error_reason = None

        # is this URL crawlable?
        filter_result = self.url_filter.filter(url.value)
        analysis.filtered_status = filter_result.filtered
        analysis.filtered_status_reason = filter_result.reason

        if analysis.filtered_status:
            logging.debug("{} is not crawlable: {}".format(
                url.value, analysis.filtered_status_reason))
            return False

        parsed_url = filter_result.parsed_url
        if parsed_url is None:
            logging.debug("unable to parse url {}".format(url.value))
            return False

        formatted_url = urlunparse(parsed_url)

        # update brocess if we're configured to do so
        if self.update_brocess and parsed_url.hostname and not is_ipv4(
                parsed_url.hostname):
            logging.debug(
                "updating brocess with crawlphish request for {}".format(
                    parsed_url.hostname))
            add_httplog(parsed_url.hostname)

        # what proxies are we going to use to attempt to download the url?
        # these are attempted in the order specified in the configuration setting
        proxy_configs = []
        for name in self.proxies.split(','):
            if name == 'GLOBAL':
                proxy_configs.append((name, saq.PROXIES))
            else:
                proxy_configs.append((name, saq.OTHER_PROXIES[name]))

        proxy_result = None

        for index, proxy_config in enumerate(proxy_configs):
            proxy_name, proxy_config = proxy_config

            proxy_result = CloudphishProxyResult()
            proxy_result.proxy_name = proxy_name
            analysis.proxies.append(proxy_name)
            analysis.proxy_results[proxy_name] = proxy_result
            session = requests.Session()
            session.proxies = proxy_config

            try:
                logging.info("requesting url {} via {}".format(
                    formatted_url, proxy_name))
                response = session.request('GET',
                                           formatted_url,
                                           headers=self.headers,
                                           timeout=self.timeout,
                                           allow_redirects=True,
                                           verify=False,
                                           stream=True)

                proxy_result.status_code = response.status_code
                proxy_result.status_code_reason = response.reason
                logging.info("url request result {} ({}) for {}".format(
                    response.status_code, response.reason, formatted_url))

                for header in response.headers.keys():
                    proxy_result.headers[header] = response.headers[header]

                for part in response.history:
                    proxy_result.history.append(part.url)

                # did we get an error code?
                if math.floor(response.status_code / 100) in [4, 5]:
                    proxy_result.error_reason = '({}) {}'.format(
                        proxy_result.status_code,
                        proxy_result.status_code_reason)
                    continue

                # all is well -- break out and download the content
                break

            except requests.Timeout as e:
                proxy_result.error_reason = "request timed out"
                continue
            except Exception as e:
                proxy_result.error_reason = str(e)
                #report_exception()
                continue

            # we should never get here
            logging.error("executed invalid branch?")
            break

        # did we successfully start a download?
        if proxy_result.error_reason is not None:
            logging.info("unable to download {}: {}".format(
                formatted_url, proxy_result.error_reason))
            return True

        # for each url we download we use a file name inside a directory with the following format
        # parsed_url.hostname_N
        # where N is an integer starting at 0 and increments each time
        # we store the next number to use in the state for this module
        if not self.state:
            self.state = {}
            self.state[
                'next_unknown'] = 0  # next integer to use for unknown.crawlphish files
            self.state['host_counts'] = {
            }  # key = parsed_url.hostname, value = next integer to use

        path_components = [x for x in parsed_url.path.split('/') if x.strip()]

        # need to figure out what to call it
        file_name = None
        # content-disposition header is the official way
        if 'content-disposition' in response.headers:
            file_name = response.headers['content-disposition']
            # we could potentially see there here: attachment; filename="blah..."
            content_file_match = re.search(
                'attachment; filename="?(?P<real_filename>[^"]+)"?',
                response.headers['content-disposition'])
            if content_file_match:
                file_name = content_file_match.group('real_filename')
                # replace any / or . with _
                file_name = re.sub(
                    r'_+', '_',
                    re.sub(r'\.\.', '_', re.sub(r'/', '_', file_name)))

        # otherwise we use the last element of the path
        if not file_name and parsed_url.path and not parsed_url.path.endswith(
                '/'):
            file_name = path_components[-1]

        # default if we can't figure it out
        if not file_name:
            file_name = 'unknown_{}.crawlphish'.format(
                self.state['next_unknown'])
            self.state['next_unknown'] += 1

        hostname = parsed_url.hostname
        if not hostname:
            hostname = 'unknown_host'

        if hostname not in self.state['host_counts']:
            self.state['host_counts'][hostname] = 0

        dest_dir = os.path.join(
            self.root.storage_dir, 'crawlphish',
            '{}_{}'.format(hostname, self.state['host_counts'][hostname]))
        self.state['host_counts'][hostname] += 1

        try:
            if not os.path.isdir(dest_dir):
                os.makedirs(dest_dir)
        except Exception as e:
            logging.error("unable to create directory {}: {}".format(
                dest_dir, e))

        file_path = os.path.join(dest_dir, file_name)

        # download the results up to the limit
        try:
            bytes_downloaded = 0
            with open(file_path, 'wb') as fp:
                for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE):
                    bytes_downloaded += len(chunk)
                    fp.write(chunk)

                    if bytes_downloaded >= self.max_download_size:
                        logging.debug(
                            "exceeded max download size for {}".format(url))
                        response.close()

            logging.debug("downloaded {} bytes for {}".format(
                bytes_downloaded, file_path))

        except Exception as e:
            analysis.downloaded = False
            proxy_result.error_reason = "data transfer interrupted: {}".format(
                e)
            logging.debug("url {} transfer failed: {}".format(url, e))
            return True

        # record all the details of the transaction
        analysis.downloaded = True
        analysis.file_name = file_name
        analysis.requested_url = formatted_url
        analysis.final_url = response.url

        # if the final url is different than the original url, record that url as an observable
        final_url = None
        if analysis.final_url and analysis.final_url != url.value:
            final_url = analysis.add_observable(F_URL,
                                                analysis.final_url,
                                                o_time=url.time)
            if final_url:
                final_url.add_tag('redirection_target')
                final_url.add_relationship(R_REDIRECTED_FROM, url)

        #if len(response.history) > 1:
        #url.add_tag('redirection')

        # and add the file for processing
        download = analysis.add_observable(
            F_FILE, os.path.relpath(file_path, start=self.root.storage_dir))
        if download:
            download.add_relationship(R_DOWNLOADED_FROM,
                                      final_url if final_url else url)
            # only extract if non-error http response
            if response.status_code >= 200 and response.status_code <= 299:
                download.add_directive(DIRECTIVE_EXTRACT_URLS)

        return True