Ejemplo n.º 1
0
def initialize_url_filter():
    global url_filter
    # initialize the crawlphish url filter
    url_filter = CrawlphishURLFilter()
    # TODO schedule tasks to reload lists
    url_filter.load()
    logging.debug("url filter loaded")
Ejemplo n.º 2
0
    def execute_analysis(self, url):

        if not self._initialized:
            # used to decide what URLs to actually crawl
            self.url_filter = CrawlphishURLFilter()

            # a whitelist of sites we'll always crawl
            self.watch_file(self.url_filter.whitelist_path,
                            self.url_filter.load_whitelist)
            self.watch_file(self.url_filter.blacklist_path,
                            self.url_filter.load_blacklist)
            self.watch_file(self.url_filter.regex_path,
                            self.url_filter.load_path_regexes)

            self._initialized = True

        analysis = self.create_analysis(url)
        # are we able to download it?
        analysis.downloaded = False
        # if not, why?
        #analysis.error_reason = None

        # is this URL crawlable?
        filter_result = self.url_filter.filter(url.value)
        analysis.filtered_status = filter_result.filtered
        analysis.filtered_status_reason = filter_result.reason

        if analysis.filtered_status:
            logging.debug("{} is not crawlable: {}".format(
                url.value, analysis.filtered_status_reason))
            return False

        parsed_url = filter_result.parsed_url
        if parsed_url is None:
            logging.debug("unable to parse url {}".format(url.value))
            return False

        formatted_url = urlunparse(parsed_url)

        # update brocess if we're configured to do so
        if self.update_brocess and parsed_url.hostname and not is_ipv4(
                parsed_url.hostname):
            logging.debug(
                "updating brocess with crawlphish request for {}".format(
                    parsed_url.hostname))
            add_httplog(parsed_url.hostname)

        # what proxies are we going to use to attempt to download the url?
        # these are attempted in the order specified in the configuration setting
        proxy_configs = []
        for name in self.proxies.split(','):
            if name == 'GLOBAL':
                proxy_configs.append((name, saq.PROXIES))
            else:
                proxy_configs.append((name, saq.OTHER_PROXIES[name]))

        proxy_result = None

        for index, proxy_config in enumerate(proxy_configs):
            proxy_name, proxy_config = proxy_config

            proxy_result = CloudphishProxyResult()
            proxy_result.proxy_name = proxy_name
            analysis.proxies.append(proxy_name)
            analysis.proxy_results[proxy_name] = proxy_result
            session = requests.Session()
            session.proxies = proxy_config

            try:
                logging.info("requesting url {} via {}".format(
                    formatted_url, proxy_name))
                response = session.request('GET',
                                           formatted_url,
                                           headers=self.headers,
                                           timeout=self.timeout,
                                           allow_redirects=True,
                                           verify=False,
                                           stream=True)

                proxy_result.status_code = response.status_code
                proxy_result.status_code_reason = response.reason
                logging.info("url request result {} ({}) for {}".format(
                    response.status_code, response.reason, formatted_url))

                for header in response.headers.keys():
                    proxy_result.headers[header] = response.headers[header]

                for part in response.history:
                    proxy_result.history.append(part.url)

                # did we get an error code?
                if math.floor(response.status_code / 100) in [4, 5]:
                    proxy_result.error_reason = '({}) {}'.format(
                        proxy_result.status_code,
                        proxy_result.status_code_reason)
                    continue

                # all is well -- break out and download the content
                break

            except requests.Timeout as e:
                proxy_result.error_reason = "request timed out"
                continue
            except Exception as e:
                proxy_result.error_reason = str(e)
                #report_exception()
                continue

            # we should never get here
            logging.error("executed invalid branch?")
            break

        # did we successfully start a download?
        if proxy_result.error_reason is not None:
            logging.info("unable to download {}: {}".format(
                formatted_url, proxy_result.error_reason))
            return True

        path_components = [x for x in parsed_url.path.split('/') if x.strip()]

        # need to figure out what to call it
        file_name = None
        # content-disposition header is the official way
        if 'content-disposition' in response.headers:
            file_name = response.headers['content-disposition']
            # we could potentially see there here: attachment; filename="blah..."
            content_file_match = re.search(
                'attachment; filename*?="?(?P<real_filename>[^"]+)"?',
                response.headers['content-disposition'])
            if content_file_match:
                file_name = content_file_match.group('real_filename')

                # handle rfc5987 which allows utf-8 encoding and url-encoding
                if file_name.lower().startswith("utf-8''"):
                    file_name = file_name[7:]
                    file_name = urllib.unquote(file_name).decode('utf8')

        # otherwise we use the last element of the path
        if not file_name and parsed_url.path and not parsed_url.path.endswith(
                '/'):
            file_name = path_components[-1]

        # default if we can't figure it out
        if not file_name:
            file_name = 'unknown.crawlphish'

        # truncate if too long
        if len(file_name) > self.max_file_name_length:
            file_name = file_name[len(file_name) - self.max_file_name_length:]

        # replace invalid filesystem characters
        file_name = secure_filename(file_name)

        # make the crawlphish dir
        dest_dir = os.path.join(self.root.storage_dir, 'crawlphish')
        try:
            if not os.path.isdir(dest_dir):
                os.makedirs(dest_dir)
        except Exception as e:
            logging.error("unable to create directory {}: {}".format(
                dest_dir, e))
        file_path = os.path.join(dest_dir, file_name)

        # prevent file path collision
        if os.path.isfile(file_path):
            duplicate_count = 1
            file_path = os.path.join(
                dest_dir, "{}_{}".format(duplicate_count, file_name))
            while os.path.isfile(file_path):
                duplicate_count = duplicate_count + 1
                file_path = os.path.join(
                    dest_dir, "{}_{}".format(duplicate_count, file_name))

        # download the results up to the limit
        try:
            bytes_downloaded = 0
            with open(file_path, 'wb') as fp:
                for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE):
                    bytes_downloaded += len(chunk)
                    fp.write(chunk)

                    if bytes_downloaded >= self.max_download_size:
                        logging.debug(
                            "exceeded max download size for {}".format(url))
                        response.close()

            logging.debug("downloaded {} bytes for {}".format(
                bytes_downloaded, file_path))

        except Exception as e:
            analysis.downloaded = False
            proxy_result.error_reason = "data transfer interrupted: {}".format(
                e)
            logging.debug("url {} transfer failed: {}".format(url, e))
            return True

        # record all the details of the transaction
        analysis.downloaded = True
        analysis.file_name = file_name
        analysis.requested_url = formatted_url
        analysis.final_url = response.url

        # if the final url is different than the original url, record that url as an observable
        final_url = None
        if analysis.final_url and analysis.final_url != url.value:
            final_url = analysis.add_observable(F_URL,
                                                analysis.final_url,
                                                o_time=url.time)
            if final_url:
                final_url.add_tag('redirection_target')
                final_url.add_relationship(R_REDIRECTED_FROM, url)

        #if len(response.history) > 1:
        #url.add_tag('redirection')

        # and add the file for processing
        download = analysis.add_observable(
            F_FILE, os.path.relpath(file_path, start=self.root.storage_dir))
        if download:
            download.add_relationship(R_DOWNLOADED_FROM,
                                      final_url if final_url else url)
            # only extract if non-error http response
            if response.status_code >= 200 and response.status_code <= 299:
                download.add_directive(DIRECTIVE_EXTRACT_URLS)

        return True