Beispiel #1
0
    def load_blacklist(self):
        logging.debug("loading blacklist from {}".format(self.blacklist_path))
        blacklisted_fqdn = []
        blacklisted_cidr = []

        try:
            with open(self.blacklist_path, 'r') as fp:
                for line in fp:
                    line = line.strip()

                    # skip comments
                    if line.startswith('#'):
                        continue

                    # skip blank lines
                    if line == '':
                        continue

                    if is_ipv4(line):
                        blacklisted_cidr.append(IPv4Network(add_netmask(line)))
                    else:
                        blacklisted_fqdn.append(line)

            self.blacklisted_cidr = blacklisted_cidr
            self.blacklisted_fqdn = blacklisted_fqdn
            logging.debug("loaded {} cidr {} fqdn blacklisted items".format(
                           len(self.blacklisted_cidr),
                           len(self.blacklisted_fqdn)))

        except Exception as e:
            logging.error("unable to load blacklist {}: {}".format(self.blacklist_path, e))
            report_exception()
Beispiel #2
0
    def filter(self, url):
        """Returns True if the given URL should be filtered (not crawled).  Check the reason property
           the reason the url is filtered."""
        result = FilterResult()
        result.filtered = False
        result.reason = REASON_UNKNOWN

        result.parsed_url = process_url(url)
        if not result.parsed_url:
            logging.debug("unable to process url {}".format(url))
            result.reason = REASON_ERROR
            return result

        logging.debug("analyzing scheme {} netloc {} hostname {} path {} params {} query {} fragment {}".format(
                      result.parsed_url.scheme,
                      result.parsed_url.netloc,
                      result.parsed_url.hostname,
                      result.parsed_url.path,
                      result.parsed_url.params,
                      result.parsed_url.query,
                      result.parsed_url.fragment))

        if self.is_whitelisted(result.parsed_url.hostname):
            result.reason = REASON_WHITELISTED
            result.filtered = False
            return result

        if self.is_blacklisted(result.parsed_url.hostname):
            result.reason = REASON_BLACKLISTED
            result.filtered = True
            return result

        # if the URL is just to an IP address then we crawl that no matter what
        if is_ipv4(result.parsed_url.hostname):
            result.reason = REASON_DIRECT_IPV4
            result.filtered = False
            return result

        if result.parsed_url.path:
            if self.matches_path_regex(result.parsed_url.path):
                result.reason = REASON_WHITELISTED
                result.filtered = False
                return result
            
        if self.is_in_intel_db(result.parsed_url):
            result.reason = REASON_CRITS
            result.filtered = False
            return result

        if not self.is_uncommon_network(result.parsed_url.hostname):
            result.reason = REASON_COMMON_NETWORK
            result.filtered = True
            return result

        result.filtered = False
        result.reason = REASON_OK
        return result
Beispiel #3
0
    def is_whitelisted(self, value):
        if is_ipv4(value):
            for cidr in self.whitelisted_cidr:
                if IPv4Address(value) in cidr:
                    logging.debug("{} matches whitelisted cidr {}".format(value, cidr))
                    return True

            return False

        for dst in self.whitelisted_fqdn:
            if is_subdomain(value, dst):
                logging.debug("{} matches whitelisted fqdn {}".format(value, dst))
                return True

        return False
Beispiel #4
0
    def is_blacklisted(self, value):
        if is_ipv4(value):
            for cidr in self.blacklisted_cidr:
                try:
                    if IPv4Address(value) in cidr:
                        logging.debug("{} matches blacklisted cidr {}".format(value, cidr))
                        return True
                except Exception as e:
                    logging.error("failed to compare {} to {}: {}".format(value, cidr, e))
                    report_exception()

            return False

        for dst in self.blacklisted_fqdn:
            if is_subdomain(value, dst):
                logging.debug("{} matches blacklisted fqdn {}".format(value, dst))
                return True

        return False
Beispiel #5
0
    def is_in_cache_db(self, value, cache_path):
        """Is this URL in crits?  value is the result of calling process_url on a URL."""
        assert isinstance(value, ParseResult)

        with sqlite3.connect('file:{}?mode=ro'.format(cache_path), uri=True) as db:
            db_cursor = db.cursor()
            row = None

            # check ipv4
            if is_ipv4(value.hostname):
                db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = ?", 
                                 (CRITS_IPV4, value.hostname))

                row = db_cursor.fetchone()
                if row:
                    logging.debug("{} matched ipv4 indicator {}".format(value.hostname, row[0]))
                    return True
            else:
                # check fqdn
                for partial_fqdn in iterate_fqdn_parts(value.hostname):
                    #logging.debug("checking crits for {}".format(partial_fqdn))
                    db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = ?",
                                     (CRITS_FQDN, partial_fqdn.lower()))

                    row = db_cursor.fetchone()
                    if row:
                        logging.debug("{} matched fqdn indicator {}".format(partial_fqdn, row[0]))
                        return True
                        
            # check full url
            db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = LOWER(?)",
                             (CRITS_URL, value.geturl()))

            row = db_cursor.fetchone()
            if row:
                logging.debug("{} matched url indicator{}".format(value.geturl(), row[0]))
                return True

            # check url path
            path = urlunparse(('', '', value.path, value.params, value.query, value.fragment))
            if path:
                db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = LOWER(?)",
                                 (CRITS_URL_PATH, path))

                row = db_cursor.fetchone()
                if row:
                    logging.debug("{} matched url_path indicator {}".format(value.path, row[0]))
                    return True

            # check url file name
            if value.path:
                if not value.path.endswith('/'):
                    file_name = value.path.split('/')[-1]
                    db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = LOWER(?)",
                                     (CRITS_FILE_NAME, file_name))

                    row = db_cursor.fetchone()
                    if row:
                        logging.debug("{} matched file_name indicator {}".format(file_name, row[0]))
                        return True

            return False
Beispiel #6
0
    def execute_analysis(self, url):

        if not self._initialized:
            # used to decide what URLs to actually crawl
            self.url_filter = CrawlphishURLFilter()

            # a whitelist of sites we'll always crawl
            self.watch_file(self.url_filter.whitelist_path,
                            self.url_filter.load_whitelist)
            self.watch_file(self.url_filter.blacklist_path,
                            self.url_filter.load_blacklist)
            self.watch_file(self.url_filter.regex_path,
                            self.url_filter.load_path_regexes)

            self._initialized = True

        analysis = self.create_analysis(url)
        # are we able to download it?
        analysis.downloaded = False
        # if not, why?
        #analysis.error_reason = None

        # is this URL crawlable?
        filter_result = self.url_filter.filter(url.value)
        analysis.filtered_status = filter_result.filtered
        analysis.filtered_status_reason = filter_result.reason

        if analysis.filtered_status:
            logging.debug("{} is not crawlable: {}".format(
                url.value, analysis.filtered_status_reason))
            return False

        parsed_url = filter_result.parsed_url
        if parsed_url is None:
            logging.debug("unable to parse url {}".format(url.value))
            return False

        formatted_url = urlunparse(parsed_url)

        # update brocess if we're configured to do so
        if self.update_brocess and parsed_url.hostname and not is_ipv4(
                parsed_url.hostname):
            logging.debug(
                "updating brocess with crawlphish request for {}".format(
                    parsed_url.hostname))
            add_httplog(parsed_url.hostname)

        # what proxies are we going to use to attempt to download the url?
        # these are attempted in the order specified in the configuration setting
        proxy_configs = []
        for name in self.proxies.split(','):
            if name == 'GLOBAL':
                proxy_configs.append((name, saq.PROXIES))
            else:
                proxy_configs.append((name, saq.OTHER_PROXIES[name]))

        proxy_result = None

        for index, proxy_config in enumerate(proxy_configs):
            proxy_name, proxy_config = proxy_config

            proxy_result = CloudphishProxyResult()
            proxy_result.proxy_name = proxy_name
            analysis.proxies.append(proxy_name)
            analysis.proxy_results[proxy_name] = proxy_result
            session = requests.Session()
            session.proxies = proxy_config

            try:
                logging.info("requesting url {} via {}".format(
                    formatted_url, proxy_name))
                response = session.request('GET',
                                           formatted_url,
                                           headers=self.headers,
                                           timeout=self.timeout,
                                           allow_redirects=True,
                                           verify=False,
                                           stream=True)

                proxy_result.status_code = response.status_code
                proxy_result.status_code_reason = response.reason
                logging.info("url request result {} ({}) for {}".format(
                    response.status_code, response.reason, formatted_url))

                for header in response.headers.keys():
                    proxy_result.headers[header] = response.headers[header]

                for part in response.history:
                    proxy_result.history.append(part.url)

                # did we get an error code?
                if math.floor(response.status_code / 100) in [4, 5]:
                    proxy_result.error_reason = '({}) {}'.format(
                        proxy_result.status_code,
                        proxy_result.status_code_reason)
                    continue

                # all is well -- break out and download the content
                break

            except requests.Timeout as e:
                proxy_result.error_reason = "request timed out"
                continue
            except Exception as e:
                proxy_result.error_reason = str(e)
                #report_exception()
                continue

            # we should never get here
            logging.error("executed invalid branch?")
            break

        # did we successfully start a download?
        if proxy_result.error_reason is not None:
            logging.info("unable to download {}: {}".format(
                formatted_url, proxy_result.error_reason))
            return True

        path_components = [x for x in parsed_url.path.split('/') if x.strip()]

        # need to figure out what to call it
        file_name = None
        # content-disposition header is the official way
        if 'content-disposition' in response.headers:
            file_name = response.headers['content-disposition']
            # we could potentially see there here: attachment; filename="blah..."
            content_file_match = re.search(
                'attachment; filename*?="?(?P<real_filename>[^"]+)"?',
                response.headers['content-disposition'])
            if content_file_match:
                file_name = content_file_match.group('real_filename')

                # handle rfc5987 which allows utf-8 encoding and url-encoding
                if file_name.lower().startswith("utf-8''"):
                    file_name = file_name[7:]
                    file_name = urllib.unquote(file_name).decode('utf8')

        # otherwise we use the last element of the path
        if not file_name and parsed_url.path and not parsed_url.path.endswith(
                '/'):
            file_name = path_components[-1]

        # default if we can't figure it out
        if not file_name:
            file_name = 'unknown.crawlphish'

        # truncate if too long
        if len(file_name) > self.max_file_name_length:
            file_name = file_name[len(file_name) - self.max_file_name_length:]

        # replace invalid filesystem characters
        file_name = secure_filename(file_name)

        # make the crawlphish dir
        dest_dir = os.path.join(self.root.storage_dir, 'crawlphish')
        try:
            if not os.path.isdir(dest_dir):
                os.makedirs(dest_dir)
        except Exception as e:
            logging.error("unable to create directory {}: {}".format(
                dest_dir, e))
        file_path = os.path.join(dest_dir, file_name)

        # prevent file path collision
        if os.path.isfile(file_path):
            duplicate_count = 1
            file_path = os.path.join(
                dest_dir, "{}_{}".format(duplicate_count, file_name))
            while os.path.isfile(file_path):
                duplicate_count = duplicate_count + 1
                file_path = os.path.join(
                    dest_dir, "{}_{}".format(duplicate_count, file_name))

        # download the results up to the limit
        try:
            bytes_downloaded = 0
            with open(file_path, 'wb') as fp:
                for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE):
                    bytes_downloaded += len(chunk)
                    fp.write(chunk)

                    if bytes_downloaded >= self.max_download_size:
                        logging.debug(
                            "exceeded max download size for {}".format(url))
                        response.close()

            logging.debug("downloaded {} bytes for {}".format(
                bytes_downloaded, file_path))

        except Exception as e:
            analysis.downloaded = False
            proxy_result.error_reason = "data transfer interrupted: {}".format(
                e)
            logging.debug("url {} transfer failed: {}".format(url, e))
            return True

        # record all the details of the transaction
        analysis.downloaded = True
        analysis.file_name = file_name
        analysis.requested_url = formatted_url
        analysis.final_url = response.url

        # if the final url is different than the original url, record that url as an observable
        final_url = None
        if analysis.final_url and analysis.final_url != url.value:
            final_url = analysis.add_observable(F_URL,
                                                analysis.final_url,
                                                o_time=url.time)
            if final_url:
                final_url.add_tag('redirection_target')
                final_url.add_relationship(R_REDIRECTED_FROM, url)

        #if len(response.history) > 1:
        #url.add_tag('redirection')

        # and add the file for processing
        download = analysis.add_observable(
            F_FILE, os.path.relpath(file_path, start=self.root.storage_dir))
        if download:
            download.add_relationship(R_DOWNLOADED_FROM,
                                      final_url if final_url else url)
            # only extract if non-error http response
            if response.status_code >= 200 and response.status_code <= 299:
                download.add_directive(DIRECTIVE_EXTRACT_URLS)

        return True
Beispiel #7
0
    def execute_analysis(self, url):
        from saq.modules.cloudphish import CloudphishAnalysis
        from saq.cloudphish import SCAN_RESULT_ERROR, SCAN_RESULT_PASS

        cloudphish_analysis = self.wait_for_analysis(url, CloudphishAnalysis)
        if not cloudphish_analysis:
            return False

        # is this a URL to an IP address to a single file in the root directory?
        # example: http://220.218.70.160/sec.hta

        try:
            parsed_url = urlparse(url.value)
        except Exception as e:
            logging.debug("unable to parse url {}: {}".format(url.value, e))
            return False

        # define what is considered suspicious to find in the root dir
        def _susp_file(path):
            for ext in [
                    'doc', 'docx', 'docm', 'xls', 'xlsx', 'xlsm', 'ppt',
                    'pptx', 'pptm', 'pdf', 'js', 'vbs', 'jse', 'exe', 'swf',
                    'jar', 'lnk', 'ps1', 'rtf', 'chm', 'bat', 'scr', 'hta',
                    'cab', 'pif', 'au3', 'a3x', 'eps', 'xla', 'pptm', 'pps',
                    'dot', 'dotm', 'pub', 'wsf', 'cmd', 'ps', 'vbe', 'wsc'
            ]:
                if path.lower().endswith('.{}'.format(ext)):
                    return True

            return False

        analysis = self.create_analysis(url)

        if parsed_url.hostname and parsed_url.path:
            if is_ipv4(parsed_url.hostname) and SINGLE_FILE_REGEX.match(
                    parsed_url.path):
                # ignore a link to a URL in the local network (common for companies to do locally)
                if not any([
                        parsed_url.hostname in cidr
                        for cidr in saq.MANAGED_NETWORKS
                ]):
                    # and then the file extension must end in something suspicious
                    if _susp_file(parsed_url.path):
                        analysis.details = True
                        url.add_detection_point(
                            "URL to ipv4 to suspicious file in root directory")
                        url.add_directive(DIRECTIVE_FORCE_DOWNLOAD)

        # is the URL to an actual internet host?
        if parsed_url.hostname and '.' in parsed_url.hostname:
            # did this URL come from a stream file from an office document?
            stream_file = search_down(
                url, lambda x: isinstance(x, Observable) and x.type == F_FILE
                and '.officeparser/stream' in x.value)
            if stream_file:
                # what did cloudphish think of this url?
                if cloudphish_analysis.analysis_result not in [
                        SCAN_RESULT_ERROR, SCAN_RESULT_PASS
                ]:
                    #analysis.details = True
                    #url.add_detection_point("uncommon URL in ole stream file")
                    url.add_directive(DIRECTIVE_FORCE_DOWNLOAD)

        return True