Python find_urls Examples, urlfinderlib.find_urls Python Examples

Example #1

0

Show file

File: WildfireParser.py Project: seanmcfeely/eventsentry

 def parse_process_tree_urls(self):
     self.logger.debug('Looking for URLs in process tree')
     urls = []
     for process in self.processes:
         urls = find_urls(process['command'])
         urls += find_urls(process['decoded_command'])
     return urls

Example #2

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_binary():
    with open(f"{files_dir}/hello.bin", "rb") as f:
        blob = f.read()

    expected_urls = {"http://domain.com"}

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #3

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_csv_lookalike():
    with open(f"{files_dir}/csv_lookalike.txt", "rb") as f:
        blob = f.read()

    expected_urls = {"https://example.com"}

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #4

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_html():
    with open(f"{files_dir}/test.html", "rb") as f:
        blob = f.read()

    expected_urls = {
        "http://domain.com",
        "http://domain.com/action",
        "http://domain.com/background",
        "http://domain.com/css",
        "http://domain.com/href",
        "http://domain.com/meta",
        "http://domain.com/src",
        "http://domain.com/xmlns",
        "http://domain3.com",
        "http://faÃŸ.de/re.php",
        "http://domain2.com/image-small.png",
        "http://domain2.com/image-medium.png",
        "http://domain2.com/image-large.png",
        "http://domain4.com/index.php#thing=http://domain5.com",
        "http://domain5.com",
        "https://domain6.com",
        "https://domain.com/?a=1234",
    }

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #5

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_looks_like_html():
    with open(f"{files_dir}/looks_like_html.xml", "rb") as f:
        blob = f.read()

    expected_urls = {"https://example.com"}

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #6

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_csv():
    with open(f"{files_dir}/test.csv", "rb") as f:
        blob = f.read()

    expected_urls = {
        "http://domain.com", "http://domain2.com", "http://domain3.com"
    }

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #7

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_domain_as_url():
    with open(f"{files_dir}/domain_as_url.txt", "rb") as f:
        blob = f.read()

    expected_urls = {
        "https://somefakesite.com", "https://somefakesite.com/index.html",
        "https://somefakesite2.com"
    }

    assert urlfinderlib.find_urls(blob, domain_as_url=True) == expected_urls

Example #8

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_pdf():
    with open(f"{files_dir}/test.pdfparser", "rb") as f:
        blob = f.read()

    expected_urls = {
        "http://en.wikipedia.org/wiki/MIT_License", "http://domain.com",
        "http://domain.com/(test/123"
    }

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #9

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_ical():
    with open(f"{files_dir}/test.ical", "rb") as f:
        blob = f.read()

        expected_urls = {
            "https://thisisjustatest.com",
            "https://thisisjustatest2.com",
            "https://thisisjustatest3.com",
            "https://thisisjustatest4.com",
        }

        assert urlfinderlib.find_urls(blob) == expected_urls

Example #10

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_base_url_malformed():
    with open(f"{files_dir}/base_url_malformed.html", "rb") as f:
        blob = f.read()

    expected_urls = {
        "https://t.co",
        "https://t.co/asdf1234?amp=1",
        "https://t.co/images/stickman.gif",
        "https://www.w3schools.com/images/stickman.gif",
    }

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #11

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_xml():
    with open(f"{files_dir}/sharedStrings.xml", "rb") as f:
        blob = f.read()

    expected_urls = {
        "http://schemas.openxmlformats.org/spreadsheetml/2006/main",
        "https://domain.com/test",
        "http://domain2.com",
        "http://www.w3.org/XML/1998/namespace",
    }

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #12

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_text_xml():
    with open(f"{files_dir}/text.xml", "rb") as f:
        blob = f.read()

    expected_urls = {
        "http://schemas.microsoft.com/office/drawing/2010/main",
        "http://schemas.openxmlformats.org/drawingml/2006/chart",
        "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
        "http://schemas.openxmlformats.org/drawingml/2006/main",
        "http://schemas.openxmlformats.org/markup-compatibility/2006",
    }

    assert urlfinderlib.find_urls(blob) == expected_urls

Example #13

0

Show file

    def setup(self, alert_uuids=[], manual_indicators=[], force=False):
        """ Parse everything in the event directory to build the event.json """

        if alert_uuids:
            # Make sure the event directory exists.
            if not os.path.exists(self.path):
                #os.makedirs(self.path, mode=0o770)
                os.makedirs(self.path)
                self.logger.debug('Created event directory: {}'.format(
                    self.path))

            # Make sure the collect directory exists.
            collect_path = os.path.join(self.path, 'collect')
            if not os.path.exists(collect_path):
                #os.makedirs(collect_path, mode=0o770)
                os.makedirs(collect_path)
                self.logger.debug(
                    'Created collect directory: {}'.format(collect_path))

            # Figure out which alerts are new to the event.
            existing_alert_paths = [
                f['path'] for f in self.json['files']
                if f['category'] == 'ace_alert'
            ]
            new_alert_uuids = []
            for alert_uuid in alert_uuids:
                if not any(alert_uuid in existing_path
                           for existing_path in existing_alert_paths):
                    new_alert_uuids.append(alert_uuid)

            # Download any new alerts into the event directory.
            # NOTE: THIS IS A TEMPORARY HACK UNTIL ACE CAN HANDLE THE API REDIRECTS!
            for alert_uuid in new_alert_uuids:
                alert_path = os.path.join(self.path, alert_uuid)
                if not os.path.exists(alert_path):
                    for ace_server in config['ace']['extra_ace_servers']:
                        try:
                            ace_api.set_default_remote_host(ace_server)
                            ace_api.download(alert_uuid, alert_path)
                            # Reset the API server to the default one.
                            ace_api.set_default_remote_host(
                                config['ace']['ace_api_server'])
                            break
                        except Exception as e:
                            if 'BAD REQUEST for url' in str(e):
                                try:
                                    shutil.rmtree(alert_path)
                                except:
                                    self.logger.exception(
                                        'There was an error trying to delete alert from event: {}'
                                        .format(alert_path))
                            else:
                                self.logger.exception(
                                    'Unable to download ACE alert!')

                    self.changed = True

            # Figure out if any alerts were removed from the event.
            for a in existing_alert_paths:
                uuid = a.split('/')[-2]
                if not any(uuid in alert_uuid for alert_uuid in alert_uuids):
                    self.logger.warning(
                        'Alert has been removed from the ACE event: {}'.format(
                            uuid))
                    try:
                        shutil.rmtree(os.path.dirname(a))
                    except:
                        self.logger.exception(
                            'There was an error trying to delete alert from event: {}'
                            .format(os.path.dirname(a)))

        # Identify the files in the event directory.
        self.logger.debug('Starting to walk event directory.')
        files = self.walk_event_directory()
        self.logger.debug('Finished walking event directory.')

        # Cross reference the critical files with what is stored in the JSON
        # to see if there have been any changes. Changes to critical files are
        # what require the event to be reprocessed.
        if force or self.has_critical_file_changed(files):
            self.logger.info(
                'A critical file has changed or we are forcing an update. Reprocessing the event.'
            )

            # Get rid of the old event campaign and tags.
            self.json['campaign'] = {}
            self.json['tags'] = []

            # Flag the event as changed.
            self.changed = True

            # Save the current version of the files to the JSON.
            self.json['files'] = files

            # Fill in the missing non-critical file MD5 hashes. We don't hash every
            # file as we originally walk through the event directory to save time.
            # We only need to hash the critical files when walking the event directory
            # so that we can make sure that we do not have any duplicates and so we
            # can also tell if any of them have changed. This makes things much faster
            # if nothing has changed with the event.
            for f in self.json['files']:
                if not f['critical']:
                    f['md5'] = self.calculate_md5(f['path'])

            # Get the latest whitelist from SIP.
            whitelist = EventWhitelist(sip=self.sip)

            # Parse the ACE alerts.
            ace_alerts = self.parse_ace_alerts()
            self.json['ace_alerts'] = [
                ace_alert.json for ace_alert in ace_alerts
            ]

            # Gather up the unique screenshots found in the ACE alerts. Right now these are just from Cloudphish.
            screenshot_dict = {}
            for ace_alert in ace_alerts:
                for screenshot in ace_alert.screenshots:
                    # Skip this screenshot if it is an HTML e-mail body. The HTML e-mail screenshots
                    # are handled within the EmailParser class.
                    if 'text_html' in screenshot:
                        continue

                    # Find the MD5 hash of this screenshot.
                    for f in self.json['files']:
                        if f['path'] == screenshot:
                            screenshot_dict[f['md5']] = f['path']

            # Symlink to the screenshots so we can ensure they have unique file names.
            unique_screenshots = []
            for md5 in screenshot_dict:
                new_name = 'ace_screenshot_{}.png'.format(md5)
                new_path = os.path.join(self.path, new_name)
                unique_screenshots.append(new_path)
                try:
                    os.symlink(screenshot_dict[md5], new_path)
                    self.logger.debug('Symlinked to ACE screenshot: {}'.format(
                        screenshot_dict[md5]))
                except FileExistsError:
                    pass

            # Save the screenshots to the event JSON.
            self.json['ace_screenshots'] = sorted(unique_screenshots)

            # Parse the emails and make their indicators.
            emails = self.parse_emails(whitelist)
            self.json['emails'] = [
                email.json for email in emails
                if email.received_time and email.message_id
            ]

            # Symlink to the sreenshots in the emails so we can ensure they have unique file names.
            for email in self.json['emails']:

                # Locate the MD5 of this email.
                for f in self.json['files']:
                    if f['path'] == email['path']:

                        # Symlink (rename) each screenshot in this email.
                        new_screenshot_paths = []
                        for screenshot in email['screenshots']:
                            new_name = 'email_screenshot_{}_{}'.format(
                                f['md5'], os.path.basename(screenshot))
                            new_path = os.path.join(
                                os.path.dirname(email['path']), new_name)
                            new_screenshot_paths.append(new_path)
                            try:
                                os.symlink(screenshot, new_path)
                                self.logger.debug(
                                    'Symlinked to email screenshot: {}'.format(
                                        screenshot))
                            except FileExistsError:
                                pass

                        # Replace the screenshot paths in the JSON with the new paths.
                        email['screenshots'] = sorted(new_screenshot_paths)

            # Parse the sandbox reports and make their indicators.
            sandbox_reports = self.parse_sandbox_reports(whitelist)
            self.json['sandbox'] = [
                sandbox_report.json for sandbox_report in sandbox_reports
            ]

            # The EmailParser objects do not know where the attachments are located within the event directory.
            # This is a handy piece of information for various processes later.
            for email in self.json['emails']:
                for attachment in email['attachments']:
                    attachment['event_path'] = ''

                    # Try to locate the file with the same MD5.
                    for f in self.json['files']:
                        if f['md5'] == attachment['md5']:

                            # Inject the event directory path into the e-mail JSON.
                            attachment['event_path'] = f['path']

            # The sandbox reports do not know where the sample is located within the event directory. That
            # is a handy piece of information for various processes later.
            # Also fix the filename in the sandbox reports. VxStream likes to name it after the SHA256 hash
            # and does not appear to included the actual filename anywhere in its JSON report.
            for report in self.json['sandbox']:
                report['event_path'] = ''

                # Try to locate the file with the same MD5.
                for f in self.json['files']:
                    if f['md5'] == report['md5']:

                        # Overwrite the filename in the sandbox report.
                        report['filename'] = os.path.basename(f['path'])

                        # Inject the event directory path into the sandbox report.
                        report['event_path'] = f['path']

                    # Loop over any dropped files in this sandbox report to inject the event path.
                    for dropped_file in report['dropped_files']:
                        if f['md5'] == dropped_file['md5']:
                            # Inject the event directory path into the dropped file JSON.
                            dropped_file['event_path'] = f['path']

            # Gather up the indicators.
            all_indicators = []
            self.json['indicators'] = []

            # Loop over all of the HTML files in the event and pull out the URLs.
            self.logger.debug('Gathering URLs from HTML files in the event.')
            for html_file in [
                    f['path'] for f in self.json['files']
                    if f['category'] == 'html'
            ]:

                # Store the unique URLs we find.
                unique_urls = set()

                # Open and read the contents of the file.
                with open(html_file, 'rb') as f:
                    urls = find_urls(f.read())

                    # Add the unique URLs to the list.
                    for url in urls:
                        unique_urls.add(url)

                # Create indicators for the URLs.
                indicators = make_url_indicators(unique_urls)
                for indicator in indicators:
                    indicator.path = html_file
                    indicator.whitelisted = whitelist.is_indicator_whitelisted(
                        indicator)
                    all_indicators.append(indicator)

            # Gather up the indicators from the ACE alerts.
            self.logger.debug(
                'Checking ACE alert indicators against whitelist.')
            for ace_alert in ace_alerts:
                for indicator in ace_alert.indicators:
                    indicator.path = ace_alert.path
                    indicator.whitelisted = whitelist.is_indicator_whitelisted(
                        indicator)
                    all_indicators.append(indicator)

            # Gather up the indicators from the emails.
            self.logger.debug('Checking email indicators against whitelist.')
            for email in emails:
                for indicator in email.indicators:
                    indicator.path = email.path
                    indicator.whitelisted = whitelist.is_indicator_whitelisted(
                        indicator)
                    all_indicators.append(indicator)

            # Gather up the indicators from the sandbox reports.
            self.logger.debug('Checking sandbox indicators against whitelist.')
            for sandbox_report in sandbox_reports:
                # Try to find the path to the actual sandboxed sample instead of the JSON report.
                matching_samples = [
                    f for f in self.json['files']
                    if f['md5'] == sandbox_report.md5
                ]
                if matching_samples:
                    for indicator in sandbox_report.indicators:
                        indicator.path = matching_samples[0]['path']
                        indicator.whitelisted = whitelist.is_indicator_whitelisted(
                            indicator)
                        all_indicators.append(indicator)
                else:
                    self.logger.warning(
                        'Could not find matching sample for indicators: "{}" "{}"'
                        .format(sandbox_report.filename, sandbox_report.md5))

            # Merge all of the indicators.
            merged_indicators = merge_indicators(all_indicators)

            # Add the merged indicators to the event JSON.
            for merged_ind in merged_indicators:
                self.json['indicators'].append(merged_ind.json)

            # Gather up any manual indicators we were given (from the refresh wiki function).
            # These are not Indicator objects, so we do not add the .json form to the list.
            for indicator in manual_indicators:
                self.logger.debug(
                    'Adding manual indicator to JSON: {} - {}'.format(
                        indicator['type'], indicator['value']))
                indicator['path'] = ''
                # We want to allow the Manual Indicators section to bypass the whitelist.
                indicator['whitelisted'] = False
                self.json['indicators'].append(indicator)
            """
            # Loop over any CSS URLs we found to try and find even more URLs.
            for css_url in [i['value'] for i in self.json['indicators'] if i['type'] == 'URI - URL' and '.css' in i['value']]:
            
                # Download the CSS content and find any URLs inside it.
                try:
                    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'}
                    css = requests.get(css_url, headers=headers).content
                    css_urls = find_urls(css, base_url=css_url)
                except:
                    self.logger.exception('Unable to download the CSS: {}'.format(css_url))
            """

            # Add some items to the event tags.
            if emails:
                self.json['tags'].append('phish')
            self.json['tags'] = sorted(list(set(self.json['tags'])))

            # Create the event package.
            if config['package']['enabled']:
                self.package_event()

Example #14

0

Show file

def dedup_reports(report_list, whitelist):
    """ Merge a list of BaseSandboxParser subclass objects to make a single generic report. """

    logger = logging.getLogger()
    logger.debug('Deduping sandbox report list')

    # Create the new generic report.
    dedup_report = BaseSandboxParser()

    for report in report_list:
        dedup_report.sandbox_urls += report.sandbox_urls

        if report.filename and not report.filename == 'sample':
            dedup_report.filename = report.filename

        if report.original_filename:
            dedup_report.original_filename = report.original_filename
            dedup_report.indicators.append(
                Indicator('Windows - FileName',
                          dedup_report.original_filename,
                          tags=['sandboxed_sample']))

        if report.md5:
            dedup_report.md5 = report.md5
            dedup_report.indicators.append(
                Indicator('Hash - MD5',
                          dedup_report.md5,
                          tags=['sandboxed_sample']))

        if report.sha1:
            dedup_report.sha1 = report.sha1
            dedup_report.indicators.append(
                Indicator('Hash - SHA1',
                          dedup_report.sha1,
                          tags=['sandboxed_sample']))

        if report.sha256:
            dedup_report.sha256 = report.sha256
            dedup_report.indicators.append(
                Indicator('Hash - SHA256',
                          dedup_report.sha256,
                          tags=['sandboxed_sample']))

        if report.sha512:
            dedup_report.sha512 = report.sha512

        if report.ssdeep:
            dedup_report.ssdeep = report.ssdeep
            dedup_report.indicators.append(
                Indicator('Hash - SSDEEP',
                          dedup_report.ssdeep,
                          tags=['sandboxed_sample']))

        dedup_report.malware_family += report.malware_family

        # Dedup the contacted hosts.
        for host in report.contacted_hosts:
            if not host in dedup_report.contacted_hosts:
                dedup_report.contacted_hosts.append(host)
                tags = ['contacted_host']
                if host['protocol'] and host['port']:
                    tags.append('{} {}'.format(host['protocol'], host['port']))
                elif host['protocol']:
                    tags.append(host['protocol'])

                # For now we consider ALL contacted hosts to be benign, so no need to check the whitelist. <- XXX UPDATE WHY ?!
                #dedup_report.indicators.append(Indicator('Address - ipv4-addr', host['ipv4'], status='Informational', tags=tags))
                dedup_report.indicators.append(
                    Indicator('Address - ipv4-addr', host['ipv4'], tags=tags))

        # Suricata
        for suricata_alert in report.suricata_alerts:
            if suricata_alert not in dedup_report.suricata_alerts:
                dedup_report.suricata_alerts.append(suricata_alert)

        # Dedup modified files
        for file in report.modified_files:
            if not file in dedup_report.modified_files:
                dedup_report.modified_files.append(file)

        # Dedup the dropped files.
        for file in report.dropped_files:

            # Dropped files are harder than the other items to properly whitelist, so we will
            # initially restrict them to certain file names or file types that we care about.
            if any(name in file['filename'].lower()
                   for name in dedup_report.good_dropped_file_names) or any(
                       t in file['type']
                       for t in dedup_report.good_dropped_file_types):
                if not file in dedup_report.dropped_files:
                    dedup_report.dropped_files.append(file)

                    # If any part of the dropped file is whitelisted, make sure we mark all parts as whitelisted.
                    if whitelist.is_dropped_file_whitelisted(file):
                        status = 'Whitelisted'
                        file['status'] = 'Whitelisted'
                    else:
                        status = 'New'

                    dedup_report.indicators.append(
                        Indicator('Windows - FileName',
                                  file['filename'],
                                  status=status,
                                  tags=['dropped_file']))
                    dedup_report.indicators.append(
                        Indicator('Hash - MD5',
                                  file['md5'],
                                  status=status,
                                  tags=['dropped_file'],
                                  relationships=[file['sha1'],
                                                 file['sha256']]))
                    dedup_report.indicators.append(
                        Indicator('Hash - SHA1',
                                  file['sha1'],
                                  status=status,
                                  tags=['dropped_file'],
                                  relationships=[file['md5'], file['sha256']]))
                    dedup_report.indicators.append(
                        Indicator('Hash - SHA256',
                                  file['sha256'],
                                  status=status,
                                  tags=['dropped_file'],
                                  relationships=[file['md5'], file['sha1']]))

        # Dedup the HTTP requests.
        for request in report.http_requests:
            if not request in dedup_report.http_requests:
                dedup_report.http_requests.append(request)
                dedup_report.indicators += make_url_indicators(
                    [request['url']], tags=['http_request', request['method']])

        # Dedup the DNS requests.
        for request in report.dns_requests:
            if not request in dedup_report.dns_requests:
                dedup_report.dns_requests.append(request)

                # If any part of the DNS request is whitelisted, make sure we mark all parts as whitelisted.
                if whitelist.is_dns_request_whitelisted(request):
                    status = 'Whitelisted'
                else:
                    status = 'New'

                # For now we consider ALL request IP addresses to be benign, so no need to check the whitelist.
                dedup_report.indicators.append(
                    Indicator('URI - Domain Name',
                              request['request'],
                              tags=['dns_request']))
                try:
                    ipaddress.ip_address(request['answer'])
                    dedup_report.indicators.append(
                        Indicator('Address - ipv4-addr',
                                  request['answer'],
                                  tags=['dns_response'],
                                  status='Informational',
                                  relationships=[request['request']]))
                except:
                    pass

        # Dedup the memory strings.
        dedup_report.memory_strings += report.memory_strings
        dedup_report.memory_strings = sorted(
            list(set(dedup_report.memory_strings)))

        # Dedup the memory URLs.
        dedup_report.memory_urls += report.memory_urls
        dedup_report.memory_urls = list(set(dedup_report.memory_urls))
        dedup_report.memory_urls = [
            u for u in dedup_report.memory_urls if RegexHelpers.is_url(u)
        ]
        dedup_report.indicators += make_url_indicators(
            dedup_report.memory_urls, tags=['url_in_memory'])

        # Dedup the strings URLs.
        dedup_report.strings_urls += report.strings_urls
        dedup_report.strings_urls = list(set(dedup_report.strings_urls))
        dedup_report.strings_urls = [
            u for u in dedup_report.strings_urls if RegexHelpers.is_url(u)
        ]
        dedup_report.indicators += make_url_indicators(
            dedup_report.strings_urls, tags=['url_in_strings'])

        # Dedup the mutexes.
        dedup_report.mutexes += report.mutexes
        dedup_report.mutexes = list(set(dedup_report.mutexes))

        # Dedup the resolved APIs.
        dedup_report.resolved_apis += report.resolved_apis
        dedup_report.resolved_apis = list(set(dedup_report.resolved_apis))

        # Dedup the created services.
        dedup_report.created_services += report.created_services
        dedup_report.created_services = list(set(
            dedup_report.created_services))

        # Dedup the started services.
        dedup_report.started_services += report.started_services
        dedup_report.started_services = list(set(
            dedup_report.started_services))

        # Add the process tree as-is.
        dedup_report.process_trees.append(report.process_tree)

        # Try to decode base64 chunks in the process tree.
        process_tree_decoded = report.process_tree
        for chunk in report.process_tree.split():
            try:
                decoded_chunk = base64.b64decode(chunk).decode('utf-8')
                if '\x00' in decoded_chunk:
                    decoded_chunk = base64.b64decode(chunk).decode('utf-16')
                process_tree_decoded = process_tree_decoded.replace(
                    chunk, decoded_chunk)
            except:
                pass
        dedup_report.process_trees_decoded.append(process_tree_decoded)

        # Remove ` backtick and other basic Powershell obfuscation.
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            if 'powershell' in decoded_process_tree.lower():
                new_trees.append(decoded_process_tree.replace('`', ''))
        dedup_report.process_trees_decoded += new_trees

        # Remove Powershell string formatter obfuscation.
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            formatter_pattern = re.compile(
                r'(\([\'\"](({(\d+)})+)[\'\"]\s*\-f\s*(([\'\"][^\'\"]+[\'\"],*)+)\))',
                re.IGNORECASE)
            results = formatter_pattern.findall(decoded_process_tree)
            if results:
                for result in results:
                    """ ('("{0}{1}"-f\'JDxA\',\'QDc\')', '{0}{1}', '{1}', '1', "'JDxA','QDc'", "'QDc'") """
                    full_match = result[0]
                    order = result[1][1:-1]  # 0}{1
                    items = result[4]  # "'JDxA','QDc'"

                    order_list = order.split('}{')
                    order_ints = [int(x) for x in order_list]

                    items_list = [
                        i.replace('\'', '').replace('"', '')
                        for i in items.split(',')
                    ]

                    if len(order_ints) == len(items_list):
                        deobfuscated_string = ''
                        for i in order_ints:
                            deobfuscated_string += items_list[i]
                        decoded_process_tree = decoded_process_tree.replace(
                            full_match, deobfuscated_string)
                new_trees.append(decoded_process_tree)
        dedup_report.process_trees_decoded += new_trees

        # Try to decode string .split() obfuscation (used by Emotet and others)
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            if 'split' in decoded_process_tree.lower():
                try:
                    split_char_pattern = re.compile(
                        r'\.[\'\"]*split[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)',
                        re.IGNORECASE)
                    try:
                        split_char = str(
                            split_char_pattern.search(
                                decoded_process_tree).group(1))
                    except AttributeError:
                        split_char = None
                    if split_char:
                        new_process_tree_decoded = ' '.join(
                            decoded_process_tree.split(split_char))
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            "'+'", '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '"+"', '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\'', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\"', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '. ', ' ')
                        new_trees.append(new_process_tree_decoded)
                except:
                    logger.exception(
                        'Could not find process tree split() character.')
        dedup_report.process_trees_decoded += new_trees

        # Try to decode string .invoke() obfuscation (used by Emotet and others)
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            if 'invoke' in decoded_process_tree.lower():
                try:
                    split_char_pattern = re.compile(
                        r'\.[\'\"]*invoke[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)',
                        re.IGNORECASE)
                    try:
                        split_char = str(
                            split_char_pattern.search(
                                decoded_process_tree).group(1))
                    except AttributeError:
                        split_char = None
                    if split_char:
                        new_process_tree_decoded = ' '.join(
                            decoded_process_tree.split(split_char))
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            "'+'", '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '"+"', '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\'', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\"', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '. ', ' ')
                        new_trees.append(new_process_tree_decoded)
                except:
                    logger.exception(
                        'Could not find process tree invoke() character.')
        dedup_report.process_trees_decoded += new_trees

        # Dedup the process tree URLs. Start by just adding the URLs from each report.
        dedup_report.process_tree_urls += report.process_tree_urls
        # Find the URLs in each decoded process tree.
        for decoded_tree in dedup_report.process_trees_decoded:
            urls = find_urls(decoded_tree)
            # Remove any URL that has these URLs as substrings, since it's probably a bogus
            # URL from the original, non-decoded process tree.
            for u in report.process_tree_urls:
                if any(decoded_url in u for decoded_url in urls):
                    try:
                        dedup_report.process_tree_urls.remove(u)
                        logger.debug(
                            'Removing bad process tree URL: {}'.format(u))
                    except:
                        pass
            dedup_report.process_tree_urls += urls
        dedup_report.process_tree_urls = list(
            set(dedup_report.process_tree_urls))
        dedup_report.process_tree_urls = [
            u for u in dedup_report.process_tree_urls if RegexHelpers.is_url(u)
        ]
        dedup_report.indicators += make_url_indicators(
            dedup_report.process_tree_urls, tags=['url_in_process_tree'])

        # Add the screenshot URLs as-is.
        if report.screenshot_path:
            dedup_report.screenshot_paths.append(report.screenshot_path)

    return dedup_report

Example #15

0

Show file

    def _parse_attachment(self, message_part, charset):
        part_items = message_part.items()
        for tup in part_items:
            for value in tup:
                if 'attachment' in value:
                    file_data = message_part.get_payload()

                    attachment_dict = {}
                    if message_part.get('Content-Transfer-Encoding', None) == 'base64':
                        file_data_b64 = file_data.replace('\n', '')
                        # For some reason, sometimes the attachments don't have the proper
                        # padding. Add a couple "==" on the end for good measure. This doesn't
                        # seem to harm correctly encoded attachments.
                        file_data_decoded = base64.b64decode(file_data_b64 + '==')

                        # Try and get strings out of the attachment.
                        strings_list = RegexHelpers.find_strings(file_data_decoded)
                        strings = ' '.join(strings_list)

                        # Look for any URLs that were in the strings.
                        strings_urls = find_urls(strings)
                        attachment_dict['strings_urls'] = strings_urls

                    elif message_part.get_content_type() == 'text/html':
                        file_data_decoded = message_part.get_payload(decode=True).decode(charset).encode('utf-8')
                    else:
                        file_data_decoded = file_data

                    try:
                        md5_hasher = hashlib.md5()
                        md5_hasher.update(file_data_decoded)
                        md5_hash = md5_hasher.hexdigest()
                    except TypeError:
                        md5_hash = ''

                    try:
                        sha256_hasher = hashlib.sha256()
                        sha256_hasher.update(file_data_decoded)
                        sha256_hash = sha256_hasher.hexdigest()
                    except TypeError:
                        sha256_hash = ''

                    attachment_dict['content_type'] = message_part.get_content_type()
                    attachment_dict['size'] = len(file_data_decoded)
                    attachment_dict['md5'] = md5_hash
                    attachment_dict['sha256'] = sha256_hash
                    attachment_dict['name'] = ''
                    attachment_dict['create_date'] = ''
                    attachment_dict['mod_date'] = ''
                    attachment_dict['read_date'] = ''

                    # Find the attachment name. Normally this follows a specific format
                    # and is called 'filename=' but recently I've seen some that are in
                    # different locations are are just called 'name='... Hence removing
                    # old code and replacing with a regex statement to account for either
                    # name in any location in the message part.
                    attachment_name_pattern = re.compile(r'(file)?name="?([^"]+)"?')
                    for tup in part_items:
                        for item in tup:
                            item_lines = item.splitlines()
                            for item_line in item_lines:
                                attachment_name = attachment_name_pattern.search(item_line)
                                if attachment_name:
                                    attachment_dict['name'] = RegexHelpers.decode_utf_b64_string(attachment_name.groups()[1])
                                    if attachment_dict['name'].endswith(';'):
                                        attachment_dict['name'] = attachment_dict['name'][:-1]
                    
                    # Make the attachment indicators.
                    self.indicators.append(Indicator('Windows - FileName', attachment_dict['name'], tags=['attachment']))
                    self.indicators.append(Indicator('Hash - MD5', attachment_dict['md5'], tags=['attachment']))
                    self.indicators.append(Indicator('Hash - SHA256', attachment_dict['sha256'], tags=['attachment']))

                    return attachment_dict

        return None

Example #16

0

Show file

 def parse_strings_urls(self):
     self.logger.debug('Looking for URLs in strings')
     return find_urls(self.parse_strings())

Example #17

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_in_text_like_html():
    blob = b"""<meta http-equiv="refresh" content="0; URL=https://blah.com/one/two">"""
    assert urlfinderlib.find_urls(blob) == {"https://blah.com/one/two"}

Example #18

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_ooxml():
    with open(f"{files_dir}/test.ooxml", "rb") as f:
        blob = f.read()

    assert urlfinderlib.find_urls(blob) == set()

Example #19

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_rfc822():
    with open(f"{files_dir}/email.rfc822", "rb") as f:
        blob = f.read()

    assert urlfinderlib.find_urls(blob) == set()

Example #20

0

Show file

File: test_urlfinderlib.py Project: ace-ecosystem/urlfinderlib

def test_find_urls_text():
    assert urlfinderlib.find_urls("test") == set()

Example #21

0

Show file

    def __init__(self, smtp_path, whitelist):

        # Initiate logging.
        self.logger = logging.getLogger()

        # Save the whitelist.
        self.whitelist = whitelist

        # Items we parse out of the email.
        self.ace_url = ''
        self.attachments = []
        self.body = ''
        self.cc_addresses = []
        self.envelope_from = ''
        self.envelope_to = ''
        self.from_address = ''
        self.headers = ''
        self.html = ''
        self.indicators = []
        self.message_id = ''
        self.original_recipient = ''
        self.path = smtp_path
        self.received = ''
        self.received_time = ''
        self.remediated = False
        self.reply_to = ''
        self.return_path = ''
        self.screenshots = []
        self.subject = ''
        self.subject_decoded = ''
        self.to_addresses = []
        self.urls = []
        self.x_auth_id = ''
        self.x_mailer = ''
        self.x_original_sender = ''
        self.x_originating_ip = ''
        self.x_sender = ''
        self.x_sender_id = ''
        self.x_sender_ip = ''

        # Build the URL to the ACE alert.
        ace_uuid_pattern = re.compile(r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})')
        match = ace_uuid_pattern.search(self.path)
        if match:
            self.ace_url = '{}{}'.format(config['ace']['ace_alert_url'], match.group(1))

        with open(self.path, encoding='utf-8', errors='ignore') as s:
            smtp_stream = s.read().splitlines()

        # Locate any screenshots for this email.
        email_dir = os.path.dirname(self.path)
        files = os.listdir(email_dir)
        for f in files:
            if 'text_html' in f and f.endswith('.png') and not f.startswith('email_screenshot'):
                self.logger.debug('Found email screenshot: {}'.format(os.path.join(email_dir, f)))
                self.screenshots.append(os.path.join(email_dir, f))

        # Find the envelope from/to addresses. This will only work if given an
        # "smtp.stream" file, since otherwise the SMTP commands will not exist.
        envelope_address_pattern = re.compile(r'.*<(.*)>.*')
        for line in smtp_stream:
            if line.startswith('MAIL FROM:'):
                try:
                    self.envelope_from = envelope_address_pattern.match(line).group(1)
                except:
                    self.logger.exception('Unable to parse envelope from.')
            if line.startswith('RCPT TO:'):
                try:
                    self.envelope_to = envelope_address_pattern.match(line).group(1)
                except:
                    self.logger.exception('Unable to parse envelope to.')

        # Just in case we are dealing with an "smtp.stream" file that still has
        # the SMTP commands above the actual e-mail, we need to strip those out.
        # This will remove all lines prior to the Received: headers so that the
        # email.parser can properly parse out the e-mail. If we were given an
        # "smtp.email" type of file with the SMTP commands already removed, this
        # should not affect anything. This is legacy code at this point.
        try:
            while not smtp_stream[0].startswith('Received:'):
                smtp_stream.pop(0)
        except IndexError:
            smtp_stream = []

        # Join the header lines into a single string.
        self.email_text = '\n'.join(smtp_stream)

        # Create the e-mail object.
        email_obj = email.message_from_string(self.email_text)

        # We want to try and parse an embedded/attached e-mail if there is one.
        # Walk the full e-mail's parts.
        for part in email_obj.walk():
            # Continue if the part looks like a valid e-mail.
            if part.get_content_type() == 'message/rfc822':
                # Split the part lines into a list.
                part_text = str(part).splitlines()
                if any('Received:' in line for line in part_text):
                    # Make sure our part starts with the Received: headers.
                    while not part_text[0].startswith('Received:'):
                        part_text.pop(0)
                    part_text = '\n'.join(part_text)

                    # Make the new e-mail object.
                    email_obj = email.message_from_string(part_text)

        # Parse the e-mail object for its content.
        parsed_email = self._parse_content(email_obj)

        # Now that we have the e-mail object, parse out some of the interesting parts.
        self.headers = self._get_all_headers_string(email_obj)
        self.received = self.get_header(email_obj, 'received')

        # Get the e-mail's plaintext body, HTML body, and the visible text from the HTML.
        self.body = parsed_email['body']
        self.html = parsed_email['html']

        # Get any e-mail attachments.
        self.attachments = parsed_email['attachments']

        # From address
        try:
            self.from_address = self._get_address_list(email_obj, 'from')[0][1]
            self.indicators.append(Indicator('Email - Address', self.from_address, tags=['from_address']))
        except:
            pass

        # From domain
        try:
            self.indicators.append(Indicator('URI - Domain Name', self.from_address.split('@')[1], tags=['from_domain']))
        except:
            pass

        # Reply-To address
        try:
            self.reply_to = self._get_address_list(email_obj, 'reply-to')[0][1]
            self.indicators.append(Indicator('Email - Address', self.reply_to, tags=['reply_to']))
        except:
            pass

        # X-Sender address
        try:
            self.x_sender = self._get_address_list(email_obj, 'X-Sender')[0][1]
            self.indicators.append(Indicator('Email - Address', self.x_sender, tags=['x_sender']))
        except:
            pass

        # X-Sender-Id address
        try:
            self.x_sender_id = self._get_address_list(email_obj, 'X-Sender-Id')[0][1]
            self.indicators.append(Indicator('Email - Address', self.x_sender_id, tags=['x_sender_id']))
        except:
            pass

        # X-Auth-Id address
        try:
            self.x_auth_id = self._get_address_list(email_obj, 'X-Auth-ID')[0][1]
            self.indicators.append(Indicator('Email - Address', self.x_auth_id, tags=['x_auth_id']))
        except:
            pass

        # Return-Path address
        try:
            self.return_path = self._get_address_list(email_obj, 'return_path')[0][1]
            self.indicators.append(Indicator('Email - Address', self.return_path, tags=['return_path']))
        except:
            pass

        # X-MS-Exchange-Organization-OriginalEnvelopeRecipients address
        try:
            self.original_recipient = self._get_address_list(email_obj, 'X-MS-Exchange-Organization-OriginalEnvelopeRecipients')[0][1].lower()
            self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient']))
        except:
            pass
        # If the original_recipient was not found, check if this is a POTENTIAL PHISH e-mail and use the from address.
        if not self.original_recipient and 'Subject: [POTENTIAL PHISH]' in self.email_text:
            try:
                temp_email_obj = email.message_from_string(self.email_text)
                self.original_recipient = self._get_address_list(temp_email_obj, 'from')[0][1]
                self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient']))
            except:
                self.logger.exception('Error parsing original recipient from POTENTIAL PHISH e-mail.')

        # Subject
        try:
            self.subject = ''.join(self.get_header(email_obj, 'subject')[0].splitlines())
            self.indicators.append(Indicator('Email - Subject', self.subject))
        except:
            pass

        # Decoded subject
        try:
            self.subject_decoded = ''.join(str(make_header(decode_header(self.get_header(email_obj, 'subject')[0]))).splitlines())
            self.indicators.append(Indicator('Email - Subject', self.subject_decoded))
        except:
            pass

        # To addresses
        self.to_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'to')]

        # CC addresses
        self.cc_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'cc')]

        # Message-Id
        try:
            self.message_id = self.get_header(email_obj, 'message-id')[0]
            self.indicators.append(Indicator('Email Message ID', self.message_id, status='Informational'))
        except:
            pass

        # X-Mailer
        try:
            self.x_mailer = self.get_header(email_obj, 'x-mailer')[0]
            self.indicators.append(Indicator('Email - Xmailer', self.x_mailer, status='Informational'))
        except:
            pass

        # X-Original-Sender address
        try:
            self.x_original_sender = self.get_header(email_obj, 'x-original-sender')[0]
            self.indicators.append(Indicator('Email - Address', self.x_original_sender, tags=['x_original_sender']))
        except:
            pass

        # X-Originating-Ip
        try:
            x_originating_ip = self.get_header(email_obj, 'x-originating-ip')[0]
            # Sometimes this field is in the form: [1.1.1.1]
            # Make sure we remove any non-IP characters.
            ip = RegexHelpers.find_ip_addresses(x_originating_ip)
            if ip:
                self.x_originating_ip = ip[0]
                self.indicators.append(Indicator('Address - ipv4-addr', self.x_originating_ip, tags=['x_originating_ip']))
        except:
            pass

        # X-Sender-Ip
        try:
            x_sender_ip = self.get_header(email_obj, 'x-sender-ip')[0]
            # Make sure like the X-Originating-IP that we only
            # get the IP address and no other characters.
            ip = RegexHelpers.find_ip_addresses(x_sender_ip)
            if ip:
                self.x_sender_ip = ip[0]
                self.indicators.append(Indicator('Address - ipv4-addr', self.x_sender_ip, tags=['x_sender_ip']))
        except:
            pass

        self.received_time = self._get_received_time(email_obj)
        if not self.received_time:
            self.received_time = self._get_date_time()

        # Find any URLs in the plaintext body.
        text_urls = find_urls(self.body)

        # Find any URLs in the HTML body.
        html_urls = find_urls(self.html)

        # Get any strings URLs.
        strings_urls = []
        """
        for file in self.attachments:
            try:
                strings_urls += file['strings_urls']
            except:
                pass
        """

        # Try and remove any URLs that look like partial versions of other URLs.
        all_urls = text_urls + html_urls + strings_urls
        unique_urls = set()
        for u in all_urls:
            if not any(other_url.startswith(u) and other_url != u for other_url in all_urls):
                unique_urls.add(u)

        # Get rid of any invalid URLs.
        self.urls = [u for u in unique_urls if is_valid(u)]

        # Make indicators for the URLs.
        self.indicators += make_url_indicators(self.urls)

        # Get rid of any invalid indicators.
        self.indicators = [i for i in self.indicators if i.value]

        # Add any extra tags to each indicator.
        for i in self.indicators:
            i.tags.append('phish')