Ejemplo n.º 1
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        query = (
            '{"query": { "bool": { "should": [ '
            '{ "exists" : { "field" : "url" }}, '
            '{ "exists" : { "field" : "domain" }} ] } } }')

        return_fields = ['domain', 'url']

        events = self.event_stream(
            '', query_dsl=query, return_fields=return_fields)

        domains = {}
        domain_counter = collections.Counter()
        tld_counter = collections.Counter()

        for event in events:
            domain = event.source.get('domain')

            if not domain:
                url = event.source.get('url')
                if not url:
                    continue
                domain = utils.get_domain_from_url(url)
                event.add_attributes({'domain': domain})

            if not domain:
                continue

            domain_counter[domain] += 1
            domains.setdefault(domain, [])
            domains[domain].append(event)

            tld = '.'.join(domain.split('.')[-2:])
            tld_counter[tld] += 1

        satellite_emoji = emojis.get_emoji('SATELLITE')
        for domain, count in domain_counter.iteritems():
            emojis_to_add = [satellite_emoji]
            text = '{0:s} seen {1:d} times'.format(domain, count)

            for event in domains.get(domain, []):
                event.add_emojis(emojis_to_add)
                event.add_human_readable(text, self.NAME, append=False)
                event.add_attributes({'domain_count': count})

        return (
            '{0:d} domains discovered with {1:d} TLDs.').format(
                len(domains), len(tld_counter))
Ejemplo n.º 2
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        query = ('{"query": { "bool": { "should": [ '
                 '{ "exists" : { "field" : "url" }}, '
                 '{ "exists" : { "field" : "domain" }} ] } } }')

        return_fields = ['domain', 'url']

        events = self.event_stream('',
                                   query_dsl=query,
                                   return_fields=return_fields)

        domains = {}
        domain_counter = collections.Counter()
        tld_counter = collections.Counter()
        cdn_counter = collections.Counter()

        for event in events:
            domain = event.source.get('domain')

            if not domain:
                url = event.source.get('url')
                if not url:
                    continue
                domain = utils.get_domain_from_url(url)

            if not domain:
                continue

            domain_counter[domain] += 1
            domains.setdefault(domain, [])
            domains[domain].append(event)

            tld = '.'.join(domain.split('.')[-2:])
            tld_counter[tld] += 1

        satellite_emoji = emojis.get_emoji('SATELLITE')
        for domain, count in iter(domain_counter.items()):
            emojis_to_add = [satellite_emoji]
            tags_to_add = []
            text = '{0:s} seen {1:d} times'.format(domain, count)

            cdn_provider = utils.get_cdn_provider(domain)
            if cdn_provider:
                tags_to_add.append('known-cdn')
                cdn_counter[cdn_provider] += 1

            for event in domains.get(domain, []):
                event.add_tags(tags_to_add)
                event.add_emojis(emojis_to_add)

                event.add_human_readable(text, self.NAME, append=False)
                new_attributes = {'domain': domain, 'domain_count': count}
                if cdn_provider:
                    new_attributes['cdn_provider'] = cdn_provider
                event.add_attributes(new_attributes)

                # Commit the event to the datastore.
                event.commit()

        return ('{0:d} domains discovered ({1:d} TLDs) and {2:d} known '
                'CDN networks found.').format(len(domains), len(tld_counter),
                                              len(cdn_counter))
Ejemplo n.º 3
0
 def test_get_domain_from_url(self):
     """Test get_domain_from_url function."""
     url = 'http://www.example.com/?foo=bar'
     domain = utils.get_domain_from_url(url)
     self.assertEqual(domain, 'www.example.com')
Ejemplo n.º 4
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        query = (
            '{"query": { "bool": { "should": [ '
            '{ "exists" : { "field" : "url" }}, '
            '{ "exists" : { "field" : "domain" }} ] } } }')

        return_fields = ['domain', 'url']

        events = self.event_stream(
            '', query_dsl=query, return_fields=return_fields)

        domains = {}
        domain_counter = collections.Counter()
        tld_counter = collections.Counter()
        cdn_counter = collections.Counter()

        for event in events:
            domain = event.source.get('domain')

            if not domain:
                url = event.source.get('url')
                if not url:
                    continue
                domain = utils.get_domain_from_url(url)

            if not domain:
                continue

            domain_counter[domain] += 1
            domains.setdefault(domain, [])
            domains[domain].append(event)

            tld = '.'.join(domain.split('.')[-2:])
            tld_counter[tld] += 1

        # Exit early if there are no domains in the data set to analyze.
        if not domain_counter:
            return 'No domains to analyze.'

        domain_count_array = numpy.array(list(domain_counter.values()))
        domain_20th_percentile = int(numpy.percentile(domain_count_array, 20))
        domain_85th_percentile = int(numpy.percentile(domain_count_array, 85))

        common_domains = [
            x for x, y in domain_counter.most_common()
            if y >= domain_85th_percentile]
        rare_domains = [
            x for x, y in domain_counter.most_common()
            if y <= domain_20th_percentile]

        satellite_emoji = emojis.get_emoji('SATELLITE')
        for domain, count in iter(domain_counter.items()):
            emojis_to_add = [satellite_emoji]
            tags_to_add = []

            cdn_provider = utils.get_cdn_provider(domain)
            if cdn_provider:
                tags_to_add.append('known-cdn')
                cdn_counter[cdn_provider] += 1

            if domain in common_domains:
                tags_to_add.append('common_domain')

            if domain in rare_domains:
                tags_to_add.append('rare_domain')

            for event in domains.get(domain, []):
                event.add_tags(tags_to_add)
                event.add_emojis(emojis_to_add)

                new_attributes = {'domain': domain, 'domain_count': count}
                if cdn_provider:
                    new_attributes['cdn_provider'] = cdn_provider
                event.add_attributes(new_attributes)

                # Commit the event to the datastore.
                event.commit()

        return (
            '{0:d} domains discovered ({1:d} TLDs) and {2:d} known '
            'CDN networks found.').format(
                len(domains), len(tld_counter), len(cdn_counter))
Ejemplo n.º 5
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        query = ('{"query": { "bool": { "should": [ '
                 '{ "exists" : { "field" : "url" }}, '
                 '{ "exists" : { "field" : "domain" }} ] } } }')

        return_fields = ['domain', 'url']

        events = self.event_stream('',
                                   query_dsl=query,
                                   return_fields=return_fields)

        domains = {}
        domain_counter = collections.Counter()
        tld_counter = collections.Counter()
        cdn_counter = collections.Counter()

        for event in events:
            domain = event.source.get('domain')

            if not domain:
                url = event.source.get('url')
                if not url:
                    continue
                domain = utils.get_domain_from_url(url)

            if not domain:
                continue

            domain_counter[domain] += 1
            domains.setdefault(domain, [])
            domains[domain].append(event)

            tld = '.'.join(domain.split('.')[-2:])
            tld_counter[tld] += 1

        # Exit early if there are no domains in the data set to analyze.
        if not domain_counter:
            return 'No domains to analyze.'

        domain_count_array = numpy.array(list(domain_counter.values()))
        try:
            domain_20th_percentile = int(
                numpy.percentile(domain_count_array, 20))
        except IndexError:
            logging.warning('Unable to calculate the 20th percentile.')
            domain_20th_percentile = 0

        try:
            domain_85th_percentile = int(
                numpy.percentile(domain_count_array, 85))
        except IndexError:
            logging.warning('Unable to calculate the 85th percentile.')
            highest_count_domain = domain_counter.most_common(1)
            if highest_count_domain:
                _, highest_count = highest_count_domain[0]
                domain_85th_percentile = highest_count + 10
            else:
                domain_85th_percentile = 100

        common_domains = [
            x for x, y in domain_counter.most_common()
            if y >= domain_85th_percentile
        ]
        rare_domains = [
            x for x, y in domain_counter.most_common()
            if y <= domain_20th_percentile
        ]

        satellite_emoji = emojis.get_emoji('SATELLITE')
        for domain, count in iter(domain_counter.items()):
            emojis_to_add = [satellite_emoji]
            tags_to_add = []

            cdn_provider = utils.get_cdn_provider(domain)
            if cdn_provider:
                tags_to_add.append('known-cdn')
                cdn_counter[cdn_provider] += 1

            if domain in common_domains:
                tags_to_add.append('common_domain')

            if domain in rare_domains:
                tags_to_add.append('rare_domain')

            for event in domains.get(domain, []):
                event.add_tags(tags_to_add)
                event.add_emojis(emojis_to_add)

                new_attributes = {'domain': domain, 'domain_count': count}
                if cdn_provider:
                    new_attributes['cdn_provider'] = cdn_provider
                event.add_attributes(new_attributes)

                # Commit the event to the datastore.
                event.commit()

        return ('{0:d} domains discovered ({1:d} TLDs) and {2:d} known '
                'CDN networks found.').format(len(domains), len(tld_counter),
                                              len(cdn_counter))
Ejemplo n.º 6
0
 def test_get_domain_from_url(self):
     """Test get_domain_from_url function."""
     url = 'http://www.example.com/?foo=bar'
     domain = utils.get_domain_from_url(url)
     self.assertEquals(domain, 'www.example.com')
Ejemplo n.º 7
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        query = (
            '{"query": { "bool": { "should": [ '
            '{ "exists" : { "field" : "url" }}, '
            '{ "exists" : { "field" : "domain" }} ] } } }'
        )

        return_fields = ["domain", "url"]

        events = self.event_stream("", query_dsl=query, return_fields=return_fields)

        domains = {}
        domain_counter = collections.Counter()
        tld_counter = collections.Counter()
        cdn_counter = collections.Counter()

        for event in events:
            domain = event.source.get("domain")

            if not domain:
                url = event.source.get("url")
                if not url:
                    continue
                domain = utils.get_domain_from_url(url)

            if not domain:
                continue

            domain_counter[domain] += 1
            domains.setdefault(domain, [])
            domains[domain].append(event)

            tld = ".".join(domain.split(".")[-2:])
            tld_counter[tld] += 1

        # Exit early if there are no domains in the data set to analyze.
        if not domain_counter:
            return "No domains to analyze."

        domain_count_array = numpy.array(list(domain_counter.values()))
        try:
            domain_20th_percentile = int(numpy.percentile(domain_count_array, 20))
        except IndexError:
            logger.warning("Unable to calculate the 20th percentile.")
            domain_20th_percentile = 0

        try:
            domain_85th_percentile = int(numpy.percentile(domain_count_array, 85))
        except IndexError:
            logger.warning("Unable to calculate the 85th percentile.")
            highest_count_domain = domain_counter.most_common(1)
            if highest_count_domain:
                _, highest_count = highest_count_domain[0]
                domain_85th_percentile = highest_count + 10
            else:
                domain_85th_percentile = 100

        common_domains = [
            x for x, y in domain_counter.most_common() if y >= domain_85th_percentile
        ]
        rare_domains = [
            x for x, y in domain_counter.most_common() if y <= domain_20th_percentile
        ]

        for domain, count in iter(domain_counter.items()):
            tags_to_add = []

            cdn_provider = utils.get_cdn_provider(domain)
            if cdn_provider:
                tags_to_add.append("known-cdn")
                cdn_counter[cdn_provider] += 1

            if domain in rare_domains:
                tags_to_add.append("rare-domain")

            for event in domains.get(domain, []):
                event.add_tags(tags_to_add)

                new_attributes = {"domain": domain, "domain_count": count}
                if domain in common_domains:
                    new_attributes["is_common_domain"] = True
                if cdn_provider:
                    new_attributes["cdn_provider"] = cdn_provider
                event.add_attributes(new_attributes)

                # Commit the event to the datastore.
                event.commit()

        return (
            "{0:d} domains discovered ({1:d} TLDs) and {2:d} known "
            "CDN networks found."
        ).format(len(domains), len(tld_counter), len(cdn_counter))
Ejemplo n.º 8
0
    def run(self):
        """Entry point for the analyzer.

        Returns:
            String with summary of the analyzer result
        """
        query = (
            '{"query": { "bool": { "should": [ '
            '{ "exists" : { "field" : "url" }}, '
            '{ "exists" : { "field" : "domain" }} ] } } }')

        return_fields = ['domain', 'url']

        events = self.event_stream(
            '', query_dsl=query, return_fields=return_fields)

        domains = {}
        domain_counter = collections.Counter()
        tld_counter = collections.Counter()
        cdn_counter = collections.Counter()

        for event in events:
            domain = event.source.get('domain')

            if not domain:
                url = event.source.get('url')
                if not url:
                    continue
                domain = utils.get_domain_from_url(url)

            if not domain:
                continue

            domain_counter[domain] += 1
            domains.setdefault(domain, [])
            domains[domain].append(event)

            tld = '.'.join(domain.split('.')[-2:])
            tld_counter[tld] += 1

        satellite_emoji = emojis.get_emoji('SATELLITE')
        for domain, count in iter(domain_counter.items()):
            emojis_to_add = [satellite_emoji]
            tags_to_add = []
            text = '{0:s} seen {1:d} times'.format(domain, count)

            cdn_provider = utils.get_cdn_provider(domain)
            if cdn_provider:
                tags_to_add.append('known-cdn')
                cdn_counter[cdn_provider] += 1

            for event in domains.get(domain, []):
                event.add_tags(tags_to_add)
                event.add_emojis(emojis_to_add)

                event.add_human_readable(text, self.NAME, append=False)
                new_attributes = {'domain': domain, 'domain_count': count}
                if cdn_provider:
                    new_attributes['cdn_provider'] = cdn_provider
                event.add_attributes(new_attributes)

                # Commit the event to the datastore.
                event.commit()

        return (
            '{0:d} domains discovered ({1:d} TLDs) and {2:d} known '
            'CDN networks found.').format(
                len(domains), len(tld_counter), len(cdn_counter))