Example #1
0
def test_get_analyst_qas():
    """Get Analyst QAs data.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I extract the Analyst QAs data
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_analyst_qas()

    # then I should receive a list of TrafficHistory records
    msg = 'Graph Analyst QAs data error'
    expected = {
        'ANALYST_QAS_DATE': 1500422400,
        'DOMAIN_DOWN_OR_PARKED': 'true',
        'HAS_FORUM_OR_COMMENTS': 'false',
        'HAS_RSS_FEED': 'false',
        'LINKS_TO_OSP': 'false',
        'LINKS_TO_TORRENTS': 'false',
        'P2P_MAGNET_LINKS': 'false',
        'REQUIRES_LOGIN': '******',
        'SEARCH_FEATURE': 'false'
    }
    assert received == expected, msg
Example #2
0
def test_get_country_ranks():
    """Get country ranks.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I parse the country ranks
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_country_ranks()

    # then I should receive a list of rank structures
    msg = 'Country rank error'
    expected = [{
        'COUNTRY_CODE': 'BE',
        'COUNTRY_NAME': 'Belgium',
        'COUNTRY_RANK': 1440
    }, {
        'COUNTRY_CODE': 'DE',
        'COUNTRY_NAME': 'Germany',
        'COUNTRY_RANK': 45635
    }, {
        'COUNTRY_CODE': 'NL',
        'COUNTRY_NAME': 'Netherlands',
        'COUNTRY_RANK': 2500
    }]
    assert received == expected, msg
Example #3
0
def test_get_geodns_dumping_none_values():
    """Get GeoDNS data: dumping None values.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR,
                              'graph_record_indiecade.com.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I parse the GeoDNS
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_geodns()

    # then I should receive a list of GeoDNS records
    msg = 'GeoDNS error'
    expected = [
        {
            'IPV4_ADDR': '216.157.102.147',
            'IPV4_CONTINENT': '',
            'IPV4_CONTINENT_CODE': '',
            'IPV4_COUNTRY': '',
            'IPV4_COUNTRY_CODE': '',
            'IPV4_ISP': '',
            'IPV4_LATITUDE': '',
            'IPV4_LONGITUDE': '',
            'IPV4_ORG': '',
        },
    ]
    assert received == expected, msg
Example #4
0
def test_reporter():
    """Initialise a domain_intel.Reporter object.
    """
    # When I initialise an Domain Intel Reporter
    awis = domain_intel.Reporter(data=None)

    # I should get a domain_intel.Reporter instance
    msg = 'Object is not a domain_intel.Reporter instance'
    assert isinstance(awis, domain_intel.Reporter), msg
Example #5
0
def test_get_traffichistory_multiple_traffic_results():
    """Get TrafficHistory data: multiple traffic results.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'majaa.net.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I extract the TrafficHistory data
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_traffichistory()

    # then I should receive a list of TrafficHistory records
    msg = 'TrafficHistory with 3 data sets error'
    assert len(received) == 12, msg
Example #6
0
def test_dump_wide_column_csv_simple_domain():
    """Dump wide column CSV: simple domain.
    """
    # Given a Domain Intel graph record with no ancillary data
    with io.open(os.path.join(SAMPLE_FILE_DIR,
                              'graph_record_simple_domain.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I dump as CSV
    reporter = domain_intel.Reporter(data=record)
    received = reporter.dump_wide_column_csv()

    # then I should receive a CSV output
    msg = 'Wide column CSV dump of simple domain error'
    expected = ('cheapdressonsale.com,"cheapdressonsale.com/",,,,False,,,,,'
                ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,,,,,,,,,')
    assert '\n'.join(received) == expected, msg
Example #7
0
def test_reporter_parse():
    """Parse a Domain Intel graph construct.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I parse the graph construct
    reporter = domain_intel.Reporter(data=record)

    # then the domain attribute should return a value
    msg = 'Reporter.domain error'
    expected = {
        'DOMAIN':
        'ondertitel.com',
        'ADULT_CONTENT':
        False,
        'LINKS_IN_COUNT':
        141,
        'ENCODING':
        'iso-8859-1',
        'LOCALE':
        'nl-NL',
        'MEDIAN_LOAD_TIME':
        767,
        'DESCRIPTION':
        '"Database voor het uploaden en downloaden van Nederlandse '
        'ondertitels voor Divx/Xvid."',
        'ONLINE_SINCE':
        '23-Feb-2004',
        'RANK':
        86580,
        'SPEED_PERCENTILE':
        87,
        'TITLE':
        '"Ondertitel.com"',
    }
    assert reporter.domain == expected, msg

    # and the countries attribute should return a value
    msg = 'Reporter.countries error'
    expected = ['country/BE', 'country/DE', 'country/NL']
    assert sorted(reporter.countries) == sorted(expected), msg
Example #8
0
def test_dump_wide_column_csv(mock_time):
    """Dump wide column CSV.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I dump as CSV
    reporter = domain_intel.Reporter(data=record)
    mock_time.gmtime.return_value = time.gmtime(1498867200)
    received = reporter.dump_wide_column_csv()

    # then I should receive a CSV output
    msg = 'Wide column CSV dump error'
    with io.open(os.path.join('domain_intel', 'test', 'files', 'results',
                              'gbq.csv'),
                 encoding='utf-8') as _fh:
        expected = _fh.read().rstrip()
    assert '\n'.join(received) == expected, msg
Example #9
0
def test_get_traffichistory():
    """Get TrafficHistory data.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I extract the TrafficHistory data
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_traffichistory()

    # then I should receive a list of TrafficHistory records
    msg = 'TrafficHistory error'
    expected = {
        'TRAFFIC_PAGE_VIEWS_PM': 0.27,
        'TRAFFIC_PAGE_VIEWS_USER': 4,
        'TRAFFIC_RANK': 202346,
        'TRAFFIC_REACH': 3,
        'TRAFFIC_TS': 1497139200.0
    }
    assert received[10] == expected, msg
Example #10
0
def test_get_geodns():
    """Get GeoDNS data.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I parse the GeoDNS
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_geodns()

    # then I should receive a list of GeoDNS records
    msg = 'GeoDNS error'
    expected = [
        {
            'IPV4_CONTINENT': u'North America',
            'IPV4_CONTINENT_CODE': u'NA',
            'IPV4_COUNTRY': u'United States',
            'IPV4_COUNTRY_CODE': u'US',
            'IPV4_LATITUDE': 37.7697,
            'IPV4_LONGITUDE': -122.3933,
            'IPV4_ADDR': u'104.27.140.239',
            'IPV4_ORG': u'"CloudFlare"',
            'IPV4_ISP': u'"CloudFlare"',
        },
        {
            'IPV4_CONTINENT': u'North America',
            'IPV4_CONTINENT_CODE': u'NA',
            'IPV4_COUNTRY': u'United States',
            'IPV4_COUNTRY_CODE': u'US',
            'IPV4_LATITUDE': 37.7697,
            'IPV4_LONGITUDE': -122.3933,
            'IPV4_ADDR': u'104.27.141.239',
            'IPV4_ORG': u'"CloudFlare"',
            'IPV4_ISP': u'"CloudFlare"',
        },
    ]
    assert received == expected, msg
Example #11
0
def test_get_traffichistory_dodgy_domain_during_filter():
    """Get TrafficHistory data: dodgy domain during filtering.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR,
                              'graph_record_trafficestimate.com.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I extract the TrafficHistory data
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_traffichistory()

    # then I should receive a list of TrafficHistory records
    msg = 'TrafficHistory parse error'
    expected = {
        'TRAFFIC_PAGE_VIEWS_PM': 0.68,
        'TRAFFIC_PAGE_VIEWS_USER': 3.3,
        'TRAFFIC_RANK': 87993,
        'TRAFFIC_REACH': 8.3,
        'TRAFFIC_TS': 1498953600.0
    }
    assert received[1] == expected, msg
Example #12
0
    def wide_column_dump_worker(self, queue, max_read_count, topic, group_id,
                                dry):
        """Wide-column CSV dump worker.

        As this is a worker that could be part of a set of executing
        threads, the number of messages read is pushed onto the
        :class:`multiprocessing.Queue` *queue*.

        The parameter list is as per :meth:`wide_column_dump`.

        Returns:
            updated :class:`multiprocessing.Queue` *queue* instance
            with number of records processed

        """
        log.debug('Wide-column CSV dump worker set to read %s messages',
                  max_read_count or 'all')

        with self.producer() as producer:
            with self.consumer(topic, group_id=group_id) as consumer:
                total_messages_read = 0
                total_messages_put = 0
                for message in consumer:
                    traversal = json.loads(message.value.decode('utf-8'))
                    reporter = domain_intel.Reporter(data=traversal)
                    total_messages_read += 1
                    for line in reporter.dump_wide_column_csv():
                        if not dry:
                            producer.send('wide-column-csv',
                                          line.encode('utf-8'))
                        total_messages_put += 1

                    if (max_read_count is not None
                            and (total_messages_read >= max_read_count)):
                        break

        queue.put((total_messages_read, total_messages_put))
Example #13
0
def test_get_sites_linking_in():
    """Get sites linking in.
    """
    # Given a Domain Intel graph record
    with io.open(os.path.join(SAMPLE_FILE_DIR, 'graph_record.json'),
                 encoding='utf-8') as _fh:
        record = json.loads(_fh.read().rstrip())

    # when I parse the sites linking in
    reporter = domain_intel.Reporter(data=record)
    received = reporter.get_sites_linking_in()

    # then I should receive a list of URLs linking in
    msg = 'URL linking in error'
    expected = [{
        'DOMAIN_LINKINGIN': u'kaskus.co.id',
        'URL_LINKINGIN': u'"archive.kaskus.co.id:80/thread/13385296/1"'
    }, {
        'DOMAIN_LINKINGIN':
        u'stuffgate.com',
        'URL_LINKINGIN':
        u'"stuffgate.com:80/stuff/website/top-113000-sites"'
    }, {
        'DOMAIN_LINKINGIN': u'aljyyosh.com',
        'URL_LINKINGIN': u'"aljyyosh.com:80/vb/showthread.php?t=12598"'
    }, {
        'DOMAIN_LINKINGIN':
        u'dmoztools.net',
        'URL_LINKINGIN':
        u'"dmoztools.net:80/World/Nederlands/Computers/Multimedia/Beeld_en_Video"'
    }, {
        'DOMAIN_LINKINGIN':
        u'bsplayer.com',
        'URL_LINKINGIN':
        u'"forum.bsplayer.com:80/general-talk-support/6000-read-first-before-posting.html"'
    }, {
        'DOMAIN_LINKINGIN': u'skynet.be',
        'URL_LINKINGIN': u'"users.skynet.be:80/nedsites/film.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'infonu.nl',
        'URL_LINKINGIN':
        u'"pc-en-internet.infonu.nl:80/tutorials/'
        '31155-ondertiteling-onder-film-zetten.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'napiprojekt.pl',
        'URL_LINKINGIN':
        u'"forum.napiprojekt.pl:80/viewtopic.php?t=149"'
    }, {
        'DOMAIN_LINKINGIN':
        u'subtitleseeker.com',
        'URL_LINKINGIN':
        u'"subtitleseeker.com:80/Download-movie-1000292/'
        'Its%20a%20Mad%20Mad%20Mad%20Mad%20World%201963-NTSC"'
    }, {
        'DOMAIN_LINKINGIN':
        u'amara.org',
        'URL_LINKINGIN':
        u'"amara.org:80/en/videos/G5NFTlUp42ul/hai/1657138/4759909"'
    }, {
        'DOMAIN_LINKINGIN':
        u'startpagina.nl',
        'URL_LINKINGIN':
        u'"startpagina.nl:80/v/overig/vraag/29532/lettertype-gebruiken-ondertiteling"'
    }, {
        'DOMAIN_LINKINGIN':
        u'blogspot.ch',
        'URL_LINKINGIN':
        u'"acrossborders.blogspot.ch:80/2006/10/'
        'can-you-ever-say-nigger-without-making.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'findeen.com',
        'URL_LINKINGIN':
        u'"be.findeen.com:80/8_mile_watch_online_subs.html"'
    }, {
        'DOMAIN_LINKINGIN': u'telenet.be',
        'URL_LINKINGIN': u'"users.telenet.be:80/alenkin/archive.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'scoop.it',
        'URL_LINKINGIN':
        u'"scoop.it:80/t/confbosimptoude/p/4075409398/'
        '2017/02/17/razer-nostromo-software-download-chip?"'
    }, {
        'DOMAIN_LINKINGIN':
        u'blogspot.cl',
        'URL_LINKINGIN':
        u'"streptococcuspyogenes.blogspot.cl:80/2006/06/introduccin_15.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'blogspot.pe',
        'URL_LINKINGIN':
        u'"streptococcuspyogenes.blogspot.pe:80/2006/06/introduccin_15.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'cocolog-nifty.com',
        'URL_LINKINGIN':
        u'"eurobeter-gc8.cocolog-nifty.com:80/blog/2006/07/post_5b50.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'blogspot.tw',
        'URL_LINKINGIN':
        u'"redmotion.blogspot.tw:80/2008/10/new-ice-compounds.html"'
    }, {
        'DOMAIN_LINKINGIN':
        u'secureserver.net',
        'URL_LINKINGIN':
        u'"ip-173-201-142-193.ip.secureserver.net:80/alexa/Alexa_25.html"',
    }]
    assert received == expected, msg