Exemple #1
0
    def __init__(self, thread_count=8):

        self.dns_tracer = DNSTracer()
        self.pool = ThreadPool(thread_count)
        self.asn_tracer = AsnTracer()
Exemple #2
0
    def __init__(self, thread_count=8):

        self.dns_tracer = DNSTracer()
        self.pool = ThreadPool(thread_count)
        self.asn_tracer = AsnTracer()
Exemple #3
0
class Browser(object):
    """
    Controls the browser (PhantomJS). Visits sites with it, records hosts
    that were contacted during the page load and does traceroutes to these
    hosts and the dns servers used to locate them.

    """

    def __init__(self, thread_count=8):

        self.dns_tracer = DNSTracer()
        self.pool = ThreadPool(thread_count)
        self.asn_tracer = AsnTracer()


    def _hosts_from_page(self, page_url):
        """
        Uses PhantomJS to visit a returns a list of hosts that are connected to,
        to fetch resources when loading the page.

        """

        browser_proc = envoy.run("phantomjs browser.js " + page_url, timeout=30)
        urls = [url.strip('"') for url in browser_proc.std_out.split('\n') if url != '' ]
        netlocs = [urlparse.urlparse(url).netloc for url in urls]
        contacted_hosts = set(netloc if ':' not in netloc else netloc.split(':')[0] for netloc in netlocs)
        return contacted_hosts

    def visit_multiple(self, page_urls):
        """
        The plural version of visit.

        """
        # TODO: parallelize?
        return map(self.visit, page_urls)


    def visit(self, page_url):
        """
        Visits a webpage and determines the paths that are traversed when visiting it.

        """
        resource_hosts = self._hosts_from_page(page_url)

        # This stores the results we care about
        print page_url
        page_result = {
            'page': page_url,
            'resource_hosts': map(self._trace, list(resource_hosts))
        }
        return PageResult(page_result)

    def _trace(self, host):
        """
        Traces the Asns to a host and to the nameservers used to find the host.
        Returns the nameservers queried, and the Asns traversed to each host.

        """


        def asn_tracer_dns_helper(contacted_host):
            """
            For use on DNS servers.

            """

            return {
                'host': contacted_host,
                'traversed_asns': self.asn_tracer.trace(contacted_host)
            }

        dirty_queried_dns_servers = self.dns_tracer.trace(host)
        queried_dns_servers = [dns_server for dns_server in
                               dirty_queried_dns_servers if dns_server and
                               not is_addr_private(dns_server)]
        return {
            'host': host,
            'traversed_asns': self.asn_tracer.trace(host),
            'queried_dns_servers': self.pool.map(
                asn_tracer_dns_helper,
                queried_dns_servers
            )
        }
Exemple #4
0
class Browser(object):
    """
    Controls the browser (PhantomJS). Visits sites with it, records hosts
    that were contacted during the page load and does traceroutes to these
    hosts and the dns servers used to locate them.

    """
    def __init__(self, thread_count=8):

        self.dns_tracer = DNSTracer()
        self.pool = ThreadPool(thread_count)
        self.asn_tracer = AsnTracer()

    def _hosts_from_page(self, page_url):
        """
        Uses PhantomJS to visit a returns a list of hosts that are connected to,
        to fetch resources when loading the page.

        """

        browser_proc = envoy.run("phantomjs browser.js " + page_url,
                                 timeout=30)
        urls = [
            url.strip('"') for url in browser_proc.std_out.split('\n')
            if url != ''
        ]
        netlocs = [urlparse.urlparse(url).netloc for url in urls]
        contacted_hosts = set(
            netloc if ':' not in netloc else netloc.split(':')[0]
            for netloc in netlocs)
        return contacted_hosts

    def visit_multiple(self, page_urls):
        """
        The plural version of visit.

        """
        # TODO: parallelize?
        return map(self.visit, page_urls)

    def visit(self, page_url):
        """
        Visits a webpage and determines the paths that are traversed when visiting it.

        """
        resource_hosts = self._hosts_from_page(page_url)

        # This stores the results we care about
        print page_url
        page_result = {
            'page': page_url,
            'resource_hosts': map(self._trace, list(resource_hosts))
        }
        return PageResult(page_result)

    def _trace(self, host):
        """
        Traces the Asns to a host and to the nameservers used to find the host.
        Returns the nameservers queried, and the Asns traversed to each host.

        """
        def asn_tracer_dns_helper(contacted_host):
            """
            For use on DNS servers.

            """

            return {
                'host': contacted_host,
                'traversed_asns': self.asn_tracer.trace(contacted_host)
            }

        dirty_queried_dns_servers = self.dns_tracer.trace(host)
        queried_dns_servers = [
            dns_server for dns_server in dirty_queried_dns_servers
            if dns_server and not is_addr_private(dns_server)
        ]
        return {
            'host':
            host,
            'traversed_asns':
            self.asn_tracer.trace(host),
            'queried_dns_servers':
            self.pool.map(asn_tracer_dns_helper, queried_dns_servers)
        }