Esempio n. 1
0
 def test_static_dns_mapping(self):
     s = Settings()
     s.STATIC_DNS_MAPPINGS = {"localhost:123": ("-1.-1.-1.-1", 123)}
     dns = DnsCache(s)
     self.assertEqual(("-1.-1.-1.-1", 123), dns["localhost:123"])
     self.assertEqual(('127.0.0.1', 80), dns["localhost:80"])
     self.assertTrue(1, len(dns._cache))
Esempio n. 2
0
 def test_dns_cache(self):
     s = Settings()
     s.SIZE_DNS_CACHE = 1
     dns = DnsCache(s)
     self.assertEqual(('127.0.0.1', 80), dns["localhost:80"])
     self.assertEqual(('127.0.0.1', 81), dns["localhost:81"])
     self.assertTrue(1, len(dns._cache))
Esempio n. 3
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
            next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
            curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
Esempio n. 4
0
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
Esempio n. 5
0
    def test_with_multiple_active_queues(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 2
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://www.google.de")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(2, len(frontier._current_queues))

        next_url = frontier.get_next()
Esempio n. 6
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
Esempio n. 7
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
               next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
                         curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
Esempio n. 8
0
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
Esempio n. 9
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
Esempio n. 10
0
    def test_create_frontier_works(self):

        handler = logging.StreamHandler(sys.stdout)
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = masterprocess.create_frontier(s, handler)

        self.assertTrue(frontier is not None)
Esempio n. 11
0
    def test_create_frontier_works(self):

        handler = logging.StreamHandler(sys.stdout)
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = masterprocess.create_frontier(s, handler)

        self.assertTrue(frontier is not None)
Esempio n. 12
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = {
                "Etag": "123%s" % i,
                "Date": serialize_date_time(now)
            }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date,
                 prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri(
                    (url, etag, mod_date, next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
Esempio n. 13
0
    def test_fetching_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        curi = CrawlUri(
            url="http://localhost:%s/robots.txt" % self.port,
            effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
        )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            robots = open(
                os.path.join(os.path.dirname(__file__),
                             "static/robots.txt")).read()
            self.assertEqual(robots, msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._io_loop.start()
Esempio n. 14
0
def spyder_management(settings):
    """
    Start new master/worker/logsink processes.
    """

    from spyder import logsink
    import spyder.workerprocess as worker
    import spyder.masterprocess as master

    effective_settings = Settings(settings)

    args = [a.lower() for a in sys.argv]

    if "master" in args:
        args.remove("master")
        master.main(effective_settings)
    elif "worker" in args:
        worker.main(effective_settings)
    elif "logsink" in args:
        logsink.main(effective_settings)
    else:
        print >> sys.stderr, """Usage: spyder-ctrl [master|worker|logsink]

'master'\t\tstart a master process.
'worker'\t\tstart a worker process.
'logsink'\t\tstart a sink for logmessages.
"""
        sys.exit(1)
Esempio n. 15
0
    def setUp(self):

        # create the io_loop
        self._io_loop = IOLoop.instance()

        # and the context
        self._ctx = zmq.Context(1)

        self._settings = Settings()
        self._settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push'
        self._settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \
            self._settings.ZEROMQ_MASTER_PUSH
        self._settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub'
        self._settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \
            self._settings.ZEROMQ_MASTER_SUB

        self._settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master'
        self._settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker'

        # setup the mgmt sockets
        self._setup_mgmt_sockets()

        # setup the data sockets
        self._setup_data_servers()

        # setup the management interface
        self._mgmt = ZmqMgmt(self._mgmt_sockets['worker_sub'],
                             self._mgmt_sockets['worker_pub'],
                             io_loop=self._io_loop)
        self._mgmt.start()
        self._mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, self.on_mgmt_end)
Esempio n. 16
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = { "Etag" : "123%s" % i, "Date" : serialize_date_time(now) }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri((url, etag, mod_date,
                            next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
Esempio n. 17
0
    def test_loading_default_settings_works(self):

        from spyder import defaultsettings
        from spyder.core.settings import Settings

        settings = Settings()
        self.assertEqual(defaultsettings.ZEROMQ_MGMT_MASTER,
                         settings.ZEROMQ_MGMT_MASTER)
    def test_that_content_type_restriction_works(self):
        xtor = DefaultHtmlLinkExtractor(Settings())

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        self.assertTrue(xtor._restrict_content_type(curi))
        curi.rep_header["Content-Type"] = "pille/palle"
        self.assertFalse(xtor._restrict_content_type(curi))
Esempio n. 19
0
    def test_loading_custom_settings_works(self):

        from spyder import defaultsettings
        from spyder.core.settings import Settings

        import test_settings_settings
        settings = Settings(test_settings_settings)

        self.assertEqual(test_settings_settings.ZEROMQ_MGMT_WORKER,
                         settings.ZEROMQ_MGMT_WORKER)
    def test_that_creating_mgmt_works(self):

        ctx = zmq.Context()
        io_loop = IOLoop.instance()

        def stop_looping(_msg):
            io_loop.stop()

        settings = Settings()
        settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push'
        settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \
            settings.ZEROMQ_MASTER_PUSH
        settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub'
        settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \
            settings.ZEROMQ_MASTER_SUB

        settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master'
        settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker'

        pubsocket = ctx.socket(zmq.PUB)
        pubsocket.bind(settings.ZEROMQ_MGMT_MASTER)
        pub_stream = ZMQStream(pubsocket, io_loop)

        subsocket = ctx.socket(zmq.SUB)
        subsocket.setsockopt(zmq.SUBSCRIBE, "")
        subsocket.bind(settings.ZEROMQ_MGMT_WORKER)
        sub_stream = ZMQStream(subsocket, io_loop)

        mgmt = workerprocess.create_worker_management(settings, ctx, io_loop)
        mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, stop_looping)
        mgmt.start()

        def assert_quit_message(msg):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data)

        sub_stream.on_recv(assert_quit_message)

        death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                            data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
        pub_stream.send_multipart(death.serialize())

        io_loop.start()

        mgmt._out_stream.close()
        mgmt._in_stream.close()
        mgmt._publisher.close()
        mgmt._subscriber.close()
        pub_stream.close()
        pubsocket.close()
        sub_stream.close()
        subsocket.close()
        ctx.term()
Esempio n. 21
0
    def test_regex_scoper(self):

        curi = CrawlUri()
        curi.optional_vars = dict()
        curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([
            "http://www.google.de/index.html",
            "ftp://www.google.de/pillepalle.avi",
        ])

        settings = Settings()
        settings.REGEX_SCOPE_POSITIVE = ['^.*\.html']
        settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi']
        scoper = RegexScoper(settings)

        curi = scoper(curi)

        print curi.optional_vars[CURI_EXTRACTED_URLS]
        self.assertTrue("http://www.google.de/index.html" in
                curi.optional_vars[CURI_EXTRACTED_URLS])
        self.assertFalse("ftp://www.google.de/pillepalle.avi" in
                curi.optional_vars[CURI_EXTRACTED_URLS])
Esempio n. 22
0
    def test_regex_scoper(self):

        curi = CrawlUri()
        curi.optional_vars = dict()
        curi.optional_vars[CURI_EXTRACTED_URLS] = "\n".join([
            "http://www.google.de/index.html",
            "ftp://www.google.de/pillepalle.avi",
        ])

        settings = Settings()
        settings.REGEX_SCOPE_POSITIVE = ['^.*\.html']
        settings.REGEX_SCOPE_NEGATIVE = ['^.*\.avi']
        scoper = RegexScoper(settings)

        curi = scoper(curi)

        print curi.optional_vars[CURI_EXTRACTED_URLS]
        self.assertTrue("http://www.google.de/index.html" in
                        curi.optional_vars[CURI_EXTRACTED_URLS])
        self.assertFalse("ftp://www.google.de/pillepalle.avi" in
                         curi.optional_vars[CURI_EXTRACTED_URLS])
Esempio n. 23
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
Esempio n. 24
0
    def test_with_multiple_active_queues(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 2
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://www.google.de")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(2, len(frontier._current_queues))

        next_url = frontier.get_next()
Esempio n. 25
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
Esempio n. 26
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
    def test_that_creating_mgmt_works(self):

        ctx = zmq.Context()
        io_loop = IOLoop.instance()

        def stop_looping(_msg):
            io_loop.stop()

        settings = Settings()
        settings.ZEROMQ_MASTER_PUSH = 'inproc://spyder-zmq-master-push'
        settings.ZEROMQ_WORKER_PROC_FETCHER_PULL = \
            settings.ZEROMQ_MASTER_PUSH
        settings.ZEROMQ_MASTER_SUB = 'inproc://spyder-zmq-master-sub'
        settings.ZEROMQ_WORKER_PROC_EXTRACTOR_PUB = \
            settings.ZEROMQ_MASTER_SUB

        settings.ZEROMQ_MGMT_MASTER = 'inproc://spyder-zmq-mgmt-master'
        settings.ZEROMQ_MGMT_WORKER = 'inproc://spyder-zmq-mgmt-worker'

        pubsocket = ctx.socket(zmq.PUB)
        pubsocket.bind(settings.ZEROMQ_MGMT_MASTER)
        pub_stream = ZMQStream(pubsocket, io_loop)

        subsocket = ctx.socket(zmq.SUB)
        subsocket.setsockopt(zmq.SUBSCRIBE, "")
        subsocket.bind(settings.ZEROMQ_MGMT_WORKER)
        sub_stream = ZMQStream(subsocket, io_loop)

        mgmt = workerprocess.create_worker_management(settings, ctx, io_loop)
        mgmt.add_callback(ZMQ_SPYDER_MGMT_WORKER, stop_looping)
        mgmt.start()

        def assert_quit_message(msg):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg.data)

        sub_stream.on_recv(assert_quit_message)

        death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
        pub_stream.send_multipart(death.serialize())

        io_loop.start()

        mgmt._out_stream.close()
        mgmt._in_stream.close()
        mgmt._publisher.close()
        mgmt._subscriber.close()
        pub_stream.close()
        pubsocket.close()
        sub_stream.close()
        subsocket.close()
        ctx.term()
Esempio n. 28
0
    def test_that_cleaning_qs_works(self):
        s = Settings()
        c = CleanupQueryString(s)

        self.assertEqual(
            "http://tesT.com/t.html?p=a",
            c._cleanup_query_string("http://tesT.com/t.html?p=a#top"))

        self.assertEqual(
            "http://test.com/t.html",
            c._cleanup_query_string("http://test.com/t.html?#top"))

        self.assertEqual(
            "http://test.com/t.html?test=a",
            c._cleanup_query_string("http://test.com/t.html?test=a&"))
Esempio n. 29
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
Esempio n. 30
0
    def test_that_with_uri_works(self):

        s = StripSessionIds(Settings())

        urls = [
            "http://preis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2",
            "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2",
        ]

        curi = CrawlUri()
        curi.optional_vars = {CURI_EXTRACTED_URLS: "\n".join(urls)}

        curi = s(curi)
        clean_urls = curi.optional_vars[CURI_EXTRACTED_URLS].split('\n')

        for u in clean_urls:
            self.assertEqual("http://preis.de/traeger/index.php?", u)
    def test_missing_encoding_works(self):
        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                         links[2])
    def test_link_extraction_works(self):

        src = "<a href='http://www.google.de' title='ups'> viel text</a>" + \
            "<a title='ups i did it again' href ='/relative.html'>und " + \
            "noch mehr!</a><a href='evenmorerelative.html'/>" + \
            "<a href='&#109;&#97;&#105;&#108;&#116;&#111;&#58;&#109;&#117;&#115;&#116;&#101;&#114;&#64;&#98;&#102;&#97;&#114;&#109;&#46;&#100;&#101;'/>"

        curi = CrawlUri()
        curi.rep_header = dict()
        curi.rep_header["Content-Type"] = "text/html; charset=utf-8"
        curi.url = "http://www.bmg.bund.de/test/"
        curi.content_body = src
        curi.optional_vars = dict()

        xtor = DefaultHtmlLinkExtractor(Settings())
        curi = xtor(curi)

        links = curi.optional_vars[CURI_EXTRACTED_URLS].split("\n")
        self.assertEqual("http://www.google.de", links[0])
        self.assertEqual("http://www.bmg.bund.de/relative.html", links[1])
        self.assertEqual("http://www.bmg.bund.de/test/evenmorerelative.html",
                         links[2])
Esempio n. 33
0
    def test_that_creating_fetcher_works(self):
        ctx = zmq.Context()
        io_loop = IOLoop.instance()

        def stop_looping(_msg):
            io_loop.stop()

        settings = Settings()

        master_push = ctx.socket(zmq.PUSH)
        master_push.bind(settings.ZEROMQ_MASTER_PUSH)

        fetcher = workerprocess.create_worker_fetcher(
            settings, {}, ctx, StreamHandler(sys.stdout), io_loop)

        self.assertTrue(isinstance(fetcher._processing, FetchProcessor))
        self.assertTrue(isinstance(fetcher, AsyncZmqWorker))

        fetcher._insocket.close()
        fetcher._outsocket.close()
        master_push.close()
        ctx.term()
Esempio n. 34
0
    def test_that_stripping_session_stuff_works(self):

        s = StripSessionIds(Settings())

        url = "http://pREis.de/traeger/index.php?sid=8429fb3ae210a2a0e28800b7f48d90f2"

        self.assertEqual("http://pREis.de/traeger/index.php?",
                         s._remove_session_ids(url))

        url = "http://preis.de/traeger/index.php?jsessionid=8429fb3ae210a2a0e28800b7f48d90f2"

        self.assertEqual("http://preis.de/traeger/index.php?",
                         s._remove_session_ids(url))

        url = "http://preis.de/traeger/index.php?phpsessid=8429fb3ae210a2a0e28800b7f48d90f2"

        self.assertEqual("http://preis.de/traeger/index.php?",
                         s._remove_session_ids(url))

        url = "http://preis.de/traeger/index.php?aspsessionid=8429fb3ae210a2a0e28800b7f48d90f2"

        self.assertEqual("http://preis.de/traeger/index.php?",
                         s._remove_session_ids(url))
Esempio n. 35
0
    def test_that_creating_processing_function_works(self):
        settings = Settings()
        processors = settings.SPYDER_EXTRACTOR_PIPELINE
        processors.extend(settings.SPYDER_SCOPER_PIPELINE)
        processors.append('test_workerprocess')
        self.assertRaises(ValueError, workerprocess.create_processing_function,
                          settings, processors)

        processors.pop()
        processors.append('test_workerprocess_unspec')
        self.assertRaises(ValueError, workerprocess.create_processing_function,
                          settings, processors)

        processors.pop()
        processing = workerprocess.create_processing_function(
            settings, processors)

        curi = CrawlUri(optional_vars=dict())
        curi.effective_url = "http://127.0.0.1/robots.txt"
        curi2 = processing(curi)

        self.assertEqual(CURI_OPTIONAL_TRUE,
                         curi2.optional_vars[CURI_EXTRACTION_FINISHED])
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(
            os.stat(os.path.join(self._path, "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                        effective_url="http://127.0.0.1:%s/robots.txt" %
                        self.port,
                        req_header={"Last-Modified": mtime})

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Esempio n. 37
0
    def test_fetching_etag_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        curi = CrawlUri(
            url="http://localhost:%s/robots.txt" % self.port,
            effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
            req_header={
                "Etag": "\"3926227169c58185234888b60000c6eb1169577d\""
            })

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Esempio n. 38
0
    def test_queues_work(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 1
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://foreignhost")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(1, len(frontier._current_queues))
        for q1 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q1])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q1])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q1 == 1:
            curi1.status_code = 500
            frontier.process_server_error(curi1)
        else:
            curi1.status_code = 500
            frontier.process_server_error(curi2)

        self.assertEquals(-1, frontier._budget_politeness[q1])

        frontier._cleanup_budget_politeness()

        self.assertEqual(1, len(frontier._current_queues))
        for q2 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q2])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q2])
 
        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q2 == 1:
            curi1.status_code = 200
            frontier.process_successful_crawl(curi1)
        else:
            curi2.status_code = 200
            frontier.process_successful_crawl(curi2)

        self.assertEquals(3, frontier._budget_politeness[q2])

        frontier._cleanup_budget_politeness()
Esempio n. 39
0
    def test_queues_work(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 1
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://foreignhost")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(1, len(frontier._current_queues))
        for q1 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q1])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q1])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q1 == 1:
            curi1.status_code = 500
            frontier.process_server_error(curi1)
        else:
            curi1.status_code = 500
            frontier.process_server_error(curi2)

        self.assertEquals(-1, frontier._budget_politeness[q1])

        frontier._cleanup_budget_politeness()

        self.assertEqual(1, len(frontier._current_queues))
        for q2 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q2])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q2])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q2 == 1:
            curi1.status_code = 200
            frontier.process_successful_crawl(curi1)
        else:
            curi2.status_code = 200
            frontier.process_successful_crawl(curi2)

        self.assertEquals(3, frontier._budget_politeness[q2])

        frontier._cleanup_budget_politeness()