def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Dequeues new batch of requests for crawling. :param max_n_requests: maximum number of requests to return :param partition_id: partition id :return: list of :class:`Request <frontera.core.models.Request>` objects. """ results = [] try: for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ limit(max_n_requests): method = 'GET' if not item.method else item.method r = Request( item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) r.meta['fingerprint'] = item.fingerprint r.meta['score'] = item.score results.append(r) self.session.delete(item) self.session.commit() except Exception, exc: self.logger.exception(exc) self.session.rollback()
def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Dequeues new batch of requests for crawling. :param max_n_requests: maximum number of requests to return :param partition_id: partition id :return: list of :class:`Request <frontera.core.models.Request>` objects. """ results = [] try: for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ limit(max_n_requests): method = item.method or b'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) r.meta[b'fingerprint'] = to_bytes(item.fingerprint) r.meta[b'score'] = item.score results.append(r) self.session.delete(item) self.session.commit() except Exception as exc: self.logger.exception(exc) self.session.rollback() return results
def test_states(): logging.basicConfig(level=logging.DEBUG) states = HCFStates(config.API_KEY, config.PROJECT_ID, config.FRONTIER_NAME, 256, True) states.frontier_start() objs = [] fprints = [] for i in range(0, 128): o = Request('http://website.com/%d' % randint(0, maxsize)) o.meta[b'fingerprint'] = generate_fprint() o.meta[b'state'] = choice([ HCFStates.NOT_CRAWLED, HCFStates.QUEUED, HCFStates.CRAWLED, HCFStates.ERROR ]) objs.append(o) fprints.append(o.meta[b'fingerprint']) states.update_cache(objs) states.flush() # cache is warm check_states(states, fprints, objs) # clearing tha cache, and testing fetching states.flush(force_clear=True) check_states(states, fprints, objs)
def test_should_parse_domain_info(self): seeds = [ Request('http://example.com'), Request('https://www.google.com'), ] mware = DomainMiddleware(self.fake_manager) result = mware.add_seeds(seeds) self.assertEquals(len(result), len(seeds)) for r in result: self.assertIn(b'domain', r.meta, 'Missing domain info for %r' % r) expected = [ { b'name': b'example.com', b'netloc': b'example.com', b'scheme': b'http', b'sld': b'', b'subdomain': b'', b'tld': b'' }, { b'name': b'www.google.com', b'netloc': b'www.google.com', b'scheme': b'https', b'sld': b'', b'subdomain': b'', b'tld': b'' }, ] self.assertEquals(expected, [r.meta[b'domain'] for r in result])
def test_should_parse_tldextract_extra_domain_info(self): seeds = [ Request('http://example.com'), Request('https://www.google.com'), ] self.fake_manager.settings = {'TLDEXTRACT_DOMAIN_INFO': True} mware = DomainMiddleware(self.fake_manager) result = mware.add_seeds(seeds) self.assertEqual(len(result), len(seeds)) for r in result: self.assertIn(b'domain', r.meta, 'Missing domain info for %r' % r) expected = [ { b'name': b'example.com', b'netloc': b'example.com', b'scheme': b'http', b'sld': b'example', b'subdomain': b'', b'tld': b'com' }, { b'name': b'google.com', b'netloc': b'www.google.com', b'scheme': b'https', b'sld': b'google', b'subdomain': b'www', b'tld': b'com' }, ] self.assertEqual(expected, [r.meta[b'domain'] for r in result])
def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Dequeues new batch of requests for crawling. :param max_n_requests: maximum number of requests to return :param partition_id: partition id :return: list of :class:`Request <frontera.core.models.Request>` objects. """ results = [] try: dequeued_urls = 0 cql_ditems = [] d_query = self.session.prepare("DELETE FROM queue WHERE crawl = ? AND fingerprint = ? AND partition_id = ? " "AND score = ? AND created_at = ?") for item in self.queue_model.objects.filter(crawl=self.crawl_id, partition_id=partition_id).\ order_by("partition_id", "score", self._order_by()).limit(max_n_requests): method = 'GET' if not item.method else item.method meta_dict2 = dict((name, getattr(item.meta, name)) for name in dir(item.meta) if not name.startswith('__')) # TODO: How the result can be an dict not an object -> Objects get error while encodeing for Message Bus # If I take meta_dict2 direct to Request i get the same error message meta_dict = dict() meta_dict["fingerprint"] = meta_dict2["fingerprint"] meta_dict["domain"] = meta_dict2["domain"] meta_dict["origin_is_frontier"] = meta_dict2["origin_is_frontier"] meta_dict["scrapy_callback"] = meta_dict2["scrapy_callback"] meta_dict["scrapy_errback"] = meta_dict2["scrapy_errback"] meta_dict["scrapy_meta"] = meta_dict2["scrapy_meta"] meta_dict["score"] = meta_dict2["score"] meta_dict["jid"] = meta_dict2["jid"] r = Request(item.url, method=method, meta=meta_dict, headers=item.headers, cookies=item.cookies) r.meta['fingerprint'] = item.fingerprint r.meta['score'] = item.score results.append(r) cql_d = (item.crawl, item.fingerprint, item.partition_id, item.score, item.created_at) cql_ditems.append(cql_d) dequeued_urls += 1 if dequeued_urls > 0: execute_concurrent_with_args(self.session, d_query, cql_ditems, concurrency=200) self.counter_cls.cass_count({"dequeued_urls": dequeued_urls}) except Exception, exc: self.logger.exception(exc)
def create_request(self, url, method='GET', headers=None, cookies=None, meta=None, body=''): """ Creates request with specified fields, with state fetched from backend. :param url: str :param method: str :param headers: dict :param cookies: dict :param meta: dict :param body: str :return: :class:`Request <frontera.core.models.Request>` """ r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) self.url_mw._add_fingerprint(r) self._states_context.refresh_and_keep(r) return r
def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): """ Creates request with specified fields, with state fetched from backend. This method only creates request, but isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states from storage. :param url: str :param method: str :param headers: dict :param cookies: dict :param meta: dict :param body: str :return: :class:`Request <frontera.core.models.Request>` """ r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) self.url_mw._add_fingerprint(r) return r
def single_node_chain(url1, url2): r = Request(url=url1) re = Response(url=url2, request=r) re.meta[b'fingerprint'] = sha1(url2) re.meta[b'redirect_urls'] = [url1] re.meta[b'redirect_fingerprints'] = [sha1(url1)] return re
def schedule(self, batch): for obj in batch: if obj[3]: self.requests.append( Request(obj[2].url, meta={ b'fingerprint': obj[0], b'score': obj[1] }))
def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Dequeues new batch of requests for crawling. Priorities, from highest to lowest: - max_requests_per_host - max_n_requests - min_hosts & min_requests :param max_n_requests: :param partition_id: :param kwargs: min_requests, min_hosts, max_requests_per_host :return: list of :class:`Request <frontera.core.models.Request>` objects. """ min_requests = kwargs.pop("min_requests", None) min_hosts = kwargs.pop("min_hosts", None) max_requests_per_host = kwargs.pop("max_requests_per_host", None) assert(max_n_requests > min_requests) queue = {} limit = max_n_requests tries = 0 count = 0 while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d", tries, limit, count, len(queue.keys())) queue.clear() count = 0 for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ limit(limit): if item.host_crc32 not in queue: queue[item.host_crc32] = [] if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: continue queue[item.host_crc32].append(item) count += 1 if count > max_n_requests: break if min_hosts is not None and len(queue.keys()) < min_hosts: continue if min_requests is not None and count < min_requests: continue break self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count) results = [] for items in queue.itervalues(): for item in items: method = 'GET' if not item.method else str(item.method) results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies)) self.session.delete(item) self.session.commit() return results
def generate_requests(self): def get_random_host(): return str("").join([choice(ascii_lowercase) for i in range(5)]) self.hosts = set() for _ in range(21): self.hosts.add(get_random_host()) self.requests = [] for host in self.hosts: self.requests.append(Request("http://%s/" % (host)))
def test_basic(): cs = Basic() r = Request(url="http://www.scrapinghub.com/") re = Response(url="http://scrapinghub.com/", request=r) re.meta['fingerprint'] = "6d8afb0c246caa28a2c1bdaaac19c70c24a2d22e" re.meta['redirect_urls'] = ['http://www.scrapinghub.com/'] re.meta['redirect_fingerprints'] = [ "6cd0a1e069d5a1666a6ec290a4b33f5f325c2e66" ] cs.page_crawled(re, []) assert re.url == "http://www.scrapinghub.com/"
def test_states(): logging.basicConfig(level=logging.DEBUG) states = HCFStates(config.API_KEY, config.PROJECT_ID, config.FRONTIER_NAME, 256, True) states.frontier_start() objs = [] fprints = [] for i in range(0, 128): o = Request('http://website.com/%d' % randint(0, maxsize)) o.meta[b'fingerprint'] = generate_fprint() o.meta[b'state'] = choice([HCFStates.NOT_CRAWLED, HCFStates.QUEUED, HCFStates.CRAWLED, HCFStates.ERROR]) objs.append(o) fprints.append(o.meta[b'fingerprint']) states.update_cache(objs) states.flush() # cache is warm check_states(states, fprints, objs) # clearing tha cache, and testing fetching states.flush(force_clear=True) check_states(states, fprints, objs)
def test_scheduling_past_1part_post(self): subject = MemoryQueue(1) data = {'id': 'xxx', 'name': 'yyy'} batch = [ ("1", 1, Request(url='https://www.knuthellan.com/', body=data, method='POST'), True), ] subject.schedule(batch) requests = subject.get_next_requests(5, 0) for request in requests: self.assertTrue(request.method == b'POST') self.assertTrue(request.body == data)
def get_next_requests(self, max_n_requests, partition_id, score, **kwargs): results = [] try: queue = self.queue_model query = self.session.query(queue ).filter(queue.partition_id == partition_id, queue.score >= score ).order_by(queue.created_at ).limit(max_n_requests) for item in query: method = item.method or b'GET' r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) fp = item.fingerprint msg = f"retrieved request {fp[:6]}...{fp[-6:]}" self.logger.info(msg) r.meta[b'fingerprint'] = to_bytes(item.fingerprint) r.meta[b'score'] = item.score results.append(r) self.session.delete(item) self.session.commit() except Exception as exc: self.logger.exception(exc) self.session.rollback() self.logger.info(f"Got {len(results)} next requests with score {score}") return results
def check_states(states, fprints, objs): states.fetch(fprints) objs_fresh = [ Request(o.url, meta={b'fingerprint': o.meta[b'fingerprint']}) for o in objs ] states.set_states(objs_fresh) i1 = iter(objs) i2 = iter(objs_fresh) while True: try: o1 = next(i1) o2 = next(i2) assert o1.meta[b'fingerprint'] == o2.meta[b'fingerprint'] assert o1.meta[b'state'] == o2.meta[b'state'] except StopIteration: break
def consume_scoring(self, *args, **kwargs): consumed = 0 seen = set() batch = [] for m in self.scoring_log_consumer.get_messages(count=self.consumer_batch_size): try: msg = self._decoder.decode(m) except (KeyError, TypeError), e: logger.error("Decoding error: %s", e) continue else: if msg[0] == 'update_score': _, fprint, score, url, schedule = msg if fprint not in seen: batch.append((fprint, score, Request(url), schedule)) seen.add(fprint) if msg[0] == 'new_job_id': self.job_id = msg[1] finally:
def test_queue(): logging.basicConfig(level=logging.DEBUG) queue = HCFQueue(config.API_KEY, config.PROJECT_ID, config.FRONTIER_NAME, 10000, 1, 1, "", True) queue.frontier_start() r = Request(url="http://scrapinghub.com", meta={ b"fingerprint": b"abcdef01234567890", "native": "string test" }) queue.schedule([("", 0.9, r, True)]) sleep(4) result = queue.get_next_requests(256, 0) assert result[0].url == r.url assert result[0].meta[b'fingerprint'] == r.meta[b'fingerprint'] assert result[0].meta["native"] == r.meta["native"] queue.frontier_stop()
import pytest from frontera.core.components import States from frontera.core.models import Request from happybase import Connection from frontera.contrib.backends.hbase import HBaseState, HBaseQueue from frontera.contrib.backends.sqlalchemy import States as SQLAlchemyStates, Queue as SQLAlchemyQueue from frontera.contrib.backends.sqlalchemy.models import StateModel, QueueModel from frontera.contrib.backends.memory import MemoryStates, MemoryQueue from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker r1 = Request('https://www.example.com', meta={ b'fingerprint': b'10', b'domain': { b'name': b'www.example.com', b'fingerprint': b'81' } }) r2 = Request('http://example.com/some/page/', meta={ b'fingerprint': b'11', b'domain': { b'name': b'example.com', b'fingerprint': b'82' } }) r3 = Request('http://www.scrapy.org', meta={ b'fingerprint': b'12', b'domain': {
from frontera.core.models import Request, Response from frontera.worker.db import DBWorker, ScoringConsumer, IncomingConsumer, BatchGenerator from frontera.settings import Settings from frontera.core.components import States import unittest r1 = Request('http://www.example.com/', meta={ b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0 }) r2 = Request('http://www.scrapy.org/', meta={ b'fingerprint': b'2', b'state': States.DEFAULT, b'jid': 0 }) r3 = Request('https://www.dmoz.org', meta={ b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0 }) class TestDBWorker(unittest.TestCase): def dbw_setup(self, distributed=False): settings = Settings() settings.MAX_NEXT_REQUESTS = 64 settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus'
from __future__ import absolute_import from frontera.core.manager import FrontierManager from frontera.settings import Settings from frontera.core.models import Request, Response from six.moves import range r1 = Request( 'http://www.example.com', meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'}) r2 = Request( 'https://www.example.com/some/page', meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'}) r3 = Request( 'http://example1.com', meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'}) class TestFrontierManager(object): def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' return FrontierManager.from_settings(settings) def test_start(self):
from frontera.core.manager import LocalFrontierManager from frontera.settings import Settings from frontera.core.models import Request, Response from frontera.core.components import States from six.moves import range from unittest import TestCase r1 = Request( 'http://www.example.com', meta={b'fingerprint': b'89e6a0649e06d83370cdf2cbfb05f363934a8d0c'}) r2 = Request( 'https://www.example.com/some/page', meta={b'fingerprint': b'61aec35fac3a032b3be3a5d07eb9e0024bd89de1'}) r3 = Request( 'http://example1.com', meta={b'fingerprint': b'758293d800fc9672ae2c68bd083359b74ab9b6c2'}) seeds_blob = b"""http://www.example.com https://www.example.com/some/page http://example1.com """ from io import BytesIO SEEDS_FILE = BytesIO(seeds_blob) class TestFrontierManager(TestCase): def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [
from __future__ import absolute_import from frontera.core import OverusedBuffer from frontera.core.models import Request from six.moves import range from itertools import cycle from random import choice, sample from string import ascii_lowercase r1 = Request('http://www.example.com') r2 = Request('http://www.example.com/some/') r3 = Request('htttp://www.example.com/some/page/') r4 = Request('http://example.com') r5 = Request('http://example.com/some/page') r6 = Request('http://example1.com') class TestOverusedBuffer(object): requests = [r1, r2, r3, r4, r5, r6] def get_once(self, max_n_requests, **kwargs): lst = [] for _ in range(max_n_requests): try: lst.append(next(self.req_it)) except StopIteration: break return lst def test_base(self): self.req_it = iter(self.requests)
from frontera.worker.strategy import StrategyWorker from frontera.settings import Settings from frontera.core.models import Request, Response from frontera.core.components import States from tests.mocks.components import CrawlingStrategy from unittest import TestCase from os import remove from os.path import exists r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'jid': 0}) r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'jid': 0}) r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'jid': 0}) r4 = Request('http://www.test.com/some/page', meta={ b'fingerprint': b'4', b'jid': 0 }) class FilteredLinksCrawlingStrategy(CrawlingStrategy): def filter_extracted_links(self, request, links): return [] class TestStrategyWorker(TestCase): def setUp(self): settings = Settings() settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.STRATEGY = 'tests.mocks.components.CrawlingStrategy' settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100
def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ Tries to get new batch from priority queue. It makes self.GET_RETRIES tries and stops, trying to fit all parameters. Every new iteration evaluates a deeper batch. After batch is requested it is removed from the queue. :param max_n_requests: maximum number of requests :param partition_id: partition id to get batch from :param min_requests: minimum number of requests :param min_hosts: minimum number of hosts :param max_requests_per_host: maximum number of requests per host :return: list of :class:`Request <frontera.core.models.Request>` objects. """ min_requests = kwargs.pop('min_requests') min_hosts = kwargs.pop('min_hosts') max_requests_per_host = kwargs.pop('max_requests_per_host') assert (max_n_requests > min_requests) table = self.connection.table(self.table_name) meta_map = {} queue = {} limit = min_requests tries = 0 count = 0 while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 self.logger.debug( "Try %d, limit %d, last attempt: requests %d, hosts %d" % (tries, limit, count, len(queue.keys()))) meta_map.clear() queue.clear() count = 0 for rk, data in table.scan(row_prefix='%d_' % partition_id, limit=int(limit), batch_size=256): for cq, buf in data.iteritems(): stream = BytesIO(buf) unpacker = Unpacker(stream) for item in unpacker: fingerprint, host_crc32, url, score = item if host_crc32 not in queue: queue[host_crc32] = [] if max_requests_per_host is not None and len( queue[host_crc32]) > max_requests_per_host: continue queue[host_crc32].append(fingerprint) count += 1 if fingerprint not in meta_map: meta_map[fingerprint] = [] meta_map[fingerprint].append((rk, item)) if count > max_n_requests: break if min_hosts is not None and len(queue.keys()) < min_hosts: continue if count < min_requests: continue break self.logger.debug("Finished: tries %d, hosts %d, requests %d" % (tries, len(queue.keys()), count)) # For every fingerprint collect it's row keys and return all fingerprints from them fprint_map = {} for fprint, meta_list in meta_map.iteritems(): for rk, _ in meta_list: fprint_map.setdefault(rk, []).append(fprint) results = [] trash_can = set() for _, fprints in queue.iteritems(): for fprint in fprints: for rk, _ in meta_map[fprint]: trash_can.add(rk) for rk_fprint in fprint_map[rk]: _, item = meta_map[rk_fprint][0] _, _, url, score = item results.append( Request(url, meta={ 'fingerprint': hexlify(rk_fprint), 'score': score, })) with table.batch(transaction=True) as b: for rk in trash_can: b.delete(rk) self.logger.debug("%d row keys removed" % (len(trash_can))) return results
from __future__ import absolute_import import unittest from frontera.contrib.backends.remote.messagebus import MessageBusBackend from frontera.settings import Settings from frontera.core.models import Request, Response data = {'id': 'xxx', 'name': 'yyy'} r1 = Request('http://www.example.com/', method='post', body=data, meta={b'domain': { b'fingerprint': b'1' }}) r2 = Request('http://www.scrapy.org/', meta={b'domain': { b'fingerprint': b'2' }}) r3 = Request('http://www.test.com/some/page', meta={b'domain': { b'fingerprint': b'3' }}) class TestMessageBusBackend(unittest.TestCase): def mbb_setup(self, settings=None): manager = type('manager', (object, ), {}) settings = settings or Settings() settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' settings.STORE_CONTENT = True
from __future__ import absolute_import from frontera.core.manager import FrontierManager from frontera.settings import Settings from frontera.core.models import Request, Response from six.moves import range r1 = Request('http://www.example.com') r2 = Request('https://www.example.com/some/page') r3 = Request('http://example1.com') class TestFrontierManager(object): def setup_frontier_manager(self, settings=None): settings = settings or Settings() settings.BACKEND = 'tests.mocks.components.FakeBackend' settings.MIDDLEWARES = [ 'tests.mocks.components.FakeMiddleware', 'tests.mocks.components.FakeMiddlewareModifySeeds', 'tests.mocks.components.FakeMiddlewareModifyResponse', 'tests.mocks.components.FakeMiddlewareModifyLinks' ] settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' return FrontierManager.from_settings(settings) def test_start(self): fm = self.setup_frontier_manager() assert fm._started is True assert fm.backend._started is True assert [mw._started for mw in fm.middlewares] == [True] * 4 assert fm.canonicalsolver._started is True
from frontera.core import OverusedBuffer from frontera.core.models import Request from six.moves import range from itertools import cycle from random import choice from string import ascii_lowercase r1 = Request( 'http://www.example.com', meta={b'fingerprint': b'8ece61d2d42e578e86d9f95ad063cf36eb8e774d'}) r2 = Request( 'http://www.example.com/some/', meta={b'fingerprint': b'9773afd9cb0f4ec3fd09d6d1fe2c742abf0621ec'}) r3 = Request( 'htttp://www.example.com/some/page/', meta={b'fingerprint': b'7278fb7612670523a7e3e37d7c38871c73bcb0ea'}) r4 = Request( 'http://example.com', meta={b'fingerprint': b'89dce6a446a69d6b9bdc01ac75251e4c322bcdff'}) r5 = Request( 'http://example.com/some/page', meta={b'fingerprint': b'9dbd730bdce21e322a12c757753f26bbc95c3779'}) r6 = Request( 'http://example1.com', meta={b'fingerprint': b'0ac55362d7391707e121dace4d203a0dc4393afc'}) class TestOverusedBuffer(object): requests = [r1, r2, r3, r4, r5, r6]
def test_codec(encoder, decoder): def check_request(req1, req2): assert req1.url == req2.url and req1.meta == req2.meta and req1.headers == req2.headers \ and req1.method == req2.method enc = encoder(Request, send_body=True) dec = decoder(Request, Response) req = Request(url="http://www.yandex.ru", method=b'GET', meta={b"test": b"shmest"}, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), enc.encode_page_crawled( Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, request=req)), enc.encode_links_extracted(req, [req2]), enc.encode_request_error(req, "Host not found"), enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), enc.encode_request(req) ] it = iter(msgs) o = dec.decode(next(it)) assert o[0] == 'add_seeds' assert type(o[1]) == list req_d = o[1][0] check_request(req_d, req) assert type(req_d) == Request o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response assert o[1].url == req.url and o[1].body == b'SOME CONTENT' and o[ 1].meta == req.meta o = dec.decode(next(it)) print(o) assert o[0] == 'links_extracted' assert type(o[1]) == Request assert o[1].url == req.url and o[1].meta == req.meta assert type(o[2]) == list req_d = o[2][0] assert type(req_d) == Request assert req_d.url == req2.url o_type, o_req, o_error = dec.decode(next(it)) assert o_type == 'request_error' check_request(o_req, req) assert o_error == "Host not found" o_type, o_req2, score, schedule = dec.decode(next(it)) assert o_type == 'update_score' assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers assert score == 0.51 assert schedule is True o_type, job_id = dec.decode(next(it)) assert o_type == 'new_job_id' assert job_id == 1 o_type, partition_id, offset = dec.decode(next(it)) assert o_type == 'offset' assert partition_id == 0 assert offset == 28796 o = dec.decode_request(next(it)) check_request(o, req)
# -*- coding: utf-8 -*- from __future__ import absolute_import from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner from frontera.core.models import Request from six.moves import range request = Request( 'http://www.example.com', meta={b'fingerprint': b'1be68ff556fd0bbe5802d1a100850da29f7f15b1'}) def test_fingerprint_partitioner(): partitions = list(range(0, 5)) fp = FingerprintPartitioner(partitions) key = b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' assert fp.get_key(request) == key partition = fp.partition(key, partitions) assert partition == 1 partition = fp.partition(key, None) assert partition == 1 def test_crc32name_partitioner(): partitions = list(range(0, 5)) cp = Crc32NamePartitioner(partitions) key = b'www.example.com' assert cp.get_key(request) == key
from __future__ import absolute_import from happybase import Connection from Hbase_thrift import AlreadyExists # module loaded at runtime in happybase from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue from frontera.core.models import Request, Response from frontera.core.components import States from binascii import unhexlify from time import sleep, time from w3lib.util import to_native_str r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) r4 = r3.copy() class TestHBaseBackend(object): def delete_rows(self, table, row_keys): batch = table.batch() for key in row_keys: batch.delete(unhexlify(key)) batch.send() def test_metadata(self): connection = Connection(host='hbase-docker', port=9090) metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True) metadata.add_seeds([r1, r2, r3])
def test_codec(encoder, decoder, send_body, invalid_value): def check_request(req1, req2): assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \ _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method enc = encoder(Request, send_body=send_body) dec = decoder(Request, Response) req = Request(url="http://www.yandex.ru", method=b'GET', meta={ b'test': b'shmest', b'scrapy_meta': { 'rule': 0, 'key': 'value' } }, headers={b'reqhdr': b'value'}) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), enc.encode_page_crawled( Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, request=req)), enc.encode_links_extracted(req, [req2]), enc.encode_request_error(req, "Host not found"), enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), enc.encode_request(req), invalid_value, ] it = iter(msgs) o = dec.decode(next(it)) assert o[0] == 'add_seeds' assert type(o[1]) == list req_d = o[1][0] check_request(req_d, req) assert type(req_d) == Request o = dec.decode(next(it)) assert o[0] == 'page_crawled' assert type(o[1]) == Response assert o[1].url == req.url and o[1].meta == req.meta if send_body: o[1].body == b'SOME CONTENT' else: o[1].body is None o = dec.decode(next(it)) print(o) assert o[0] == 'links_extracted' assert type(o[1]) == Request assert o[1].url == req.url and o[1].meta == req.meta assert type(o[2]) == list req_d = o[2][0] assert type(req_d) == Request assert req_d.url == req2.url o_type, o_req, o_error = dec.decode(next(it)) assert o_type == 'request_error' check_request(o_req, req) assert o_error == "Host not found" o_type, o_req2, score, schedule = dec.decode(next(it)) assert o_type == 'update_score' assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers assert score == 0.51 assert schedule is True o_type, job_id = dec.decode(next(it)) assert o_type == 'new_job_id' assert job_id == 1 o_type, partition_id, offset = dec.decode(next(it)) assert o_type == 'offset' assert partition_id == 0 assert offset == 28796 o = dec.decode_request(next(it)) check_request(o, req) with pytest.raises(TypeError): dec.decode(next(it))
from __future__ import absolute_import import unittest from frontera.contrib.backends.remote.messagebus import MessageBusBackend from frontera.settings import Settings from frontera.core.models import Request, Response r1 = Request('http://www.example.com/', meta={ b'domain': { b'fingerprint': b'1' }, b'fingerprint': b'abc' }) r2 = Request('http://www.scrapy.org/', meta={ b'domain': { b'fingerprint': b'2' }, b'fingerprint': b'012' }) r3 = Request('http://www.test.com/some/page', meta={ b'domain': { b'fingerprint': b'3' }, b'fingerprint': b'345' }) class TestMessageBusBackend(unittest.TestCase):