def put_data_into_hbase(rdd): """ functions to store data into hbase table """ # collecting the results results = rdd.collect() # computing the exact time: this will serve as the row id date = str(datetime.datetime.now())[:19] # making connection to the right connection = Connection(host='localhost', port=9090, autoconnect=True) table = connection.table(name='base_tweets') # for data in results: if data[0] == 0: table.put(row=date, data={'tweet_count:neg': str(data[1])}) else: table.put(row=date, data={'tweet_count:pos': str(data[1])}) connection.close()
class HBaseBackend(DistributedBackend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':', 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None @classmethod def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings o._states = HBaseState(o.connection, settings.get('HBASE_STATES_TABLE'), settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), settings.get('HBASE_DROP_ALL_TABLES')) return o @classmethod def db_worker(cls, manager): o = cls(manager) settings = manager.settings drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') o._queue = HBaseQueue(o.connection, o.queue_partitions, settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables, use_snappy=settings.get('HBASE_USE_SNAPPY')) o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) return o @property def metadata(self): return self._metadata @property def queue(self): return self._queue @property def states(self): return self._states def frontier_start(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_start() def frontier_stop(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_stop() self.connection.close() def add_seeds(self, seeds): self.metadata.add_seeds(seeds) def page_crawled(self, response): self.metadata.page_crawled(response) def links_extracted(self, request, links): self.metadata.links_extracted(request, links) def request_error(self, page, error): self.metadata.request_error(page, error) def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] self.logger.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get_next_requests(max_next_requests, partition_id, min_requests=self._min_requests, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) next_pages.extend(results) self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) return next_pages
class HBaseBackend(DistributedBackend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':' } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({ 'protocol': 'compact', 'transport': 'framed' }) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None @classmethod def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'), settings.get('HBASE_STATE_CACHE_SIZE_LIMIT')) return o @classmethod def db_worker(cls, manager): o = cls(manager) settings = manager.settings drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') o._queue = HBaseQueue(o.connection, o.queue_partitions, settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables) o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) return o @property def metadata(self): return self._metadata @property def queue(self): return self._queue @property def states(self): return self._states def frontier_start(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_start() def frontier_stop(self): for component in [self.metadata, self.queue, self.states]: if component: component.frontier_stop() self.connection.close() def add_seeds(self, seeds): self.metadata.add_seeds(seeds) def page_crawled(self, response): self.metadata.page_crawled(response) def links_extracted(self, request, links): self.metadata.links_extracted(request, links) def request_error(self, page, error): self.metadata.request_error(page, error) def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] self.logger.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get_next_requests(max_next_requests, partition_id, min_requests=self._min_requests, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) next_pages.extend(results) self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) return next_pages
class HBaseBackend(Backend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get('HBASE_THRIFT_PORT', 9090) hosts = settings.get('HBASE_THRIFT_HOST', 'localhost') namespace = settings.get('HBASE_NAMESPACE', 'crawler') drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False) self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4) self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata') host = choice(hosts) if type(hosts) in [list, tuple] else hosts self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':') # protocol='compact', transport='framed' self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend, drop=drop_all_tables) self.state_checker = HBaseState(self.connection, self._table_name) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY' 's': {'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': {'max_versions': 1} }) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=9216) @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.connection.close() self.flush() def add_seeds(self, seeds): for seed in seeds: url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed) obj = prepare_hbase_object(url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain['fingerprint']) self.batch.put(unhexlify(fingerprint), obj) def page_crawled(self, response, links): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response) obj = prepare_hbase_object(status_code=response.status_code, content=response.body) links_dict = dict() for link in links: link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link) links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain) self.batch.put(unhexlify(fingerprint), obj) for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems(): obj = prepare_hbase_object(url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain['fingerprint']) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request) obj = prepare_hbase_object(url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain['fingerprint']) rk = unhexlify(request.meta['fingerprint']) self.batch.put(rk, obj) def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] log = self.manager.logger.backend log.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128) log.debug("Got %d items for partition id %d" % (len(results), partition_id)) for fingerprint, url, score in results: r = self.manager.request_model(url=url) r.meta['fingerprint'] = fingerprint r.meta['score'] = score next_pages.append(r) return next_pages def update_score(self, batch): if not isinstance(batch, dict): raise TypeError('batch should be dict with fingerprint as key, and float score as value') to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {'name': hostname}, url)) self.queue.schedule(to_schedule) def flush(self): self.batch.send() def update_states(self, objs, persist): self.state_checker.update(objs, persist) def flush_states(self, is_clear=True): self.state_checker.flush(is_clear) def fetch_states(self, fingerprints): self.state_checker.fetch(fingerprints)
class HBaseBackend(Backend): component_name = "HBase Backend" def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get("HBASE_THRIFT_PORT") hosts = settings.get("HBASE_THRIFT_HOST") namespace = settings.get("HBASE_NAMESPACE") drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES") self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS") self._table_name = settings.get("HBASE_METADATA_TABLE") host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"} if settings.get("HBASE_USE_COMPACT_PROTOCOL"): kwargs.update({"protocol": "compact", "transport": "framed"}) self.connection = Connection(**kwargs) self.queue = HBaseQueue( self.connection, self.queue_partitions, self.manager.logger.backend, settings.get("HBASE_QUEUE_TABLE"), drop=drop_all_tables, ) self.state_checker = HBaseState( self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT") ) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: schema = { "m": {"max_versions": 1}, "s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True}, "c": {"max_versions": 1}, } if settings.get("HBASE_USE_SNAPPY"): schema["m"]["compression"] = "SNAPPY" schema["c"]["compression"] = "SNAPPY" self.connection.create_table(self._table_name, schema) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE")) self.store_content = settings.get("HBASE_STORE_CONTENT") @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.connection.close() self.flush() def add_seeds(self, seeds): for seed in seeds: url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed) obj = prepare_hbase_object( url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain["fingerprint"] ) self.batch.put(unhexlify(fingerprint), obj) def page_crawled(self, response, links): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response) obj = ( prepare_hbase_object(status_code=response.status_code, content=response.body) if self.store_content else prepare_hbase_object(status_code=response.status_code) ) links_dict = dict() for link in links: link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link) links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain) self.batch.put(unhexlify(fingerprint), obj) for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems(): obj = prepare_hbase_object( url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain["fingerprint"] ) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request) obj = prepare_hbase_object( url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain["fingerprint"] ) rk = unhexlify(request.meta["fingerprint"]) self.batch.put(rk, obj) def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] log = self.manager.logger.backend log.debug("Querying queue table.") partitions = set(kwargs.pop("partitions", [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128) log.debug("Got %d items for partition id %d" % (len(results), partition_id)) for fingerprint, url, score in results: r = self.manager.request_model(url=url) r.meta["fingerprint"] = fingerprint r.meta["score"] = score next_pages.append(r) return next_pages def update_score(self, batch): if not isinstance(batch, dict): raise TypeError("batch should be dict with fingerprint as key, and float score as value") to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {"name": hostname}, url)) self.queue.schedule(to_schedule) def flush(self): self.batch.send() def update_states(self, objs, persist): self.state_checker.update(objs, persist) def flush_states(self, is_clear=True): self.state_checker.flush(is_clear) def fetch_states(self, fingerprints): self.state_checker.fetch(fingerprints)
class HBaseBackend(Backend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager settings = manager.settings port = settings.get('HBASE_THRIFT_PORT', 9090) hosts = settings.get('HBASE_THRIFT_HOST', 'localhost') namespace = settings.get('HBASE_NAMESPACE', 'crawler') drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False) self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4) self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata') host = choice(hosts) if type(hosts) in [list, tuple] else hosts self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':') # protocol='compact', transport='framed' self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend, drop=drop_all_tables) self.state_checker = HBaseState(self.connection, self._table_name) tables = set(self.connection.tables()) if drop_all_tables and self._table_name in tables: self.connection.delete_table(self._table_name, disable=True) tables.remove(self._table_name) if self._table_name not in tables: self.connection.create_table( self._table_name, { 'm': { 'max_versions': 5 }, # 'compression': 'SNAPPY' 's': { 'max_versions': 1, 'block_cache_enabled': 1, 'bloom_filter_type': 'ROW', 'in_memory': True, }, 'c': { 'max_versions': 1 } }) table = self.connection.table(self._table_name) self.batch = table.batch(batch_size=9216) @classmethod def from_manager(cls, manager): return cls(manager) def frontier_start(self): pass def frontier_stop(self): self.connection.close() self.flush() def add_seeds(self, seeds): for seed in seeds: url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url( seed) obj = prepare_hbase_object( url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain['fingerprint']) self.batch.put(unhexlify(fingerprint), obj) def page_crawled(self, response, links): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url( response) obj = prepare_hbase_object(status_code=response.status_code, content=response.body) links_dict = dict() for link in links: link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url( link) links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain) self.batch.put(unhexlify(fingerprint), obj) for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems(): obj = prepare_hbase_object( url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain['fingerprint']) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url( request) obj = prepare_hbase_object(url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain['fingerprint']) rk = unhexlify(request.meta['fingerprint']) self.batch.put(rk, obj) def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] log = self.manager.logger.backend log.debug("Querying queue table.") partitions = set(kwargs.pop('partitions', [])) for partition_id in range(0, self.queue_partitions): if partition_id not in partitions: continue results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128) log.debug("Got %d items for partition id %d" % (len(results), partition_id)) for fingerprint, url, score in results: r = self.manager.request_model(url=url) r.meta['fingerprint'] = fingerprint r.meta['score'] = score next_pages.append(r) return next_pages def update_score(self, batch): if not isinstance(batch, dict): raise TypeError( 'batch should be dict with fingerprint as key, and float score as value' ) to_schedule = [] for fprint, (score, url, schedule) in batch.iteritems(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(url) if not hostname: self.manager.logger.backend.error( "Can't get hostname for URL %s, fingerprint %s" % (url, fprint)) continue to_schedule.append((score, fprint, {'name': hostname}, url)) self.queue.schedule(to_schedule) def flush(self): self.batch.send() def update_states(self, objs, persist): self.state_checker.update(objs, persist) def flush_states(self, is_clear=True): self.state_checker.flush(is_clear) def fetch_states(self, fingerprints): self.state_checker.fetch(fingerprints)
# flipper data_for_flipper = { 'id:name': 'flipper', 'features:race': 'dolphin', 'features:gender': 'male', 'features:apnea': '10' } # lassie data_for_lassie = { 'id:chip_number': '314', 'id:name': 'lassie', 'features:race': 'colley', 'features:gender': 'female' } # gary data_for_gary = {'id:name': 'gary', 'features:race': 'snail'} # putting data into the table table.put(row='1', data=data_for_lassie) table.put(row='2', data=data_for_flipper) table.put(row='3', data=data_for_gary) # printing out the content of the table for data in table.scan(): pprint.pprint(data) # closing hbase connection hbase_connection.close()
class HBaseBackend(DistributedBackend): component_name = 'HBase Backend' def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings port = settings.get('HBASE_THRIFT_PORT') hosts = settings.get('HBASE_THRIFT_HOST') namespace = settings.get('HBASE_NAMESPACE') self._min_requests = settings.get('BC_MIN_REQUESTS') self._min_hosts = settings.get('BC_MIN_HOSTS') self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') host = choice(hosts) if type(hosts) in [list, tuple] else hosts kwargs = { 'host': host, 'port': int(port), 'table_prefix': namespace, 'table_prefix_separator': ':', 'timeout': 60000 } if settings.get('HBASE_USE_FRAMED_COMPACT'): kwargs.update({'protocol': 'compact', 'transport': 'framed'}) self.logger.info("Connecting to %s:%d thrift server.", host, port) self.connection = Connection(**kwargs) self._metadata = None self._queue = None self._states = None self._domain_metadata = None def _init_states(self, settings): self._states = HBaseState( connection=self.connection, table_name=settings.get('HBASE_STATES_TABLE'), cache_size_limit=settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), write_log_size=settings.get('HBASE_STATE_WRITE_LOG_SIZE'), drop_all_tables=settings.get('HBASE_DROP_ALL_TABLES')) def _init_queue(self, settings): self._queue = HBaseQueue(self.connection, self.queue_partitions, settings.get('HBASE_QUEUE_TABLE'), drop=settings.get('HBASE_DROP_ALL_TABLES'), use_snappy=settings.get('HBASE_USE_SNAPPY')) def _init_metadata(self, settings): self._metadata = HBaseMetadata(self.connection, settings.get('HBASE_METADATA_TABLE'), settings.get('HBASE_DROP_ALL_TABLES'), settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), settings.get('STORE_CONTENT')) def _init_domain_metadata(self, settings): self._domain_metadata = DomainCache( settings.get('HBASE_DOMAIN_METADATA_CACHE_SIZE'), self.connection, settings.get('HBASE_DOMAIN_METADATA_TABLE'), batch_size=settings.get('HBASE_DOMAIN_METADATA_BATCH_SIZE')) @classmethod def strategy_worker(cls, manager): o = cls(manager) o._init_states(manager.settings) o._init_domain_metadata(manager.settings) return o @classmethod def db_worker(cls, manager): o = cls(manager) o._init_queue(manager.settings) o._init_metadata(manager.settings) return o @classmethod def local(cls, manager): o = cls(manager) o._init_queue(manager.settings) o._init_states(manager.settings) return o @property def metadata(self): return self._metadata @property def queue(self): return self._queue @property def states(self): return self._states @property def domain_metadata(self): return self._domain_metadata def frontier_start(self): for component in [ self.metadata, self.queue, self.states, self.domain_metadata ]: if component: component.frontier_start() def frontier_stop(self): for component in [ self.metadata, self.queue, self.states, self.domain_metadata ]: if component: component.frontier_stop() self.connection.close() def add_seeds(self, seeds): self.metadata.add_seeds(seeds) def page_crawled(self, response): self.metadata.page_crawled(response) def links_extracted(self, request, links): self.metadata.links_extracted(request, links) def request_error(self, page, error): self.metadata.request_error(page, error) def finished(self): raise NotImplementedError def get_next_requests(self, max_next_requests, **kwargs): self.logger.debug("Querying queue table.") results = [] for partition_id in set( kwargs.pop('partitions', [i for i in range(self.queue_partitions)])): requests = self.queue.get_next_requests( max_next_requests, partition_id, min_requests=self._min_requests, min_hosts=self._min_hosts, max_requests_per_host=self._max_requests_per_host) results.extend(requests) self.logger.debug("Got %d requests for partition id %d", len(requests), partition_id) return results def get_stats(self): """Helper to get stats dictionary for the backend. For now it provides only HBase client stats. """ stats = {} with time_elapsed('Call HBase backend get_stats()'): stats.update(self.connection.client.get_stats()) if self._states: stats.update(self._states.get_stats()) return stats