Esempio n. 1
0
    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':',
            'timeout': 60000
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.logger.info("Connecting to %s:%d thrift server.", host, port)
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None
Esempio n. 2
0
 def test_drop_all_tables_when_table_name_is_str(self):
     connection = Connection(host='hbase-docker', port=9090)
     for table in connection.tables():
         connection.delete_table(table, True)
     hbase_queue_table = 'queue'
     hbase_metadata_table = 'metadata'
     connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}})
     connection.create_table(hbase_metadata_table,
                             {'f': {
                                 'max_versions': 1
                             }})
     tables = connection.tables()
     assert set(tables) == set([b'metadata',
                                b'queue'])  # Failure of test itself
     try:
         HBaseQueue(connection=connection,
                    partitions=1,
                    table_name=hbase_queue_table,
                    drop=True)
         HBaseMetadata(connection=connection,
                       table_name=hbase_metadata_table,
                       drop_all_tables=True,
                       use_snappy=False,
                       batch_size=300000,
                       store_content=True)
     except AlreadyExists:
         assert False, "failed to drop hbase tables"
Esempio n. 3
0
 def __init__(self, name):
     from happybase import Connection
     from thrift.transport import TTransport
     try:
         self._conn = Connection('localhost')
         self._table = self._conn.table(name)
     except TTransport.TTransportException, e:
         raise UserWarning(e)
Esempio n. 4
0
 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])
Esempio n. 5
0
 def test_metadata(self):
     connection = Connection(host='hbase-docker', port=9090)
     metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True)
     metadata.add_seeds([r1, r2, r3])
     resp = Response('https://www.example.com', request=r1)
     metadata.page_crawled(resp)
     metadata.links_extracted(resp.request, [r2, r3])
     metadata.request_error(r4, 'error')
     metadata.frontier_stop()
     table = connection.table('metadata')
     assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \
         set([r1.url, r2.url, r3.url])
     self.delete_rows(table, [b'10', b'11', b'12'])
Esempio n. 6
0
def setup_module():
    global connection, table
    connection = Connection(**connection_kwargs)

    assert_is_not_none(connection)

    cfs = {
        'cf1': {},
        'cf2': None,
        'cf3': {'max_versions': 1},
    }
    connection.create_table(TEST_TABLE_NAME, families=cfs)

    table = connection.table(TEST_TABLE_NAME)
    assert_is_not_none(table)
Esempio n. 7
0
def test_prefix():
    assert_equal(TABLE_PREFIX + '_', connection._table_name(''))
    assert_equal(TABLE_PREFIX + '_foo', connection._table_name('foo'))

    assert_equal(connection.table('foobar').name, TABLE_PREFIX + '_foobar')
    assert_equal(connection.table('foobar', use_prefix=False).name, 'foobar')

    c = Connection(autoconnect=False)
    assert_equal('foo', c._table_name('foo'))

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix=123)

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix_separator=2.1)
 def setUp(self):
     logging.basicConfig(level=logging.DEBUG)
     self.conn = Connection(host="hbase-docker")
     if b'domain_metadata' not in self.conn.tables():
         self.conn.create_table(
             'domain_metadata',
             {'m': {
                 'max_versions': 1,
                 'block_cache_enabled': 1,
             }})
     t = self.conn.table('domain_metadata')
     t.delete('d1')
     t.delete('d2')
     t.delete('d3')
     t.delete('d4')
Esempio n. 9
0
def test_prefix():
    assert_equal(TABLE_PREFIX + '_', connection._table_name(''))
    assert_equal(TABLE_PREFIX + '_foo', connection._table_name('foo'))

    assert_equal(connection.table('foobar').name, TABLE_PREFIX + '_foobar')
    assert_equal(connection.table('foobar', use_prefix=False).name, 'foobar')

    c = Connection(autoconnect=False)
    assert_equal('foo', c._table_name('foo'))

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix=123)

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix_separator=2.1)
Esempio n. 10
0
def test_prefix():
    assert TABLE_PREFIX + b'_' == connection._table_name('')
    assert TABLE_PREFIX + b'_foo' == connection._table_name('foo')

    assert connection.table('foobar').name == TABLE_PREFIX + b'_foobar'
    assert connection.table('foobar', use_prefix=False).name == b'foobar'

    c = Connection(autoconnect=False)
    assert b'foo' == c._table_name('foo')

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix=123)

    with assert_raises(TypeError):
        Connection(autoconnect=False, table_prefix_separator=2.1)
Esempio n. 11
0
    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':'
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None
Esempio n. 12
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection,
                        1,
                        b'queue',
                        use_snappy=False,
                        drop=True)
     r5 = r3.copy()
     crawl_at = int(time()) + 1000
     r5.meta[b'crawl_at'] = crawl_at
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time:
         mocked_time.return_value = time()
         assert queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10) == []
         mocked_time.return_value = crawl_at + 1
         assert set([
             r.url
             for r in queue.get_next_requests(10,
                                              0,
                                              min_requests=3,
                                              min_hosts=1,
                                              max_requests_per_host=10)
         ]) == set([r5.url])
Esempio n. 13
0
 def test_state(self):
     connection = Connection(host='hbase-docker', port=9090)
     state = HBaseState(connection,
                        b'states',
                        cache_size_limit=300000,
                        write_log_size=5000,
                        drop_all_tables=True)
     state.set_states([r1, r2, r3])
     assert [r.meta[b'state']
             for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3
     state.update_cache([r1, r2, r3])
     assert dict(state._state_cache) == {
         b'10': States.NOT_CRAWLED,
         b'11': States.NOT_CRAWLED,
         b'12': States.NOT_CRAWLED
     }
     assert state._state_batch._mutation_count == 3
     r1.meta[b'state'] = States.CRAWLED
     r2.meta[b'state'] = States.CRAWLED
     r3.meta[b'state'] = States.CRAWLED
     state.update_cache([r1, r2, r3])
     assert state._state_batch._mutation_count == 6
     state.flush()
     assert state._state_batch._mutation_count == 0
     state.fetch([b'10', b'11', b'12'])
     assert dict(state._state_cache) == {
         b'10': States.CRAWLED,
         b'11': States.CRAWLED,
         b'12': States.CRAWLED
     }
     r4.meta[b'state'] = States.ERROR
     state.set_states([r1, r2, r4])
     assert r4.meta[b'state'] == States.CRAWLED
     state.flush()
     assert state._state_batch._mutation_count == 0
Esempio n. 14
0
    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT', 9090)
        hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
        namespace = settings.get('HBASE_NAMESPACE', 'crawler')
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
        self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
        self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts

        self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':')
        # protocol='compact', transport='framed'
        self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend,
                                drop=drop_all_tables)
        self.state_checker = HBaseState(self.connection, self._table_name)


        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY'
                                                            's': {'max_versions': 1, 'block_cache_enabled': 1,
                                                            'bloom_filter_type': 'ROW', 'in_memory': True, },
                                                            'c': {'max_versions': 1}
                                                            })
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=9216)
Esempio n. 15
0
 def test_state(self):
     connection = Connection(host='hbase-docker', port=9090)
     state = HBaseState(connection, b'metadata', 300000)
     state.set_states([r1, r2, r3])
     assert [r.meta[b'state']
             for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3
     state.update_cache([r1, r2, r3])
     assert state._state_cache == {
         b'10': States.NOT_CRAWLED,
         b'11': States.NOT_CRAWLED,
         b'12': States.NOT_CRAWLED
     }
     r1.meta[b'state'] = States.CRAWLED
     r2.meta[b'state'] = States.CRAWLED
     r3.meta[b'state'] = States.CRAWLED
     state.update_cache([r1, r2, r3])
     state.flush(True)
     assert state._state_cache == {}
     state.fetch([b'10', b'11', b'12'])
     assert state._state_cache == {
         b'10': States.CRAWLED,
         b'11': States.CRAWLED,
         b'12': States.CRAWLED
     }
     r4.meta[b'state'] = States.ERROR
     state.set_states([r1, r2, r4])
     assert r4.meta[b'state'] == States.CRAWLED
     state.flush(True)
     assert state._state_cache == {}
Esempio n. 16
0
def setup_module():
    global connection, table
    connection = Connection(**connection_kwargs)

    assert connection is not None

    maybe_delete_table()
    cfs = {
        'cf1': {},
        'cf2': None,
        'cf3': {'max_versions': 1},
    }
    connection.create_table(TEST_TABLE_NAME, families=cfs)

    table = connection.table(TEST_TABLE_NAME)
    assert table is not None
Esempio n. 17
0
    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT', 9090)
        hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
        namespace = settings.get('HBASE_NAMESPACE', 'crawler')
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
        self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
        self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts

        self.connection = Connection(host=host,
                                     port=int(port),
                                     table_prefix=namespace,
                                     table_prefix_separator=':')
        # protocol='compact', transport='framed'
        self.queue = HBaseQueue(self.connection,
                                self.queue_partitions,
                                self.manager.logger.backend,
                                drop=drop_all_tables)
        self.state_checker = HBaseState(self.connection, self._table_name)

        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            self.connection.create_table(
                self._table_name,
                {
                    'm': {
                        'max_versions': 5
                    },  # 'compression': 'SNAPPY'
                    's': {
                        'max_versions': 1,
                        'block_cache_enabled': 1,
                        'bloom_filter_type': 'ROW',
                        'in_memory': True,
                    },
                    'c': {
                        'max_versions': 1
                    }
                })
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=9216)
Esempio n. 18
0
 def __init__(self, name):
     from happybase import Connection
     from thrift.transport import TTransport
     try:
         self._conn = Connection('localhost')
         self._table = self._conn.table(name)
     except TTransport.TTransportException, e:
         raise UserWarning(e)
Esempio n. 19
0
def kpi2(conn: happybase.Connection):
    """Correlation of rent price and family income per neighborhood."""

    table = conn.table('housing')
    table2 = conn.table('opendatabcn')
    table3 = conn.table('idealista-to-open')

    for year in range(2014, 2017, 1):
        # key = district-neighborhood
        rfdByZone = dict()  # value = RFD
        pricesByZone = dict()  # value = [price]

        for _k, v in table.scan():
            # Union by hand
            district = v[b'cf1:district'].decode('utf-8')
            neighborhood = v[b'cf1:neighborhood'].decode('utf-8')
            k = getKey(district, neighborhood)
            row = table3.row(k, columns=['cf1:district', 'cf1:neighborhood'])
            k = getKeyOpen(row[b'cf1:district'].decode('utf-8'),
                           row[b'cf1:neighborhood'].decode('utf-8'),
                           year=year)
            row = table2.row(k, columns=['cf1:rfd'])
            rfd = float(row[b'cf1:rfd'].decode('utf-8'))

            # Update data
            k = getKey(district.replace('-', ' '),
                       neighborhood.replace('-', ' '))
            rfdByZone[k] = rfd
            price = float(v[b'cf1:price'].decode('utf-8'))
            if k in pricesByZone:
                pricesByZone[k].append(price)
            else:
                pricesByZone[k] = [price]

        print('')
        print(f'Year {year}:')
        for k, rfd in rfdByZone.items():
            (district, neighborhood) = k.split('-')
            price = mean(pricesByZone[k])
            # Not the actual correlation formula but to simplify things
            correlation = price / rfd
            print(
                f'\t{neighborhood} has a correlation price/rfd = {correlation}'
            )
Esempio n. 20
0
 def test_queue(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 2, b'queue', True)
     batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True),
              ('12', 0.7, r3, True)]
     queue.schedule(batch)
     assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r3.url])
     assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1,
                max_requests_per_host=10)]) == set([r1.url, r2.url])
Esempio n. 21
0
    def __init__(self, host='127.0.0.1', port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs):
        super(HBaseCache, self).__init__(default_timeout)
        
        if not table_name:
            raise TypeError('table_name is a required argument')
        self.table_name = table_name

        self._c = Connection(host=host, port=port, table_prefix=table_prefix, **kwargs)
        self._table = self._c.table(table_name)
        self.clear()
def put_data_into_hbase(rdd):
    """
    functions to store data into hbase table
    """
    # collecting the results
    results = rdd.collect()
    # computing the exact time: this will serve as the row id
    date = str(datetime.datetime.now())[:19]
    # making connection to the right
    connection = Connection(host='localhost', port=9090, autoconnect=True)
    table = connection.table(name='base_tweets')
    #
    for data in results:
        if data[0] == 0:
            table.put(row=date, data={'tweet_count:neg': str(data[1])})
        else:
            table.put(row=date, data={'tweet_count:pos': str(data[1])})

    connection.close()
Esempio n. 23
0
 def test_drop_all_tables_when_table_name_is_str(self):
     connection = Connection(host='hbase-docker', port=9090)
     for table in connection.tables():
         connection.delete_table(table, True)
     hbase_queue_table = 'queue'
     hbase_metadata_table = 'metadata'
     connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}})
     connection.create_table(hbase_metadata_table, {'f': {'max_versions': 1}})
     tables = connection.tables()
     assert set(tables) == set([b'metadata', b'queue'])  # Failure of test itself
     try:
         HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True)
         HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True,
                       use_snappy=False, batch_size=300000, store_content=True)
     except AlreadyExists:
         assert False, "failed to drop hbase tables"
Esempio n. 24
0
 def setUp(self):
     logging.basicConfig(level=logging.DEBUG)
     self.conn = Connection(host="hbase-docker")
     if b'domain_metadata' not in self.conn.tables():
         self.conn.create_table('domain_metadata', {
             'm': {'max_versions': 1, 'block_cache_enabled': 1,}
         })
     t = self.conn.table('domain_metadata')
     t.delete('d1')
     t.delete('d2')
     t.delete('d3')
     t.delete('d4')
Esempio n. 25
0
    def __init__(self, host="127.0.0.1", port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs):
        # Potential bug: table_prefix instead of prefix
        BaseCache.__init__(self, default_timeout)

        if not table_name:
            raise TypeError("table_name is a required argument")
        self.table_name = table_name

        self._c = Connection(host=host, port=port, table_prefix=prefix, **kwargs)
        self._table = self._c.table(table_name)  # Note: initialisation overwrites the existing rows of the Hbase table

        self.clear()
Esempio n. 26
0
 def __init__(self, manager):
     self.manager = manager
     self.logger = manager.logger.backend
     settings = manager.settings
     port = settings.get('HBASE_THRIFT_PORT')
     hosts = settings.get('HBASE_THRIFT_HOST')
     namespace = settings.get('HBASE_NAMESPACE')
     self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
     host = choice(hosts) if type(hosts) in [list, tuple] else hosts
     kwargs = {
         'host': host,
         'port': int(port),
         'table_prefix': namespace,
         'table_prefix_separator': ':'
     }
     if settings.get('HBASE_USE_COMPACT_PROTOCOL'):
         kwargs.update({'protocol': 'compact', 'transport': 'framed'})
     self.connection = Connection(**kwargs)
     self._metadata = None
     self._queue = None
     self._states = None
Esempio n. 27
0
    def get_client_addons(self, client_id):
        """Retrieve the list of addons for the given client

        Only the last known version of the list of addons is retrieved"""
        with contextlib.closing(Connection(self._hostname)) as connection:
            table = connection.table(self.tablename)
            row_start = "{}:{}".format(client_id, "99999999")
            for key, data in table.scan(row_start=row_start,
                                        limit=1,
                                        columns=[self.column_family],
                                        reverse=True):
                return json.loads(data[self.column].decode("utf-8"))
        return None
Esempio n. 28
0
def kpi1(conn: happybase.Connection):
    """Average number of new listings per day."""
    table = conn.table('housing')

    def getDate(x):
        k, data = x
        return datetime.strptime(data[b'cf2:date'].decode('utf-8'), '%Y-%m-%d')

    dates = list(map(getDate, table.scan(columns=[b'cf2:date'])))
    nDays = (max(dates) - min(dates)).days
    nListings = len(dates)
    print('Average number of new listings per day {}'.format(nListings /
                                                             nDays))
Esempio n. 29
0
 def test_queue_with_post_request(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection,
                        1,
                        b'queue',
                        drop=True,
                        use_snappy=False)
     batch = [('10', 0.5, r1, True)]
     queue.schedule(batch)
     requests = queue.get_next_requests(10,
                                        0,
                                        min_requests=3,
                                        min_hosts=1,
                                        max_requests_per_host=10)
     self.assertEqual(b'POST', requests[0].method)
     self.assertEqual(data, requests[0].body)
Esempio n. 30
0
class HBaseStorage(MachineBaseStorage):

    _VAL = b'values:value'
    _EXP = b'values:expires_at'
    _COLS = [_VAL, _EXP]

    def __init__(self, settings):
        super().__init__(settings)
        hbase_host = settings['HBASE_HOST']
        hbase_table = settings['HBASE_TABLE']
        self._connection = Connection(hbase_host)
        self._table = self._connection.table(hbase_table)

    def _get_value(self, key):
        row = self._table.row(key, self._COLS)
        val = row.get(self._VAL)
        if val:
            exp = row.get(self._EXP)
            if not exp:
                return val
            elif datetime.fromtimestamp(
                    bytes_to_float(exp)) > datetime.utcnow():
                return val
            else:
                self.delete(key)
                return None
        return None

    def has(self, key):
        val = self._get_value(key)
        return bool(val)

    def get(self, key):
        return self._get_value(key)

    def set(self, key, value, expires=None):
        data = {self._VAL: value}
        if expires:
            expires_at = datetime.utcnow() + timedelta(seconds=expires)
            data[self._EXP] = float_to_bytes(expires_at.timestamp())
        self._table.put(key, data)

    def delete(self, key):
        self._table.delete(key)

    def size(self):
        return 0
Esempio n. 31
0
 def test_queue_with_delay(self):
     connection = Connection(host='hbase-docker', port=9090)
     queue = HBaseQueue(connection, 1, b'queue', True)
     r5 = r3.copy()
     r5.meta[b'crawl_at'] = int(time()) + 1
     batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)]
     queue.schedule(batch)
     assert queue.get_next_requests(10,
                                    0,
                                    min_requests=3,
                                    min_hosts=1,
                                    max_requests_per_host=10) == []
     sleep(1.5)
     assert set([
         r.url for r in queue.get_next_requests(
             10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10)
     ]) == set([r5.url])
Esempio n. 32
0
    def get_client_profile(self, client_id):
        """Retrieve the latest row for the given client in HBase

        Only the last known version of the info is retrieved"""
        try:
            with contextlib.closing(Connection(
                    self.hbase_hostname)) as connection:
                table = connection.table(self.tablename)
                client_row = table.row(client_id, columns=[self.column_family])
                if client_row:
                    return json.loads(client_row[self.column].decode("utf-8"))
        except Exception:
            logger.exception("Connection to HBase failed",
                             extra={"client_id": client_id})

        logger.info("Client information not found",
                    extra={"client_id": client_id})
        return None
Esempio n. 33
0
    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get("HBASE_THRIFT_PORT")
        hosts = settings.get("HBASE_THRIFT_HOST")
        namespace = settings.get("HBASE_NAMESPACE")
        drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES")
        self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS")
        self._table_name = settings.get("HBASE_METADATA_TABLE")
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"}
        if settings.get("HBASE_USE_COMPACT_PROTOCOL"):
            kwargs.update({"protocol": "compact", "transport": "framed"})
        self.connection = Connection(**kwargs)
        self.queue = HBaseQueue(
            self.connection,
            self.queue_partitions,
            self.manager.logger.backend,
            settings.get("HBASE_QUEUE_TABLE"),
            drop=drop_all_tables,
        )
        self.state_checker = HBaseState(
            self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT")
        )
        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {
                "m": {"max_versions": 1},
                "s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True},
                "c": {"max_versions": 1},
            }
            if settings.get("HBASE_USE_SNAPPY"):
                schema["m"]["compression"] = "SNAPPY"
                schema["c"]["compression"] = "SNAPPY"
            self.connection.create_table(self._table_name, schema)
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE"))
        self.store_content = settings.get("HBASE_STORE_CONTENT")
class TestDomainCache(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(level=logging.DEBUG)
        self.conn = Connection(host="hbase-docker")
        if b'domain_metadata' not in self.conn.tables():
            self.conn.create_table(
                'domain_metadata',
                {'m': {
                    'max_versions': 1,
                    'block_cache_enabled': 1,
                }})
        t = self.conn.table('domain_metadata')
        t.delete('d1')
        t.delete('d2')
        t.delete('d3')
        t.delete('d4')

    def test_domain_cache_both_generations(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}

        # eviction should happen
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc['d1'] == {'domain': 1}
        assert dc['d2'] == {'domain': 2}
        assert dc['d3'] == {'domain': [3, 2, 1]}
        assert dc['d4'] == {'domain': 4}

    def test_domain_cache_get_with_default(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc.get('d1', {}) == {'domain': 1}
        assert dc.get('d3', {}) == {'domain': [3, 2, 1]}

    def test_domain_cache_setdefault(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc.setdefault('d1', {}) == {'domain': 1}
        assert dc.setdefault('d5', {'domain': 6}) == {'domain': 6}
        dc.flush()
        assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]}

    def test_domain_cache_setdefault_with_second_gen_flush(self):
        dc = DomainCache(2, self.conn, 'domain_metadata', batch_size=3)
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}

        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        dc.setdefault('d1', {})['domain'] += 1

        assert dc.setdefault('d1', {}) == {'domain': 2}

    def test_empty_key(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        with self.assertRaises(KeyError):
            dc[''] = {'test': 1}

    def test_deletion(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        with self.assertRaises(KeyError):
            del dc['d1']

        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        del dc['d1']  # second gen
        del dc['d3']  # first gen

        dc.flush()

        del dc['d4']  # hbase

    def test_contains(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert 'd1' in dc  # second gen
        assert 'd3' in dc  # first gen

        dc.flush()

        assert 'd4' in dc

    def test_pop(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc.pop('d1') == {'domain': 1}
        assert 'd1' not in dc

        assert dc.pop('d3') == {'domain': [3, 2, 1]}
        assert 'd3' not in dc

        dc.flush()

        assert dc.pop('d4') == {'domain': 4}
        assert 'd4' not in dc
Esempio n. 35
0
class HBaseCache(BaseCache):
    def __init__(self, host='127.0.0.1', port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs):
        super(HBaseCache, self).__init__(default_timeout)
        
        if not table_name:
            raise TypeError('table_name is a required argument')
        self.table_name = table_name

        self._c = Connection(host=host, port=port, table_prefix=table_prefix, **kwargs)
        self._table = self._c.table(table_name)
        self.clear()

    def _put(self, key, value):
        return key, {'cf:value': value}

    def _extract(self, value):
        if value:
            return value.get('cf:value')
        else:
            return value

    def add(self, key, value, timeout=None):
        table = self._table
        try:
            if not table.row(key):  # TO-DO: what does table.row returns for non existing keys?
                table.put(*self._put(key, value))
            else:
                return False
        except:
            return False
        return True

    def clear(self):
        self._c.delete_table(self.table_name, disable=True)
        self._c.create_table(self.table_name, {'cf': dict()})
        return super(HBaseCache, self).clear()

    def dec(self, key, delta=1):
        return self.inc(key, -delta)
#        table = self._table
#        new_value = table.counter_inc(key, 'cf:value', -delta)
#        value = table.row(key)
#        new_value = (self._extract(value) or 0) - delta
#        table.put(*self._put(key, new_value))
        # TO-DO the above should in principle be guarded by some exception handling
#        return new_value

    def delete(self, key):
        try:
            self._table.delete(key)
        except:
            return False
        return True

    def delete_many(self, *keys):
        batch = self._table.batch()
        try:
            for k in keys:
                batch.delete(k)
            batch.send()
        except:
            return False
        return True

    def get(self, key):
        value = self._table.row(key)
        return self._extract(value)

    def get_dict(self, *keys):
        table = self._table
        _, values = table.rows(keys)
        return {k: self._extract(v) for v in zip(keys, values)}

    def get_many(self, *keys):
        table = self._table
        _, values = table.rows(keys)
        return map(self._extract, values)

    def has(self, key):
        return super(HBaseCache, self).has(key)

    def inc(self, key, delta=1):
        table = self._table
        new_value = table.counter_inc(key, 'cf:value', delta)
        return new_value

    def set(self, key, value, timeout=None):
        table = self._table
        try:
            table.delete(key)  # TO-DO Does this return an exception if it doesn't exist? Otherwise we need to put a table.row before that
            table.put(*self._put(key, value))
        except:
            return False
        return True

    def set_many(self, mapping, timeout=None):
        batch = self._table.batch()
        for key, value in _items(mapping):
            batch.put(*self._put(key, value))
        try:
            batch.send()
        except:
            return False
        return True
Esempio n. 36
0
def test_timeout_arg():
    Connection(
        timeout=5000,
        autoconnect=False)
Esempio n. 37
0
class HBaseBackend(Backend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT', 9090)
        hosts = settings.get('HBASE_THRIFT_HOST', 'localhost')
        namespace = settings.get('HBASE_NAMESPACE', 'crawler')
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES', False)
        self.queue_partitions = settings.get('HBASE_QUEUE_PARTITIONS', 4)
        self._table_name = settings.get('HBASE_METADATA_TABLE', 'metadata')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts

        self.connection = Connection(host=host, port=int(port), table_prefix=namespace, table_prefix_separator=':')
        # protocol='compact', transport='framed'
        self.queue = HBaseQueue(self.connection, self.queue_partitions, self.manager.logger.backend,
                                drop=drop_all_tables)
        self.state_checker = HBaseState(self.connection, self._table_name)


        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            self.connection.create_table(self._table_name, {'m': {'max_versions': 5}, # 'compression': 'SNAPPY'
                                                            's': {'max_versions': 1, 'block_cache_enabled': 1,
                                                            'bloom_filter_type': 'ROW', 'in_memory': True, },
                                                            'c': {'max_versions': 1}
                                                            })
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=9216)

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.connection.close()
        self.flush()

    def add_seeds(self, seeds):
        for seed in seeds:
            url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed)
            obj = prepare_hbase_object(url=url,
                                       depth=0,
                                       created_at=utcnow_timestamp(),
                                       domain_fingerprint=domain['fingerprint'])
            self.batch.put(unhexlify(fingerprint), obj)

    def page_crawled(self, response, links):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response)
        obj = prepare_hbase_object(status_code=response.status_code, content=response.body)

        links_dict = dict()
        for link in links:
            link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link)
            links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain)


        self.batch.put(unhexlify(fingerprint), obj)
        for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems():
            obj = prepare_hbase_object(url=link_url,
                                       created_at=utcnow_timestamp(),
                                       domain_fingerprint=link_domain['fingerprint'])
            self.batch.put(link_fingerprint, obj)

    def request_error(self, request, error):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request)
        obj = prepare_hbase_object(url=request.url,
                                   created_at=utcnow_timestamp(),
                                   error=error,
                                   domain_fingerprint=domain['fingerprint'])
        rk = unhexlify(request.meta['fingerprint'])
        self.batch.put(rk, obj)

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        log = self.manager.logger.backend
        log.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get(partition_id, max_next_requests,
                                                    min_hosts=24, max_requests_per_host=128)

            log.debug("Got %d items for partition id %d" % (len(results), partition_id))
            for fingerprint, url, score in results:
                r = self.manager.request_model(url=url)
                r.meta['fingerprint'] = fingerprint
                r.meta['score'] = score
                next_pages.append(r)
        return next_pages

    def update_score(self, batch):
        if not isinstance(batch, dict):
            raise TypeError('batch should be dict with fingerprint as key, and float score as value')

        to_schedule = []
        for fprint, (score, url, schedule) in batch.iteritems():
            obj = prepare_hbase_object(score=score)
            rk = unhexlify(fprint)
            self.batch.put(rk, obj)
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
                if not hostname:
                    self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint))
                    continue
                to_schedule.append((score, fprint, {'name': hostname}, url))
        self.queue.schedule(to_schedule)

    def flush(self):
        self.batch.send()

    def update_states(self, objs, persist):
        self.state_checker.update(objs, persist)

    def flush_states(self, is_clear=True):
        self.state_checker.flush(is_clear)

    def fetch_states(self, fingerprints):
        self.state_checker.fetch(fingerprints)
Esempio n. 38
0
class HBaseBackend(Backend):
    component_name = "HBase Backend"

    def __init__(self, manager):
        self.manager = manager

        settings = manager.settings
        port = settings.get("HBASE_THRIFT_PORT")
        hosts = settings.get("HBASE_THRIFT_HOST")
        namespace = settings.get("HBASE_NAMESPACE")
        drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES")
        self.queue_partitions = settings.get("HBASE_QUEUE_PARTITIONS")
        self._table_name = settings.get("HBASE_METADATA_TABLE")
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {"host": host, "port": int(port), "table_prefix": namespace, "table_prefix_separator": ":"}
        if settings.get("HBASE_USE_COMPACT_PROTOCOL"):
            kwargs.update({"protocol": "compact", "transport": "framed"})
        self.connection = Connection(**kwargs)
        self.queue = HBaseQueue(
            self.connection,
            self.queue_partitions,
            self.manager.logger.backend,
            settings.get("HBASE_QUEUE_TABLE"),
            drop=drop_all_tables,
        )
        self.state_checker = HBaseState(
            self.connection, self._table_name, self.manager.logger.backend, settings.get("HBASE_STATE_CACHE_SIZE_LIMIT")
        )
        tables = set(self.connection.tables())
        if drop_all_tables and self._table_name in tables:
            self.connection.delete_table(self._table_name, disable=True)
            tables.remove(self._table_name)

        if self._table_name not in tables:
            schema = {
                "m": {"max_versions": 1},
                "s": {"max_versions": 1, "block_cache_enabled": 1, "bloom_filter_type": "ROW", "in_memory": True},
                "c": {"max_versions": 1},
            }
            if settings.get("HBASE_USE_SNAPPY"):
                schema["m"]["compression"] = "SNAPPY"
                schema["c"]["compression"] = "SNAPPY"
            self.connection.create_table(self._table_name, schema)
        table = self.connection.table(self._table_name)
        self.batch = table.batch(batch_size=settings.get("HBASE_BATCH_SIZE"))
        self.store_content = settings.get("HBASE_STORE_CONTENT")

    @classmethod
    def from_manager(cls, manager):
        return cls(manager)

    def frontier_start(self):
        pass

    def frontier_stop(self):
        self.connection.close()
        self.flush()

    def add_seeds(self, seeds):
        for seed in seeds:
            url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(seed)
            obj = prepare_hbase_object(
                url=url, depth=0, created_at=utcnow_timestamp(), domain_fingerprint=domain["fingerprint"]
            )
            self.batch.put(unhexlify(fingerprint), obj)

    def page_crawled(self, response, links):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(response)
        obj = (
            prepare_hbase_object(status_code=response.status_code, content=response.body)
            if self.store_content
            else prepare_hbase_object(status_code=response.status_code)
        )
        links_dict = dict()
        for link in links:
            link_url, link_fingerprint, link_domain = self.manager.canonicalsolver.get_canonical_url(link)
            links_dict[unhexlify(link_fingerprint)] = (link, link_url, link_domain)
        self.batch.put(unhexlify(fingerprint), obj)
        for link_fingerprint, (link, link_url, link_domain) in links_dict.iteritems():
            obj = prepare_hbase_object(
                url=link_url, created_at=utcnow_timestamp(), domain_fingerprint=link_domain["fingerprint"]
            )
            self.batch.put(link_fingerprint, obj)

    def request_error(self, request, error):
        url, fingerprint, domain = self.manager.canonicalsolver.get_canonical_url(request)
        obj = prepare_hbase_object(
            url=request.url, created_at=utcnow_timestamp(), error=error, domain_fingerprint=domain["fingerprint"]
        )
        rk = unhexlify(request.meta["fingerprint"])
        self.batch.put(rk, obj)

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        log = self.manager.logger.backend
        log.debug("Querying queue table.")
        partitions = set(kwargs.pop("partitions", []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get(partition_id, max_next_requests, min_hosts=24, max_requests_per_host=128)
            log.debug("Got %d items for partition id %d" % (len(results), partition_id))
            for fingerprint, url, score in results:
                r = self.manager.request_model(url=url)
                r.meta["fingerprint"] = fingerprint
                r.meta["score"] = score
                next_pages.append(r)
        return next_pages

    def update_score(self, batch):
        if not isinstance(batch, dict):
            raise TypeError("batch should be dict with fingerprint as key, and float score as value")
        to_schedule = []
        for fprint, (score, url, schedule) in batch.iteritems():
            obj = prepare_hbase_object(score=score)
            rk = unhexlify(fprint)
            self.batch.put(rk, obj)
            if schedule:
                _, hostname, _, _, _, _ = parse_domain_from_url_fast(url)
                if not hostname:
                    self.manager.logger.backend.error("Can't get hostname for URL %s, fingerprint %s" % (url, fprint))
                    continue
                to_schedule.append((score, fprint, {"name": hostname}, url))
        self.queue.schedule(to_schedule)

    def flush(self):
        self.batch.send()

    def update_states(self, objs, persist):
        self.state_checker.update(objs, persist)

    def flush_states(self, is_clear=True):
        self.state_checker.flush(is_clear)

    def fetch_states(self, fingerprints):
        self.state_checker.fetch(fingerprints)
Esempio n. 39
0
def describe_proximity(window_seconds=5):
    """
    Poll the DB at a given interval to get the minimum proximity, average proximity
    and the variance in the mean (to get a feel for range sampled).

    Args:
        interval (int): Inteval at which to update
        window_ms (int): Window range for averaging (in milliseconds)
    """
    dt = timedelta(seconds=window_seconds)
    now_ = datetime.now()
    start = (now_ - dt).strftime(dtfmt)
    stop = now_.strftime(dtfmt)
    conn = Connection(config['hbase'], port=int(config['thrift']))
    tab = conn.table(str.encode(config['prox_table']))
    #dct = {k: v for k, v in tab.scan(row_start=pk01+start, row_stop=pk01+stop)}
    avg_ = []
    min_ = 0
    for pk in pks:
        dct = {
            k: v
            for k, v in tab.scan(row_start=pk + start, row_stop=pk + stop)
        }
        if len(dct) > 0:
            df = pd.DataFrame.from_dict(dct, orient="index").reset_index()
            df[b'spatial:dr'] = df[b'spatial:dr'].astype(float)
            avg_.append(df[b'spatial:dr'].mean())
            min_ += df[df[b'spatial:dr'] < 10].shape[0]

    time.append(str(now_))
    miny.append(min_)
    try:
        avgy.append(sum(avg_) / len(avg_))
    except Exception:
        avgy.append(np.nan)
    avgline = Scatter(x=list(time),
                      y=list(avgy),
                      type='scatter',
                      mode='lines',
                      name='Mean')
    minline = Scatter(x=list(time),
                      y=list(miny),
                      type='scatter',
                      mode='lines',
                      name='< 10',
                      yaxis="y2")
    #trace = [{'x': time, 'y': avgy, 'type': "scatter", 'mode': "lines", 'name': 'Avg'},
    #         {'x': time, 'y': miny, 'type': "scatter", 'mode': "lines", 'name': 'Min'}]
    layout = {
        'height': 620,
        'yaxis': {
            'title': "Average Proximity (m)",
            'side': "left"
        },
        'yaxis2': {
            'title': 'Within 10 Meters (count)',
            'side': "right",
            'overlaying': "y"
        }
    }
    return Figure(data=[avgline, minline], layout=layout)
Esempio n. 40
0
class HBaseCache(BaseCache):
    def __init__(self, host="127.0.0.1", port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs):
        # Potential bug: table_prefix instead of prefix
        BaseCache.__init__(self, default_timeout)

        if not table_name:
            raise TypeError("table_name is a required argument")
        self.table_name = table_name

        self._c = Connection(host=host, port=port, table_prefix=prefix, **kwargs)
        self._table = self._c.table(table_name)  # Note: initialisation overwrites the existing rows of the Hbase table

        self.clear()

    def _put(self, key, value):
        return key, {"cf:value": value}

    def _extract(self, value):
        if value:
            return value.get("cf:value")
        else:
            return value

    def add(self, key, value, timeout=None):  # Note: timeout is not used in this method, but should be
        print "Adding stuff"
        table = self._table
        print table
        try:
            if not table.row(
                key
            ):  # TO-DO: what does table.row returns for non existing keys? # Returns empty dict >> check for it and return None
                table.put(*self._put(key, value))
            else:
                return False
        except:
            return False
        return True

    def clear(self):
        print "Clearing stuff"
        try:
            self._c.delete_table(self.table_name, disable=True)
        except:
            pass
        self._c.create_table(self.table_name, {"cf": dict()})
        return super(HBaseCache, self).clear()

    def dec(self, key, delta=1):
        return self.inc(key, -delta)

    #        table = self._table
    #        new_value = table.counter_inc(key, 'cf:value', -delta)
    #        value = table.row(key)
    #        new_value = (self._extract(value) or 0) - delta
    #        table.put(*self._put(key, new_value))
    # TO-DO the above should in principle be guarded by some exception handling
    #        return new_value

    def delete(self, key):
        try:
            self._table.delete(key)
        except:
            return False
        return True

    def delete_many(self, *keys):
        batch = self._table.batch()
        try:
            for k in keys:
                batch.delete(k)
            batch.send()
        except:
            return False
        return True

    def get(self, key):
        value = self._table.row(key)
        return self._extract(value) or None

    def get_dict(self, *keys):
        table = self._table
        rows = table.rows(keys)
        if not rows:
            return {k: None for k in keys}
        return {k: self._extract(v) for k, v in rows}

    def get_many(self, *keys):
        table = self._table
        rows = table.rows(keys)
        if not rows:
            return [None for _ in keys]
        return map(self._extract, map(itemgetter(1), rows))

    def has(self, key):
        return super(HBaseCache, self).has(key)

    def inc(self, key, delta=1):
        table = self._table
        new_value = table.counter_inc(key, "cf:value", delta)
        return new_value

    def set(self, key, value, timeout=None):
        table = self._table
        print "Setting stuff"
        print table
        try:
            table.delete(
                key
            )  # TO-DO Does this return an exception if it doesn't exist? Otherwise we need to put a table.row before that
            table.put(*self._put(key, value))
        except:
            return False
        return True

    def set_many(self, mapping, timeout=None):
        print "Set many"
        batch = self._table.batch()
        for key, value in _items(mapping):
            batch.put(*self._put(key, value))
        try:
            batch.send()
        except:
            return False
        return True
Esempio n. 41
0
class HBaseCache(BaseCache):
    def __init__(self, host='127.0.0.1', port=9090, prefix=None, table_name=None, default_timeout=300, **kwargs): 
        super(HBaseCache, self).__init__(default_timeout)
        
        if not table_name:
            raise TypeError('table_name is a required argument')
        self.table_name = table_name

        self._c = Connection(host=host, port=port, table_prefix=prefix, **kwargs)
        self._table = self._c.table(table_name) # Note: initialisation overwrites the existing rows of the Hbase table
        
        self.clear()

    def _put(self, key, value, timeout):
        timestamp = (datetime.now() + timedelta(0, timeout or self.default_timeout)).isoformat()
        return key, {'cf:value': value, 'cf:timestamp': timestamp}

    def _extract(self, value):
        if value:
            v = value.get('cf:value')
            ts = from_iso(value.get('cf:timestamp'))
            if ts > datetime.now():
                return v
            else:
                return None
        else:
            return None

    def add(self, key, value, timeout=None):
        table = self._table
        try:
            if not table.row(key):
                table.put(*self._put(key, value, timeout))
            else:
                return False
        except:
            return False
        return True

    def clear(self):
        try:
            self._c.delete_table(self.table_name, disable=True)
        except:
            pass
        self._c.create_table(self.table_name, {'cf': dict()})
        return super(HBaseCache, self).clear()

    def dec(self, key, delta=1):
        return self.inc(key, -delta)

    def delete(self, key):
        # delete in happybase just uses batch()
        return self.delete_many([key])

    def delete_many(self, *keys):
        with self._table.batch() as batch: # TO-DO: exceptions here?
            for k in keys:
                batch.delete()
        return True

    def get(self, key):
        value = self._table.row(key)
        return self._extract(value) or None

    def get_dict(self, *keys):
        keys = keys[0]
        table = self._table
        results = dict(table.rows(keys))
        return {k: self._extract(results.get(k, None)) for k in keys}  # Non-existing keys are not returned by table.rows()

    def get_many(self, *keys):
        result = self.get_dict(*keys)
        return [result[k] for k in keys[0]]

    def has(self, key):
        return super(HBaseCache, self).has(key)

    def inc(self, key, delta=1):
        table = self._table
        new_value = table.counter_inc(key, 'cf:value', delta)
        return new_value

    # TO-DO: rewrite this to use set_many. Check if delete is necessary, etc.
    def set(self, key, value, timeout=None):
        # set in happybase just uses batch
        table = self._table
        try:
            table.delete(key)
            table.put(*self._put(key, value, timeout))
        except:
            return False
        return True

    def set_many(self, mapping, timeout=None):
        batch = self._table.batch()
        for key, value in _items(mapping):
            batch.put(*self._put(key, value, timeout))
        try:
            batch.send()
        except:
            return False
        return True
Esempio n. 42
0
class HBaseBackend(DistributedBackend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':',
            'timeout': 60000
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.logger.info("Connecting to %s:%d thrift server.", host, port)
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None

    @classmethod
    def strategy_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        o._states = HBaseState(o.connection, settings.get('HBASE_STATES_TABLE'),
                               settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'), settings.get('HBASE_DROP_ALL_TABLES'))
        return o

    @classmethod
    def db_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES')
        o._queue = HBaseQueue(o.connection, o.queue_partitions,
                              settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables,
                              use_snappy=settings.get('HBASE_USE_SNAPPY'))
        o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables,
                                    settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'),
                                    settings.get('STORE_CONTENT'))
        return o

    @property
    def metadata(self):
        return self._metadata

    @property
    def queue(self):
        return self._queue

    @property
    def states(self):
        return self._states

    def frontier_start(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_start()

    def frontier_stop(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_stop()
        self.connection.close()

    def add_seeds(self, seeds):
        self.metadata.add_seeds(seeds)

    def page_crawled(self, response):
        self.metadata.page_crawled(response)

    def links_extracted(self, request, links):
        self.metadata.links_extracted(request, links)

    def request_error(self, page, error):
        self.metadata.request_error(page, error)

    def finished(self):
        raise NotImplementedError

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        self.logger.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get_next_requests(max_next_requests, partition_id,
                                                   min_requests=self._min_requests,
                                                   min_hosts=self._min_hosts,
                                                   max_requests_per_host=self._max_requests_per_host)
            next_pages.extend(results)
            self.logger.debug("Got %d requests for partition id %d", len(results), partition_id)
        return next_pages
Esempio n. 43
0
def test_connection_compat():
    with assert_raises(ValueError):
        Connection(compat='0.1.invalid.version')
Esempio n. 44
0
 def __init__(self):
     self.conn = Connection()
Esempio n. 45
0
class TestDomainCache(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(level=logging.DEBUG)
        self.conn = Connection(host="hbase-docker")
        if b'domain_metadata' not in self.conn.tables():
            self.conn.create_table('domain_metadata', {
                'm': {'max_versions': 1, 'block_cache_enabled': 1,}
            })
        t = self.conn.table('domain_metadata')
        t.delete('d1')
        t.delete('d2')
        t.delete('d3')
        t.delete('d4')

    def test_domain_cache_both_generations(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}

        # eviction should happen
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc['d1'] == {'domain': 1}
        assert dc['d2'] == {'domain': 2}
        assert dc['d3'] == {'domain': [3, 2, 1]}
        assert dc['d4'] == {'domain': 4}

    def test_domain_cache_get_with_default(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc.get('d1', {}) == {'domain': 1}
        assert dc.get('d3', {}) == {'domain': [3, 2, 1]}

    def test_domain_cache_setdefault(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc.setdefault('d1', {}) == {'domain': 1}
        assert dc.setdefault('d5', {'domain': 6}) == {'domain': 6}
        dc.flush()
        assert dc.setdefault('d3', {}) == {'domain': [3, 2, 1]}

    def test_domain_cache_setdefault_with_second_gen_flush(self):
        dc = DomainCache(2, self.conn, 'domain_metadata', batch_size=3)
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}

        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        dc.setdefault('d1', {})['domain'] += 1

        assert dc.setdefault('d1', {}) == {'domain': 2}

    def test_empty_key(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        with self.assertRaises(KeyError):
            dc[''] = {'test':1}

    def test_deletion(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        with self.assertRaises(KeyError):
            del dc['d1']

        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        del dc['d1'] # second gen
        del dc['d3'] # first gen

        dc.flush()

        del dc['d4'] # hbase

    def test_contains(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert 'd1' in dc # second gen
        assert 'd3' in dc # first gen

        dc.flush()

        assert 'd4' in dc

    def test_pop(self):
        dc = DomainCache(2, self.conn, 'domain_metadata')
        dc['d1'] = {'domain': 1}
        dc['d2'] = {'domain': 2}
        dc['d3'] = {'domain': [3, 2, 1]}
        dc['d4'] = {'domain': 4}

        assert dc.pop('d1') == {'domain': 1}
        assert 'd1' not in dc

        assert dc.pop('d3') == {'domain': [3, 2, 1]}
        assert 'd3' not in dc

        dc.flush()

        assert dc.pop('d4') == {'domain': 4}
        assert 'd4' not in dc
def get_hbase_connection():
    global hbase_connection
    if hbase_connection is None:
        hbase_connection = Connection(host='hbase-docker', port=9090)
    return hbase_connection
Esempio n. 47
0
class HBaseBackend(DistributedBackend):
    component_name = 'HBase Backend'

    def __init__(self, manager):
        self.manager = manager
        self.logger = logging.getLogger("hbase.backend")
        settings = manager.settings
        port = settings.get('HBASE_THRIFT_PORT')
        hosts = settings.get('HBASE_THRIFT_HOST')
        namespace = settings.get('HBASE_NAMESPACE')
        self._min_requests = settings.get('BC_MIN_REQUESTS')
        self._min_hosts = settings.get('BC_MIN_HOSTS')
        self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST')

        self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS')
        host = choice(hosts) if type(hosts) in [list, tuple] else hosts
        kwargs = {
            'host': host,
            'port': int(port),
            'table_prefix': namespace,
            'table_prefix_separator': ':'
        }
        if settings.get('HBASE_USE_FRAMED_COMPACT'):
            kwargs.update({
                'protocol': 'compact',
                'transport': 'framed'
            })
        self.connection = Connection(**kwargs)
        self._metadata = None
        self._queue = None
        self._states = None

    @classmethod
    def strategy_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'),
                               settings.get('HBASE_STATE_CACHE_SIZE_LIMIT'))
        return o

    @classmethod
    def db_worker(cls, manager):
        o = cls(manager)
        settings = manager.settings
        drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES')
        o._queue = HBaseQueue(o.connection, o.queue_partitions,
                              settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables)
        o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables,
                                    settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'),
                                    settings.get('STORE_CONTENT'))
        return o

    @property
    def metadata(self):
        return self._metadata

    @property
    def queue(self):
        return self._queue

    @property
    def states(self):
        return self._states

    def frontier_start(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_start()

    def frontier_stop(self):
        for component in [self.metadata, self.queue, self.states]:
            if component:
                component.frontier_stop()
        self.connection.close()

    def add_seeds(self, seeds):
        self.metadata.add_seeds(seeds)

    def page_crawled(self, response):
        self.metadata.page_crawled(response)

    def links_extracted(self, request, links):
        self.metadata.links_extracted(request, links)

    def request_error(self, page, error):
        self.metadata.request_error(page, error)

    def finished(self):
        raise NotImplementedError

    def get_next_requests(self, max_next_requests, **kwargs):
        next_pages = []
        self.logger.debug("Querying queue table.")
        partitions = set(kwargs.pop('partitions', []))
        for partition_id in range(0, self.queue_partitions):
            if partition_id not in partitions:
                continue
            results = self.queue.get_next_requests(max_next_requests, partition_id,
                                                   min_requests=self._min_requests,
                                                   min_hosts=self._min_hosts,
                                                   max_requests_per_host=self._max_requests_per_host)
            next_pages.extend(results)
            self.logger.debug("Got %d requests for partition id %d", len(results), partition_id)
        return next_pages