Esempio n. 1
0
def test_pool_exhaustion():
    pool = ConnectionPool(size=1, **connection_kwargs)

    def run():
        with assert_raises(NoConnectionsAvailable):
            with pool.connection(timeout=.1) as connection:
                connection.tables()

    with pool.connection():
        # At this point the only connection is assigned to this thread,
        # so another thread cannot obtain a connection at this point.

        t = threading.Thread(target=run)
        t.start()
        t.join()
Esempio n. 2
0
def test_pool_exhaustion():
    pool = ConnectionPool(size=1, **connection_kwargs)

    def run():
        with assert_raises(NoConnectionsAvailable):
            with pool.connection(timeout=.1) as connection:
                connection.tables()

    with pool.connection():
        # At this point the only connection is assigned to this thread,
        # so another thread cannot obtain a connection at this point.

        t = threading.Thread(target=run)
        t.start()
        t.join()
Esempio n. 3
0
def _get_job_rule(pool: happybase.ConnectionPool,
                  job_name) -> crawler.CrawlJobCore:
    '''
        获取 hbase 里的 crawl_job_core (爬取规则)
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(job_name)
            row = table.row(rule_row_key, columns=[
                rule_col,
            ])
            rule = row[bytes(rule_col, encoding="utf-8")].decode("utf-8")
            # _json_str = row.values
            # print(rule)
            common.print_info("get crawl rule: {}".format(rule))
            crawl_job_core = crawler.CrawlJobCore.loads(rule)
            # TODO 键 有点问题
            return crawl_job_core
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Esempio n. 4
0
    def predict_from_image_batch(self, mnist_batch, index):

        t0 = time.time()
        connection_pool = ConnectionPool(size=self.CONNECTION_POOL_SIZE,
                                         host=HBaseManager.HOST,
                                         port=HBaseManager.PORT)
        hbase_manager = HBaseManager(connection_pool)

        process_pool = Pool(self.POOL_SIZE)
        n = len(mnist_batch)

        indexs = list(range(n))

        extract_process = process_pool.starmap_async(self.extract_keys,
                                                     zip(mnist_batch, indexs))
        extracted_keys = extract_process.get()

        predict_hash_args = zip(extracted_keys, indexs)

        predictions = [
            self.predict_hash_values(keys, hbase_manager, i)
            for keys, i in predict_hash_args
        ]

        process_pool.close()

        t1 = time.time()
        print("Mnist Batch {} predicted in: {} Seconds, For Node: {}".format(
            str(index), str(t1 - t0), self.__str__()))

        return predictions
Esempio n. 5
0
def _get_job_result(pool: happybase.ConnectionPool, crawl_job_name) -> list:
    '''
        获取爬虫结果
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(crawl_job_name)
            result_list = []
            for key, value in table.scan(include_timestamp=True):
                tmp = {}
                tmp['url'] = key.decode("utf-8")
                # tmp['result']={ele.decode("utf-8"):value[ele].decode("utf-8") for ele in value}
                tmp['result'] = {
                    ele.decode("utf-8"):
                    (value[ele][0].decode("utf-8"), value[ele][1])
                    for ele in value
                }
                result_list.append(tmp)
            return result_list
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Esempio n. 6
0
def _save_results(pool: happybase.ConnectionPool, crawl_job_core, url,
                  result_list) -> bool:
    '''
        保存爬取结果到 hbase 里
        如果 result_list 为空,不进行操作
    '''
    if not bool(result_list):
        return False
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(core.name)
            row_key = url
            table.put(row_key, {
                results_col_pattern(i): ele
                for i, ele in enumerate(result_list)
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Esempio n. 7
0
    def train_batch(self, mnist_batch, index):
        '''
        :type mnist_batch: list of tuple
        :type deviate: boolean
        :rtype: None
        '''

        t0 = time.time()

        connection_pool = ConnectionPool(size=self.CONNECTION_POOL_SIZE, host=HBaseManager.HOST, port=HBaseManager.PORT)
        hbase_manager = HBaseManager(connection_pool)

        process_pool = Pool(self.POOL_SIZE)
        thread_pool = ThreadPool(self.POOL_SIZE)
        n = len(mnist_batch)

        numbers, mnist_images = MnistHelper.extract_numbers_images(mnist_batch)
        mnist_images = [mnist_obs[MnistModel.PREDICTOR_INDEX] for mnist_obs in mnist_batch]
        indexs = list(range(n))

        extract_process = process_pool.starmap_async(self.extract_keys, zip(mnist_images, indexs))
        extracted_keys = extract_process.get()

        store_hash_args = zip(extracted_keys, numbers, indexs)
        [self.store_hash_values(k, n, hbase_manager, i) for k, n, i in store_hash_args]

        process_pool.close()
        thread_pool.close()

        t1 = time.time()
        print("Time taken to train batch {} : {} Seconds".format(str(index),str(t1 - t0)))
Esempio n. 8
0
def test_connection_pool():

    from thriftpy2.thrift import TException

    def run():
        name = threading.current_thread().name
        print("Thread %s starting" % name)

        def inner_function():
            # Nested connection requests must return the same connection
            with pool.connection() as another_connection:
                assert connection is another_connection

                # Fake an exception once in a while
                if random.random() < .25:
                    print("Introducing random failure")
                    connection.transport.close()
                    raise TException("Fake transport exception")

        for i in range(50):
            with pool.connection() as connection:
                connection.tables()

                try:
                    inner_function()
                except TException:
                    # This error should have been picked up by the
                    # connection pool, and the connection should have
                    # been replaced by a fresh one
                    pass

                connection.tables()

        print("Thread %s done" % name)

    N_THREADS = 10

    pool = ConnectionPool(size=3, **connection_kwargs)
    threads = [threading.Thread(target=run) for i in range(N_THREADS)]

    for t in threads:
        t.start()

    while threads:
        for t in threads:
            t.join(timeout=.1)

        # filter out finished threads
        threads = [t for t in threads if t.is_alive()]
        print("%d threads still alive" % len(threads))
Esempio n. 9
0
def _get_job_list(pool: happybase.ConnectionPool) -> list:
    '''
        获取hbase中存的job名称list
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table_list = conn.tables()
            return table_list
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Esempio n. 10
0
def _set_job_rule(pool: happybase.ConnectionPool, crawl_job_core) -> bool:
    '''
        改变规则
    '''
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(core.name)
            table.put(rule_row_key, {
                rule_col: core.dumps(),
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Esempio n. 11
0
def _save_job(pool: happybase.ConnectionPool, crawl_job_core)-> bool:
    '''
        存储 crawl_job_core (爬取规则) 到 hbase 里
    '''
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            conn.create_table(name=core.name, families={
                rule_col: dict(max_versions=rule_max_version),
                results_family: dict(max_versions=results_max_version),
            })
            table = conn.table(core.name)
            table.put(rule_row_key, {
                rule_col: core.dumps()
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Esempio n. 12
0
def test_connection_pool():
    def run():
        name = threading.current_thread().name
        print "Thread %s starting" % name

        def inner_function():
            # Nested connection requests must return the same connection
            with pool.connection() as another_connection:
                assert connection is another_connection

        for i in xrange(100):
            with pool.connection() as connection:
                connection.tables()

                # Fake an exception once in a while
                if random.random() < .001:
                    connection._tainted = True

                inner_function()

        print "Thread %s done" % name

    N_THREADS = 50

    pool = ConnectionPool(size=3, **connection_kwargs)
    threads = [threading.Thread(target=run) for i in xrange(N_THREADS)]

    for t in threads:
        t.start()

    while threads:
        for t in threads:
            t.join(timeout=.1)

        # filter out finished threads
        threads = [t for t in threads if t.is_alive()]
        print "%d threads still alive" % len(threads)
Esempio n. 13
0
def test_connection_pool_construction():
    with assert_raises(TypeError):
        ConnectionPool(size='abc')

    with assert_raises(ValueError):
        ConnectionPool(size=0)
Esempio n. 14
0
 def setup(self):
     HBaseManager(ConnectionPool(size=1, host=HBaseManager.HOST, port=HBaseManager.PORT)).create_table(
         table_name=self.TABLE_NAME, delete=True)
Esempio n. 15
0
from django.http import HttpResponse
from happybase import Connection, ConnectionPool
from collections import Counter
import random
import itertools
import pandas as pd
import numpy as np
from django.shortcuts import render_to_response
from desktop.lib.django_util import render
import datetime, json
from os.path import abspath, split, join
import strategy


# conn = Connection('192.168.2.41')
pool = ConnectionPool(size=5, host='192.168.2.41')
'''
user_t = conn.table('haodou')
keys_t = conn.table('tag_search_keys')
id2tokens_t = conn.table('id2tokens') #id-标签对应表
favs_t = conn.table('favs_by_time') #key: time, column: token, value: [w1, w2, w3, ...]
user_tags_t = conn.table('haodou_user_tags') # 用户标签库
goods_t = conn.table('haodou_goods')
td_w_t = conn.table('td_w') # tag_date_weight
ud_w_t = conn.table('ud_w') # user_date_weight
'''

current_tag = ''
country_index = [u'中国', u'加拿大', u'美国', u'日本', u'澳大利亚']

token2tag = {