Exemple #1
0
def _get_job_rule(pool: happybase.ConnectionPool,
                  job_name) -> crawler.CrawlJobCore:
    '''
        获取 hbase 里的 crawl_job_core (爬取规则)
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(job_name)
            row = table.row(rule_row_key, columns=[
                rule_col,
            ])
            rule = row[bytes(rule_col, encoding="utf-8")].decode("utf-8")
            # _json_str = row.values
            # print(rule)
            common.print_info("get crawl rule: {}".format(rule))
            crawl_job_core = crawler.CrawlJobCore.loads(rule)
            # TODO 键 有点问题
            return crawl_job_core
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Exemple #2
0
def _get_job_result(pool: happybase.ConnectionPool, crawl_job_name) -> list:
    '''
        获取爬虫结果
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(crawl_job_name)
            result_list = []
            for key, value in table.scan(include_timestamp=True):
                tmp = {}
                tmp['url'] = key.decode("utf-8")
                # tmp['result']={ele.decode("utf-8"):value[ele].decode("utf-8") for ele in value}
                tmp['result'] = {
                    ele.decode("utf-8"):
                    (value[ele][0].decode("utf-8"), value[ele][1])
                    for ele in value
                }
                result_list.append(tmp)
            return result_list
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Exemple #3
0
def _save_results(pool: happybase.ConnectionPool, crawl_job_core, url,
                  result_list) -> bool:
    '''
        保存爬取结果到 hbase 里
        如果 result_list 为空,不进行操作
    '''
    if not bool(result_list):
        return False
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(core.name)
            row_key = url
            table.put(row_key, {
                results_col_pattern(i): ele
                for i, ele in enumerate(result_list)
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Exemple #4
0
def test_pool_exhaustion():
    pool = ConnectionPool(size=1, **connection_kwargs)

    def run():
        with assert_raises(NoConnectionsAvailable):
            with pool.connection(timeout=.1) as connection:
                connection.tables()

    with pool.connection():
        # At this point the only connection is assigned to this thread,
        # so another thread cannot obtain a connection at this point.

        t = threading.Thread(target=run)
        t.start()
        t.join()
Exemple #5
0
def test_pool_exhaustion():
    pool = ConnectionPool(size=1, **connection_kwargs)

    def run():
        with assert_raises(NoConnectionsAvailable):
            with pool.connection(timeout=.1) as connection:
                connection.tables()

    with pool.connection():
        # At this point the only connection is assigned to this thread,
        # so another thread cannot obtain a connection at this point.

        t = threading.Thread(target=run)
        t.start()
        t.join()
Exemple #6
0
def _get_job_list(pool: happybase.ConnectionPool) -> list:
    '''
        获取hbase中存的job名称list
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table_list = conn.tables()
            return table_list
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Exemple #7
0
def _set_job_rule(pool: happybase.ConnectionPool, crawl_job_core) -> bool:
    '''
        改变规则
    '''
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(core.name)
            table.put(rule_row_key, {
                rule_col: core.dumps(),
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Exemple #8
0
def _save_job(pool: happybase.ConnectionPool, crawl_job_core)-> bool:
    '''
        存储 crawl_job_core (爬取规则) 到 hbase 里
    '''
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            conn.create_table(name=core.name, families={
                rule_col: dict(max_versions=rule_max_version),
                results_family: dict(max_versions=results_max_version),
            })
            table = conn.table(core.name)
            table.put(rule_row_key, {
                rule_col: core.dumps()
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接