Ejemplo n.º 1
0
 def get_hbase_connect_pool(self):
     """
     :return: 获取happybase的连接池
     """
     pool = happybase.ConnectionPool(200, host=self.host, port=self.port)
     return pool
Ejemplo n.º 2
0
def get_connetion_pool(timeout=10):
    global conn_pool
    if conn_pool is None:
        conn_pool = happybase.ConnectionPool(10, timeout=timeout)
    return conn_pool
Ejemplo n.º 3
0
    def update_user_ctr_feature_to_hbase(self):
        """
        :return:
        """
        clr.spark.sql("use profile")

        user_profile_hbase = self.spark.sql(
            "select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase"
        )

        # 特征工程处理
        # 抛弃获取值少的特征
        user_profile_hbase = user_profile_hbase.drop('env', 'birthday',
                                                     'gender')

        def get_user_id(row):
            return int(row.user_id.split(":")[1]), row.article_partial

        user_profile_hbase_temp = user_profile_hbase.rdd.map(get_user_id)

        from pyspark.sql.types import *

        _schema = StructType([
            StructField("user_id", LongType()),
            StructField("weights", MapType(StringType(), DoubleType()))
        ])

        user_profile_hbase_schema = self.spark.createDataFrame(
            user_profile_hbase_temp, schema=_schema)

        def frature_preprocess(row):

            from pyspark.ml.linalg import Vectors

            channel_weights = []
            for i in range(1, 26):
                try:
                    _res = sorted([
                        row.weights[key] for key in row.weights.keys()
                        if key.split(':')[0] == str(i)
                    ])[:10]
                    channel_weights.append(_res)
                except:
                    channel_weights.append([])

            return row.user_id, channel_weights

        res = user_profile_hbase_schema.rdd.map(frature_preprocess).collect()

        # 批量插入Hbase数据库中
        pool = happybase.ConnectionPool(size=10,
                                        host='hadoop-master',
                                        port=9090)
        with pool.connection() as conn:
            ctr_feature = conn.table('ctr_feature_user')
            with ctr_feature.batch(transaction=True) as b:
                for i in range(len(res)):
                    for j in range(25):
                        b.put(
                            "{}".format(res[i][0]).encode(), {
                                "channel:{}".format(j + 1).encode():
                                str(res[i][1][j]).encode()
                            })
            conn.close()
Ejemplo n.º 4
0
        def save_content_filter_history_to__recall(partition):
            """计算每个用户的每个操作文章的相似文章,过滤之后,写入content召回表当中(支持不同时间戳版本)
            """
            import happybase
            pool = happybase.ConnectionPool(size=10, host='hadoop-master')

            # 进行为相似文章获取
            with pool.connection() as conn:

                # key:   article_id,    column:  similar:article_id
                similar_table = conn.table('article_similar')
                # 循环partition
                for row in partition:
                    # 获取相似文章结果表
                    similar_article = similar_table.row(str(
                        row.article_id).encode(),
                                                        columns=[b'similar'])
                    # 相似文章相似度排序过滤,召回不需要太大的数据, 百个,千
                    _srt = sorted(similar_article.items(),
                                  key=lambda item: item[1],
                                  reverse=True)
                    if _srt:
                        # 每次行为推荐10篇文章
                        reco_article = [
                            int(i[0].split(b':')[1]) for i in _srt
                        ][:10]

                        # 获取历史看过的该频道文章
                        history_table = conn.table('history_recall')
                        # 多个版本
                        data = history_table.cells(
                            'reco:his:{}'.format(row.user_id).encode(),
                            'channel:{}'.format(row.channel_id).encode())

                        history = []
                        if len(data) >= 2:
                            for l in data[:-1]:
                                history.extend(eval(l))
                        else:
                            history = []

                        # 过滤reco_article与history
                        reco_res = list(set(reco_article) - set(history))

                        # 进行推荐,放入基于内容的召回表当中以及历史看过的文章表当中
                        if reco_res:
                            # content_table = conn.table('cb_content_recall')
                            content_table = conn.table('cb_recall')
                            content_table.put(
                                "recall:user:{}".format(row.user_id).encode(),
                                {
                                    'content:{}'.format(row.channel_id).encode(
                                    ):
                                    str(reco_res).encode()
                                })

                            # 放入历史推荐过文章
                            history_table.put(
                                "reco:his:{}".format(row.user_id).encode(), {
                                    'channel:{}'.format(row.channel_id).encode(
                                    ):
                                    str(reco_res).encode()
                                })

                conn.close()
Ejemplo n.º 5
0
        def get_similar_online_recall(rdd):
            """
            解析rdd中的内容,然后进行获取计算
            :param rdd:
            :return:
            """
            # rdd---> 数据本身
            # [row(1,2,3), row(4,5,6)]----->[[1,2,3], [4,5,6]]
            import happybase
            # 初始化happybase连接
            pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)
            for data in rdd.collect():

                # 进行data字典处理过滤
                if data['param']['action'] in ["click", "collect", "share"]:

                    logger.info(
                        "{} INFO: get user_id:{} action:{}  log".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                                                        data['param']['userId'], data['param']['action']))

                    # 读取param当中articleId,相似的文章
                    with pool.connection() as conn:

                        sim_table = conn.table("article_similar")

                        # 根据用户点击流日志涉及文章找出与之最相似文章(基于内容的相似),选取TOP-k相似的作为召回推荐结果
                        _dic = sim_table.row(str(data["param"]["articleId"]).encode(), columns=[b"similar"])
                        _srt = sorted(_dic.items(), key=lambda obj: obj[1], reverse=True)  # 按相似度排序
                        if _srt:

                            topKSimIds = [int(i[0].split(b":")[1]) for i in _srt[:10]]

                            # 根据历史推荐集过滤,已经给用户推荐过的文章
                            history_table = conn.table("history_recall")

                            _history_data = history_table.cells(
                                b"reco:his:%s" % data["param"]["userId"].encode(),
                                b"channel:%d" % data["channelId"]
                            )
                            # print("_history_data: ", _history_data)

                            # history = []
                            # if len(data) >= 2:
                            #     for l in data[:-1]:
                            #         history.extend(eval(l))
                            # else:
                            #     history = []
                            history = []
                            if len(_history_data) > 1:
                                for l in _history_data:
                                    history.extend(l)

                            # 根据历史召回记录,过滤召回结果
                            recall_list = list(set(topKSimIds) - set(history))

                            # 如果有推荐结果集,那么将数据添加到cb_recall表中,同时记录到历史记录表中
                            logger.info(
                                "{} INFO: store online recall data:{}".format(
                                    datetime.now().strftime('%Y-%m-%d %H:%M:%S'), str(recall_list)))

                            if recall_list:

                                recall_table = conn.table("cb_recall")

                                recall_table.put(
                                    b"recall:user:%s" % data["param"]["userId"].encode(),
                                    {b"online:%d" % data["channelId"]: str(recall_list).encode()}
                                )

                                history_table.put(
                                    b"reco:his:%s" % data["param"]["userId"].encode(),
                                    {b"channel:%d" % data["channelId"]: str(recall_list).encode()}
                                )
                        conn.close()
import happybase
import json
import sha1_tools

hbase_conn_timeout = None
pool = happybase.ConnectionPool(size=12,
                                host='10.1.94.57',
                                timeout=hbase_conn_timeout)
sha1_tools.pool = pool
global_var = json.load(open('../../conf/global_var_all.json'))
sha1_tools.global_var = global_var

sha1_mysql = sha1_tools.get_SHA1_from_MySQL(1)
print sha1_mysql
sha1_mysql = sha1_tools.get_SHA1_from_MySQL(151)
print sha1_mysql
sha1_mysql = sha1_tools.get_SHA1_from_MySQL(10)
print sha1_mysql
sha1_aaron = sha1_tools.compute_SHA1_for_image_id_from_tab_aaron(
    1, "aaron_memex_ht-images")
print sha1_aaron
sha1_aaron = sha1_tools.compute_SHA1_for_image_id_from_tab_aaron(
    10, "aaron_memex_ht-images")
print sha1_aaron
sha1s_mysql = sha1_tools.get_batch_SHA1_from_mysql(["1", "10", "151"])
print sha1s_mysql
sha1s_mysql = sha1_tools.get_batch_SHA1_from_mysql([1, 10, 151])
print sha1s_mysql
Ejemplo n.º 7
0
  
  # sqllite settings
  vuri = ':memory:'
  dbc = apsw.Connection(vuri)
   
  #xbee connection
  ser = serial.Serial('/dev/ttyAMA0', 9600, timeout=5)
  xbee = ZigBee(ser,escaped=True)
  
  #basic sensor keys
  knownprekeys = ['40b5af00_rx000A01_','40b5af00_rx000A02_','40b5af00_rx000A03_','40b5af00_rx000A04_','40b5af00_rx000A05_','40b5af00_rx000A06_','40b5af00_rx000A07_','40b5af01_rx000A01_','40b5af01_rx000A02_','40b5af01_rx000A03_','40b5af01_rx000A04_','40b5af01_rx000A05_','40b5af01_rx000A07_','40b5af01_rx000B01_','40b5af01_rx000B02_','40b5af01_rx000B03_','40b5af01_rx000B04_']
  
  time.sleep(2)
  
  #Happybase connection pool to HBase server. Usses ssh portforwarding to connect to remote host.
  hpool = happybase.ConnectionPool(6,host='localhost')
  
 
  
  #classes used for multithreading.
  class myThreadInsert (threading.Thread):
      def __init__(self):
          threading.Thread.__init__(self)
      def run(self):
          try:
              xinsert()
          except Exception:
              logging.exception("xinsert")
          
  class myThreadRead (threading.Thread):
      def __init__(self):
Ejemplo n.º 8
0
import datetime
import time

import numpy as np
import happybase
from elasticsearch_dsl import connections, Search
import faiss

pool = happybase.ConnectionPool(size=10, host='localhost', port=9091)
connections.create_connection(hosts=['localhost'], timeout=20)

faiss_model_path = "faiss.model"
index = faiss.read_index(faiss_model_path)
model_update_time = ""


def get_user_profile_recall(user_id, num_items):
    """
    用户偏好召回,hbase取用户小类偏好top1,2,3,es检索再根据上市时间排名
    :param user_id:
    :param num_items:
    :return: item_list
    """
    with pool.connection() as conn:
        table = conn.table('TOPIC_LIKE')
        row = table.row(user_id,
                        columns=[b'INFO:PTY1', b'INFO:PTY2', b'INFO:PTY3'])
        conn.close()

    search_size = {b"INFO:PTY1": 0.5, b"INFO:PTY2": 0.3, b"INFO:PTY3": 0.2}
    item_list = []
Ejemplo n.º 9
0
#coding=utf-8
import sys
sys.path.append("../configs")
sys.path.append("configs")
import settings

import happybase
import json
import logging
pool = happybase.ConnectionPool(size=settings.hbase_pool_size, \
        host=settings.hbase_host, \
        table_prefix=settings.hbase_table_prefix,\
        protocol='compact')

# conn = happybase.Connection(host=settings.hbase_host,\
# 							table_prefix=settings.hbase_table_prefix,\
# 							protocol="compact")


def create_table(table_name):
    try:
        with pool.connection() as conn:
            conn.create_table(table_name, {
                'index': dict(max_versions=1),
                'data': dict(max_versions=1)
            })
    except Exception, e:
        logging.exception(e)
        return False
    return True
Ejemplo n.º 10
0
    scorelist = []
    count = 0
    for y in x[1][1]:
        scorelist.append(y)
        count += 1
    scorelist.sort(reverse=True)
    # make sure we don"t overwhelm users by too many photos under the same tag
    if count > NUM_PHOTO_PER_TAG:
        scorelist = scorelist[:NUM_PHOTO_PER_TAG]
    return x[0], (x[1][0], scorelist)


# sample input
# (2, (u"ptag2", [(0.7692307692307693, "{"photo": {"timeposted": 1422939564, "description": "pdes", "title": "ptitle", "URL": "purl", "tags": "ptag1,ptag2,ptag3", "pid": "101", "location": {"latitude": "plat", "longitude": "plon"}}, "numViewed": 3, "numLiked": 10}"), (0.4230769230769231, "{"photo": {"timeposted": 1422939564, "description": "pdes", "title": "ptitle", "URL": "purl", "tags": "ptag1,ptag2,ptag3", "pid": "103", "location": {"latitude": "plat", "longitude": "plon"}}, "numViewed": 15, "numLiked": 11}")]))

POOL = happybase.ConnectionPool(size=30, host="c0tl.com")


def writeToHBase(x):
    # print "count ", x[0]
    # print "tag name", str(x[1][0])
    # print "first photo", x[1][1][0]
    print "writing to hbase.., cout,", x[0]
    plist = x[1][1]
    pdict = {}
    for i in range(len(plist)):
        pdict[i] = json.loads(plist[i][1])
    with POOL.connection() as connection:
        tagview = connection.table('top_tags')
        rowkey = "%016i" % int(x[0]) + hashlib.md5(str(x[1][0])).digest()
        tagview.put(rowkey, {
Ejemplo n.º 11
0
#! /usr/bin/python

import happybase
pool = happybase.ConnectionPool(1, host='localhost', port=9090)

from collections import defaultdict, namedtuple
import happybase

TaskContent = namedtuple('TaskContent', [
    'submit_at', 'from_reverse', 'site_asset_id', 'deadline', 'id', 'retries',
    'account', 'uuid', 'created_at', 'format', 'priority', 'scope',
    'queued_at', 'dna_url'
])

matches = [{
    'video_score': 99,
    'meta_uuid': '970ae0ba-773b-11e1-a7b2-080027cf46d6',
    'video_sample_offset': 0,
    'match_type': 'video',
    'meta_name': 'Auto_Rule306_Movie',
    'video_ref_offset': 0,
    'audio_sample_offset': 0,
    'audio_score': 0,
    'audio_duration': 0,
    'track_id': 0.0,
    'instance_id': '9752d1cc-773b-11e1-a7b2-080027cf46d6',
    'audio_ref_offset': 0,
    'clip_duration': 307,
    'media_type': 'video',
    'video_duration': 307,
    'instance_name': 'cappella.flv.xfp.0'
Ejemplo n.º 12
0
import json
import datetime
import sys
import pandas as pd
import os

import happybase

from api2.mysql import mysql

# connection = happybase.Connection(host='120.27.241.54', transport='framed', protocol='compact')
# connection.open()
# table = connection.table('usersize_recommend')

pool = happybase.ConnectionPool(size=10,
                                host='120.27.241.54',
                                transport='framed',
                                protocol='compact')


class recommendProduct(object):
    """docstring for ClassName"""
    def __init__(self, ):
        pass

    def computedUserRecommendProd(userid):
        print(userid)
        # userid = userid.decode('utf8')
        # userid = json.loads(userid)
        # userid = userid['userid']
        logging.info(userid)
Ejemplo n.º 13
0
# -*- coding: utf-8 -*-

import happybase

pool = happybase.ConnectionPool(host='localhost', port=9090, size=10)
Ejemplo n.º 14
0
 def __init__(self, ):
     self.conn = hconn.ConnectionPool(size=8, host='133.0.6.89')
     self.table = b'vip:tian_yan'
     self.html_col = b'data:html'
     self.summary_col = b'data:summary'
Ejemplo n.º 15
0
def create_data_from_station_data(first, second):
    """this function creates the data analyzing the two stations in comparison"""
    global hdfs
    #global hdfs object
    global hbase
    #global hbase object

    if (hdfs is None):
        from pywebhdfs.webhdfs import PyWebHdfsClient
        hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',
                               port='50070',
                               user_name='uacharya')

    if (hbase is None):
        import happybase
        hbase = happybase.ConnectionPool(size=1,
                                         host='cshadoop.boisestate.edu')

    date_for_comparision = first["Date"].strip()

    # creating directory for each date
    try:
        hdfs.get_file_dir_status('user/uacharya/single_screen/' +
                                 date_for_comparision)
    except Exception:
        # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9
        content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n'
        try:
            hdfs.create_file('user/uacharya/single_screen/' +
                             date_for_comparision + '/data/output.csv',
                             content,
                             replication=1)
        except Exception:
            pass

    dataset = {
        'node_1': [],
        'node_2': [],
        'node_3': []
    }

    for data in broadcast_variable.value:
        compare_data_between(date_for_comparision, first, data, dataset)


#    for key in dataset:
#        if(len(dataset[key])!=0):
#            content = "\n".join(dataset[key]);
#            content +="\n";
#            while(True):
#                try:
#                    hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096);
#                    break;
#                except Exception:
#                    time.sleep(0.2);
#                    continue;

    dataset.clear()
    #clearing the dictionary
    # append over here after all the global variable has been made
    return second
Ejemplo n.º 16
0
def save_hbase(entries):
    pool = happybase.ConnectionPool(size=3, host=HBASE_HOST)
    for entry in entries:
        with pool.connection() as connection:
            table = connection.table(HBASE_TABLE)
            table.put(entry[0], entry[1])
Ejemplo n.º 17
0
    # escorts_images_sha1_infos_from_ts_subsampled_newformat => ht_images_infos_merged_subsampled
    # discard ad:*. ext:sbcmdline (at least for real transform). just do not put them in mappings
    # mappings should be an array of arrays like:
    # ["ext:dlib*", "data:dlib*"]
    # ["ext:sbpycaffe*", "data:sbpycaffe*"]
    # ["info:s3_url", "data:s3_url"]
    # Could be a parameter in conf
    HAPPYBASE_HOST = '10.108.16.137'

    # TODO: should we also transform update table?

    # Try to create "tab_name_out"
    HBASE_TIMEOUT = None
    NB_THREADS = 1
    POOL = happybase.ConnectionPool(size=NB_THREADS,
                                    host=HAPPYBASE_HOST,
                                    timeout=HBASE_TIMEOUT)
    with POOL.connection() as CONN:
        get_create_table(TAB_NAME_OUT, CONN, TAB_OUT_FAMILIES)

    # Setup spark job
    SC = SparkContext(appName='transform_' + TAB_NAME_IN + '_to_' +
                      TAB_NAME_OUT)
    SC.setLogLevel("ERROR")
    CONF = SparkConf()
    HBASE_MAN_IN = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_IN)
    HBASE_MAN_OUT = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_OUT)
    transform_table()

    print("Transformation completed.")
Ejemplo n.º 18
0
from django.conf import settings
from django.http import HttpResponse

import happybase

logger = logging.getLogger(__name__)

N_KEYS = 10000

#
# Initialization
#
# Importing this module has side-effects; way to go Django. :s
#

pool = happybase.ConnectionPool(size=3, host=settings.HBASE_HOST)


def populate_table():
    with pool.connection() as connection:
        connection.delete_table(settings.HBASE_TABLE, disable=True)
        connection.create_table(settings.HBASE_TABLE, families={'cf': {}})
        table = connection.table(settings.HBASE_TABLE)
        with table.batch() as b:
            for i in xrange(N_KEYS):
                row_data = {'cf:col1': 'value-%d' % i}
                b.put('row-key-%d' % i, row_data)


with pool.connection() as connection:
    if not settings.HBASE_TABLE in connection.tables():
Ejemplo n.º 19
0
import functools

import common
import crawler
import happybase
from . import misc

conf = common.args.hbase_conf
host = conf["hbase_thrift_host"]
port = conf["hbase_thrift_port"]
table_prefix = conf["table_prefix"]
table_prefix_separator = conf["table_prefix_separator"]
hbase_pool = happybase.ConnectionPool(
    size=3, 
    host=host, 
    port=port,
    table_prefix=table_prefix,
    table_prefix_separator=table_prefix_separator
)

# 模块对外接口,用偏函数实现
# get_job_rule = functools.partial(misc._get_job_rule,hbase_pool)
# set_job_rule = functools.partial(misc._set_job_rule,hbase_pool)
# save_job = functools.partial(misc._save_job,hbase_pool)
# remove_job = functools.partial(misc._remove_job,hbase_pool)
# save_results = functools.partial(misc._save_results,hbase_pool)


def get_job_rule(job_name) -> crawler.CrawlJobCore:
    '''
        获取 hbase 里的 crawl_job_core (爬取规则)
Ejemplo n.º 20
0
        def get_similar_online_recall(rdd):
            import happybase
            pool = happybase.ConnectionPool(size=10,
                                            host='hadoop-master',
                                            port=9090)
            # 解析rdd中的内容,然后进行获取计算
            # rdd的[row(1,2,3), row(4,5,6)] -----> rdd.collect()的[[1,2,3], [4,5,6]]
            for data in rdd.collect():
                # 进行data字典处理过滤
                if data['param']['action'] in ["click", "collect", "share"]:
                    logger.info(
                        "{} INFO: get user_id:{} action:{}  log".format(
                            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                            data['param']['userId'], data['param']['action']))
                    # 读取param当中articleId,相似的文章
                    with pool.connection() as conn:
                        sim_table = conn.table("article_similar")
                        # 根据用户点击流日志涉及文章找出与之最相似文章(基于内容的相似),选取TOP-k相似的作为召回推荐结果
                        _dic = sim_table.row(str(
                            data["param"]["articleId"]).encode(),
                                             columns=[b"similar"])
                        if _dic:
                            logger.info("_dic is " + str(_dic))
                            # {b'similar:1': b'0.2', b'similar:2': b'0.34', b'similar:3': b'0.267', b'similar:4': b'0.56', b'similar:5': b'0.7', b'similar:6': b'0.819', b'similar:8': b'0.28'}

                            _srt = sorted(_dic.items(),
                                          key=lambda obj: obj[1],
                                          reverse=True)  # 按相似度排序
                            logger.info("_srt is " + str(_srt))
                            # [(b'similar:6', b'0.819'), (b'similar:5', b'0.7'), (b'similar:4', b'0.56'), (b'similar:2', b'0.34'), (b'similar:8', b'0.28'), (b'similar:3', b'0.267'), (b'similar:1', b'0.2')]

                            topKSimIds = [
                                int(i[0].split(b":")[1]) for i in _srt[:10]
                            ]
                            logger.info("topKSimIds is " + str(topKSimIds))
                            # [6, 5, 4, 2, 8, 3, 1]

                            # 根据历史推荐集history_recall进行过滤(已经给用户推荐过的文章)
                            history_table = conn.table("history_recall")

                            _history_data = history_table.cells(
                                b"reco:his:%s" %
                                data["param"]["userId"].encode(),
                                b"channel:%d" % data["channelId"])
                            logger.info("_history_data is " +
                                        str(_history_data))

                            history = []
                            if len(_history_data) >= 1:
                                for l in _history_data:
                                    history.extend(eval(l))
                            logger.info("history is " + str(history))

                            # 根据历史召回记录,过滤召回结果
                            recall_list = list(set(topKSimIds) - set(history))
                            logger.info("recall_list is " + str(recall_list))

                            # 如果有推荐结果集,那么将数据添加到cb_recall表中,同时记录到历史记录表中
                            logger.info(
                                "{} INFO: store online recall data:{}".format(
                                    datetime.now().strftime(
                                        '%Y-%m-%d %H:%M:%S'),
                                    str(recall_list)))

                            if recall_list:
                                recall_table = conn.table("cb_recall")

                                recall_table.put(
                                    b"recall:user:%s" %
                                    data["param"]["userId"].encode(), {
                                        b"online:%d" % data["channelId"]:
                                        str(recall_list).encode()
                                    })

                                history_table.put(
                                    b"reco:his:%s" %
                                    data["param"]["userId"].encode(), {
                                        b"channel:%d" % data["channelId"]:
                                        str(recall_list).encode()
                                    })

                        conn.close()
                        logger.info("-" * 30)
Ejemplo n.º 21
0
from kafka import KafkaConsumer
import time
import happybase
import json

hbase_ip = '127.0.0.1'
hbase_port = 9090
ip = hbase_ip
port = hbase_port
pool = happybase.ConnectionPool(size=3, host=ip)


# 往tableName里插数据
def hbase_load(tableName, lists):
    with pool.connection() as connection:
        connection.open()
    if tableName not in str(connection.tables()):
        create_table(connection, tableName)
    # print(tableName,str(connection.tables()))
    table = connection.table(tableName)
    b = table.batch(batch_size=1024)
    for li in lists:
        try:
            rowkey = li['info']
            data_dicts = {}
            for d, x in li.items():
                key = "ss:" + d
                value = str(x)
                data_dicts[key] = value
                b.put(row=rowkey, data=data_dicts)
                b.send()
Ejemplo n.º 22
0
import happybase

# gives error
# TSocket read 0 bytes
# [Errno 32] Broken pipe

if __name__ == "__main__":
    pool = happybase.ConnectionPool(size=1, host="10.1.94.57")
    with pool.connection() as conn:
        table_name = "escorts_images_sha1_infos_dev"
        hbase_table = conn.table(table_name)
        batch_list_queries = ["000421227D83DA48DB4A417FCEFCA68272398B8E"]
        rows = hbase_table.rows(batch_list_queries)
        print rows
Ejemplo n.º 23
0
def insert_row(batch, row):
    batch.put(str(row), {"data:value": str(row + 10)})
    print "Insert row %i" % (row)


def delete_row(batch, row):
    batch.delete(str(row))
    print "Delete row %i" % (row)


# Start to run
# connection, table, batch = connect_to_hbase()
pool = happybase.ConnectionPool(size=3,
                                host=host,
                                table_prefix=namespace,
                                table_prefix_separator=':',
                                port=9090)
with pool.connection() as connection:
    # print "Connect to HBase. batch size: %i" % (batch_size)
    print(connection.tables())
    table = connection.table(name=table_name)
    batch = table.batch(batch_size=batch_size)

    for row in range(1, 10000):
        insert_row(batch, row)

    batch.send()

#    with batch:
#        insert_row(batch, row)
Ejemplo n.º 24
0
    print u"%s 结束HBASE插入" % time.strftime('%Y-%m-%d %H:%M:%S',
                                          time.localtime(time.time()))
    print u"插入耗时: %s s" % ((end_put_time - start_put_time).seconds)
    #b.send()
    #进行GMT的处理
    second_storing = Second_Storing()
    second_storing.merge_GMT_time(file[0:21])

    happybase_end_time = datetime.datetime.now()
    #print u"存入耗时: %s"%((happybase_end_time - happybase_start_time).seconds)


pool = happybase.ConnectionPool(
    size=66,
    host='10.210.180.43',
    port=9090,
    timeout=None,
    autoconnect=True,
    compat='0.94',
)
from multiprocessing import Pool
import os, time, random


def put_data(table_name, list_put_table_data, counter_list_all_para):

    cut_number = (counter_list_all_para // 74) + 1
    #cut_number = 1
    print "进程数: %s" % cut_number
    list_cut = div_list(list_put_table_data, cut_number)
    print list_cut[0][0][0], list_cut[0][-1][0]
    #print list_cut[1][0][0], list_cut[1][-1][0]
Ejemplo n.º 25
0
# -*- coding: UTF-8 -*-

import happybase
from setting.default import DefaultConfig
import redis

pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)

# 召回数据
# 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。
redis_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=10,
                                 decode_responses=True)

# 用于缓存的Redis数据库
# 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。
cache_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST,
                                 port=DefaultConfig.REDIS_PORT,
                                 db=8,
                                 decode_responses=True)

# 在 sort_service.py 排序逻辑中使用
from pyspark import SparkConf
from pyspark.sql import SparkSession
# spark配置
conf = SparkConf()
conf.setAll(DefaultConfig.SPARK_GRPC_CONFIG)
SORT_SPARK = SparkSession.builder.config(conf=conf).getOrCreate()
Ejemplo n.º 26
0
 def get_connection_pool(self, size=128, **kw):
     self.pool = happybase.ConnectionPool(**kw, size=size)
     return self.pool
Ejemplo n.º 27
0
 def get_hb_conn():
     hbase_pool = happybase.ConnectionPool(size=2, host=Read().hbase_host)
     return hbase_pool
Ejemplo n.º 28
0
 print("this is the driver container")
 # getting the header of the whole dataset
 header = distributed_dataset.first()
 # filtering the header out of the data
 distributed_dataset = distributed_dataset.filter(lambda d: d != header)
 # mapping the data to prepare for processing
 data_in_required_format = distributed_dataset.map(
     create_required_datewise_data)
 data_in_required_format.cache()
 #collecting keys to do batch processing based on keys
 temp = set(data_in_required_format.keys().collect())
 print("total keys " + str(len(temp)))
 #sorting keys to create data in chronological order based on date
 sorted_keys = sorted(temp, key=int)
 #connecting to database for writing checker data
 database = happybase.ConnectionPool(size=1, host='cshadoop.boisestate.edu')
 #getting a connection from the pool
 #    with database.connection() as db:
 #        db.create_table('fChecker'.encode(),{'f'.encode():dict(max_versions=1,in_memory=True)});
 #creating batch processing with new rdd each iteration based on key values
 for key in sorted_keys[:2]:
     print(key)
     keyed_rdd = data_in_required_format.filter(lambda t: t[0] == key).map(
         lambda t: t[1]).coalesce(48, shuffle=True)
     keyed_rdd.cache()
     #collecting all the dataset for broadcasting
     broadcast_data = keyed_rdd.collect()
     print(str(len(broadcast_data)) + " driver program")
     #        l = keyed_rdd.glom().map(len).collect()  # get length of each partition
     #        print(min(l), max(l), sum(l)/len(l), len(l))  # check if skewed
     #        broadcasting the entire keyed dataset
def user_table():
	with pool.connection() as connection:
		user=connection.table('user')
		scaner=user.scan()
		for key, data in scaner:
			print key, data
	
def user_table():
	with pool.connection() as connection:
		connection.enable_table('movie')
		movie=connection.table('movie')
		scaner=movie.scan()
		for key, data in scaner:
			print key, data

pool = happybase.ConnectionPool(size=3, host='hadoop_env.com', table_prefix='pool_test')

try:
   thread.start_new_thread( user_table )
   thread.start_new_thread( movie_table )
except:
   print "Error: 无法开启线程"








Ejemplo n.º 30
0
 def __init__(self, host, table_prefix, table_name):
     self.pool = hb.ConnectionPool(size=16,
                                   host=host,
                                   autoconnect=True,
                                   table_prefix=table_prefix)
     self.table_name = table_name