예제 #1
0
class Video(object):
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_SF_ADDR)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_SF_ADDR)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        while True:
            rowkey = self.redis_con.get_rowkey("video")
            if rowkey == None:
                if len(action_list) > 0:
                    self.commit(action_list)
                    action_list.clear()
                    count = 0
                    start = int(time.time())
                time.sleep(10)
                continue
            if "|||||" in rowkey:
                rowkey = rowkey.split("|||||")[0]
            map = self.hbase_con.getSuanfaResultByRowkey(
                "VIDEO_DATA_TS_TABLE", rowkey, "video")
            if not map:
                continue
            action = {
                "_index": "video",
                "_type": "sino",
                "_id": "",
                "_source": {},
            }
            action['_id'] = rowkey
            action['_source'] = map
            action_list.append(action)
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 30:
                if len(action_list) > 0:
                    self.es_ping()
                    self.commit(action_list)
                    start = int(time.time())
                    action_list.clear()
                count = 0

    def commit(self, action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:video,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.info("提交成功")
예제 #2
0
class HarmInsertInfo(object):

    '''
     * create by: yangjt
     * description:初始化hbase和redis连接
     * create time:  
     * 
     
     * @return 
    '''
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR)

    '''
     * create by: yangjt
     * description:
     * create time:  
     * 
     
     * @return 
    '''
    def run(self):
        while True:
            result = self.redis_con.get_yy_rowkey("es:harm:insert:info")
            logging.info(result)
            rowkey,type = eval(result)
            _id = rowkey
            if type == "WECHAT_INFO_TABLE" or type == "INFO_TABLE" or type == "MONITOR_INFO_TABLE":
                _id = trans_md5(rowkey)
            log_info = "表格%s的rowkey的值为:%s" %(type,rowkey)
            logging.info(log_info)
            map = self.hbase_con.getResultByRowkey(type, rowkey,HARM_INFO_ZIDUAN[type])
            if not map:
                continue
            self.es_ping()
            boo = self.es.exists(HARM_INFO_ZIDUAN[type], "sino", _id)
            if boo:
                doc = {"doc": map}
                log_info = "rowkey值已存在"
                logging.info(log_info)
                self.es.update(HARM_INFO_ZIDUAN[type],doc_type="sino",id=_id,body=doc)
                log_info = "%s数据更新成功" %_id
                logging.info(log_info)
            else:
                log_info = "rowkey值:%s不存在" %_id
                logging.info(log_info)
                self.es.index(HARM_INFO_ZIDUAN[type],doc_type="sino",id=_id,body=map)
예제 #3
0
class YyUrUser(object):
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisYyTools()
        self.es = Elasticsearch(ES_ADDR)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR)

    def run(self):
        while True:
            rowkey = self.redis_con.get_yy_rowkey("es:ur:insert:info")
            map = self.hbase_con.getYyResultByRowkey("UR_USER_TABLE", rowkey)
            self.es_ping()
            self.es.index("ur_follow", doc_type="sino", id=rowkey, body=map)
예제 #4
0
 def __init__(self):
     self.hbase_con = HbaseInfoTask()
     self.redis_con = RedisTools()
     self.es = Elasticsearch(ES_ADDR)
예제 #5
0
class GetNewsUser(object):
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            rowkey = self.redis_con.get_rowkey("xw_user")
            if rowkey == None:
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    start = int(time.time())
                    count = 0
                time.sleep(10)
                continue
            param = None
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            if len(rowkey) > 500:
                log_info = "id:%s长度超过500" % rowkey
                logging.warning(log_info)
                continue
            boo = self.es.exists("xw_user", "sino", rowkey)
            action = {
                "_index": "xw_user",
                "_type": "sino",
                "_id": "",
            }
            if boo:
                map = self.hbase_con.getResultByRowkey("NEWS_PERSON_TABLE",
                                                       rowkey, "xw_user",
                                                       param)
                if not map:
                    continue
                action["_op_type"] = "update"
                action['doc'] = map
                cunzai = cunzai + 1
            else:
                map = self.hbase_con.getResultByRowkey("NEWS_PERSON_TABLE",
                                                       rowkey, "xw_user")
                if not map:
                    continue
                action['_source'] = map
            action['_id'] = rowkey
            action_list.append(action)
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 10:
                if len(action_list) > 0:
                    self.es_ping()
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    start = int(time.time())
                    action_list.clear()
                count = 0

    def commit(self, action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:xw_user,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("提交成功:%d条数据" % len(action_list))
예제 #6
0
 def __init__(self):
     self.hbase_con = HbaseInfoTask()
     self.redis_con = RedisTools()
     self.es = Elasticsearch(ES_ADDR,timeout=30)
     self.insert_count = 0
예제 #7
0
class GetInfo(object):

    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR,timeout=30)
        self.insert_count = 0

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR,timeout=30)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            rowkey = self.redis_con.get_rowkey("xw_info")
            if rowkey == None:
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    start = int(time.time())
                    count = 0
                time.sleep(10)
                continue
            param = None
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            _id = trans_md5(rowkey)
            boo = self.es.exists("xw_info", "sino", _id)
            action = {
                "_index": "xw_info",
                "_type": "sino",
                "_id": "",
            }
            if boo:
                map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info",param)
                if not map:
                    continue
                action["_op_type"] = "update"
                action['doc'] = map
                cunzai = cunzai+1
            else:
                map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info")
                if not map:
                    continue
                self.es.index(index="xw_info",doc_type="sino",id=_id,body=map)
                self.insert_count = self.insert_count + 1
                continue
                # action['_source'] = map
            action['_id'] = _id
            action_list.append(action)
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 10:
                self.es_ping()
                logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                cunzai = 0
                start = int(time.time())
                self.commit(action_list)
                action_list.clear()
                count = 0

    def commit(self,action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:xw_info index,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("新增存入elasticsearch当中%d条数据" % self.insert_count)
        logging.warning("提交成功:%d条数据" % len(action_list))
예제 #8
0
class GetWechatInfo(object):
    '''
     * create by: yangjt
     * description:初始化hbase,redis-cluster,elasticsearch连接
     * create time:  
     * 
     
     * @return 
    '''
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR, timeout=30)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR, timeout=30)

    '''
     * create by: yangjt
     * description:WECHAT_INFO_TABLE数据同步
     * create time:  
     * 
     
     * @return 
    '''

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            #获取需要同步的redis值
            rowkey = self.redis_con.get_rowkey("wx_info")
            #由于这里无法使用blpop,所以需要通过空值判定
            if rowkey == None:
                #没有进数据时,将累积的需要同步的数据存入elasticsearch
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    start = int(time.time())
                    count = 0
                time.sleep(10)
                continue
            param = None
            #获取rowkey和需要同步的字段
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            #将hbase的rowkey转化为md5类型数据,存入elasticsearch
            _id = trans_md5(rowkey)
            #判定此_id是否存在于elasticsearch,花费时间为30毫秒,实为head请求
            boo = self.es.exists("wx_info", "sino", _id)
            action = {
                "_index": "wx_info",
                "_type": "sino",
                "_id": "",
            }
            #如果数据已存在,采取update的方式进行数据上传
            if boo:
                map = self.hbase_con.getResultByRowkey("WECHAT_INFO_TABLE",
                                                       rowkey, "wx_info",
                                                       param)
                if not map:
                    continue
                action["_op_type"] = "update"
                action['doc'] = map
                cunzai = cunzai + 1
            #如果数据不存在,采集insert的方式进行数据上传(此时不用去关心是否被限制了字段)
            else:
                map = self.hbase_con.getResultByRowkey("WECHAT_INFO_TABLE",
                                                       rowkey, "wx_info")
                if not map:
                    continue
                action['_source'] = map
            action['_id'] = _id
            action_list.append(action)
            end = int(time.time())
            count = count + 1
            #如果数据量超过COUNT_NUM或者距离上次提交数据的时间超过30秒,则提交数据
            if count > COUNT_NUM or (end - start) > 30:
                self.es_ping()
                logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                cunzai = 0
                self.commit(action_list)
                start = int(time.time())
                action_list.clear()
                count = 0

    '''
     * create by: yangjt
     * description:批量上传数据
     * create time:  
     * 
        action_list:{
                "_index": "wx_info",
                "_type": "sino",
                "_id": "",
                "_source":{"key":"value"}
            }
     * @return 
    '''

    def commit(self, action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:wechat,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("提交成功:%d条数据" % len(action_list))
예제 #9
0
class Image(object):
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_SF_ADDR)
        self.insert_count = 0

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_SF_ADDR)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            rowkey = self.redis_con.get_rowkey("image")
            if rowkey == None:
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    count = 0
                    start = int(time.time())
                time.sleep(10)
                continue
            param = None
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            boo = self.es.exists("image", "sino", rowkey)
            action = {
                "_index": "image",
                "_type": "sino",
                "_id": "",
            }
            if boo:
                map = self.hbase_con.getSuanfaResultByRowkey(
                    "IMAGE_DATA_TABLE", rowkey, "image", param)
                if not map:
                    continue
                action["_op_type"] = "update"
                action['doc'] = map
                cunzai = cunzai + 1
            else:
                map = self.hbase_con.getSuanfaResultByRowkey(
                    "IMAGE_DATA_TABLE", rowkey, "image")
                if not map:
                    continue
                try:
                    self.es.index(index="image",
                                  doc_type="sino",
                                  id=rowkey,
                                  body=map)
                    self.insert_count = self.insert_count + 1
                except Exception as e:
                    log_info = "单条插入错误:%s" % str(e)
                    logging.error(log_info)
                continue
                # action['_source'] = map
            action['_id'] = rowkey
            action_list.append(action)
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 30:
                logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                cunzai = 0
                if len(action_list) > 0:
                    self.es_ping()
                    self.commit(action_list)
                    start = int(time.time())
                    action_list.clear()
                count = 0

    def commit(self, action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:image,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("提交成功:%d条数据" % len(action_list))
예제 #10
0
class GetForumInfo(object):

    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR,timeout=30)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR,timeout=30)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            rowkey = self.redis_con.get_rowkey("forum_info")
            # logging.info(rowkey)
            if rowkey == None:
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    start = int(time.time())
                    count = 0
                time.sleep(10)
                continue
            param = None
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            _id = trans_md5(rowkey)
            boo = self.es.exists("forum_info","sino",_id)
            if boo:
                cunzai = cunzai + 1
                map = self.hbase_con.getResultByRowkey("MONITOR_INFO_TABLE", rowkey, "forum_info",param)
                if not map:
                    continue
                action_list.append({
                    "_op_type":"update",
                    "_index": "forum_info",
                    "_type": "sino",
                    "_id": _id,
                    "doc": map,
                })
            else:
                map = self.hbase_con.getResultByRowkey("MONITOR_INFO_TABLE", rowkey, "forum_info")
                if not map:
                    continue
                action_list.append({
                    "_index": "forum_info",
                    "_type": "sino",
                    "_id": _id,
                    "_source": map,
                })
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 30:
                self.es_ping()
                logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                cunzai = 0
                if len(action_list) > 0:
                    self.commit(action_list)
                start = int(time.time())
                action_list.clear()
                count = 0

    def commit(self,action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:forum_info,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("提交成功:%d条数据" % len(action_list))
예제 #11
0
class GetSiteRecord(object):
    def __init__(self):
        self.hbase_con = HbaseInfoTask()
        self.redis_con = RedisTools()
        self.es = Elasticsearch(ES_ADDR, timeout=30)

    def es_ping(self):
        if not self.es.ping():
            self.es = Elasticsearch(ES_ADDR, timeout=30)

    def run(self):
        action_list = []
        count = 0
        start = int(time.time())
        cunzai = 0
        while True:
            rowkey = self.redis_con.get_rowkey("site_record")
            if rowkey == None:
                if len(action_list) > 0:
                    logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                    cunzai = 0
                    self.commit(action_list)
                    action_list.clear()
                    start = int(time.time())
                    count = 0
                time.sleep(10)
                continue
            param = None
            # self.redis_con.insert_yy_rowkey("es:wangxin:wechat:info",rowkey)
            # self.redis_con.insert_yy_rowkey("es:kafka:wechat:info", rowkey)
            if "|||||" in rowkey:
                params = rowkey.split("|||||")[1]
                param = params.split(",")
                rowkey = rowkey.split("|||||")[0]
            boo = self.es.exists("site_record", "sino", rowkey)
            action = {
                "_index": "site_record",
                "_type": "sino",
                "_id": "",
            }
            if boo:
                map = self.hbase_con.getResultByRowkey("SITE_RECORD", rowkey,
                                                       "site_record", param)
                if not map:
                    continue
                action["_op_type"] = "update"
                action['doc'] = map
                cunzai = cunzai + 1
            else:
                map = self.hbase_con.getResultByRowkey("SITE_RECORD", rowkey,
                                                       "site_record")
                if not map:
                    continue
                action['_source'] = map
            action['_id'] = rowkey
            action_list.append(action)
            end = int(time.time())
            count = count + 1
            if count > COUNT_NUM or (end - start) > 30:
                self.es_ping()
                logging.warning("重复存入elasticsearch当中%d条数据" % cunzai)
                cunzai = 0
                self.commit(action_list)
                start = int(time.time())
                action_list.clear()
                count = 0

    def commit(self, action_list):
        try:
            helpers.bulk(self.es, action_list)
        except Exception as e:
            log_info = "index:site_record,\terror:" + str(e)
            logging.error(log_info)
            helpers.bulk(self.es, action_list)
        logging.warning("提交成功:%d条数据" % len(action_list))
예제 #12
0
 def __init__(self):
     self.hbase_con = HbaseInfoTask()
     self.redis_con = RedisTools()
     self.es = Elasticsearch(ES_ADDR, timeout=ELASTIC_TIMEOUT)