class Video(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_SF_ADDR) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_SF_ADDR) def run(self): action_list = [] count = 0 start = int(time.time()) while True: rowkey = self.redis_con.get_rowkey("video") if rowkey == None: if len(action_list) > 0: self.commit(action_list) action_list.clear() count = 0 start = int(time.time()) time.sleep(10) continue if "|||||" in rowkey: rowkey = rowkey.split("|||||")[0] map = self.hbase_con.getSuanfaResultByRowkey( "VIDEO_DATA_TS_TABLE", rowkey, "video") if not map: continue action = { "_index": "video", "_type": "sino", "_id": "", "_source": {}, } action['_id'] = rowkey action['_source'] = map action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 30: if len(action_list) > 0: self.es_ping() self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 def commit(self, action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:video,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.info("提交成功")
class HarmInsertInfo(object): ''' * create by: yangjt * description:初始化hbase和redis连接 * create time: * * @return ''' def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR) ''' * create by: yangjt * description: * create time: * * @return ''' def run(self): while True: result = self.redis_con.get_yy_rowkey("es:harm:insert:info") logging.info(result) rowkey,type = eval(result) _id = rowkey if type == "WECHAT_INFO_TABLE" or type == "INFO_TABLE" or type == "MONITOR_INFO_TABLE": _id = trans_md5(rowkey) log_info = "表格%s的rowkey的值为:%s" %(type,rowkey) logging.info(log_info) map = self.hbase_con.getResultByRowkey(type, rowkey,HARM_INFO_ZIDUAN[type]) if not map: continue self.es_ping() boo = self.es.exists(HARM_INFO_ZIDUAN[type], "sino", _id) if boo: doc = {"doc": map} log_info = "rowkey值已存在" logging.info(log_info) self.es.update(HARM_INFO_ZIDUAN[type],doc_type="sino",id=_id,body=doc) log_info = "%s数据更新成功" %_id logging.info(log_info) else: log_info = "rowkey值:%s不存在" %_id logging.info(log_info) self.es.index(HARM_INFO_ZIDUAN[type],doc_type="sino",id=_id,body=map)
class YyUrUser(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisYyTools() self.es = Elasticsearch(ES_ADDR) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR) def run(self): while True: rowkey = self.redis_con.get_yy_rowkey("es:ur:insert:info") map = self.hbase_con.getYyResultByRowkey("UR_USER_TABLE", rowkey) self.es_ping() self.es.index("ur_follow", doc_type="sino", id=rowkey, body=map)
def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR)
class GetNewsUser(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR) def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("xw_user") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] if len(rowkey) > 500: log_info = "id:%s长度超过500" % rowkey logging.warning(log_info) continue boo = self.es.exists("xw_user", "sino", rowkey) action = { "_index": "xw_user", "_type": "sino", "_id": "", } if boo: map = self.hbase_con.getResultByRowkey("NEWS_PERSON_TABLE", rowkey, "xw_user", param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai + 1 else: map = self.hbase_con.getResultByRowkey("NEWS_PERSON_TABLE", rowkey, "xw_user") if not map: continue action['_source'] = map action['_id'] = rowkey action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 10: if len(action_list) > 0: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 def commit(self, action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:xw_user,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("提交成功:%d条数据" % len(action_list))
def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR,timeout=30) self.insert_count = 0
class GetInfo(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR,timeout=30) self.insert_count = 0 def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR,timeout=30) def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("xw_info") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] _id = trans_md5(rowkey) boo = self.es.exists("xw_info", "sino", _id) action = { "_index": "xw_info", "_type": "sino", "_id": "", } if boo: map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info",param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai+1 else: map = self.hbase_con.getResultByRowkey("INFO_TABLE", rowkey, "xw_info") if not map: continue self.es.index(index="xw_info",doc_type="sino",id=_id,body=map) self.insert_count = self.insert_count + 1 continue # action['_source'] = map action['_id'] = _id action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 10: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 start = int(time.time()) self.commit(action_list) action_list.clear() count = 0 def commit(self,action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:xw_info index,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("新增存入elasticsearch当中%d条数据" % self.insert_count) logging.warning("提交成功:%d条数据" % len(action_list))
class GetWechatInfo(object): ''' * create by: yangjt * description:初始化hbase,redis-cluster,elasticsearch连接 * create time: * * @return ''' def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR, timeout=30) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR, timeout=30) ''' * create by: yangjt * description:WECHAT_INFO_TABLE数据同步 * create time: * * @return ''' def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: #获取需要同步的redis值 rowkey = self.redis_con.get_rowkey("wx_info") #由于这里无法使用blpop,所以需要通过空值判定 if rowkey == None: #没有进数据时,将累积的需要同步的数据存入elasticsearch if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None #获取rowkey和需要同步的字段 if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] #将hbase的rowkey转化为md5类型数据,存入elasticsearch _id = trans_md5(rowkey) #判定此_id是否存在于elasticsearch,花费时间为30毫秒,实为head请求 boo = self.es.exists("wx_info", "sino", _id) action = { "_index": "wx_info", "_type": "sino", "_id": "", } #如果数据已存在,采取update的方式进行数据上传 if boo: map = self.hbase_con.getResultByRowkey("WECHAT_INFO_TABLE", rowkey, "wx_info", param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai + 1 #如果数据不存在,采集insert的方式进行数据上传(此时不用去关心是否被限制了字段) else: map = self.hbase_con.getResultByRowkey("WECHAT_INFO_TABLE", rowkey, "wx_info") if not map: continue action['_source'] = map action['_id'] = _id action_list.append(action) end = int(time.time()) count = count + 1 #如果数据量超过COUNT_NUM或者距离上次提交数据的时间超过30秒,则提交数据 if count > COUNT_NUM or (end - start) > 30: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 ''' * create by: yangjt * description:批量上传数据 * create time: * action_list:{ "_index": "wx_info", "_type": "sino", "_id": "", "_source":{"key":"value"} } * @return ''' def commit(self, action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:wechat,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("提交成功:%d条数据" % len(action_list))
class Image(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_SF_ADDR) self.insert_count = 0 def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_SF_ADDR) def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("image") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() count = 0 start = int(time.time()) time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] boo = self.es.exists("image", "sino", rowkey) action = { "_index": "image", "_type": "sino", "_id": "", } if boo: map = self.hbase_con.getSuanfaResultByRowkey( "IMAGE_DATA_TABLE", rowkey, "image", param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai + 1 else: map = self.hbase_con.getSuanfaResultByRowkey( "IMAGE_DATA_TABLE", rowkey, "image") if not map: continue try: self.es.index(index="image", doc_type="sino", id=rowkey, body=map) self.insert_count = self.insert_count + 1 except Exception as e: log_info = "单条插入错误:%s" % str(e) logging.error(log_info) continue # action['_source'] = map action['_id'] = rowkey action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 30: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 if len(action_list) > 0: self.es_ping() self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 def commit(self, action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:image,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("提交成功:%d条数据" % len(action_list))
class GetForumInfo(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR,timeout=30) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR,timeout=30) def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("forum_info") # logging.info(rowkey) if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] _id = trans_md5(rowkey) boo = self.es.exists("forum_info","sino",_id) if boo: cunzai = cunzai + 1 map = self.hbase_con.getResultByRowkey("MONITOR_INFO_TABLE", rowkey, "forum_info",param) if not map: continue action_list.append({ "_op_type":"update", "_index": "forum_info", "_type": "sino", "_id": _id, "doc": map, }) else: map = self.hbase_con.getResultByRowkey("MONITOR_INFO_TABLE", rowkey, "forum_info") if not map: continue action_list.append({ "_index": "forum_info", "_type": "sino", "_id": _id, "_source": map, }) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 30: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 if len(action_list) > 0: self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 def commit(self,action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:forum_info,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("提交成功:%d条数据" % len(action_list))
class GetSiteRecord(object): def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR, timeout=30) def es_ping(self): if not self.es.ping(): self.es = Elasticsearch(ES_ADDR, timeout=30) def run(self): action_list = [] count = 0 start = int(time.time()) cunzai = 0 while True: rowkey = self.redis_con.get_rowkey("site_record") if rowkey == None: if len(action_list) > 0: logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) action_list.clear() start = int(time.time()) count = 0 time.sleep(10) continue param = None # self.redis_con.insert_yy_rowkey("es:wangxin:wechat:info",rowkey) # self.redis_con.insert_yy_rowkey("es:kafka:wechat:info", rowkey) if "|||||" in rowkey: params = rowkey.split("|||||")[1] param = params.split(",") rowkey = rowkey.split("|||||")[0] boo = self.es.exists("site_record", "sino", rowkey) action = { "_index": "site_record", "_type": "sino", "_id": "", } if boo: map = self.hbase_con.getResultByRowkey("SITE_RECORD", rowkey, "site_record", param) if not map: continue action["_op_type"] = "update" action['doc'] = map cunzai = cunzai + 1 else: map = self.hbase_con.getResultByRowkey("SITE_RECORD", rowkey, "site_record") if not map: continue action['_source'] = map action['_id'] = rowkey action_list.append(action) end = int(time.time()) count = count + 1 if count > COUNT_NUM or (end - start) > 30: self.es_ping() logging.warning("重复存入elasticsearch当中%d条数据" % cunzai) cunzai = 0 self.commit(action_list) start = int(time.time()) action_list.clear() count = 0 def commit(self, action_list): try: helpers.bulk(self.es, action_list) except Exception as e: log_info = "index:site_record,\terror:" + str(e) logging.error(log_info) helpers.bulk(self.es, action_list) logging.warning("提交成功:%d条数据" % len(action_list))
def __init__(self): self.hbase_con = HbaseInfoTask() self.redis_con = RedisTools() self.es = Elasticsearch(ES_ADDR, timeout=ELASTIC_TIMEOUT)