class PageExtractorRDS(RedisController): def __init__(self): self.rds = RedisController(section_name="redis_pe") def __insert_id__(self, house_id, err_type): '''插入房源ID的基础方法 err_type枚举: - -1 未曾发起过请求 - 0 请求成功,删除错误标记 - 1 请求详情页面成功,但是页面返回为空值 - 2 页面由于加载不完全出现的部分元素信息缺失 ''' self.rds.rset(house_id, err_type) def select(self): return self.rds.rscan def insert_init(self, house_id): '''初始插入所有的房源ID列表''' self.__insert_id__(house_id, err_type=-1) def insert_success_id(self, house_id): '''标记成功爬取到页面的房源ID''' self.__insert_id__(house_id, err_type=0) def insert_empty_id(self, house_id): '''存放请求详情页面但页面返回数据为空的情况的房源ID''' self.__insert_id__(house_id, err_type=1) def insert_lose_element_id(self, house_id): '''存放页面加载不完全损失元素的房源ID''' self.__insert_id__(house_id, err_type=2)
class HouseSelectorRDS(RedisController): def __init__(self): self.rds = RedisController(section_name="redis_hs") def insert(self, house_id, house_info): self.rds.rset(house_id, house_info) def select(self): return self.rds.rscan
def ziroom_extra(project_name, rid, rtn_data): '''ziroom_extra Ziroom Extra func. ''' logger = LogBase(project_name, "ziroom_extra") logger.debug("Before Extra =>", data=rtn_data) # Extra func for house code. try: end = int(rtn_data['house_code'].split('_')[1]) room_num = int(findall(r"([0-9])室[0-9]厅", rtn_data['house_type'])[0]) except Exception: pass else: if room_num > 1: rds = RedisController( int(conf_kv_func("ziroom.sys_config", all=True)['redis_db']), project_name) for idx in range(1, room_num + 1): rds.__update_dict_to_redis__( rid - end + idx, {"house_id": str(rid - end + idx)}) # Extra func for price. try: price_dict = dict() price, price_dict = get_price_from_png(rtn_data["price"], price_dict, project_name) rtn_data["price"] = price except Exception: pass # Extra func for payment. try: payment_rtn_list = list() for payment in rtn_data["paymentlist"]: payment_rtn = dict() for k, v in zip(payment.keys(), payment.values()): if k == "period": payment_rtn["period"] = v else: payment_rtn[k], price_dict = get_price_from_png( v, price_dict, project_name) payment_rtn_list.append(payment_rtn) rtn_data["paymentlist"] = payment_rtn_list except Exception: pass logger.debug("After Extra =>", data=rtn_data) return rtn_data
def __load__(self): '''__load__ Load crawler config detail info from config files. ''' self.crawler_conf = CrawlerConfigReader.crawler_config(self.crawler_name) self.rds = RedisController(int(self.crawler_conf['sys_conf']['redis_db']), self.project_name) self.rds_key = self.crawler_conf['sys_conf']['redis_key'] self.debug("Here is crawler config => ", **self.crawler_conf)
class LJRedisController(): def __init__(self): self._db = LJDBController() self._redis = RedisController() def failed_page_insert(self): '''将spider_page操作中执行失败的房源编号列表入Redis''' house_list = self._db.get_house_page_failed for id in range(0,len(house_list)): idx = id + 1 self._redis.rset(str(idx),house_list[id]) def failed_page_get(self, num=20): '''将Redis中存储的房源ID按照num的频次返回''' for id in range(0, self._redis.dbsize, num): house_list_get = list() for i in range(0 ,num): idx = id + i + 1 house_id = self._redis.rget(str(idx)) if house_id.strip() == "n": pass else: house_list_get.append((idx, house_id)) yield house_list_get def success_page_del(self, house_list): '''将更小粒度执行完成的一组房源编号从Redis中删除''' for house_id in house_list: self.page_reexec_del(house_id[0]) def page_reexec_del(self, key): '''将重新运行过的房源编号从Redis中删除''' self._redis.rdel(key) @property def close(self): '''一些需要最终处理的事务''' self._db.close
def __init__(self): self.rds = RedisController(section_name="redis_pe")
def __init__(self): self._db = LJDBController() self._redis = RedisController()
def truncate_redis(): # 清空Redis - PageExtractor rds = RedisController(section_name="redis_pe") rds._redis_conn.flushdb()