def get_global_meta_from_redis(): global _meta_data, _meta_last_load_time reload = False if _meta_data is None or _meta_last_load_time is None: reload = True elif time.time() > _meta_last_load_time + 10 * 60: # 10 minute reload = True if reload: _meta_data = json.loads(get_redis_cli().get(META_KEY).decode("utf8")) _meta_last_load_time = time.time() return _meta_data
def get_unfinished_images(): """ 获取全部未完成下载图片的生成器 :return: """ redis_cli = get_redis_cli() all_groups = json.loads(redis_cli.get(META_KEY).decode("utf8")) for group in all_groups: group_id = group["id"] all_images = json.loads( redis_cli.get(KEY_TEMPLATE.format(idx=group_id)).decode("utf8")) yield from (img_info for img_info in all_images if img_info["nos_key"] == "")
def revise_meta(group_id, img_id, key): """ 填充 nos_key, nos_url :param group_id: :param img_id: :return: """ LOCK.acquire() redis_cli = get_redis_cli() img_info_list = json.loads( redis_cli.get(KEY_TEMPLATE.format(idx=group_id)).decode("utf8")) for info in img_info_list: if info["id"] == img_id: info["nos_key"] = key break redis_cli.set(KEY_TEMPLATE.format(idx=group_id), json.dumps(img_info_list)) LOCK.release()
def single_spide_group(group_id): time.sleep(random.random() / 10) meta_logger.info("{}: 图片组信息下载".format(group_id)) inner_group_info_url = INNER_GROUP_INFO_URL.format(index=group_id) redis_cli = get_redis_cli() group_key = "group:{}".format(group_id) origin_group_info_str = redis_cli.get(group_key) origin_group_info_str = b"[]" if origin_group_info_str is None else origin_group_info_str origin_group_info = json.loads(origin_group_info_str.decode("utf8")) # 原 meta 数据 meta_logger.info("{}: 原图片数量: {}".format(group_id, len(origin_group_info))) origin_img_index_set = {img_info["id"] for img_info in origin_group_info} # 原来组内所有图片的 id try: remote_group_info = json.loads( proxy_request_get(inner_group_info_url).content.decode("utf8") )["info"] except: meta_logger.error("{}: 退出: 下载图片组信息失败".format(group_id)) meta_logger.error(traceback.format_exc()) return False else: # 爬取成功 counter = 0 for group_info in remote_group_info: if group_info["id"] not in origin_img_index_set: # 不在原来的数据里,添加 new_image_info = { "id": group_info["id"], "group_id": group_id, "origin_url": group_info["url"], "nos_key": "", "nos_url": "", "local_file_path": "", "file_name": str(group_info["url"]).split("/")[-1] } origin_group_info = [new_image_info] + origin_group_info counter += 1 redis_cli.set(group_key, json.dumps(origin_group_info).encode("utf8")) meta_logger.info("{}: 退出: 新增图片 {} 张".format(group_id, counter)) return True
def spide_group_meta(): """ 获取全部图片组 meta 信息,并将其存入数据库 """ meta_logger.info("下载图片组信息") redis_cli = get_redis_cli() origin_meta_str = redis_cli.get("group:meta") origin_meta_str = b"[]" if origin_meta_str is None else origin_meta_str origin_meta = json.loads(origin_meta_str.decode("utf8")) # 原 meta 数据 meta_logger.info("原组长: {}".format(len(origin_meta))) origin_group_index_set = {meta["id"] for meta in origin_meta} # 原来所有图片组的 id 集合 try: remote_meta = json.loads( proxy_request_get(GROUP_INFO_URL).content.decode("utf8") )["indexes"] except: meta_logger.error('退出: 下载"全部"图片组信息失败') meta_logger.error(traceback.format_exc()) else: # 爬取成功 counter = 0 for meta in remote_meta: if meta["index"] not in origin_group_index_set: # 不在原来的数据里,添加 new_meta_unit = { "id": meta["index"], "group_name": meta["des"], "origin_url": meta["url"], "nos_key": "", "nos_url": "", "local_file_path": "" } origin_meta = [new_meta_unit] + origin_meta counter += 1 redis_cli.set("group:meta", json.dumps(origin_meta).encode("utf8")) meta_logger.info("退出: 新增图片组信息 {} 组".format(counter)) return origin_meta
def get_group_meta_from_redis(group_id): group_key = "group:{}".format(group_id) group_meta = json.loads(get_redis_cli().get(group_key).decode("utf8")) return group_meta