def parse_detail(params, content): try: doc = pq(content) task_list = list() save_list = list() return task_list, save_list except Exception as excep: report_logger.error("\n" + params + "\n" + traceback.format_exc()) raise excep
def html_parse(self, url, keys, deep, content): try: parse_result, url_list, save_list = self.working(keys[0], content) except Exception as excep: parse_result, url_list, save_list = -1, [], [] report_logger.error("keys=%s, deep=%s, url=%s,msg=%s" % (keys, deep, url, excep)) if 0 <= self.max_deep <= deep: url_list = [] return parse_result, url_list, save_list
def reset_task_record(self): try: self.task_table.update_many( {}, {"$set": { "task_detail": 0, "task_otm": 0, "task_ad": 0 }}) return True except Exception: logger.error("重置task状态失败") return False
def find_page_with_pid(self, pid): """ 同过pid找到对应的数据,一个pid可能对应多项数据,因为文档更新 :param pid: int/str :return: 返回一个字典列表 """ try: cursor = self.detail_table.find({"PropertyID": str(pid)}, {"_id": 0}) results = [doc for doc in cursor] return results except Exception: logger.error("获取页面数据失败 id : %s" % pid) return []
def find_page_all(self, skip=0, limit=0): """ 获取所有页面内容,默认采用pid的升序排序 :param skip: 跳过前a条记录 0 不跳过 不能为负数 :param limit: 执行跳过后,最多返回b条记录 0 不限制 不能为负数 :return: 不包含_id的字典列表 [{},{}] """ try: cursor = self.detail_table.find({}, {"_id": 0}) \ .sort([("PropertyID", pymongo.ASCENDING)]).skip(skip).limit(limit) results = [doc for doc in cursor] return results except Exception: logger.error("查询全部页面数据失败 skip:%s limit:%s" % (skip, limit)) return []
def update_task_record(self, pid, task_name: str, task_sta: int): """ 更新task记录状态 :param pid: int/str pid值 :param task_name: 任务名 :param task_sta: 任务更新状态 :return: """ try: self.task_table.update_one({"PropertyID": str(pid)}, {"$set": { task_name: task_sta }}) except Exception as e: logger.error("更新任务记录失败 id : %s\n%s" % (pid, e))
def working(self, url, key): if key == "detail": tar_url = base_url % url detail_Header['User-Agent'] = make_random_useragent() resp = self.session.get(tar_url, headers=detail_Header, params=None, timeout=(6.05, 60), allow_redirects=False) result = resp.text elif key == "otm": tar_url = otm_url % url json_Header['Referer'] = base_url % url json_Header['User-Agent'] = make_random_useragent() otm_param['_'] = int(time.time() * 1000) resp = self.session.get(tar_url, headers=json_Header, params=otm_param, timeout=(6.05, 20), allow_redirects=False) result = resp.json() elif key == 'ad': tar_url = ad_url % url json_Header['Referer'] = base_url % url json_Header['User-Agent'] = make_random_useragent() resp = self.session.get(tar_url, headers=json_Header, params=None, timeout=(6.05, 20), allow_redirects=False) result = resp.json() else: report_logger.error("%s keys error: %s is invalid", self.__class__.__name__, key) return -1, None # 不会执行到的部分 if resp.status_code == 200: return 1, result # 需要确认 账号抢登 /网络断开 是否全都能捕获到 503中 elif resp.status_code == 503: raise requests.ConnectionError else: resp.raise_for_status()
def insert_detail_page(self, pid, content: dict): """ 插入/更新详细页面数据 输入对应pid的文档不存在则创建,创建时建立createAt字段 :param pid: int/str pid值 :param content: {'a':1} :return: """ try: self.detail_table.update_one({"PropertyID": str(pid)}, { "$set": content, "$currentDate": { "LastModify": True } }, upsert=True) return True except Exception: logger.error("插入详细页面失败 id : %s" % pid) return False
def inset_result_many(self, items, key): try: if key == "Rent": for item in items: item["timestamp"] = self.update_date item['otm_flag'] = 1 # 0:not find 1:on market self.rent_table.update_one({"pid": item["pid"]}, {"$set": item}, upsert=True) elif key == "Sold": for item in items: item["timestamp"] = self.update_date item['otm_flag'] = 1 self.sold_table.update_one({"pid": item["pid"]}, {"$set": item}, upsert=True) return True except Exception as e: logger.error("insert detail page faile " + str(e)) return False
def create_index(self): """ 建立唯一索引索引 :return: T/F """ # 对一个表建立 复合 唯一索引,且在后台执行 # dropDups在3.0和之后的mongodb中不再被支持,遇到重复文档则会报错 try: self.rent_table.create_index([("pid", pymongo.ASCENDING)], background=True) self.sold_table.create_index([("pid", pymongo.ASCENDING)], background=True) except pymongo.errors.DuplicateKeyError: print("创建索引失败,已存在重复数据") logger.error("创建索引失败") return False except Exception: return False return True
def insert_task(self, pid_items: list): """ 插入任务队列 :param pid_items: [(pid,suburb),] :return: """ try: tasks = [{ "PropertyID": str(pid), "Suburb": suburb, "task_detail": 0, "task_otm": 0, "task_ad": 0 } for pid, suburb in pid_items] self.task_table.insert_many(tasks, ordered=False) except IndexError as e: logger.error("pid param error :" + str(e)) except Exception as e: logger.warning(e)
def create_index(self): """ 建立唯一索引索引 :return: T/F """ # 对一个表建立 复合 唯一索引,且在后台执行 # dropDups在3.0和之后的mongodb中不再被支持,遇到重复文档则会报错 self.database = self.client[data_base] self.detail_table = self.database[page_table] self.task_table = self.database[task_table] try: self.detail_table.create_index([("PropertyID", pymongo.ASCENDING)], unique=True, background=True) self.task_table.create_index([("PropertyID", pymongo.ASCENDING)], unique=True, background=True) except pymongo.errors.DuplicateKeyError: print("创建索引失败,已存在重复数据") logger.error("创建索引失败") return False return True