Beispiel #1
0
def parse_detail(params, content):
    try:
        doc = pq(content)
        task_list = list()
        save_list = list()
        return task_list, save_list
    except Exception as excep:
        report_logger.error("\n" + params + "\n" + traceback.format_exc())
        raise excep
Beispiel #2
0
    def html_parse(self, url, keys, deep, content):
        try:
            parse_result, url_list, save_list = self.working(keys[0], content)
        except Exception as excep:
            parse_result, url_list, save_list = -1, [], []
            report_logger.error("keys=%s, deep=%s, url=%s,msg=%s" %
                                (keys, deep, url, excep))

        if 0 <= self.max_deep <= deep:
            url_list = []
        return parse_result, url_list, save_list
Beispiel #3
0
 def reset_task_record(self):
     try:
         self.task_table.update_many(
             {}, {"$set": {
                 "task_detail": 0,
                 "task_otm": 0,
                 "task_ad": 0
             }})
         return True
     except Exception:
         logger.error("重置task状态失败")
         return False
Beispiel #4
0
 def find_page_with_pid(self, pid):
     """
     同过pid找到对应的数据,一个pid可能对应多项数据,因为文档更新
     :param pid: int/str
     :return: 返回一个字典列表
     """
     try:
         cursor = self.detail_table.find({"PropertyID": str(pid)},
                                         {"_id": 0})
         results = [doc for doc in cursor]
         return results
     except Exception:
         logger.error("获取页面数据失败 id : %s" % pid)
         return []
Beispiel #5
0
 def find_page_all(self, skip=0, limit=0):
     """
     获取所有页面内容,默认采用pid的升序排序
     :param skip: 跳过前a条记录 0 不跳过 不能为负数
     :param limit: 执行跳过后,最多返回b条记录 0 不限制 不能为负数
     :return: 不包含_id的字典列表 [{},{}]
     """
     try:
         cursor = self.detail_table.find({}, {"_id": 0}) \
             .sort([("PropertyID", pymongo.ASCENDING)]).skip(skip).limit(limit)
         results = [doc for doc in cursor]
         return results
     except Exception:
         logger.error("查询全部页面数据失败 skip:%s limit:%s" % (skip, limit))
         return []
Beispiel #6
0
 def update_task_record(self, pid, task_name: str, task_sta: int):
     """
     更新task记录状态
     :param pid: int/str pid值
     :param task_name: 任务名
     :param task_sta: 任务更新状态
     :return: 
     """
     try:
         self.task_table.update_one({"PropertyID": str(pid)},
                                    {"$set": {
                                        task_name: task_sta
                                    }})
     except Exception as e:
         logger.error("更新任务记录失败 id : %s\n%s" % (pid, e))
    def working(self, url, key):
        if key == "detail":
            tar_url = base_url % url
            detail_Header['User-Agent'] = make_random_useragent()
            resp = self.session.get(tar_url,
                                    headers=detail_Header,
                                    params=None,
                                    timeout=(6.05, 60),
                                    allow_redirects=False)
            result = resp.text
        elif key == "otm":
            tar_url = otm_url % url
            json_Header['Referer'] = base_url % url
            json_Header['User-Agent'] = make_random_useragent()
            otm_param['_'] = int(time.time() * 1000)
            resp = self.session.get(tar_url,
                                    headers=json_Header,
                                    params=otm_param,
                                    timeout=(6.05, 20),
                                    allow_redirects=False)
            result = resp.json()
        elif key == 'ad':
            tar_url = ad_url % url
            json_Header['Referer'] = base_url % url
            json_Header['User-Agent'] = make_random_useragent()
            resp = self.session.get(tar_url,
                                    headers=json_Header,
                                    params=None,
                                    timeout=(6.05, 20),
                                    allow_redirects=False)
            result = resp.json()
        else:
            report_logger.error("%s keys error: %s is invalid",
                                self.__class__.__name__, key)
            return -1, None  # 不会执行到的部分

        if resp.status_code == 200:
            return 1, result
        # 需要确认 账号抢登 /网络断开 是否全都能捕获到 503中
        elif resp.status_code == 503:
            raise requests.ConnectionError
        else:
            resp.raise_for_status()
Beispiel #8
0
 def insert_detail_page(self, pid, content: dict):
     """
     插入/更新详细页面数据 输入对应pid的文档不存在则创建,创建时建立createAt字段
     :param pid: int/str pid值
     :param content: {'a':1}
     :return: 
     """
     try:
         self.detail_table.update_one({"PropertyID": str(pid)}, {
             "$set": content,
             "$currentDate": {
                 "LastModify": True
             }
         },
                                      upsert=True)
         return True
     except Exception:
         logger.error("插入详细页面失败 id : %s" % pid)
         return False
 def inset_result_many(self, items, key):
     try:
         if key == "Rent":
             for item in items:
                 item["timestamp"] = self.update_date
                 item['otm_flag'] = 1  # 0:not find 1:on market
                 self.rent_table.update_one({"pid": item["pid"]},
                                            {"$set": item},
                                            upsert=True)
         elif key == "Sold":
             for item in items:
                 item["timestamp"] = self.update_date
                 item['otm_flag'] = 1
                 self.sold_table.update_one({"pid": item["pid"]},
                                            {"$set": item},
                                            upsert=True)
         return True
     except Exception as e:
         logger.error("insert detail page faile " + str(e))
         return False
    def create_index(self):
        """
        建立唯一索引索引 
        :return: T/F
        """
        # 对一个表建立 复合 唯一索引,且在后台执行
        # dropDups在3.0和之后的mongodb中不再被支持,遇到重复文档则会报错

        try:
            self.rent_table.create_index([("pid", pymongo.ASCENDING)],
                                         background=True)
            self.sold_table.create_index([("pid", pymongo.ASCENDING)],
                                         background=True)
        except pymongo.errors.DuplicateKeyError:
            print("创建索引失败,已存在重复数据")
            logger.error("创建索引失败")
            return False
        except Exception:
            return False
        return True
Beispiel #11
0
    def insert_task(self, pid_items: list):
        """
        插入任务队列
        :param pid_items: [(pid,suburb),]
        :return: 
        """
        try:
            tasks = [{
                "PropertyID": str(pid),
                "Suburb": suburb,
                "task_detail": 0,
                "task_otm": 0,
                "task_ad": 0
            } for pid, suburb in pid_items]
            self.task_table.insert_many(tasks, ordered=False)

        except IndexError as e:
            logger.error("pid param error :" + str(e))
        except Exception as e:
            logger.warning(e)
Beispiel #12
0
 def create_index(self):
     """
     建立唯一索引索引 
     :return: T/F
     """
     # 对一个表建立 复合 唯一索引,且在后台执行
     # dropDups在3.0和之后的mongodb中不再被支持,遇到重复文档则会报错
     self.database = self.client[data_base]
     self.detail_table = self.database[page_table]
     self.task_table = self.database[task_table]
     try:
         self.detail_table.create_index([("PropertyID", pymongo.ASCENDING)],
                                        unique=True,
                                        background=True)
         self.task_table.create_index([("PropertyID", pymongo.ASCENDING)],
                                      unique=True,
                                      background=True)
     except pymongo.errors.DuplicateKeyError:
         print("创建索引失败,已存在重复数据")
         logger.error("创建索引失败")
         return False
     return True