Ejemplo n.º 1
0
 def process_exception(self, request, exception, spider):
     if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
         if not isinstance(exception, TimeoutError):
             spider.is_change_proxy = True
             logger.error(u"中间件切换代理ip")
             logger.error(exception)
             return self._retry(request, exception, spider)
Ejemplo n.º 2
0
 def process_response(self, request, response, spider):
     if response.status in self.retry_http_codes:
         spider.is_change_proxy = True
         # building 爬虫,遇到无法处理的数据
         if spider.name == "building":
             logger.error(u"中间件切换代理ip:%s,%s" % (response.status, spider.building.get("id")))
             if not self.handle_error_building(spider.building.get('id')):
                 return self._retry(request, response_status_message(response.status), spider) or response
             else:
                 spider.building = self.handle_sql(spider.building_sql)
                 if not spider.building:
                     raise CloseSpider(u"数据收集完成,爬虫关闭")
                 request = request.replace(body=json.dumps({"buildingid": spider.building.get("id")}))
                 return request
     else:
         return response
Ejemplo n.º 3
0
 def commit(self, sql, param=None):
     cursor = self.__get_connect__()
     try:
         if param:
             cursor.execute(sql, param)
         else:
             cursor.execute(sql)
         self.__conn__.commit()
         return cursor.lastrowid
     except BaseException as e:
         # db_pool_logger.error("sql:%s" % sql)
         # db_pool_logger.error("sql param:%s" % param)
         # db_pool_logger.error(e)
         logger.error(e)
         logger.error("sql:%s" % (sql % param))
     finally:
         cursor.close()
         self.__conn__.close()
Ejemplo n.º 4
0
 def find(self, sql, param=None, sql_analysis=True):
     cursor = self.__get_connect__()
     # self.__get_sql_query_param__(sql)
     try:
         if param:
             cursor.execute(sql, param)
         else:
             cursor.execute(sql)
         result = cursor.fetchall()
         if sql_analysis:
             list_query_param = self.__get_sql_query_param__(sql)
             data_list = list()
             for item in result:
                 data_dict = dict()
                 for (index, v) in enumerate(list_query_param):
                     data_dict[v.strip()] = item[index]
                 data_list.append(data_dict)
             return data_list
         else:
             return result
     except BaseException as e:
         # db_pool_logger.error("sql:%s" % sql)
         # db_pool_logger.error("sql param:%s" % param)
         # db_pool_logger.error(e)
         logger.error("sql:%s" % sql)
         logger.error("sql param:%s" % param)
         logger.error(e)
     finally:
         cursor.close()
         self.__conn__.close()
Ejemplo n.º 5
0
    def work(self):
        delete_logs()
        options = webdriver.ChromeOptions()
        options.add_argument("headless")
        web_driver_manager = WebDriverManager(3, "chrome", options)
        validate_driver = web_driver_manager.get_web_driver()
        for region in get_all_region():
            now_page = region.get("now_page")
            while True:
                real_estate_driver = web_driver_manager.get_web_driver()
                # 获得楼盘
                url = self.base_url % (region.get("region").encode("utf8"),
                                       now_page)
                if not real_estate_driver.send_url(url, "pre"):
                    logger.info(
                        region.get("region").encode("utf8") + "房产信息收集完成")
                    update_region(region.get("id"), now_page)
                    break
                # 请求完成之后页数就加1
                logger.info(region.get("region") + ":" + str(now_page))
                now_page += 1
                real_estate = real_estate_driver.find_element_by_tag_name(
                    "pre").text
                # 关闭网页
                web_driver_manager.destory_web_driver(
                    real_estate_driver.get_id())
                if not real_estate:
                    logger.info(
                        region.get("region").encode("utf8") + "房产信息收集完成")
                    update_region(region.get("id"), 1)
                    break
                # 解析楼盘
                json_rep = json.loads(
                    real_estate.encode("utf8").replace("[", "").replace(
                        "]", "").replace("'", "\""))
                list_json_rep = [json_rep]
                for item in list_json_rep:
                    try:
                        # 查询该楼盘出售情况,全部售完的就跳过
                        real_estate_name = item.get("ZPROJECT")
                        real_estate_result = get_real_estate_sale_status(
                            real_estate_name=real_estate_name)
                        if real_estate_result and real_estate_result.get("house_total_count") != 0 \
                                and real_estate_result.get("house_sell_out_count") != 0 \
                                and real_estate_result.get("house_total_count") == real_estate_result.get("house_sell_out_count"):
                            continue
                        # 新增或查询楼盘
                        real_estate = get_real_estate(real_estate_name,
                                                      region.get("id"))
                        if real_estate:
                            real_estate_id = real_estate.get("id")
                        else:
                            real_estate = RealEstate()
                            real_estate.name = real_estate_name
                            real_estate.region = region.get("id")
                            real_estate.address = item.get("F_ADDR")
                            real_estate.developer = item.get("ENTERPRISENAME")
                            real_estate.sale_building = item.get("F_BLOCK")
                            real_estate.sale_count = item.get("NUM")
                            real_estate.source_id = WebSource.RealEstate
                            real_estate.house_total_count = 0
                            real_estate.house_sell_out_count = 0
                            real_estate_id = real_estate.__add__()
                        # 大楼数据
                        build_name = item.get("F_BLOCK").split(",")
                        build_id = item.get("BUILDID").split(",")
                        build_register = item.get("F_REGISTER_DATE").split(",")
                        build_residence_count = item.get("BUILDZZNUM").split(
                            ",")
                        build_none_residence_count = item.get(
                            "BUILDFZZNUM").split(",")
                        # 该楼盘下所有大楼
                        for index in range(len(build_id) - 1):
                            sale_building = build_name[index].replace("'", "")
                            # 新增或查询大楼
                            building = get_building_sale_status(
                                sale_building, real_estate_id)
                            if building:
                                building_id = building.get("id")
                            else:
                                building = Building()
                                building.sale_building = sale_building
                                building.web_build_id = int(build_id[index])
                                building.register_time = datetime.datetime.strptime(
                                    build_register[index], "%Y-%m-%d")
                                building.sale_residence_count = int(
                                    build_residence_count[index])
                                building.sale_none_residence_count = int(
                                    build_none_residence_count[index])
                                building.source_id = WebSource.RealEstate
                                building.real_estate_id = int(real_estate_id)
                                building.total_count = 0
                                building.sale_count = 0
                                building.real_estate_name = real_estate.name
                                building_id = building.__add__()
                            # 查询该大楼出售情况,全部售完的就跳过
                            building_sale_result = get_building_sale_status(
                                sale_building, real_estate_id)
                            if building_sale_result and building_sale_result.get("total_count") != 0 and building_sale_result.get("sale_count") != 0 \
                                and building_sale_result.get("total_count") == building_sale_result.get("sale_count"):
                                continue
                            # 一栋楼里面的所有房子
                            driver_house = web_driver_manager.get_web_driver()
                            houses_url = "http://www.cq315house.com/315web/HtmlPage/ShowRoomsNew.aspx?block=%s&buildingid=%s" %\
                                         (sale_building.encode("utf8"), int(build_id[index]))
                            driver_house.send_url(houses_url)
                            house_soup = BeautifulSoup(
                                driver_house.page_source, "html.parser")
                            # 关闭网页
                            web_driver_manager.destory_web_driver(
                                driver_house.get_id())
                            # 判断是否请求成功
                            if not house_soup.find(
                                    "img", attrs={"id": "projectInfo_img"}):
                                continue
                            # 预售许可证
                            pre_sale_number = json.loads(
                                unquote(
                                    house_soup.find(
                                        "img", attrs={
                                            "id": "projectInfo_img"
                                        }).attrs.get("src").split("text=")
                                    [1])).get("presaleCert")
                            pre_sale_number = pre_sale_number.replace(
                                "%u", "\\u").decode(
                                    "raw_unicode_escape").encode("utf-8")
                            update_building(pre_sale_number, building_id)
                            tbody = house_soup.find("table",
                                                    attrs={
                                                        "id": "_mybuilding"
                                                    }).find("tbody")
                            trs = tbody.find_all("tr")
                            # 单元列表
                            unit_td_list = house_soup.find_all(
                                "input", attrs={"name": "unitb"})
                            unit_list = list()
                            for unit_temp in unit_td_list:
                                unit_list.append(unit_temp.next)
                            # 是否新增了房子
                            is_add_house = False
                            house_count_dict = div_list_return_dict(
                                range(len(trs[0].find_all("td")) - 2),
                                len(unit_list))
                            for tr in trs:
                                tds = tr.find_all("td",
                                                  attrs={"objt": "tdclass"})
                                for td_index, td in enumerate(tds):
                                    is_exception = False
                                    try:
                                        # 是不是房子
                                        if "display:none" in td.attrs.get(
                                                "style").replace(" ", ""):
                                            continue
                                        # 单独每一套房子
                                        # 单元号
                                        house_unit = get_unit(
                                            house_count_dict, unit_list,
                                            td_index).encode("utf8").decode(
                                                "utf8").replace(" ", "")
                                        # 门牌号
                                        door_number = td.find(
                                            "font").text.replace(" ", "")
                                        logger.info(
                                            "%s %s %s %s %s %s" %
                                            (datetime.datetime.now(),
                                             region.get("region").encode(
                                                 "utf8").decode("utf8"),
                                             real_estate_name, sale_building,
                                             house_unit, door_number))
                                        if not validate_house_door_number(
                                                door_number):
                                            continue
                                        # 出售状态
                                        house_status_page = self.get_house_status_page(
                                            td)
                                        if house_status_page <= 0:
                                            # 没有获取到房间出售状态,跳过这间房间
                                            continue
                                        # 查询数据库中房间是否已经售出
                                        house_status = get_house_status(
                                            door_number, real_estate_id,
                                            building_id, house_unit)
                                        if house_status:
                                            # 已经售出跳过
                                            # 状态改变改状态
                                            if int(house_status.get("status")
                                                   ) != house_status_page:
                                                update_house_status(
                                                    house_status.get("id"),
                                                    house_status_page)
                                            if not house_status.get(
                                                    "web_house_id"):
                                                update_web_house_id(
                                                    td.find("input").attrs.get(
                                                        "value"),
                                                    house_status.get("id"))
                                            continue
                                        is_add_house = True
                                        # 未售出房子
                                        validate_url = "http://www.cq315house.com/315web/" + \
                                                       td.find("a").attrs.get("onclick").split("../")[1].split("');")[0]
                                        # 验证码
                                        # self.get_internet_validate_code(validate_driver, validate_url)
                                        self.get_image(validate_driver,
                                                       validate_url)
                                        one_house_soup = BeautifulSoup(
                                            validate_driver.page_source,
                                            "html.parser")
                                        if not one_house_soup.find("img"):
                                            raise BaseException(u"无法获取房子数据")
                                        one_house_data = unquote(
                                            one_house_soup.find(
                                                "img",
                                                attrs={
                                                    "id": "roomInfo_img"
                                                }).attrs.get("src").split(
                                                    "text=")[1].replace(
                                                        "%u", "\\u").decode(
                                                            "unicode-escape"))
                                        if not one_house_data:
                                            raise BaseException(u"无法获取房子数据")
                                        if one_house_data and "undefined-undefined" in one_house_data:
                                            raise BaseException(u"无法获取房子数据")
                                        json_data = json.loads(one_house_data)
                                        if json_data.get("HX") == u"其他":
                                            continue
                                        house = House()
                                        house.door_number = door_number
                                        house.status = house_status_page
                                        house.inside_area = json_data.get(
                                            "TNMJ")
                                        house.built_area = json_data.get(
                                            "JZMJ")
                                        house.house_type = json_data.get("HX")
                                        house.inside_price = json_data.get(
                                            "NSDJ_TN")
                                        house.built_price = json_data.get(
                                            "NSDJ_JM")
                                        house.buliding_id = building_id
                                        house.real_estate_id = real_estate_id
                                        house.source_id = 1
                                        house.unit = house_unit
                                        house.web_house_id = td.find(
                                            "input").attrs.get("value")
                                        house.__add__()
                                        logger.info("套内单价:%s, 套内面积:%s" %
                                                    (house.inside_price,
                                                     house.inside_area))
                                    except BaseException as e1:
                                        # is_exception = True
                                        logger.error(u"内层")
                                        web_driver_manager.destory_web_driver(
                                            validate_driver.get_id())
                                        logger.error(e1)
                                        validate_driver = web_driver_manager.get_web_driver(
                                            True)
                                        continue
                                    finally:
                                        if is_exception:
                                            update_region(
                                                region.get("id"), now_page)
                            if is_add_house:
                                # 增加大楼,楼房总量和在售数量
                                building_static_data = get_building_statictics_data(
                                    building_id, real_estate_id)
                                update_building_count(
                                    building_id,
                                    building_static_data.get("total_count"),
                                    building_static_data.get("sale_count"))

                        # 统计楼盘数据
                        static_data = get_real_estate_statictics_data(
                            real_estate_id)
                        update_real_estate_count(
                            real_estate_id,
                            static_data.get("sum(total_count)"),
                            static_data.get("sum(sale_count)"))
                    except BaseException as e2:
                        logger.error(u"外层")
                        logger.error(e2)
                        continue
                    finally:
                        update_region(region.get("id"), now_page)
                update_region(region.get("id"), now_page)
Ejemplo n.º 6
0
 def get_expression_code(self, ):
     """
     获得验证码
     :return:
     """
     expression = None
     try:
         # 识别图片
         try:
             expression1 = self.get_internet_validate_code()
         except:
             expression1 = None
         logger.info(u"图片识别:%s" % expression1)
         # 图片修正识别
         try:
             expression2 = self.image_corde_correct()
         except:
             expression2 = None
         logger.info(u"图片识别修正:%s" % expression2)
         # 图片比较识别
         try:
             expression3 = self.compare_image_correct(
                 operator_img_url=(self.base_image_path + "operator.png"),
                 number1_img_url=(self.base_image_path + "num1.png"),
                 number2_img_url=(self.base_image_path + "num2.png"))
         except:
             expression3 = None
         logger.info(u"图片比较识别:%s" % expression3)
         # 成功图片比较
         try:
             expression4 = self.compare_success_img()
         except:
             expression4 = None
         logger.info(u"成功图片比较:%s" % expression4)
         if not (expression1 or expression2 or expression3):
             if expression4:
                 expression = expression4
             else:
                 logger.info(u"图片识别失败")
         else:
             succ_size_expression1 = self.confirm_return_express(
                 expression1, [expression2, expression3])
             succ_size_expression2 = self.confirm_return_express(
                 expression2, [expression1, expression3])
             succ_size_expression3 = self.confirm_return_express(
                 expression3, [expression1, expression2])
             if succ_size_expression1 > succ_size_expression2:
                 if succ_size_expression1 > succ_size_expression3:
                     expression = expression1
                 else:
                     expression = expression3
             else:
                 if succ_size_expression2 > succ_size_expression3:
                     expression = expression2
                 else:
                     expression = expression3
     except BaseException as e:
         logger.info(e)
     logger.info(u"验证码:%s" % expression)
     if not expression:
         expression = 0
         logger.error(u"错误")
     # 计算验证码
     return expression, self.compute_code(expression)
Ejemplo n.º 7
0
 def work(self):
     delete_logs()
     options = webdriver.ChromeOptions()
     options.add_argument("headless")
     web_driver_manager = WebDriverManager(1, "chrome", options)
     house_driver = web_driver_manager.get_web_driver(True)
     # 统计数据
     buliding_id = 0
     real_estate_id = 0
     while True:
         try:
             house = pool.find_one(self.base_select_sql)
             if not house:
                 logger.info(u"数据收集完成")
                 break
             if not house.get("web_house_id"):
                 continue
             house_driver.send_url(
                 (self.base_house_url % house.get("web_house_id")))
             # 截图整个网页
             house_driver.save_screenshot(self.save_image_url)
             # 保存图片
             img = house_driver.find_element_by_tag_name("img")
             location_img_url = self.save_image_url
             left = img.location.get("x")
             top = img.location.get("y")
             width = left + img.size.get("width")
             height = top + img.size.get("height")
             image = Image.open(location_img_url).crop(
                 (left, top, width, height))
             image.save(location_img_url)
             # 防止图片没有保存下来
             time.sleep(3)
             # 识别图片
             image_recognition = ImageRecognition(self.base_image_path,
                                                  self.save_image_url)
             expression, int_code = image_recognition.get_expression_code()
             # 发送验证码请求
             code_input = house_driver.find_element_by_id("txtCode")
             code_input.send_keys(int_code)
             house_driver.find_element_by_id("Button1").click()
             one_house_url = house_driver.current_url
             if "bid" in one_house_url:
                 # 保存成功的图片
                 image_recognition.save_success_image(
                     self.save_image_url, expression)
                 # 收集数据
                 one_house_soup = BeautifulSoup(house_driver.page_source,
                                                "html.parser")
                 if not one_house_soup.find("img"):
                     raise BaseException(u"无法获取房子数据")
                 one_house_data = unquote(
                     one_house_soup.find(
                         "img", attrs={
                             "id": "roomInfo_img"
                         }).attrs.get("src").split("text=")[1].replace(
                             "%u", "\\u").decode("unicode-escape"))
                 if not one_house_data:
                     raise BaseException(u"无法获取房子数据")
                 if one_house_data and "undefined-undefined" in one_house_data:
                     raise BaseException(u"无法获取房子数据")
                 json_data = json.loads(one_house_data)
                 # if json_data.get("HX") == u"其他":
                 #     continue
                 house_status = chinese_status.get(
                     json_data.get("FWZT")) if chinese_status.get(
                         json_data.get("FWZT")) else 7
                 inside_area = json_data.get("TNMJ")
                 built_area = json_data.get("JZMJ")
                 house_type = json_data.get("HX")
                 inside_price = json_data.get("NSDJ_TN")
                 built_price = json_data.get("NSDJ_JM")
                 pool.commit(self.base_update_sql, [
                     house_status, inside_area, built_area, house_type,
                     inside_price, built_price,
                     datetime.datetime.now(),
                     house.get("id")
                 ])
                 logger.info(u"thread:%s, %s:套内单价:%s, 套内面积:%s" %
                             (self.thread_no, house.get("door_number"),
                              inside_price, inside_area))
                 # 统计数据
                 # 不同大楼,此时统计该栋楼的数据
                 if buliding_id and buliding_id != house.get("buliding_id"):
                     sql_count_house = """select * from
                                   (select count(1) as sale_number from house where buliding_id=%s and status=2) as a, 
                                   (select count(1) as total_number from house where buliding_id=%s) as b, 
                                   (select count(1) as sold_number from house where `status` in (3,4,5) and buliding_id=%s) as c"""
                     result_count_house = pool.find_one(
                         sql_count_house,
                         [buliding_id, buliding_id, buliding_id],
                         sql_analysis=False)
                     sql_update_buliding = """update building set sale_residence_count=%s, total_count=%s, sale_count=%s, updated=%s where id=%s"""
                     pool.commit(sql_update_buliding, [
                         result_count_house[0], result_count_house[1],
                         result_count_house[2],
                         datetime.datetime.now(), buliding_id
                     ])
                     buliding_id = house.get("buliding_id")
                 # 不同楼盘,此时统计楼盘数据
                 if real_estate_id and real_estate_id != house.get(
                         "real_estate_id"):
                     sql_count_buliding = """select sum(sale_residence_count), sum(total_count), sum(sale_count) from building where real_estate_id=%s"""
                     result_count_buliding = pool.find_one(
                         sql_count_buliding, [real_estate_id])
                     sql_update_real_estate = """update real_estate set sale_count=%s, house_total_count=%s, house_sell_out_count=%s, updated=%s where id=%s"""
                     pool.commit(sql_update_real_estate, [
                         result_count_buliding.get(
                             "sum(sale_residence_count)"),
                         result_count_buliding.get("sum(total_count)"),
                         result_count_buliding.get("sum(sale_count)"),
                         datetime.datetime.now(), real_estate_id
                     ])
                     real_estate_id = house.get("real_estate_id")
                 if not buliding_id:
                     buliding_id = house.get("buliding_id")
                     real_estate_id = house.get("real_estate_id")
         except BaseException as e:
             logger.error(e)
             try:
                 web_driver_manager.destory_web_driver(
                     house_driver.get_id())
             except BaseException as e2:
                 print e2
                 command = u"taskkill /F /IM chromedriver.exe"
                 os.system(command)
             house_driver = web_driver_manager.get_web_driver(True)