def update_web_house_id(web_house_id, id): """ 修改房子网页id :param web_house_id: :param id: :return: """ sql = """update house set web_house_id=%s, updated=%s where id=%s""" pool.commit(sql, [web_house_id, datetime.datetime.now(), id])
def parse(self, response): try: if not response.text: raise BaseException(u"需要切换代理") if not is_json(response.text): raise BaseException(u"需要切换代理") self.is_change_proxy = False if "GetRoomJson" in response._url: list_json_response = json.loads( json.loads(response.text).get("d").replace("\\", "\\\\")) if not isinstance(list_json_response, list): raise BaseException(u"返回数据错误:%s" % list_json_response) for item_json_data in list_json_response: for item_room in item_json_data.get("rooms"): if not item_room.get("location"): continue status = self.get_house_status(BuildingSpider, item_room.get("status")) key = item_room.get("id") if int(self.dict_all_houses.get(key).get( "status")) != status: self.dict_all_houses.get(key)["status"] = status self.list_house_key.append(key) yield self.get_house_request() if len(self.list_house_key) == 0: yield self.create_request() elif "GetRoomInfo" in response._url: json_response = json.loads(response.text) room = json.loads(json_response.get("d")) key = room.get("id") new_description = json.dumps( dict( json.loads( self.dict_all_houses.get(key).get("description")), **room)) sql = """update house set status=%s, updated=%s, description=%s where id=%s""" pool.commit(sql, [ self.dict_all_houses.get(key).get("status"), datetime.datetime.now(), new_description, self.dict_all_houses.get(key).get("id") ]) logger.info(u"修改房屋状态:%s" % room.get("location")) self.list_house_key.remove(key) if len(self.list_house_key) == 0: yield self.create_request() except BaseException as e: if type(e) == CloseSpider: logger.warning(u"爬虫停止") raise CloseSpider() logger.warning(e) if e.message: logger.warning(e.message) self.is_change_proxy = True if "GetRoomJson" in response.request.url or "GetRoomInfo" in response.request.url: yield response.request else: yield self.create_request()
def update_house_status(house_id, status): """ 更改房间出售情况 :param house_id: :param status: :return: """ sql = """update house set status=%s, updated=%s where id=%s""" param = [status, datetime.datetime.now(), house_id] pool.commit(sql, param)
def update_building_count(building_id, total_count, sale_count): """ 更改大楼房子总数量,出售总数量 :param building_id: :param total_count: :param sale_count: :return: """ sql = """update building set total_count=%s, sale_count=%s, updated=%s where id=%s""" param = [total_count, sale_count, datetime.datetime.now(), building_id] pool.commit(sql, param)
def handle_building(self, origin_house_number): select_sql = """select count(1) from house where building_id=%s""" result = pool.find_one(select_sql, [self.building.get("id")]) update_status = 4 if int(result.get("count(1)")) == origin_house_number: update_status = 2 update_sql = """update building set status=%s, updated=%s where status=1 and id=%s""" pool.commit( update_sql, [update_status, datetime.datetime.now(), self.building.get("id")])
def update_building(pre_sale_number, id): """ 更改大楼预售许可证 :param pre_sale_number: :param id: :return: """ find_sql = """select * from building where id=%s""" result_find = pool.find_one(find_sql, [id]) if result_find.get("per_sale_number"): return sql = """update building set pre_sale_number=%s where id=%s""" param = [pre_sale_number, id] pool.commit(sql, param)
def update_real_estate_count(real_estate_id, house_total_count, house_sell_out_count): """ 更改楼盘房子总数量,出售总数量 :param real_estate_id: :param house_total_count: :param house_sell_out_count: :return: """ sql = """update real_estate set house_total_count=%s, house_sell_out_count=%s, updated=%s where id=%s""" param = [ house_total_count, house_sell_out_count, datetime.datetime.now(), real_estate_id ] pool.commit(sql, param)
def save(self): sql = """ insert into cart_package(jpe_name, chi_name, abbreviation, eng_name, punish_time, tags_rarity_id, created) VALUES (%s, %s, %s, %s, %s, %s, %s) """ param = [ self.jpe_name, self.chi_name, self.abbreviation, self.eng_name, self.punish_time, self.tags_rarity_id, self.created ] return pool.commit(sql, param)
def save(self): sql = """ insert into cart_tags(chi_name, jap_name, eng_name, tags_chi_name, tags_eng_name) VALUES (%s, %s, %s, %s, %s) """ param = [ self.chi_name, self.jap_name, self.eng_name, self.tags_chi_name, self.tags_eng_name ] return pool.commit(sql, param)
def get_house_attribute(internet_type, name): select_sql = """select * from house_attribute where chinese_name=%s and internet_type=%s""" result_house_attribute = pool.find_one(select_sql, param=[name, internet_type]) if result_house_attribute: return result_house_attribute.get("id") else: insert_sql = """insert into house_attribute(chinese_name, internet_type, created) values(%s, %s, %s)""" return pool.commit( insert_sql, param=[name, internet_type, datetime.datetime.now()])
def save(self): self.created = datetime.datetime.now() sql = """ insert into cart(chi_name, eng_name, jpe_name, level, icon_url, effect, attack, defend, link, created, effect_detail, tags_type_id) VALUE (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ list_param = (self.chi_name, self.eng_name, self.jpe_name, self.level, self.icon_url, self.effect, self.attack, self.defend, self.link, self.created, self.effect_detail, self.tags_type_id) return pool.commit(sql, list_param)
def set_spider_now_page(new_page): sql = """ update spider_conf set value = %s, updated=%s where name="spider_now_page" """ param = [new_page, datetime.datetime.now()] pool.commit(sql, param)
def save(self): sql = """insert into cart_detail(cart_id, cart_tags_id, created, cart_tags_eng_name) values (%s, %s, %s, %s)""" param = [self.cart_id, self.cart_tags_id, datetime.datetime.now(), self.cart_tags_eng_name] return pool.commit(sql, param)
def update_region(region_id, now_page): sql = """update region set now_page=%s, updated=%s where id=%s""" param = [now_page, datetime.datetime.now(), region_id] pool.commit(sql, param)
def set_error_building_status(self, building_id): sql = """update building set status=3, updated=%s where id=%s and status=1""" pool.commit(sql, param=[datetime.datetime.now(), building_id])
def save(sql, param): return pool.commit(sql, param)
def work(self): delete_logs() options = webdriver.ChromeOptions() options.add_argument("headless") web_driver_manager = WebDriverManager(1, "chrome", options) house_driver = web_driver_manager.get_web_driver(True) # 统计数据 buliding_id = 0 real_estate_id = 0 while True: try: house = pool.find_one(self.base_select_sql) if not house: logger.info(u"数据收集完成") break if not house.get("web_house_id"): continue house_driver.send_url( (self.base_house_url % house.get("web_house_id"))) # 截图整个网页 house_driver.save_screenshot(self.save_image_url) # 保存图片 img = house_driver.find_element_by_tag_name("img") location_img_url = self.save_image_url left = img.location.get("x") top = img.location.get("y") width = left + img.size.get("width") height = top + img.size.get("height") image = Image.open(location_img_url).crop( (left, top, width, height)) image.save(location_img_url) # 防止图片没有保存下来 time.sleep(3) # 识别图片 image_recognition = ImageRecognition(self.base_image_path, self.save_image_url) expression, int_code = image_recognition.get_expression_code() # 发送验证码请求 code_input = house_driver.find_element_by_id("txtCode") code_input.send_keys(int_code) house_driver.find_element_by_id("Button1").click() one_house_url = house_driver.current_url if "bid" in one_house_url: # 保存成功的图片 image_recognition.save_success_image( self.save_image_url, expression) # 收集数据 one_house_soup = BeautifulSoup(house_driver.page_source, "html.parser") if not one_house_soup.find("img"): raise BaseException(u"无法获取房子数据") one_house_data = unquote( one_house_soup.find( "img", attrs={ "id": "roomInfo_img" }).attrs.get("src").split("text=")[1].replace( "%u", "\\u").decode("unicode-escape")) if not one_house_data: raise BaseException(u"无法获取房子数据") if one_house_data and "undefined-undefined" in one_house_data: raise BaseException(u"无法获取房子数据") json_data = json.loads(one_house_data) # if json_data.get("HX") == u"其他": # continue house_status = chinese_status.get( json_data.get("FWZT")) if chinese_status.get( json_data.get("FWZT")) else 7 inside_area = json_data.get("TNMJ") built_area = json_data.get("JZMJ") house_type = json_data.get("HX") inside_price = json_data.get("NSDJ_TN") built_price = json_data.get("NSDJ_JM") pool.commit(self.base_update_sql, [ house_status, inside_area, built_area, house_type, inside_price, built_price, datetime.datetime.now(), house.get("id") ]) logger.info(u"thread:%s, %s:套内单价:%s, 套内面积:%s" % (self.thread_no, house.get("door_number"), inside_price, inside_area)) # 统计数据 # 不同大楼,此时统计该栋楼的数据 if buliding_id and buliding_id != house.get("buliding_id"): sql_count_house = """select * from (select count(1) as sale_number from house where buliding_id=%s and status=2) as a, (select count(1) as total_number from house where buliding_id=%s) as b, (select count(1) as sold_number from house where `status` in (3,4,5) and buliding_id=%s) as c""" result_count_house = pool.find_one( sql_count_house, [buliding_id, buliding_id, buliding_id], sql_analysis=False) sql_update_buliding = """update building set sale_residence_count=%s, total_count=%s, sale_count=%s, updated=%s where id=%s""" pool.commit(sql_update_buliding, [ result_count_house[0], result_count_house[1], result_count_house[2], datetime.datetime.now(), buliding_id ]) buliding_id = house.get("buliding_id") # 不同楼盘,此时统计楼盘数据 if real_estate_id and real_estate_id != house.get( "real_estate_id"): sql_count_buliding = """select sum(sale_residence_count), sum(total_count), sum(sale_count) from building where real_estate_id=%s""" result_count_buliding = pool.find_one( sql_count_buliding, [real_estate_id]) sql_update_real_estate = """update real_estate set sale_count=%s, house_total_count=%s, house_sell_out_count=%s, updated=%s where id=%s""" pool.commit(sql_update_real_estate, [ result_count_buliding.get( "sum(sale_residence_count)"), result_count_buliding.get("sum(total_count)"), result_count_buliding.get("sum(sale_count)"), datetime.datetime.now(), real_estate_id ]) real_estate_id = house.get("real_estate_id") if not buliding_id: buliding_id = house.get("buliding_id") real_estate_id = house.get("real_estate_id") except BaseException as e: logger.error(e) try: web_driver_manager.destory_web_driver( house_driver.get_id()) except BaseException as e2: print e2 command = u"taskkill /F /IM chromedriver.exe" os.system(command) house_driver = web_driver_manager.get_web_driver(True)