def crawler(): driver = Chrome(cache_path=r"E:\Temp") # 采集城市编码列表 spider_city_code = SpiderCityCode(driver) result1 = spider_city_code.run() Utils.io.write_json("anjuke_city_code.json", result1) # 采集城市房源数量 city_code_list = Utils.io.load_json("anjuke_city_code.json") city_info_list = Utils.io.load_json("anjuke_city_infor.json", default={}) spider_city_info = SpiderCityInfo(driver) for city_name, city_code in city_code_list.items(): if city_name not in city_info_list: city_info_list[city_name] = spider_city_info.run(city_code=city_code) Utils.io.write_json("anjuke_city_info.json", city_info_list) time.sleep(2) driver.quit()
import crawlertool as tool from Selenium4R import Chrome class SpiderAnjukeCityCodeList(tool.abc.SingleSpider): """ 安居客城市编码列表爬虫 """ def __init__(self, driver): self.driver = driver def running(self) -> List[Dict]: self.driver.get("https://www.anjuke.com/sy-city.html") result = [] for city_label in self.driver.find_elements_by_css_selector( "body > div.content > div > div.letter_city > ul > li > div > a" ): city_name = city_label.text city_code = city_label.get_attribute("href").replace( "https://", "").replace(".anjuke.com/", "") result.append({"city_name": city_name, "city_code": city_code}) return result if __name__ == "__main__": driver = Chrome(cache_path=r"E:\Temp") print(SpiderAnjukeCityCodeList(driver).running()) driver.quit()