def crawler():
    driver = Chrome(cache_path=r"E:\Temp")

    # 采集城市编码列表
    spider_city_code = SpiderCityCode(driver)
    result1 = spider_city_code.run()
    Utils.io.write_json("anjuke_city_code.json", result1)

    # 采集城市房源数量
    city_code_list = Utils.io.load_json("anjuke_city_code.json")
    city_info_list = Utils.io.load_json("anjuke_city_infor.json", default={})
    spider_city_info = SpiderCityInfo(driver)
    for city_name, city_code in city_code_list.items():
        if city_name not in city_info_list:
            city_info_list[city_name] = spider_city_info.run(city_code=city_code)
            Utils.io.write_json("anjuke_city_info.json", city_info_list)
            time.sleep(2)

    driver.quit()
예제 #2
0
import crawlertool as tool
from Selenium4R import Chrome


class SpiderAnjukeCityCodeList(tool.abc.SingleSpider):
    """
    安居客城市编码列表爬虫
    """
    def __init__(self, driver):
        self.driver = driver

    def running(self) -> List[Dict]:
        self.driver.get("https://www.anjuke.com/sy-city.html")

        result = []
        for city_label in self.driver.find_elements_by_css_selector(
                "body > div.content > div > div.letter_city > ul > li > div > a"
        ):
            city_name = city_label.text
            city_code = city_label.get_attribute("href").replace(
                "https://", "").replace(".anjuke.com/", "")
            result.append({"city_name": city_name, "city_code": city_code})

        return result


if __name__ == "__main__":
    driver = Chrome(cache_path=r"E:\Temp")
    print(SpiderAnjukeCityCodeList(driver).running())
    driver.quit()