Esempi in Python per CommonClass.get_custom_settings_dict

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: tt.extensions.commonfunctions

Classe/tipologia: CommonClass

Metodo/funzione: get_custom_settings_dict

Esempi su hotexamples.com: 13

CommonClass.get_custom_settings_dict in Python: 13 esempi trovati. Questi sono i migliori esempi reali in Python per tt.extensions.commonfunctions.CommonClass.get_custom_settings_dict, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

get_custom_settings_dict(13)

clean_string(8)

get_proxies(8)

find_digits_from_str(6)

get_cleaned_string_by_splitting_list(1)

get_latest_file_name(1)

remove_0_len_element(1)

replace_string(1)

safely_convert_to_int(1)

Esempio n. 1

Mostra file

File: bus8684.py Progetto: zouyaoji/Crawls

class Bus8684Spider(scrapy.Spider):
    """
		sys.exit code == 1 # missing BUS8684_CITY_LIST
	"""
    name = "bus8684"

    root_path = ""
    log_dir = ""
    debug = False
    bus8684_city_list = []
    save_every_response = False
    crawled_dir = ""
    saved_html_dir = ""
    gaode_json_dir = ""
    output_folder_name = ""
    base_uri = ""
    run_purpose = None

    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.debug = self.settings.get(name="PROJECT_DEBUG", default=False)
        self.bus8684_city_list = self.settings.get("BUS8684_CITY_LIST",
                                                   default=[])
        if 1 > len(self.bus8684_city_list):
            self.logger.error(
                f"missing BUS8684_CITY_LIST ({self.bus8684_city_list}) setting"
            )
            sys.exit(1)
        self.save_every_response = self.settings.get(
            name="SAVE_EVERY_RESPONSE", default=False)
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="")
        self.gaode_json_dir = self.settings.get(name="SAVED_GAODE_JASON",
                                                default="")
        self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME",
                                                    default="")
        self.base_uri = self.settings.get(name="BASE_URI", default="")
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)

        self.maximal_requests_of_one_crontab_process = self.settings.get(
            name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=23)
        self.interval_between_requests = self.settings.get(
            name="INTERVAL_BETWEEN_REQUESTS", default=300)

    def check_dirs_and_files(self):
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.saved_html_dir):
            os.makedirs(self.saved_html_dir)
        if not os.path.isdir(self.gaode_json_dir):
            os.makedirs(self.gaode_json_dir)

    def start_requests(self):
        """
			0 == index_level and "index" == page_type: https://guangzhou.8684.cn/
			1 == index_level: https://guangzhou.8684.cn/list1 # list1 page displays links of Bus Route #1, #10, #175 and so on
			"detailed" == page_type: https://guangzhou.8684.cn/x_8234e473 # this one is Bus Route 10 detailed page
		"""
        self.init_self_attributes()
        self.check_dirs_and_files()

        if "PRODUCTION_RUN" == self.run_purpose:
            number_day_of_this_year = datetime.datetime.now().timetuple(
            ).tm_yday  # type == int
            seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3)
            if seperate_into_days > len(self.bus8684_city_list):
                seperate_into_days = len(self.bus8684_city_list)
            batch_count = math.ceil(
                len(self.bus8684_city_list) / seperate_into_days)
            today_batch = number_day_of_this_year % seperate_into_days
            start_index = today_batch * batch_count - 1
            end_index = (today_batch + 1) * batch_count
            urls = []
            for index, city in enumerate(self.bus8684_city_list):
                if (start_index < index) and (index < end_index):
                    urls.append(f"https://{city}.{self.base_uri}")

            meta_dict = {
                "page_type": "index",
                "index_level": 0,
            }
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "READ_HTML" == self.run_purpose:
            url = "http://quotes.toscrape.com/page/1/"
            yield scrapy.Request(url=url,
                                 callback=self.debug_one_method,
                                 meta={},
                                 dont_filter=True)

    def extract_links_from_list_page(self,
                                     response=None,
                                     city="",
                                     index_level_int=0,
                                     route_str=""):
        urls = []
        if index_level_int not in [
                0,
                1,
        ]:
            return urls
        if 0 == index_level_int:
            digit_href_list = response.xpath(
                "//div[@class='bus_kt_r1']/a/@href").extract()
            letter_href_list = response.xpath(
                "//div[@class='bus_kt_r2']/a/@href").extract()
            all_link_list = digit_href_list + letter_href_list
            for one_link in all_link_list:
                urls.append(f"https://{city}.{self.base_uri}{one_link}")
                # https://guangzhou.8684.cn/list1
                # one_link == "/list1", "/listB"
            return urls

        # 1 == index_level_int
        route_href_list = response.xpath(
            "//div[@id='con_site_1']/a/@href").extract()
        route_text_list = response.xpath(
            "//div[@id='con_site_1']/a/text()").extract()
        if len(route_href_list) != len(route_text_list):
            error_msg = f"length of route_href_list ({len(route_href_list)}) != length of route_text_list ({len(route_text_list)})"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            route_text_list = []
        for index, one_link in enumerate(route_href_list):
            temp_dict = {
                "url":
                f"https://{city}.{self.base_uri}{one_link}",
                # https://guangzhou.8684.cn/x_face82cc
                # one_link == "/x_f2148667", "/x_a72d3ade"
                "route":
                route_text_list[index] if index < len(route_text_list) else 0,
            }
            urls.append(temp_dict)
        return urls

    def parse_list_page(self, response=None):
        write_result_int, city, page_type, index_level, route_str = self.save_html(
            response=response)
        if -1 == write_result_int or "index" != page_type or index_level not in [
                0,
                1,
        ]:
            return False

        urls = self.extract_links_from_list_page(response=response,
                                                 city=city,
                                                 index_level_int=index_level,
                                                 route_str=route_str)
        if 0 == index_level:
            meta_dict = {
                "page_type": "index",
                "index_level": 1,
            }
            for one_url in urls:
                yield scrapy.Request(url=one_url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        else:
            meta_dict = {
                "page_type": "detailed",
                "index_level": -1,
            }
            for one_url_dict in urls:
                meta_dict["route"] = one_url_dict[
                    "route"] if "route" in one_url_dict.keys(
                    ) else "unknown_route"
                one_url = one_url_dict["url"] if "url" in one_url_dict.keys(
                ) else ""
                if 0 < len(one_url):
                    yield scrapy.Request(url=one_url,
                                         callback=self.parse_detailed_page,
                                         meta=meta_dict,
                                         dont_filter=True)
                else:
                    error_msg = f"wrong one_url_dict ({one_url_dict})"
                    self.logger.error(
                        f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                    )

    def get_city_from_url(self, url=""):
        city = ""
        result_obj = parse.urlparse(url)
        if -1 < result_obj.netloc.find(self.base_uri):
            temp2_list = result_obj.netloc.split(".")
            if 3 == len(temp2_list):
                city = temp2_list[0]
        return city

    def make_html_file_name(self, response=None, city="", index_level=-1):
        """
			https://guangzhou.8684.cn/
			https://guangzhou.8684.cn/list1, https://guangzhou.8684.cn/listH
			https://guangzhou.8684.cn/x_8234e473, https://guangzhou.8684.cn/x_1ed58fbc
			response already has url and meta attributes
		"""
        now = datetime.datetime.now()
        html_filename_str = now.strftime("%Y%m%d_%H%M%S")
        today = now.strftime("%Y%m%d")

        url = response.url
        meta_dict = response.meta

        result_obj = parse.urlparse(url)
        url_path_list = result_obj.path.split("/")
        while "" in url_path_list:
            url_path_list.remove("")

        detailed_page_bool = False
        route_str = ""
        if 0 == len(url_path_list) and 0 == index_level:
            html_filename_str = f"{city}___index{index_level}___route_all___{today}.html"
        elif 1 == len(url_path_list):
            if -1 < url_path_list[0].find("list"):
                route_str = url_path_list[0].lstrip("list")
                if 1 > len(route_str):
                    route_str = "unknown"
                html_filename_str = f"{city}___index{index_level}___route_{route_str}___{today}.html"
            elif -1 < url_path_list[0].find("x_"):
                detailed_page_bool = True
                route_str = str(meta_dict["route"]
                                ) if "route" in meta_dict.keys() else "unknown"
                # has Chinese and Entrobus32 does not accept file name including Chinese
                route_id = url_path_list[0]
                html_filename_str = f"{city}___detailed___route_{route_id}___{today}.html"
        else:
            html_filename_str = f"{city}___unknown___route_unknown___{html_filename_str}.html"
        return (detailed_page_bool, route_str, html_filename_str)

    def save_html(self, response=None):
        """
			returns -1: wrong response object
			-2: fail to write response.body
			1001: this is a detailed page
			101: more than 69 pages
			0 to 70: page number; 0:detailed page or fail to extract total page from list page
		"""
        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            if hasattr(response, "url"):
                error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)"
            else:
                error_msg = f"fail to save response.body response has no url attribute and may have no body and / or meta attribute(s)"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return (-1, "", "", -1, "")
        url = response.url
        meta_dict = response.meta
        page_type = "index"
        index_level = -1
        route_str = ""
        city = self.get_city_from_url(url=url)
        html_file_path = ""

        if "page_type" in meta_dict.keys():
            page_type = meta_dict["page_type"]

        if "index_level" in meta_dict.keys():
            index_level = meta_dict["index_level"]
            index_level = CommonClass.safely_convert_to_int(
                to_int_obj=index_level,
                spider_obj=self,
                convert_strategy="match_all_digits")
            if index_level is None:
                index_level = -1
        elif "index" == page_type:
            error_msg = f"index_level is NOT in meta_dict.keys(); and page has NOT been saved after requesting {url} "
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        if "index" == page_type and -1 < index_level:
            detailed_page_bool, route_str, html_filename_str = self.make_html_file_name(
                response=response, city=city, index_level=index_level)
            html_file_path = os.path.join(self.saved_html_dir,
                                          html_filename_str)
        elif "detailed" == page_type:
            detailed_page_bool, route_str, html_filename_str = self.make_html_file_name(
                response=response, city=city, index_level=index_level)
            html_file_path = os.path.join(self.saved_html_dir,
                                          html_filename_str)

        try:
            with open(html_file_path, "wb") as f:
                f.write(response.body)
        except Exception as ex:
            error_msg = f"fail to write response.body into {html_file_path} after requesting {url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return (0, city, page_type, index_level, route_str)
        return (1, city, page_type, index_level, route_str)

    def parse_one_bus_route_fields(self,
                                   response=None,
                                   city_str="",
                                   route_str=""):
        if response is None:
            return {}

        try:
            url = response.url
            url_obj = parse.urlparse(url)
            bus_route_id = url_obj.path.strip("/")
            bus_line_div = response.xpath("//div[@id='bus_line']")
            bus_line_information_div = bus_line_div.xpath(
                "./div[@class='bus_line_information ']/div[@class='bus_i_content']"
            )
            bus_route_title = bus_line_information_div.xpath(
                "./div[@class='bus_i_t1']/h1/text()").extract_first(default="")
            bus_route_title = CommonClass.clean_string(string=bus_route_title,
                                                       char_to_remove=[
                                                           ' ',
                                                           '　',
                                                           '\xa0',
                                                           '&nbsp',
                                                       ])
            bus_route_district = bus_line_information_div.xpath(
                "./div[@class='bus_i_t1']/a[@class='bus_i_t2']/text()"
            ).extract_first(default="")
            bus_route_info_list = bus_line_information_div.xpath(
                "./p[@class='bus_i_t4']/text()").extract()
            bus_route_info_str = ""
            if 0 < len(bus_route_info_list):
                bus_route_info_str = "___".join(bus_route_info_list)
            bus_operation_interval_str = bus_line_div.xpath(
                "./div[@class='bus_label ']/p[@class='bus_label_t2']/text()"
            ).extract_first(default="")

            bus_direction_dict = {}
            all_way_div_list = bus_line_div.xpath(
                "./div[@class='bus_line_top ']")
            for index, one_way_div in enumerate(all_way_div_list):
                one_way_name_text_list = one_way_div.xpath(
                    "./div/strong/text()").extract()
                one_way_name = "___".join(one_way_name_text_list) if 0 < len(
                    one_way_name_text_list) else ""
                span_text_list = one_way_div.xpath("./span/text()").extract()
                one_way_stop_number = "___".join(
                    span_text_list) if 0 < len(span_text_list) else ""
                if 0 < len(one_way_stop_number):
                    one_way_stop_number = CommonClass.clean_string(
                        string=one_way_stop_number,
                        char_to_remove=[
                            ' ',
                            '　',
                            '\xa0',
                        ])
                temp_dict = {
                    "one_way_name": one_way_name,
                    "one_way_stop_number": one_way_stop_number,
                }
                bus_direction_dict[index] = temp_dict

            bus_route_stop_round_trip_list = bus_line_div.xpath(
                "./div[@class='bus_line_site ']")
            for index, one_direction in enumerate(
                    bus_route_stop_round_trip_list):
                stop_sequence_list = one_direction.xpath(
                    "./div[@class='bus_site_layer']/div/i/text()").extract()
                stop_name_list = one_direction.xpath(
                    "./div[@class='bus_site_layer']/div/a/text()").extract()
                if len(stop_name_list) == len(stop_sequence_list):
                    temp_list = []
                    for stop_name_index, stop_name in enumerate(
                            stop_name_list):
                        temp_list.append(
                            f"{stop_sequence_list[stop_name_index]}___{stop_name}"
                        )
                    if index in bus_direction_dict.keys():
                        bus_direction_dict[index]["stops"] = temp_list
                    else:
                        bus_direction_dict[index] = {"stops": temp_list}

            text_dict = {
                "route_title": bus_route_title.strip(),
                "city": city_str,
                "route_name": route_str,
                "route_id": bus_route_id.strip(),
                "route_uri": url,
                "route_district": bus_route_district.strip(),
                "route_info": bus_route_info_str.strip(),
                "operation_interval": bus_operation_interval_str.strip(),
                "bus_directions": bus_direction_dict,
            }
            return text_dict
        except Exception as ex:
            error_msg = f"Error happened during parsing. Exception = {ex}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return {}

    def debug_one_method(self, response):
        file_name = "guangzhou___detailed___route_405路(2019年7月13日起调整)___20190622.html"
        file_name = "guangzhou___detailed___route_花6路___20190622.html"
        html_dir = os.path.join(self.root_path, self.name,
                                self.output_folder_name, "20190622html")
        file_path = os.path.join(html_dir, file_name)
        if os.path.isfile(file_path):
            doc = None
            try:
                with open(file_path, "rb") as html_file:
                    doc = html_file.read().decode("utf-8", "ignore")
            except Exception as ex:
                self.logger.error(
                    f"Error: cannot read html file {file_path}. Exception = {ex}"
                )
                return False

            if doc is None:
                self.logger.error(f"Error: cannot read html file {file_path}.")
                return False
            url = "https://guangzhou.8684.cn/x_f2148667"
            response_for_items = TextResponse(url=url,
                                              status=200,
                                              body=bytes(doc,
                                                         encoding="utf-8"))

            write_result_int, city, page_type, index_level, route_str = self.save_html(
                response=response_for_items)
            text_dict = self.parse_one_bus_route_fields(
                response=response_for_items,
                city_str=city,
                route_str=route_str)
            text_dict["city"] = "guangzhou"
            text_dict["route_name"] = "花6路"
            if 0 < len(text_dict):
                try:
                    loader = ItemLoader(item=Bus8684Item(),
                                        response=response_for_items)
                    loader = self.load_items_into_loader(loader=loader,
                                                         text=text_dict,
                                                         url=url)
                    yield loader.load_item()
                except Exception as ex:
                    self.logger.error(
                        f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
                    )

    def parse_detailed_page(self, response):
        write_result_int, city, page_type, index_level, route_str = self.save_html(
            response=response)
        text_dict = self.parse_one_bus_route_fields(response=response,
                                                    city_str=city,
                                                    route_str=route_str)
        # self.logger.debug( f"write_result_int, city, page_type, index_level, route_str = {(write_result_int, city, page_type, index_level, route_str)}" )
        if 0 < len(text_dict):
            self.logger.info(
                f"After requesting {response.url}, good response is received.")
            try:
                loader = ItemLoader(item=Bus8684Item(), response=response)
                loader = self.load_items_into_loader(loader=loader,
                                                     text=text_dict,
                                                     url=response.url)
                yield loader.load_item()
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
                )

    def load_items_into_loader(self, loader=None, text={}, url=""):
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get("BOT_NAME"))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date",
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))

        loader.add_value("content", str(text))
        loader.add_value("page_type", "detailed")

        return loader

Esempio n. 2

Mostra file

class Land3fangSpider(scrapy.Spider):
    """
		在分布式scrapyd部署之前，为了起多个fangesf进程而采取的临时措施(fangesfp2是本套代码的一个拷贝)。
		sys.exit code == 1 # wrong or missing RUN_PURPOSE
		sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON
		sys.exit code == 3 # fail to get proxy's ip
		sys.exit code == 4 # wrong city_code
		On 20190730 Peter writes this spider upon requests
	"""
    name = "land3fang"

    root_path = ""
    log_dir = ""
    resume_break_point_detailed_file_name = "crawled_detailed_html.log"
    resume_break_point_list_file_name = "crawled_list_html.log"
    crawled_list_url_list = []
    crawled_detailed_url_list = []
    debug = False
    city_list = []
    city_name_dict = {}
    run_purpose = None
    save_every_response = False
    overwrite_today = ""
    crawled_dir = ""
    saved_html_dir = ""
    over34_filename = ""

    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    proxy_ip_dict = {}
    min_proxy_ip_life_time = 6
    max_proxy_ip_life_time = 180
    use_proxy = False
    proxy_agent = ""

    cookie_string = ""
    cookie_dict = {}

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.debug = self.settings.get(name="PROJECT_DEBUG", default=False)
        self.city_list = self.settings.get("CITY_LIST", default=[])
        if 1 > len(self.city_list) and "city" == self.city_name_for_districts:
            self.logger.error(f"missing CITY_LIST ({self.city_list}) setting")
            sys.exit(1)
        self.city_name_dict = self.settings.get("CITY_NAME_DICT", default={})
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        if self.run_purpose is None:
            self.logger.error(
                f"missing RUN_PURPOSE ({self.run_purpose}) setting")
            sys.exit(2)
        self.save_every_response = self.settings.get(
            name="SAVE_EVERY_RESPONSE", default=False)
        self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="")
        if not hasattr(self, "overwrite_today") or 1 > len(
                self.overwrite_today) or self.overwrite_today is None:
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # set all paths
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="")

        if 1 > len(self.crawled_dir) or 1 > len(self.saved_html_dir):
            error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}) setting(s)"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            sys.exit(3)

        self.over34_filename = self.settings.get(name="OVER34_LOG_FILENAME",
                                                 default="")

        self.min_proxy_ip_life_time = self.settings.get(
            name="MIN_PROXY_LIFE_SPAN", default=6)
        self.max_proxy_ip_life_time = self.settings.get(
            name="MAX_PROXY_LIFE_SPAN", default=180)
        self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED",
                                           default=False)
        self.proxy_agent = self.settings.get(name="PROXY_AGENT", default="")

        self.cookie_string = self.settings.get(name="COOKIE_STRING",
                                               default="")
        self.cookie_jar = CookieJar()

    def make_dirs(self):
        # even cache is used, we save all html files; here we make these 3 dirs if they do not exist
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.saved_html_dir):
            os.makedirs(self.saved_html_dir)

    def proxy_ip_pool(self):
        """
			迅联错误码10000		提取过快，请至少5秒提取一次
		"""
        if "DRAGONFLY" == self.proxy_agent:
            return CommonClass.get_proxies(proxy_dict={})
        now = time.time()
        need_new_proxy = False
        if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict):
            need_new_proxy = True
        elif "expire" not in self.proxy_ip_dict.keys():
            need_new_proxy = True
        elif now + 3 > self.proxy_ip_dict["expire"]:
            need_new_proxy = True
        if need_new_proxy:
            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 1 > len(proxies_dict):
                return self.proxy_ip_dict  # still return the old ip dict or {}
            proxies_dict["expire"] = now + random.randint(
                self.min_proxy_ip_life_time,
                self.max_proxy_ip_life_time)  # set ip life time
            self.proxy_ip_dict = proxies_dict
        return self.proxy_ip_dict

    def read_crawled_urls(self):
        resume_break_point_detailed_file_path = os.path.join(
            self.log_dir, self.resume_break_point_detailed_file_name)
        try:
            with open(resume_break_point_detailed_file_path,
                      "r",
                      encoding="utf-8") as log_file:
                self.crawled_detailed_url_list = log_file.readlines()
                while "" in self.crawled_detailed_url_list:
                    self.crawled_detailed_url_list.remove("")
        except Exception as ex:
            error_msg = f"fail to read {resume_break_point_detailed_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        # for list pages, do not use this [] to exclude seen urls

    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            urls = [
                # 广州
                "https://land.3fang.com/market/440100__1______1_1_1.html",  # 住宅用地: 26页
                "https://land.3fang.com/market/440100__2______1_1_1.html",  # 商业/办公用地: 17页
                "https://land.3fang.com/market/440100__3_2__0_100000__1_1_1.html",  # 工业用地, 已成交, 10万平米以下: 32页
                "https://land.3fang.com/market/440100__3_2__100000_500000__1_1_1.html",  # 工业用地, 已成交, 10-50万平米: 4页
                "https://land.3fang.com/market/440100__3_2__500000_100000000__1_1_1.html",  # 工业用地, 已成交, 50万平米以上: 1页
                "https://land.3fang.com/market/440100__3_1_____1_1_1.html",  # 工业用地, 未成交: 1页
                "https://land.3fang.com/market/440100__3_3_____1_1_1.html",  # 工业用地, 流拍: 7页
                "https://land.3fang.com/market/440100__4______1_1_1.html",  # 其他用地: 4页

                # # 佛山
                "https://land.3fang.com/market/440600__1_1_____1_1_1.html",  # 住宅用地, 未成交: 8页
                "https://land.3fang.com/market/440600__1_2__0_5000__1_1_1.html",  # 住宅用地, 已成交, 5千平米以下: 33页
                "https://land.3fang.com/market/440600__1_2__5000_100000__1_1_1.html",  # 住宅用地, 已成交, 5千到10万平米: 29页
                "https://land.3fang.com/market/440600__1_2__100000_100000000__1_1_1.html",  # 住宅用地, 已成交, 10万平米以上: 6页
                "https://land.3fang.com/market/440600__1_3_____1_1_1.html",  # 住宅用地, 流拍: 3页
                "https://land.3fang.com/market/440600__2______1_1_1.html",  # 商业用地: 19页
                "https://land.3fang.com/market/440600__3_1_____1_1_1.html",  # 工业用地, 未成交: 6页
                "https://land.3fang.com/market/440600__3_2__0_40000__1_1_1.html",  # 工业用地, 已成交, 4万平米以下: 32页
                "https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html",  # 工业用地, 已成交, 4万平米以上: 12页
                "https://land.3fang.com/market/440600__3_3_____1_1_1.html",  # 工业用地, 流拍: 1页
                "https://land.3fang.com/market/440600__4______1_1_1.html",  # 其他用地: 3页
            ]

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            cookie_dict = dict([
                pair.split("=", 1) for pair in self.cookie_string.split("; ")
            ])
            self.cookie_dict = cookie_dict
            for url in urls:
                url_object = parse.urlparse(url)
                path_list = url_object.path.split("/")
                for one in path_list:
                    if -1 == one.find(".html"):
                        continue
                    city_name = ""
                    city_code_list = one.split("_")
                    city_code = int(
                        city_code_list[0]) if 0 < len(city_code_list) else 0
                    if 0 < city_code and str(
                            city_code) in self.city_name_dict.keys():
                        city_name = self.city_name_dict[str(city_code)]
                    if 1 > len(city_name):
                        error_msg = f"{city_code} is NOT in self.city_name_dict.keys() ({self.city_name_dict.keys()})"
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                        )
                        sys.exit(4)
                    break
                meta_dict["city"] = city_name
                # cookie_dict = self.change_cookies( cookie_dict )
                yield scrapy.Request(url=url,
                                     cookies=cookie_dict,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
                # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
        elif "READ_CSV_AND_REDO" == self.run_purpose:
            english_city_name = {
                "佛山": "foshan",
                "广州": "guangzhou",
            }
            filename = "tudi_201808.csv"
            csv_file_path = os.path.join(self.crawled_dir, filename)
            url_list = []
            city_list = []
            try:
                with open(csv_file_path, newline="",
                          encoding="utf-8") as csvfile:
                    file_reader = csv.reader(
                        csvfile)  # , delimiter=' ', quotechar='|'
                    for row in file_reader:
                        if -1 < row[8].find("https:"):
                            url_list.append(row[8])
                            city_list.append(row[13])
            except Exception as ex:
                error_msg = f"cannot read csv file, Exception = {ex}"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )

            meta_dict = {
                "page_type": "detailed",
                "total_pages": 1,
            }
            self.cookie_dict = dict([
                pair.split("=", 1) for pair in self.cookie_string.split("; ")
            ])
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                meta_dict["proxy"] = proxies_dict["http"]

            for index, url in enumerate(url_list):
                chinese_city_name = city_list[index]
                meta_dict["city"] = english_city_name[chinese_city_name]
                yield scrapy.Request(url=url,
                                     cookies=self.cookie_dict,
                                     callback=self.parse_detailed_page,
                                     meta=meta_dict,
                                     dont_filter=True)
                break
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)

    def change_cookies(self, cookie_dict={}):
        if "uservisitMarketitem" in cookie_dict.keys():
            item_str = cookie_dict["uservisitMarketitem"]
            item_str = parse.unquote(item_str)
            item_list = item_str.split(",")
            new_str = ""
            for index, one in enumerate(item_list):
                if index > len(item_list) - 4:
                    new_str += f",{one}"
            cookie_dict["uservisitMarketitem"] = parse.quote(new_str)
        return cookie_dict

    def get_total_pages(self, response=None):
        """
			/market/440600__4______1_1_3.html
		"""
        total_pages = 0
        if response is None:
            return total_pages
        all_link_list = response.xpath(
            "//div[@id='divAspNetPager']/a/@href").extract()

        total_page_list = []
        for one in all_link_list:
            page = 0
            temp_list = one.split("_")
            for one_fragment in temp_list:
                if -1 < one_fragment.find(".html"):
                    page = one_fragment.replace(".html", "")
                    total_page_list.append(int(page))
                    break

        if 1 > len(total_page_list):
            return 1
        return max(total_page_list)

    def get_this_url_page(self, url_obj_path=""):
        """
			https://land.3fang.com/market/440600__4______1_1_3.html
			https://land.3fang.com/market/440100__1______1_1_1.html
		"""
        url_list = url_obj_path.split("_")
        for one in url_list:
            if -1 < one.find(".html"):
                return int(one.replace(".html", ""))
        return 0

    def make_html_file_name(self, url="", city="", page_type=""):
        """
			https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html
			https://land.3fang.com/market/cee05e00-3263-4774-a898-9def16955cb4.html
		"""
        now = datetime.datetime.now()
        html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S"))
        today = now.strftime("%Y%m%d")

        url_obj = parse.urlparse(url)
        url_list = url_obj.path.split("/")
        for one in url_list:
            if -1 < one.find(".html"):
                html_filename = f"{city}__{page_type}__{one}"
                break
        return html_filename

    def get_page_and_district_area(self, url_list=[]):
        """
			list page #2 or more, or including channels like:
			https://sz.esf.fang.com/house-a013080/
			where a013080 stands for 深圳市龙华区
			or https://sz.esf.fang.com/house-a013080-b014334 or https://sz.esf.fang.com/house-a013080-b02094/i372/
			where b014334 stands for 深圳市龙华区大浪；house-a013080-b02094 stands for 观澜；
			house-a013080-b0350 stands for 龙华；house-a013080-b014333 stands for 民治
			https://sz.esf.fang.com/house-a087-b0342/g22/ where g22 stands for 二居室；
			g21(一居)，g23(三居)，g24(四居)，g25(五居)，g299(五居以上)
			# this option is a multiple choice but this crawl will ONLY use single choice
		"""
        page = "1"
        district_area = ""
        bedrooms = 0
        for index, key in enumerate(url_list):
            one_fragment = url_list[index]
            if -1 < one_fragment.find("i3") and -1 == one_fragment.find(
                    "house-"):
                page = one_fragment[2:]
            elif -1 < one_fragment.find("house-") and -1 == one_fragment.find(
                    "i3"):
                district_area = one_fragment.replace("house-", "")
                district_area = district_area.replace("-", "_")
                if index + 1 < len(url_list):
                    next_fragment = url_list[index + 1]
                    if -1 < next_fragment.find("g2"):
                        last_part_of_fragment = next_fragment.replace("g2", "")
                        if -1 < last_part_of_fragment.find("-i3"):
                            temp_list = last_part_of_fragment.split("-i3")
                            if 1 < len(temp_list):
                                bedrooms = int(temp_list[0])
                        else:
                            bedrooms = int(last_part_of_fragment)
        return (page, district_area, bedrooms)

    def save_html(self, response=None, save34=False):
        city = ""
        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            if hasattr(response, "url"):
                error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return -1, city
        url = response.url
        meta_dict = response.meta
        page_type = "index"
        total_pages = 0
        city = meta_dict["city"] if "city" in meta_dict.keys() else ""
        if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"]

        if "index" == page_type:
            if "total_pages" in meta_dict.keys():
                total_pages = int(meta_dict["total_pages"])
            if 0 == total_pages:
                total_pages = self.get_total_pages(response=response)
            if 34 < total_pages and not save34: return 101, city
            html_filename = self.make_html_file_name(url=url,
                                                     city=city,
                                                     page_type=page_type)
            html_file_path = os.path.join(self.saved_html_dir, html_filename)

        elif "detailed" == page_type:
            html_filename = self.make_html_file_name(url=url,
                                                     city=city,
                                                     page_type=page_type)
            html_file_path = os.path.join(self.saved_html_dir, html_filename)
            total_pages = 1001

        try:
            with open(html_file_path, "wb") as f:
                f.write(response.body)
        except Exception as ex:
            error_msg = f"fail to write response.body into {html_file_path} after requesting {url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return -2, city
        else:
            if 1 > total_pages:
                error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return total_pages, city  # could be 34 when save34 = True

    def extract_link_list(self, response=None):
        link_list = response.xpath(
            '//dl[@id="landlb_B04_22"]/dd/div[@class="list28_text fl"]/h3/a/@href'
        ).extract()
        if 1 > len(link_list):
            error_msg = f"Fail to extract links from {response.url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        return link_list

    def load_items_into_loader(self, loader=None, text={}, url=""):
        loader.add_value("content", str(text))  # , encoding="utf-8"
        loader.add_value("page_type", "detailed")

        # record housekeeping fields
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get('BOT_NAME'))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date",
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
        return loader

    def parse_detailed_response_field(self, response=None, city=""):
        text = {}
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text
        information_div = response.xpath("//div[@id='printData1']")

        title = information_div.xpath(
            "./div[@class='tit_box01']/text()").extract_first(default="")
        land_id = information_div.xpath(
            "./div[@class='menubox01 mt20']/span[@class='gray2']/text()"
        ).extract_first(default="")
        province_city = information_div.xpath(
            "string(./div[@class='menubox01 p0515']/div[@class='fl'])"
        ).extract()
        province_city = "___".join(province_city)

        if 0 < len(title): text["title"] = title
        if 0 < len(land_id): text["land_id"] = land_id
        if 0 < len(province_city): text["province_city"] = province_city

        key1 = information_div.xpath(
            "./div[@class='p1015']/div[@class='tit_box02 border03']/text()"
        ).extract_first(default="")
        if "土地基本信息" == key1:
            basic_info = {}
            tr_list1 = information_div.xpath(
                "./div[@class='p1015']/div[@class='tit_box02 border03']/following-sibling::table[@class='tablebox02 mt10']/tbody/tr"
            )
            for index, one_tr in enumerate(tr_list1):
                string_list = one_tr.xpath("string(.)").extract()
                td_list = []
                for one_str in string_list:
                    cleaned_str = CommonClass.clean_string(string=one_str,
                                                           char_to_remove=[
                                                               '\xa0',
                                                               '\n',
                                                               '\t',
                                                               ' ',
                                                           ])
                    td_list.append(cleaned_str.strip('\r'))
                basic_info[index] = "___".join(td_list)
            text[key1] = basic_info

        key2 = information_div.xpath(
            "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/text()"
        ).extract_first(default="")
        if "土地交易信息" == key2:
            trade_info = {}
            tr_list2 = information_div.xpath(
                "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/following-sibling::div[@class='banbox']/table[@class='tablebox02 mt10']/tbody/tr"
            )
            for index, one_tr in enumerate(tr_list2):
                string_list = one_tr.xpath("string(.)").extract()
                td_list = []
                for one_str in string_list:
                    cleaned_str = CommonClass.clean_string(string=one_str,
                                                           char_to_remove=[
                                                               '\xa0',
                                                               '\n',
                                                               '\t',
                                                               ' ',
                                                           ])
                    td_list.append(cleaned_str.strip('\r'))
                trade_info[index] = "___".join(td_list)
            text[key2] = trade_info

        # 20190730 cannot get 土地评估结果, todo ...
        # evaluation_div = response.xpath("//div[@id='divpg']")
        # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" )
        # if "土地评估结果" == key3:
        # 	evaluation_dict = {}
        # 	tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr")
        # 	for index, one_tr in enumerate( tr_list3 ):
        # 		this_td = one_tr.xpath("./td")
        # 		if this_td is None:
        # 			string_list = one_tr.xpath("string(./th)").extract()
        # 		else:
        # 			td_list = one_tr.xpath("./td")
        # 			string_list = []
        # 			for one_td in td_list:
        # 				unit = one_td.xpath("./text()").extract_first( default= "" )
        # 				amount = one_td.xpath("./span/text()").extract_first( default= "" )
        # 				string_list.append( f"{amount}___{unit}" )
        # 				# this_td_str_list = one_td.xpath("string(.)").extract()
        # 				# string_list.extend( this_td_str_list )
        # 		td_th_list = []
        # 		for one_str in string_list:
        # 			cleaned_str = CommonClass.clean_string( string = one_str, char_to_remove = [ '\xa0', '\n', '\t', ' ',] )
        # 			td_th_list.append( cleaned_str.strip('\r') )
        # 		evaluation_dict[index] = "___".join( td_th_list )
        # 	text[key3] = evaluation_dict

        # evaluation_div = response.xpath("//div[@id='divpg']")
        # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" )
        # if "土地评估结果" == key3:
        # 	evaluation_dict = {}
        # 	th_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr/th")
        # 	string_list = th_list3.xpath("string(.)").extract()
        # 	evaluation_dict["fields"] = "___".join( string_list )
        # 	tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr")
        # 	row2 = tr_list3[1].xpath("./td")
        # 	row2string = ""
        # 	str1 = row2[0].xpath("./text()").extract_first( default= "" )
        # 	str2 = row2[1].xpath("string(.)").extract()
        # 	str2 = "___".join( str2 )
        # 	str3amount = response.xpath("//span[@id='scbj_bpgj']")
        # 	str3unit = row2[2].xpath("./text()").extract_first( default= "" )
        # 	str4amount = response.xpath("//span[@id='scbj_bSumPrice']")
        # 	str4amount = str4amount.get()
        # 	str3amount = str3amount.get()
        # 	str4unit = row2[3].xpath("./text()").extract_first( default= "" )
        # 	str5 = row2[4].xpath("./a/@href").extract_first( default= "" )
        # 	evaluation_dict[str1] = f"{str2}___{str3amount} {str3unit}___{str4amount} {str4unit}___{str5}"
        # 	row3 = tr_list3[2].xpath("./td")
        # 	row3str = row3.xpath("string(.)").extract()
        # 	evaluation_dict["假设开发法"] = "___".join( row3str )
        # 	text[key3] = evaluation_dict

        if 0 < len(text): text["city"] = city
        return text

        # {'fields': '\xa0___推出楼面价___评估楼面价___评估总价___操作', '市场比较法': '暂无 元/㎡___ 元/㎡___ 万元___
        # /LandAssessment/b17ea17a-eefa-428b-8b53-461c2bdc67ea.html', '假设开发法': '假设开发法___暂无 元/㎡___元/㎡___万元___[进入评估报告]'}

    def log_for_picking_up_the_crawl_break_point(self,
                                                 page_type="detailed",
                                                 response=None):
        if "detailed" == page_type:
            resume_break_point_file_path = os.path.join(
                self.log_dir, self.resume_break_point_detailed_file_name)
        else:
            resume_break_point_file_path = os.path.join(
                self.log_dir, self.resume_break_point_list_file_name)
        try:
            with open(resume_break_point_file_path, "a") as f:
                f.write(f"{response.url}\n")
        except Exception as ex:
            error_msg = f"fail to write response.url into {resume_break_point_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

    def parse_detailed_page(self, response=None):
        url = response.url
        result_obj = parse.urlparse(url)
        has_url_error = self.url_contains_error(
            result_obj_path=result_obj.path)
        if has_url_error:
            return False

        page_status, city = self.save_html(response=response, save34=True)
        text = self.parse_detailed_response_field(response=response, city=city)
        if isinstance(text, dict) and 0 < len(text):
            try:
                loader = ItemLoader(item=Land3fangItem(), response=response)
                loader = self.load_items_into_loader(loader=loader,
                                                     text=text,
                                                     url=url)
                self.log_for_picking_up_the_crawl_break_point(
                    page_type="detailed", response=response)
                yield loader.load_item()
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
                )

    def do_nothing_for_debug(self, response=None):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}"
        )
        # print( response.body )
        # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}]
        # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}'
        # 2019-06-20 16:28:55 [fangesf] INFO: Inside Method do_nothing_for_debug of Class FangesfSpider,
        # url = https://www.coursehelper.site/index/index/getHeaders?token=ad89558c89c3394167adbfd1484c8700
        # 2019-06-20 16:28:55 [stdout] INFO: b'{"REMOTE_ADDR":"139.196.200.61","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"139.196.200.61, 139.196.200.61"}'

    def url_contains_error(self, result_obj_path=""):
        if 1 > len(result_obj_path):
            return False
        path_fragment_list = result_obj_path.split("/")
        if 1 > len(path_fragment_list):
            return False

        pass
        # do know any anticrawl methods yet

        return False

    def parse_list_page(self, response=None):
        """
			https://land.3fang.com/market/440600__4______1_1_1.html
		"""
        result_obj = parse.urlparse(response.url)
        has_url_error = self.url_contains_error(
            result_obj_path=result_obj.path)
        if has_url_error:
            return False

        page_status, city = self.save_html(response=response, save34=False)
        if 1 > page_status:
            pass
            # -2, -1, 0: error_msg has been logged; just pass
        elif 0 < page_status and 35 > page_status:
            # 1 to 34 also means "index" == page_type
            link_list = self.extract_link_list(response=response)
            if self.debug:
                self.logger.info(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}; link_list = {link_list}"
                )
            else:
                self.log_for_picking_up_the_crawl_break_point(
                    page_type="index", response=response)
                new_url = f"{result_obj.scheme}://{result_obj.netloc}"
                this_cookie = self.cookie_jar.extract_cookies(
                    response, response.request)
                print(this_cookie)

                # crawling vertically
                meta_dict = {
                    "page_type": "detailed",
                    "total_pages": 1,
                    "city": city,
                }
                if self.use_proxy:
                    proxies_dict = self.proxy_ip_pool()
                    meta_dict["proxy"] = proxies_dict["http"]

                for one_link in link_list:
                    if 0 != one_link.find('/'): one_link = f"/{one_link}"
                    this_i_url = f"{new_url}{one_link}"
                    if this_i_url in self.crawled_detailed_url_list:
                        self.logger.info(f"previously crawled {this_i_url}")
                    else:
                        self.logger.info(f"requesting {this_i_url}")
                        yield scrapy.Request(url=this_i_url,
                                             cookies=self.cookie_dict,
                                             callback=self.parse_detailed_page,
                                             meta=meta_dict,
                                             dont_filter=True)

                # crawling horizontally
                if 1 < page_status and 1 == self.get_this_url_page(
                        url_obj_path=result_obj.path):
                    meta_dict = response.meta
                    meta_dict["total_pages"] = page_status
                    if self.use_proxy:
                        proxies_dict = self.proxy_ip_pool()
                        meta_dict["proxy"] = proxies_dict["http"]
                    for i in range(page_status - 1):
                        new_path = result_obj.path
                        new_path = new_path.replace("1.html", f"{i + 2}.html")
                        this_i_url = f"{new_url}{new_path}"
                        self.logger.info(
                            f"requesting list page at {this_i_url}")
                        yield scrapy.Request(url=f"{this_i_url}",
                                             cookies=self.cookie_dict,
                                             callback=self.parse_list_page,
                                             meta=meta_dict,
                                             dont_filter=True)
        elif 101 == page_status:
            error_msg = f"101: todo ... "
            self.logger.info(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        elif 1001 == page_status:
            self.parse_detailed_page(response=response)
            # 1001 also means "detailed" == page_type
            # will never reach here because self.parse_detailed_page() is the callback method

    def read_and_parse(self, response=None):
        file_list = os.listdir(self.saved_html_dir)
        for one_file in file_list:
            if -1 == one_file.find("index"):
                temp_list = one_file.split("___")
                apt_id = 0
                city = ""
                if 1 < len(temp_list):
                    apt_id = temp_list[1]
                    city = temp_list[0]
                url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm"  # can also be 16_, 10_, and others
                # https://sz.esf.fang.com/chushou/3_218307566.htm
                html_file_path = os.path.join(self.saved_html_dir, one_file)
                if os.path.isfile(html_file_path):
                    doc = None
                    with open(html_file_path, 'rb') as f:
                        # doc = f.read().decode('gb2312', 'ignore')
                        doc = f.read().decode('utf-8', 'ignore')
                    if doc is None:
                        self.logger.error(
                            f"Error: cannot read html file {html_file_path}.")
                        continue
                    response = Selector(text=doc, type="html")
                    text = self.parse_detailed_response_field(
                        response=response, city=city, apt_id=apt_id)
                    try:
                        response_for_items = TextResponse(
                            url=url,
                            status=200,
                            body=bytes(doc, encoding="utf-8"))
                        loader = ItemLoader(item=FangesfItem(),
                                            response=response_for_items)
                        loader = self.load_items_into_loader(loader=loader,
                                                             text=text,
                                                             url=url)
                        yield loader.load_item()
                    except Exception as ex:
                        self.logger.info(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, Exception = {ex}"
                        )
                    if self.debug:
                        break

    def write_log(self, content=None, logfilename=None, content_only=False):
        if content is not None and 0 < len(content):
            today = datetime.datetime.now().strftime("%Y%m%d")
            if logfilename is None:
                logfilename = f"{self.name}{today}.log"
            try:
                with open(os.path.join(self.log_dir, logfilename),
                          'a',
                          encoding='utf-8') as f:
                    if content_only:
                        info = f"{str(content)}\n"
                    else:
                        info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
                    f.write(info)
                return 1
            except Exception as ex:
                return 0
        return -1

Esempio n. 3

Mostra file

File: traffic.py Progetto: zouyaoji/Crawls

class TrafficSpider(scrapy.Spider):
    """
		sys.exit code == 1 # missing AMAP_KEYS
		sys.exit code == 2 # missing INPUT_XY_FILE_PATH
		sys.exit code == 3 # fail to generate self.rectangle_list
		sys.exit code == 4 # self.rectangle_list have wrong length
	"""
    name = "traffic"

    root_path = ""
    log_dir = ""
    xy_response_log_file_name = ""
    city_or_area_name = ""
    # debug = False
    # save_every_response = False
    crawled_dir = ""
    json_dir = ""
    output_folder_name = ""
    output_file_format = "json"
    base_uri = ""
    run_purpose = None
    overwrite_today = ""
    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    # crontab will start a new process in every 6 hours; therefore in 1 day, the crontab will start 4 times
    # 5 minute between 2 adjacent reqests
    # the followings will be inititated in every 6 hours
    maximal_requests_of_one_crontab_process = 71
    interval_between_requests = 300
    amap_key_list = []
    amap_key_pointer = 0
    request_counter = 0  # from 0 to 71
    request_number_per_batch = 405  # there are 405 requests in every 5 minutes
    # request_number_per_batch = 10
    rectangle_list = [
    ]  # items looks like "113.267593,23.358604;113.337613,23.412658", it equals to list( self.edges_of_center_xy_dict.keys() )
    edges_of_center_xy_dict = {
    }  # key looks like "113.267593,23.358604;113.337613,23.412658" and item looks like "113.737414,22.543564"
    xy_seen_dict = {
    }  # key looks like "113.737414,22.543564" and item looks like 23
    xy_seen_updated_bool = False

    # the followings will be initiated in every 5 minutes
    last_batch_request_list = []
    last_batch_request_timestamp_float = 0.0  # if good response returned, then we use self.last_batch_request_timestamp_float
    urls = []

    def get_next_amap_key(self):
        self.amap_key_pointer -= 1
        if 0 > self.amap_key_pointer:
            self.amap_key_pointer = len(self.amap_key_list) - 1
        return self.amap_key_list[self.amap_key_pointer]

    def get_one_batch_urls(self):
        """
		https://restapi.amap.com/v3/traffic/status/rectangle?level=6&extensions=all&output=json&rectangle=113.2675927679,23.3586043241;113.3376127679,23.4126583781&key=4ebb849f151dddb3e9aab7abe6e344e2
		"""
        self.urls = []
        self.last_batch_request_list = []
        query_dict = {
            "level": 6,
            "extensions": "all",
            "output": "json",
            "key": self.get_next_amap_key(),
        }
        for one_retangle in self.rectangle_list:
            query_dict["rectangle"] = one_retangle
            self.last_batch_request_list.append(one_retangle)
            self.urls.append(f"{self.base_uri}?{parse.urlencode(query_dict)}")

        self.last_batch_request_timestamp_float = time.time()
        return self.urls

    def generate_one_rectange(self, center_xy_str=""):
        if not isinstance(center_xy_str, str) or 1 > len(center_xy_str):
            return ""
        xy_list = center_xy_str.split(",")
        x = float(xy_list[0])
        y = float(xy_list[1])
        # edge = 3.0 km
        # lat_delta = 0.009009009*edge = 0.027027027
        # 赤道长度40075公里；北纬23度每一经度长40075 * sin(90-23) / 360 = 36889.23 / 360 = 102.47008889公里
        # lng_delta = 0.009759*edge = 0.0292768
        return "%.6f,%.6f;%.6f,%.6f" % (x - 0.0292768, y - 0.027027027,
                                        x + 0.0292768, y + 0.027027027)

    def init_self_rectangles(self):
        if isinstance(self.rectangle_list,
                      list) and 0 < len(self.rectangle_list):
            return self.rectangle_list

        try:
            with open(self.input_xy_file_path, "r",
                      encoding="utf-8") as xy_file:
                overall_list = xy_file.readlines()
                for index, one_xy in enumerate(overall_list):
                    xy_list = one_xy.split(",")
                    if isinstance(xy_list, list) and 2 == len(xy_list):
                        center_xy = "%.6f,%.6f" % (float(
                            xy_list[0]), float(xy_list[1]))
                        rect_str = self.generate_one_rectange(
                            center_xy_str=center_xy)
                        self.rectangle_list.append(rect_str)
                        self.edges_of_center_xy_dict[rect_str] = center_xy
        except Exception as ex:
            self.logger.error(
                f"cannot read xy_list file ({xy_file_path}). Exception = {ex}")
            sys.exit(3)
        else:
            return self.rectangle_list

    def init_self_xy_response_log(self):
        log_file_path = os.path.join(self.log_dir,
                                     self.xy_response_log_file_name)
        try:
            with open(log_file_path, "r", encoding="utf-8") as xy_log_file:
                overall_list = xy_log_file.readlines()
                for index, one_xy in enumerate(overall_list):
                    xy_list = one_xy.split(",")
                    if isinstance(xy_list, list) and 3 == len(xy_list):
                        center_xy = "%.6f,%.6f" % (float(
                            xy_list[0]), float(xy_list[1]))
                        self.xy_seen_dict[center_xy] = int(xy_list[2])
        except Exception as ex:
            self.logger.error(
                f"cannot read historical xy_log_file ({log_file_path}). Exception = {ex}"
            )
            # do not sys.exit(3) here
            self.xy_seen_updated_bool = True
            return False
        else:
            return True

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.xy_response_log_file_name = self.settings.get(
            name="XY_RESPONSE_LOG_FILE_NAME", default="")
        self.city_or_area_name = self.settings.get(name="CITY_OR_AREA_NAME",
                                                   default="")
        # self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False )
        # self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False )
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.json_dir = self.settings.get(name="SAVED_JSON", default="")
        self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME",
                                                    default="")
        self.base_uri = self.settings.get(name="BASE_URI", default="")
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        self.overwrite_today = self.settings.get(name="OVERWRITE_TODAY",
                                                 default="")

        self.maximal_requests_of_one_crontab_process = self.settings.get(
            name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=71)
        self.interval_between_requests = self.settings.get(
            name="INTERVAL_BETWEEN_REQUESTS", default=300)
        self.amap_key_list = self.settings.get(name="AMAP_KEYS", default=[])
        if 1 > len(self.amap_key_list):
            self.logger.error(f"self.amap_key_list is empty")
            sys.exit(1)
        self.input_xy_file_path = self.settings.get(name="INPUT_XY_FILE_PATH",
                                                    default="")
        if not isinstance(self.input_xy_file_path,
                          str) or 1 > len(self.input_xy_file_path):
            self.logger.error(f"missing INPUT_XY_FILE_PATH")
            sys.exit(2)

        self.init_self_rectangles()
        if self.request_number_per_batch != len(self.rectangle_list):
            self.logger.error(
                f"self.rectangle_list length shall be {self.request_number_per_batch}"
            )
            sys.exit(4)

        self.init_self_xy_response_log()

    def check_dirs_and_files(self):
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.json_dir):
            os.makedirs(self.json_dir)

    def start_requests(self):
        self.init_self_attributes()
        self.check_dirs_and_files()

        if "INITIALIZE_AMAP_XY" == self.run_purpose:
            """
				生成xy坐标文件需要将山上、水面上的坐标去除掉。另外20190703发现广东东莞是没有数据的。要将广东东莞的数据删除掉
			"""
            xy_file_name = "data4cities_bd09.txt"
            xy_file_path = os.path.join(self.root_path, self.name,
                                        xy_file_name)
            try:
                with open(xy_file_path, "r", encoding="utf-8") as f:
                    overall_lines = f.readlines()
                    overall_list = overall_lines[0].split(";")
                    for index, one_xy in enumerate(overall_list):
                        xy_list = one_xy.split(",")
                        if isinstance(xy_list, list) and 2 == len(xy_list):
                            xy = "%.6f,%.6f" % (float(
                                xy_list[0]), float(xy_list[1]))
                            one_url = f"https://restapi.amap.com/v3/assistant/coordinate/convert?locations={xy}&coordsys=baidu&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2"
                            meta_dict = {
                                "x": float(xy_list[0]),
                                "y": float(xy_list[1]),
                                "index": index,
                            }
                            yield scrapy.Request(
                                url=one_url,
                                callback=self.initialize_amap_xy,
                                meta=meta_dict,
                                dont_filter=True)
            except Exception as ex:
                urls = []
                self.logger.error(
                    f"cannot read xy_list file ({xy_file_path}). Exception = {ex}"
                )
        elif "READ_JSON_AND_WRITE_CSV" == self.run_purpose:
            one_url = "https://blog.csdn.net/qq_37193537/article/details/78987949"
            callback_func = self.read_json_and_parse
            yield scrapy.Request(url=one_url,
                                 callback=callback_func,
                                 dont_filter=True)
        else:
            self.get_one_batch_urls()
            meta_dict = {
                # we use self.last_batch_request_timestamp_float
                "redo_counter": 0,
            }
            for index, one_url in enumerate(self.urls):
                meta_dict["center_xy_index"] = index
                self.logger.info(f"{index}: requesting {one_url} ")
                yield scrapy.Request(url=one_url,
                                     callback=self.parse_json,
                                     meta=meta_dict,
                                     dont_filter=True)

    def initialize_amap_xy(self, response):
        if response is None or not hasattr(response, "body") or not hasattr(
                response, "url") or not hasattr(response, "meta"):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object"
            )
            return None
        meta_dict = response.meta
        bd09xy = "%.6f,%.6f" % (meta_dict["x"], meta_dict["y"])
        index = meta_dict["index"]
        json_dict = json.loads(response.body)
        if "status" not in json_dict.keys(
        ) or "locations" not in json_dict.keys() or 1 != int(
                json_dict["status"]):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response status"
            )
            return None
        if not isinstance(json_dict["locations"],
                          str) or 1 > len(json_dict["locations"]):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response locations"
            )
            return None
        amap_xy = json_dict["locations"]
        this_row = f"{index}:{bd09xy}==>{amap_xy}"
        new_xy_file_name = "data4cities_amap.txt"
        new_xy_log_file_name = "bd09to_amap.log"
        new_xy_log_file_name = os.path.join(self.root_path, self.name,
                                            new_xy_log_file_name)
        new_xy_file_name = os.path.join(self.root_path, self.name,
                                        new_xy_file_name)
        CommonScrapyPipelineClass.append_row(
            spider_obj=self,
            key_list=["xy"],
            item_list=[amap_xy],
            csv_file_path_str=new_xy_file_name)
        CommonScrapyPipelineClass.append_row(
            spider_obj=self,
            key_list=["xy"],
            item_list=[this_row],
            csv_file_path_str=new_xy_log_file_name)

    def read_json_and_parse(self, response):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, todo..."
        )

    def check_trafficinfo_dict(self,
                               trafficinfo_dict={},
                               status_int=-1,
                               infocode_int=-1,
                               center_xy_index_int=-1):
        """
			有可能得到下面的空结果；这个时候需要比对历史记录
			{"status":"1","info":"OK","infocode":"10000","trafficinfo":{"description":[],"evaluation":{"expedite":[],"congested":[],"blocked":[],"unknown":[],"status":[],"description":[]},"roads":[]}}
		"""
        if not isinstance(trafficinfo_dict, dict) or 1 > len(trafficinfo_dict):
            return False

        if 1 != status_int or 10000 != infocode_int:
            return False

        edges = self.rectangle_list[
            center_xy_index_int] if center_xy_index_int in range(
                len(self.rectangle_list)) else ""
        center_xy = self.edges_of_center_xy_dict[
            edges] if edges in self.edges_of_center_xy_dict.keys() else ""
        # if "roads" not in trafficinfo_dict.keys() or not isinstance( trafficinfo_dict["roads"], list ) or 1 > len( trafficinfo_dict["roads"] ):
        # 比对历史记录
        # if center_xy in self.xy_seen_dict.keys() and 0 == int(self.xy_seen_dict[center_xy]):
        # 	return True # 0表示该xy已经请求过3次，都返回空
        # elif center_xy in self.xy_seen_dict.keys() and 0 > int(self.xy_seen_dict[center_xy]):
        # 	if -3 == int():
        # 		self.xy_seen_dict[center_xy] = 0 # -1, -2, -3分别表示第1、2、3次请求返回空
        # 	else:
        # 		self.xy_seen_dict[center_xy] -= 1
        # 	self.xy_seen_updated_bool = True
        # elif center_xy not in self.xy_seen_dict.keys():
        # 	self.xy_seen_dict[center_xy] = -1
        # 	self.xy_seen_updated_bool = True
        # return False
        # 经过测试，上述方案会产生大量请求

        if center_xy not in self.xy_seen_dict.keys() or len(
                trafficinfo_dict["roads"]) > int(self.xy_seen_dict[center_xy]):
            self.xy_seen_dict[center_xy] = len(trafficinfo_dict["roads"])
            self.xy_seen_updated_bool = True
        return True

    def parse_json(self, response):
        status, infocode, message, result_dict = self.save_json(
            response=response, page_type="json")
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        meta_dict = response.meta
        center_xy_index = int(
            meta_dict["center_xy_index"]
        ) if "center_xy_index" in meta_dict.keys() else -1

        if self.check_trafficinfo_dict(trafficinfo_dict=result_dict,
                                       status_int=status,
                                       infocode_int=infocode,
                                       center_xy_index_int=center_xy_index):
            loader = ItemLoader(item=TrafficItem(), response=response)
            loader = self.load_items_into_loader(loader=loader,
                                                 text=result_dict,
                                                 url=response.url,
                                                 now=now)
            yield loader.load_item()
        else:
            edges = self.rectangle_list[
                center_xy_index] if center_xy_index in range(
                    len(self.rectangle_list)) else ""
            center_xy = self.edges_of_center_xy_dict[
                edges] if edges in self.edges_of_center_xy_dict.keys() else ""
            center_xy_index = -1
            error_msg = f"redo request from {response.url} for {center_xy} because status == {status}, infocode == {infocode}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            meta_dict["redo_counter"] += 1
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_json,
                                 meta=meta_dict,
                                 dont_filter=True)

        if -1 < center_xy_index:
            received_all_reponses_per_batch_bool = self.check_this_center_xy(
                center_xy_index_int=center_xy_index)
            print(
                f"received_all_reponses_per_batch_bool == {received_all_reponses_per_batch_bool}; center_xy_index = {center_xy_index}"
            )

            # get data again after 5 minutes
            if self.request_counter < self.maximal_requests_of_one_crontab_process and received_all_reponses_per_batch_bool:
                while (self.check_time_interval()):
                    time.sleep(10)

                self.request_counter += 1
                now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                self.logger.info(
                    f" requesting amap at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )"
                )
                self.get_one_batch_urls()
                meta_dict = {
                    "redo_counter": 0,
                }
                for index, one_url in enumerate(self.urls):
                    meta_dict["center_xy_index"] = index
                    self.logger.info(f"{index}: requesting {one_url} ")
                    yield scrapy.Request(url=one_url,
                                         callback=self.parse_json,
                                         meta=meta_dict,
                                         dont_filter=True)

    def check_time_interval(self):
        if time.time() - self.last_batch_request_timestamp_float > float(
                self.interval_between_requests):
            return False
        return True

    def check_this_center_xy(self, center_xy_index_int=-1):
        if center_xy_index_int not in range(len(self.rectangle_list)):
            return True

        # 4 minutes have passed, just return True
        if time.time() - self.last_batch_request_timestamp_float > 240.0:
            temp_list = []
            for one_edge in self.last_batch_request_list:
                if one_edge in self.edges_of_center_xy_dict.keys():
                    temp_list.append(self.edges_of_center_xy_dict[one_edge])
            self.logger.error(
                f"after 4 minutes, there are still {len(self.last_batch_request_list)} waiting for response: {temp_list} "
            )
            return True

        # remove current preset_route
        edges = self.rectangle_list[
            center_xy_index_int] if center_xy_index_int in range(
                len(self.rectangle_list)) else ""
        if edges in self.last_batch_request_list:
            self.last_batch_request_list.remove(edges)

        if 1 > len(self.last_batch_request_list):
            return True

        print(f"len == {len( self.last_batch_request_list )}")
        # There are(is an) element(s) in self.last_batch_request_list
        return False

    def load_items_into_loader(self, loader=None, text={}, url="", now=""):
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get("BOT_NAME"))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date", now)

        loader.add_value("content", str(text))
        loader.add_value("page_type", "json")

        return loader

    def get_json_file_name(self, url_str="", status_int=-4):
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        rectangle = ""
        url_obj = parse.urlparse(url_str)
        if hasattr(url_obj, "query"):
            query_dict = parse.parse_qs(url_obj.query)
            if "rectangle" in query_dict.keys():
                rectangle = query_dict["rectangle"]
                if isinstance(rectangle, list) and 0 < len(rectangle):
                    rectangle = rectangle[0]
                rectangle = rectangle.strip("'")
                rectangle = rectangle.replace(";", "___")
                rectangle = rectangle.replace(",", "_")
        if 1 > len(rectangle):
            return ""
        return os.path.join(
            self.json_dir,
            f"{self.city_or_area_name}___{rectangle}___{status_int}___{now}.json"
        )

    def save_json(self, response=None, page_type="json"):
        """
			during this tryout running, we still save the json response.body. But we will NOT in future.
		"""
        status = -4
        infocode = 0
        result_dict = {}
        if response is None or not hasattr(response, "body") or not hasattr(
                response, "url") or not hasattr(response, "meta"):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object"
            )
            return (-1, infocode, f"wrong response object", result_dict)
        file_path = ""
        if "json" == page_type:
            json_dict = json.loads(response.body)
            status = json_dict["status"] if "status" in json_dict.keys(
            ) else "404"
            result_dict = json_dict[
                "trafficinfo"] if "trafficinfo" in json_dict.keys() else {}
            infocode = json_dict["infocode"] if "infocode" in json_dict.keys(
            ) else ""
            status = int(status)
            infocode = int(infocode) if isinstance(
                infocode, str) and 0 < len(infocode) else 0
            file_path = self.get_json_file_name(url_str=response.url,
                                                status_int=status)
        else:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter page_type == {page_type} from {response.url}"
            )
            return (-2, infocode, f"page_type can ONLY be json", result_dict)

        return_msg = "written"
        if 0 < len(file_path):
            try:
                with open(file_path, "wb") as f:
                    f.write(response.body)
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}"
                )
                return (status, infocode, f"failed to write json file",
                        result_dict)  # not -3
        return (status, infocode, return_msg, result_dict)

Esempio n. 4

Mostra file

File: fangesf.py Progetto: zouyaoji/Crawls

class FangesfSpider(scrapy.Spider):
    """
		在分布式scrapyd部署之前，为了起多个fangesf进程而采取的临时措施(fangesfp2是本套代码的一个拷贝)。
		sys.exit code == 1 # wrong or missing RUN_PURPOSE
		sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON
		sys.exit code == 3 # fail to get proxy's ip
		On 20190605 Peter writes this spider upon requests
	"""
    name = "fangesf"

    root_path = ""
    log_dir = ""
    resume_break_point_detailed_file_name = "crawled_detailed_html.log"
    resume_break_point_list_file_name = "crawled_list_html.log"
    crawled_list_url_list = []
    crawled_detailed_url_list = []
    debug = False
    city_list = []
    district_list = []
    city_name_for_districts = ""
    run_purpose = None
    save_every_response = False
    overwrite_today = ""
    crawled_dir = ""
    saved_html_dir = ""
    gaode_json_dir = ""
    csv_file_path = None
    bedrooms_links = [
        "g21",
        "g22",
        "g23",
        "g24",
        "g25",
        "g299",
    ]
    over100_filename = ""

    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    proxy_ip_dict = {}
    min_proxy_ip_life_time = 6
    max_proxy_ip_life_time = 180
    use_proxy = False
    proxy_agent = ""

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.debug = self.settings.get(name="PROJECT_DEBUG", default=False)
        self.city_name_for_districts = self.settings.get(
            "CITY_NAME_FOR_DISTRICTS", default="city")
        self.district_list = self.settings.get("DISTRICT_LIST", default=[])
        if 1 > len(
                self.district_list) and "city" != self.city_name_for_districts:
            self.logger.error(
                f"missing DISTRICT_LIST ({self.district_list}) setting")
            sys.exit(1)
        self.city_list = self.settings.get("CITY_LIST", default=[])
        if 1 > len(self.city_list) and "city" == self.city_name_for_districts:
            self.logger.error(f"missing CITY_LIST ({self.city_list}) setting")
            sys.exit(1)
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        if self.run_purpose is None:
            self.logger.error(
                f"missing RUN_PURPOSE ({self.run_purpose}) setting")
            sys.exit(2)
        self.save_every_response = self.settings.get(
            name="SAVE_EVERY_RESPONSE", default=False)
        self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="")
        if not hasattr(self, "overwrite_today") or 1 > len(
                self.overwrite_today) or self.overwrite_today is None:
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # set all paths
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="")
        self.gaode_json_dir = self.settings.get(name="SAVED_GAODE_JASON",
                                                default="")
        self.csv_file_path = os.path.join(
            self.crawled_dir, f"fang_esf{self.overwrite_today}.csv")

        if 1 > len(self.crawled_dir) or 1 > len(
                self.saved_html_dir) or 1 > len(self.gaode_json_dir):
            error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}), or SAVED_GAODE_JASON ({self.gaode_json_dir}) setting(s)"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            sys.exit(3)

        self.over100_filename = self.settings.get(name="OVER100_LOG_FILENAME",
                                                  default="")

        self.min_proxy_ip_life_time = self.settings.get(
            name="MIN_PROXY_LIFE_SPAN", default=6)
        self.max_proxy_ip_life_time = self.settings.get(
            name="MAX_PROXY_LIFE_SPAN", default=180)
        self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED",
                                           default=False)
        self.proxy_agent = self.settings.get(name="PROXY_AGENT", default="")

    def make_dirs(self):
        # even cache is used, we save all html files; here we make these 3 dirs if they do not exist
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.saved_html_dir):
            os.makedirs(self.saved_html_dir)
        if not os.path.isdir(self.gaode_json_dir):
            os.makedirs(self.gaode_json_dir)

    def proxy_ip_pool(self):
        """
			迅联错误码10000		提取过快，请至少5秒提取一次
		"""
        if "DRAGONFLY" == self.proxy_agent:
            return CommonClass.get_proxies(proxy_dict={})
        now = time.time()
        need_new_proxy = False
        if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict):
            need_new_proxy = True
        elif "expire" not in self.proxy_ip_dict.keys():
            need_new_proxy = True
        elif now + 3 > self.proxy_ip_dict["expire"]:
            need_new_proxy = True
        if need_new_proxy:
            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 1 > len(proxies_dict):
                return self.proxy_ip_dict  # still return the old ip dict or {}
            proxies_dict["expire"] = now + random.randint(
                self.min_proxy_ip_life_time,
                self.max_proxy_ip_life_time)  # set ip life time
            self.proxy_ip_dict = proxies_dict
        return self.proxy_ip_dict

    def read_crawled_urls(self):
        resume_break_point_detailed_file_path = os.path.join(
            self.log_dir, self.resume_break_point_detailed_file_name)
        try:
            with open(resume_break_point_detailed_file_path,
                      "r",
                      encoding="utf-8") as log_file:
                self.crawled_detailed_url_list = log_file.readlines()
                while "" in self.crawled_detailed_url_list:
                    self.crawled_detailed_url_list.remove("")
        except Exception as ex:
            error_msg = f"fail to read {resume_break_point_detailed_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        # for list pages, do not use this [] to exclude seen urls
        # resume_break_point_list_file_path = os.path.join( self.log_dir, self.resume_break_point_list_file_name )
        # try:
        # 	with open( resume_break_point_list_file_path, "r", encoding="utf-8" ) as log_file:
        # 		self.crawled_list_url_list = log_file.readlines()
        # 		while "" in self.crawled_list_url_list:
        # 			self.crawled_list_url_list.remove("")
        # except Exception as ex:
        # 	error_msg = f"fail to read {resume_break_point_list_file_path}"
        # 	self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" )

    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            if "city" == self.city_name_for_districts:
                city_list = self.city_list
            else:
                city_list = self.district_list
            number_day_of_this_year = datetime.datetime.now().timetuple(
            ).tm_yday  # type == int
            seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3)
            if seperate_into_days > len(city_list):
                seperate_into_days = len(city_list)
            batch_count = math.ceil(len(city_list) / seperate_into_days)
            today_batch = number_day_of_this_year % seperate_into_days
            start_index = today_batch * batch_count - 1
            end_index = (today_batch + 1) * batch_count
            urls = []
            for index, city in enumerate(city_list):
                if (start_index < index) and (index < end_index):
                    url = f"https://{city}.esf.fang.com/" if "city" == self.city_name_for_districts else f"https://{self.city_name_for_districts}.esf.fang.com/house-{city}/"
                    urls.append(url)

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if "city" != self.city_name_for_districts:
                meta_dict["index_level"] = 1

            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "GET_CHANNELS" == self.run_purpose:  # GET_CHANNELS is one kind of debug
            urls = []
            city_list = self.settings.get("CITY_LIST", default=[])
            for index, city in enumerate(city_list):
                urls.append(f"https://{city}.esf.fang.com/")
            if 0 < len(urls):
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": 0,
                }
                yield scrapy.Request(url=urls[0],
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)

    def get_total_pages(self, response=None):
        """
			if ONE page already includes all records, there is still one element called "共1页"
			if ONE page includes 0 record, there is no element called "共x页"
		"""
        total_pages = 0
        if response is None:
            return total_pages
        all_ps = response.xpath("//div[@id='list_D10_15']/p")
        total_pages_p = ""
        for one_p in all_ps:
            total_pages_p = one_p.xpath("./text()").extract_first(default="")
            if 0 < len(total_pages_p) and -1 < total_pages_p.find("共"):
                break
        if -1 < total_pages_p.find("共"):
            search_obj = re.search(r"(\d)+", total_pages_p, re.M | re.I)
            if search_obj is not None:
                start = search_obj.span()[0]
                end = search_obj.span()[1]
                if 0 < len(total_pages_p[start:end]):
                    total_pages = int(total_pages_p[start:end])
        else:
            error_msg = f"cannot find total page at uri {response.url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        return total_pages

    def get_city_from_url(self, url=""):
        city = ""
        result_obj = parse.urlparse(url)
        if -1 < result_obj.netloc.find("fang.com"):
            temp2_list = result_obj.netloc.split(".")
            if 4 == len(temp2_list):
                city = temp2_list[0]
        return city

    def make_html_file_name(self, url="", city=""):
        now = datetime.datetime.now()
        html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S"))
        today = now.strftime("%Y%m%d")

        result_obj = parse.urlparse(url)
        url_list = result_obj.path.split("/")
        while "" in url_list:
            url_list.remove("")

        detail_page = False
        last_part = url_list[len(url_list) - 1] if 0 < len(url_list) else ""
        if -1 < last_part.find(".htm"):
            detail_page = True
            # /chushou/3_218307566.htm ==> https://sz.esf.fang.com/chushou/3_218307566.htm
            temp = last_part.split("_")
            apt_id = f"{last_part}"
            if 1 < len(temp):
                apt_id = f"{temp[1]}"
            html_filename = f"{city}_{apt_id}_{today}.html"
        elif -1 < result_obj.netloc.find("fang.com") and 1 > len(url_list):
            # list page #1: https://sz.esf.fang.com/
            html_filename = f"{city}_index1_{today}.html"
        else:
            page, district_area, bedrooms = self.get_page_and_district_area(
                url_list=url_list)

            if 0 < len(district_area):
                html_filename = f"{city}_{district_area}_index{page}_{today}.html"
            else:
                html_filename = f"{city}_index{page}_{today}.html"
        return (detail_page, html_filename)

    def get_page_and_district_area(self, url_list=[]):
        """
			list page #2 or more, or including channels like:
			https://sz.esf.fang.com/house-a013080/
			where a013080 stands for 深圳市龙华区
			or https://sz.esf.fang.com/house-a013080-b014334 or https://sz.esf.fang.com/house-a013080-b02094/i372/
			where b014334 stands for 深圳市龙华区大浪；house-a013080-b02094 stands for 观澜；
			house-a013080-b0350 stands for 龙华；house-a013080-b014333 stands for 民治
			https://sz.esf.fang.com/house-a087-b0342/g22/ where g22 stands for 二居室；
			g21(一居)，g23(三居)，g24(四居)，g25(五居)，g299(五居以上)
			# this option is a multiple choice but this crawl will ONLY use single choice
		"""
        page = "1"
        district_area = ""
        bedrooms = 0
        for index, key in enumerate(url_list):
            one_fragment = url_list[index]
            if -1 < one_fragment.find("i3") and -1 == one_fragment.find(
                    "house-"):
                page = one_fragment[2:]
            elif -1 < one_fragment.find("house-") and -1 == one_fragment.find(
                    "i3"):
                district_area = one_fragment.replace("house-", "")
                district_area = district_area.replace("-", "_")
                if index + 1 < len(url_list):
                    next_fragment = url_list[index + 1]
                    if -1 < next_fragment.find("g2"):
                        last_part_of_fragment = next_fragment.replace("g2", "")
                        if -1 < last_part_of_fragment.find("-i3"):
                            temp_list = last_part_of_fragment.split("-i3")
                            if 1 < len(temp_list):
                                bedrooms = int(temp_list[0])
                        else:
                            bedrooms = int(last_part_of_fragment)
        return (page, district_area, bedrooms)

    def save_html(self, response=None, save100=False):
        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            if hasattr(response, "url"):
                error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return -1
        url = response.url
        meta_dict = response.meta
        page_type = "index"
        total_pages = 0
        city = self.get_city_from_url(url=url)

        if "page_type" in meta_dict.keys():
            page_type = meta_dict["page_type"]

        if "index" == page_type:
            if "total_pages" in meta_dict.keys():
                total_pages = int(meta_dict["total_pages"])

            if 0 == total_pages:
                total_pages = self.get_total_pages(response=response)

            if 99 < total_pages and not save100:
                return 101
            # https://sz.esf.fang.com/house-a013080/
            detail_page, html_filename = self.make_html_file_name(url=url,
                                                                  city=city)
            html_file_path = os.path.join(self.saved_html_dir, html_filename)
            save_html_file = True

        elif "detailed" == page_type:
            apt_id = self.get_apt_id(url=url)
            today = datetime.datetime.now().strftime("%Y%m%d")
            html_filename = f"{city}___{apt_id}___{today}.html"
            html_file_path = os.path.join(self.saved_html_dir, html_filename)
            save_html_file = True
            total_pages = 1001
            # https://sz.esf.fang.com/chushou/3_218307566.htm

        try:
            with open(html_file_path, "wb") as f:
                f.write(response.body)
        except Exception as ex:
            error_msg = f"fail to write response.body into {html_file_path} after requesting {url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return -2
        else:
            if 1 > total_pages:
                error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return total_pages  # could be 100 when save100 = True

    def extract_link_list(self, response=None):
        link_list = response.xpath(
            '//div[@class="shop_list shop_list_4"]/dl[@class="clearfix"]/dd/h4[@class="clearfix"]/a/@href'
        ).extract()
        if 1 > len(link_list):
            error_msg = f"Fail to extract links from {response.url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        return link_list

    def divide_request_into_next_level(self, response=None):
        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            error_msg = f"meta = {hasattr( response, 'meta' )}; body = {hasattr( response, 'body' )}; url = {hasattr( response, 'url' )}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return -1
        url = response.url
        result_obj = parse.urlparse(url)
        url_list = result_obj.path.split("/")
        while "" in url_list:
            url_list.remove("")
        meta_dict = response.meta
        index_level = 0
        if "index_level" in meta_dict.keys():
            index_level = int(meta_dict["index_level"])
        page, district_area, bedrooms = self.get_page_and_district_area(
            url_list=url_list)

        if 0 < bedrooms:
            # as of 20190605, we ONLY care level upto bedrooms
            page_status = self.save_html(response=response, save100=True)
            self.write_log(content=f"{response.url}",
                           logfilename=self.over100_filename,
                           content_only=True)

        # district_area has higher priority than index_level
        if 0 < len(district_area):
            temp_list = district_area.split("_")
            if index_level != len(temp_list):
                error_msg = f"index_level {index_level} != {len( temp_list )} ({district_area})"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
                index_level = len(temp_list)
        else:
            if 0 != index_level:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {index_level} is not 0"
                )
                index_level = 0

        pointer, link_list = self.extract_this_level_screen_options(
            response=response,
            index_level=index_level,
            district_area=district_area,
            bedrooms=0)
        return (pointer, link_list, index_level)

    def make_new_url(self, url="", index_level=0, fragment=""):
        result_obj = parse.urlparse(url)
        url_path_list = result_obj.path.split("/")
        while "" in url_path_list:
            url_path_list.remove("")

        has_bedroom = False
        for one_path in url_path_list:
            if 0 == one_path.find("i3"):
                return ""
                # https://sz.esf.fang.com/house-a013080-b02094/i372/
            if 0 == one_path.find("g2"):
                has_bedroom = True
        if 2 == index_level:
            if has_bedroom and 0 == fragment.find("g2"):
                return ""
                # ONLY one option is selected
            return_url = f"{result_obj.scheme}://{result_obj.netloc}/{result_obj.path}{fragment}"
            if 0 == result_obj.path.find('/'):
                return_url = f"{result_obj.scheme}://{result_obj.netloc}{result_obj.path}{fragment}"
            return return_url
            # returns the first url: https://sz.esf.fang.com/house-a090-b0352/g23/
            # but for page #2 and above, url shall be: https://sz.esf.fang.com/house-a090-b0352/g23-i37/
        return_url = f"{result_obj.scheme}://{result_obj.netloc}/{fragment}"
        if 0 == fragment.find('/'):
            return_url = f"{result_obj.scheme}://{result_obj.netloc}{fragment}"
        return return_url

    def extract_this_level_screen_options(self,
                                          response=None,
                                          index_level=0,
                                          district_area="",
                                          bedrooms=0):
        """
			currently ONLY 1 > pointer will be returned
		"""
        link_list = []
        if 1 > index_level:
            link_list = response.xpath(
                '//div[@class="screen_al"]/ul/li[@class="clearfix screen_list"]/ul[@class="clearfix choose_screen floatl"]/li/a/@href'
            ).extract()
            # remove 地铁线路
            temp_list = []
            for one_link in link_list:
                if -1 == one_link.find("house1-"):
                    temp_list.append(one_link)
            link_list = temp_list
        elif 1 == index_level:
            link_list = response.xpath(
                '//div[@class="screen_al"]/ul/li[@class="area_sq"]/ul[@class="clearfix"]/li/a/@href'
            ).extract()
        elif 2 == index_level:
            if 0 < bedrooms:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {bedrooms} is not 0; as of 20190605, we ONLY have 3 levels to divide requests"
                )
                return (-1, [])  # this for future ONLY
            return (0, self.bedrooms_links)
        else:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {index_level} over 2; as of 20190605, we ONLY have 3 levels to divide requests"
            )
            return (-1, [])
        temp_list = []
        for one_link in link_list:
            temp_string = one_link.replace("/", "")
            temp_string = temp_string.replace("house-", "")
            temp_list.append(temp_string.replace("-", "_"))
        pointer = 0  # currently ONLY 1 > pointer will be returned
        return (pointer, link_list)

    def load_items_into_loader(self, loader=None, text={}, url=""):
        loader.add_value("content", str(text))  # , encoding="utf-8"
        loader.add_value("page_type", "detailed")

        # record housekeeping fields
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get('BOT_NAME'))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date",
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
        return loader

    def parse_detailed_response_field(self, response=None, city="", apt_id=""):
        text = {}
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text
        title = response.xpath("//div[@id='lpname']/h1/text()").extract_first(
            default="")
        if 1 > len(title):
            title = response.xpath(
                "//div[@class='tab-cont clearfix']/div[@class='title rel']/h1[@class='title floatl']/text()"
            ).extract_first(default="")

        title_right_box = response.xpath("//div[@class='tab-cont-right']")
        price_div = title_right_box.xpath(
            "./div[@class='tr-line clearfix zf_new_title']/div[@class='trl-item_top']/div[@class='rel floatl']/preceding-sibling::div"
        )
        price_list = price_div.xpath("string(.)").extract()
        price = "___".join(price_list)

        # extract features
        feature_div = title_right_box.xpath(
            "./div[@class='tr-line clearfix']/div[contains(@class,'trl-item1')]"
        )
        feature_dict = {}
        for one_item in feature_div:
            key = one_item.xpath(
                "./div[@class='font14']/text()").extract_first(default="")
            value = one_item.xpath("./div[@class='tt']/text()").extract_first(
                default="")
            if 0 < len(key):
                feature_dict[key] = CommonClass.clean_string(string=value,
                                                             char_to_remove=[
                                                                 '\r',
                                                                 '\n',
                                                                 '\t',
                                                                 ' ',
                                                             ])

        # extract location information
        location_div = title_right_box.xpath(
            "./div[@class='tr-line']/div[@class='trl-item2 clearfix']")
        location_dict = {}
        for one_location in location_div:
            key = one_location.xpath(
                "./div[@class='lab']/text()").extract_first(default="")
            value_list = one_location.xpath(
                "string(./div[@class='rcont'])").extract()
            temp_list = []
            for one_value in value_list:
                temp = CommonClass.clean_string(string=one_value,
                                                char_to_remove=[
                                                    '\xa0',
                                                    '\n',
                                                    '\t',
                                                    ' ',
                                                ])
                temp_list.append(temp.strip('\r'))
                # keep \r
            if 0 < len(key):
                key = CommonClass.clean_string(string=key,
                                               char_to_remove=[
                                                   '\u2003',
                                                   '\xa0',
                                                   '\n',
                                                   '\t',
                                                   ' ',
                                               ])
                location_dict[key] = "___".join(temp_list)

        information_box = response.xpath(
            "//div[@class='content-item fydes-item']")
        information_title_list = information_box.xpath(
            "string(./div[@class='title'])").extract()
        information_title = "___".join(
            information_title_list) if 0 < len(information_title_list) else ""
        information1div = information_box.xpath(
            "./div[@class='cont clearfix']/div[@class='text-item clearfix']")
        information_dict = {}
        for one_item in information1div:
            key = one_item.xpath("./span[@class='lab']/text()").extract_first(
                default="")
            value_list = one_item.xpath(
                "string(./span[@class='rcont'])").extract()
            temp_list = []
            for one_value in value_list:
                temp = CommonClass.clean_string(string=one_value,
                                                char_to_remove=[
                                                    '\xa0',
                                                    '\n',
                                                    '\t',
                                                    ' ',
                                                ])
                temp_list.append(temp.strip('\r'))
            if 0 < len(key):
                information_dict[key] = "___".join(temp_list)

        community_box1 = response.xpath("//div[@id='xq_message']")
        community_title = community_box1.xpath("./text()").extract_first(
            default="")
        community_title = CommonClass.clean_string(string=community_title,
                                                   char_to_remove=[
                                                       '\xa0',
                                                       '\n',
                                                       '\t',
                                                       ' ',
                                                   ])
        community_dict = {
            "title": community_title.strip('\r'),
        }
        community_box2 = community_box1.xpath("./following-sibling::div")
        community_box2line1 = community_box2.xpath(
            "./div[@class='topt clearfix']")
        line1_list = community_box2line1.xpath(
            "./div[@class='text-item clearfix']")
        for one_item in line1_list:
            key = one_item.xpath("./span[@class='lab']/text()").extract_first(
                default="")
            value_list = one_item.xpath(
                "string(./span[@class='rcont'])").extract()
            if 0 < len(key):
                community_dict[key] = "___".join(value_list)

        community_box2line2 = community_box2line1.xpath(
            "./following-sibling::div")
        line2_list = community_box2line2.xpath(
            "./div[@class='text-item clearfix']")
        for one_item in line2_list:
            key = one_item.xpath("./span[@class='lab']/text()").extract_first(
                default="")
            value = one_item.xpath(
                "./span[@class='rcont ']/text()").extract_first(default="")
            if 0 < len(key):
                key = CommonClass.clean_string(string=key,
                                               char_to_remove=[
                                                   '\xa0',
                                                   '\n',
                                                   '\t',
                                                   ' ',
                                               ])
                community_dict[key] = CommonClass.clean_string(string=value,
                                                               char_to_remove=[
                                                                   '\xa0',
                                                                   '\n',
                                                                   '\t',
                                                                   ' ',
                                                                   '\r',
                                                               ])

        community_box2line3 = community_box2line2.xpath(
            "./following-sibling::div")
        community_box2line3key = community_box2line3.xpath(
            "./div[@class='text-item']/span[@class='lab']/text()"
        ).extract_first(default="")
        community_box2line3value = community_box2line3.xpath(
            "string(./div[@class='text-item']/span[@class='rcont'])").extract(
            )
        temp_list = []
        for one_value in community_box2line3value:
            temp = CommonClass.clean_string(string=one_value,
                                            char_to_remove=[
                                                '\xa0',
                                                '\n',
                                                '\t',
                                                ' ',
                                            ])
            temp = temp.strip('\r')
            if 0 < len(temp):
                temp_list.append(temp)
        if 0 < len(community_box2line3key):
            community_dict[community_box2line3key] = "".join(temp_list)

        text = {
            "title": title.strip(),
            "price": price.strip(),
            "feature": feature_dict,
            "location": location_dict,
            "information": information_dict,
            "community": community_dict,
            "city": city,
            "apt_id": apt_id,
        }
        return text

    def get_apt_id(self, url=""):
        apt_id = 0
        result_obj = parse.urlparse(url)
        url_list = result_obj.path.split("/")
        while "" in url_list:
            url_list.remove("")
        last_part = url_list[len(url_list) - 1]
        if -1 < last_part.find(".htm"):
            temp = last_part.split("_")
            if 1 < len(temp):
                temp = f"{temp[1]}"
                search_obj = re.search(r"(\d)+", temp, re.M | re.I)
                if search_obj is not None:
                    start = search_obj.span()[0]
                    end = search_obj.span()[1]
                    if 0 < len(temp[start:end]):
                        apt_id = int(temp[start:end])
        if 1 > apt_id:
            return f"random{random.randint(10000,99999)}"
        return str(apt_id)

    def log_for_picking_up_the_crawl_break_point(self,
                                                 page_type="detailed",
                                                 response=None):
        if "detailed" == page_type:
            resume_break_point_file_path = os.path.join(
                self.log_dir, self.resume_break_point_detailed_file_name)
        else:
            resume_break_point_file_path = os.path.join(
                self.log_dir, self.resume_break_point_list_file_name)
        try:
            with open(resume_break_point_file_path, "a") as f:
                f.write(f"{response.url}\n")
        except Exception as ex:
            error_msg = f"fail to write response.url into {resume_break_point_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

    def parse_detailed_page(self, response=None):
        url = response.url
        result_obj = parse.urlparse(url)
        has_url_error = self.url_contains_error(
            result_obj_path=result_obj.path)
        if has_url_error:
            return False

        page_status = self.save_html(response=response, save100=True)
        city = self.get_city_from_url(url=url)
        apt_id = self.get_apt_id(url=url)
        text = self.parse_detailed_response_field(response=response,
                                                  city=city,
                                                  apt_id=apt_id)
        try:
            loader = ItemLoader(item=FangesfItem(), response=response)
            loader = self.load_items_into_loader(loader=loader,
                                                 text=text,
                                                 url=url)
            self.log_for_picking_up_the_crawl_break_point(page_type="detailed",
                                                          response=response)
            yield loader.load_item()
        except Exception as ex:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
            )

    def do_nothing_for_debug(self, response=None):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}"
        )
        # print( response.body )
        # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}]
        # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}'
        # 2019-06-20 16:28:55 [fangesf] INFO: Inside Method do_nothing_for_debug of Class FangesfSpider,
        # url = https://www.coursehelper.site/index/index/getHeaders?token=ad89558c89c3394167adbfd1484c8700
        # 2019-06-20 16:28:55 [stdout] INFO: b'{"REMOTE_ADDR":"139.196.200.61","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"139.196.200.61, 139.196.200.61"}'

    def url_contains_i3_page(self, result_obj_path=""):
        if 1 > len(result_obj_path):
            return False
        path_fragment_list = result_obj_path.split("/")
        if 1 > len(path_fragment_list):
            return False
        for one in path_fragment_list:
            if 0 == one.find("i3"):
                return True
            elif 0 == one.find("g2") and 0 < one.find("-i3"):
                # the bedroom url looks like g299-i39
                return True
        return False

    def url_contains_error(self, result_obj_path=""):
        if 1 > len(result_obj_path):
            return False
        path_fragment_list = result_obj_path.split("/")
        if 1 > len(path_fragment_list):
            return False

        # https://sz.esf.fang.com/staticsearchlist/Error/Error404?aspxerrorpath=/house-a013057/i330/i330
        for one in path_fragment_list:
            if -1 < one.find("Error") or -1 < one.find(
                    "Error404") or -1 < one.find("staticsearchlist"):
                self.logger.info(
                    f"Error! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}"
                )
                return True

        # http://search.fang.com/captcha-verify/?t=1559927114.963&h=aHR0cHM6Ly9zei5lc2YuZmFuZy5jb20vaG91c2UtYTA5MC1iMDM1NC9nMjU%3D&c=cmE6MTE0LjI1Mi4yMTIuMjEwO3hyaTo7eGZmOg%3D%3D
        for one in path_fragment_list:
            if -1 < one.find("captcha") or -1 < one.find("verify"):
                self.logger.info(
                    f"Need captcha-verify! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}"
                )
                return True

        return False

    def parse_list_page(self, response=None):
        """
			0 == index_level: https://shaoguan.esf.fang.com/house/i32/ or https://shaoguan.esf.fang.com
			1 == index_level: https://gz.esf.fang.com/house-a072/i34/ or https://gz.esf.fang.com/house-a072/
			2 == index_level: https://gz.esf.fang.com/house-a072-b0627/i35/ or https://gz.esf.fang.com/house-a072-b0627/
			3 == index_level: https://sz.esf.fang.com/house-a090-b0352/g23-i37/ or https://sz.esf.fang.com/house-a090-b0352/g23/
		"""
        result_obj = parse.urlparse(response.url)
        has_url_error = self.url_contains_error(
            result_obj_path=result_obj.path)
        if has_url_error:
            return False

        page_status = self.save_html(response=response, save100=False)
        if 1 > page_status:
            pass
            # -2, -1, 0: error_msg has been logged; just pass
        elif 0 < page_status and 101 > page_status and not has_url_error:
            # 1 to 100 also means "index" == page_type
            link_list = self.extract_link_list(response=response)
            if self.debug:
                self.logger.info(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}; link_list = {link_list}"
                )
            else:
                self.log_for_picking_up_the_crawl_break_point(
                    page_type="index", response=response)
                new_url = f"{result_obj.scheme}://{result_obj.netloc}"

                # crawling vertically
                meta_dict = {
                    "page_type": "detailed",
                    "total_pages": 1,
                }
                if self.use_proxy:
                    proxies_dict = self.proxy_ip_pool()
                    meta_dict["proxy"] = proxies_dict["http"]
                for one_link in link_list:
                    if 0 != one_link.find('/'):
                        one_link = f"/{one_link}"
                    this_i_url = f"{new_url}{one_link}"
                    if this_i_url in self.crawled_detailed_url_list:
                        self.logger.info(f"previously crawled {this_i_url}")
                    else:
                        self.logger.info(f"requesting {this_i_url}")
                        yield scrapy.Request(url=this_i_url,
                                             callback=self.parse_detailed_page,
                                             meta=meta_dict,
                                             dont_filter=False)

                # crawling horizontally
                if 1 < page_status and not self.url_contains_i3_page(
                        result_obj_path=result_obj.path):
                    meta_dict = response.meta
                    meta_dict["total_pages"] = page_status
                    new_url = f"{new_url}{result_obj.path}"
                    if len(new_url) - 1 != new_url.rfind('/'):
                        new_url = f"{new_url}/"
                    is_bedroom_url = False  # https://sz.esf.fang.com/house-a090-b0352/i36/
                    if "index_level" in meta_dict.keys() and 3 == int(
                            meta_dict["index_level"]):
                        is_bedroom_url = True
                        new_url = new_url.rstrip('/')
                        # https://sz.esf.fang.com/house-a090-b0352/g23-i37/
                    elif "index_level" in meta_dict.keys() and 0 == int(
                            meta_dict["index_level"]):
                        if 1 > len(result_obj.path):
                            new_url = f"{new_url}house/"
                        elif -1 == result_obj.path.find("house"):
                            new_url = f"{new_url}house/"
                        # this city ONLY has 2 to 99 list pages and there is no need to divide requests into next level
                        # therefore 0 == index_level
                        # https://shaoguan.esf.fang.com/house/i32/

                    if self.use_proxy:
                        proxies_dict = self.proxy_ip_pool()
                        meta_dict["proxy"] = proxies_dict["http"]
                    for i in range(page_status - 1):
                        this_i_url = f"{new_url}-i3{i + 2}" if is_bedroom_url else f"{new_url}i3{i + 2}"
                        self.logger.info(
                            f"requesting list page at {this_i_url}")
                        yield scrapy.Request(url=f"{this_i_url}",
                                             callback=self.parse_list_page,
                                             meta=meta_dict,
                                             dont_filter=True)
        elif 101 == page_status and not has_url_error:
            # 101 also means "index" == page_type
            self.log_for_picking_up_the_crawl_break_point(page_type="index",
                                                          response=response)
            pointer, link_list, index_level = self.divide_request_into_next_level(
                response=response)
            # https://sz.esf.fang.com/house-a090-b0352/g23-i37/
            if -1 < pointer:
                # using level3 bedrooms
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": index_level + 1,
                }
                if self.use_proxy:
                    proxies_dict = self.proxy_ip_pool()
                    meta_dict["proxy"] = proxies_dict["http"]
                for i in range(len(link_list) - pointer):
                    new_url = self.make_new_url(url=response.url,
                                                index_level=index_level,
                                                fragment=link_list[i +
                                                                   pointer])
                    if 0 < len(new_url):
                        self.logger.info(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, requesting {new_url}; meta_dict = {meta_dict}"
                        )
                        yield scrapy.Request(url=new_url,
                                             callback=self.parse_list_page,
                                             meta=meta_dict,
                                             dont_filter=True)
        elif 1001 == page_status and not has_url_error:
            self.parse_detailed_page(response=response)
            # 1001 also means "detailed" == page_type
            # will never reach here because self.parse_detailed_page() is the callback method

    def read_and_parse(self, response=None):
        file_list = os.listdir(self.saved_html_dir)
        for one_file in file_list:
            if -1 == one_file.find("index"):
                temp_list = one_file.split("___")
                apt_id = 0
                city = ""
                if 1 < len(temp_list):
                    apt_id = temp_list[1]
                    city = temp_list[0]
                url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm"  # can also be 16_, 10_, and others
                # https://sz.esf.fang.com/chushou/3_218307566.htm
                html_file_path = os.path.join(self.saved_html_dir, one_file)
                if os.path.isfile(html_file_path):
                    doc = None
                    with open(html_file_path, 'rb') as f:
                        # doc = f.read().decode('gb2312', 'ignore')
                        doc = f.read().decode('utf-8', 'ignore')
                    if doc is None:
                        self.logger.error(
                            f"Error: cannot read html file {html_file_path}.")
                        continue
                    response = Selector(text=doc, type="html")
                    text = self.parse_detailed_response_field(
                        response=response, city=city, apt_id=apt_id)
                    try:
                        response_for_items = TextResponse(
                            url=url,
                            status=200,
                            body=bytes(doc, encoding="utf-8"))
                        loader = ItemLoader(item=FangesfItem(),
                                            response=response_for_items)
                        loader = self.load_items_into_loader(loader=loader,
                                                             text=text,
                                                             url=url)
                        yield loader.load_item()
                    except Exception as ex:
                        self.logger.info(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, Exception = {ex}"
                        )
                    if self.debug:
                        break

    def write_log(self, content=None, logfilename=None, content_only=False):
        if content is not None and 0 < len(content):
            today = datetime.datetime.now().strftime("%Y%m%d")
            if logfilename is None:
                logfilename = f"{self.name}{today}.log"
            try:
                with open(os.path.join(self.log_dir, logfilename),
                          'a',
                          encoding='utf-8') as f:
                    if content_only:
                        info = f"{str(content)}\n"
                    else:
                        info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
                    f.write(info)
                return 1
            except Exception as ex:
                return 0
        return -1

Esempio n. 5

Mostra file

class Shop58Spider(scrapy.Spider):
    """
		sys.exit code == 1 # wrong or missing RUN_PURPOSE
		sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON
		sys.exit code == 3 # fail to get proxy's ip
		On 20190605 Peter writes this spider upon requests
	"""
    name = "shop58"

    root_path = ""
    log_dir = ""
    over70_filename = ""
    resume_break_point_detailed_file_name = "crawled_detailed_html.log"
    resume_break_point_list_file_name = "crawled_list_html.log"
    crawled_list_url_list = []
    crawled_detailed_url_list = []
    debug = False
    city_list = []
    run_purpose = None
    save_every_response = False
    overwrite_today = ""
    crawled_dir = ""
    saved_html_dir = ""
    gaode_json_dir = ""
    csv_file_path = None

    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    proxy_ip_dict = {}
    min_proxy_ip_life_time = 6
    max_proxy_ip_life_time = 180
    use_proxy = False

    shop_area_uri_list = [
        "0_20",
        "20_50",
        "50_100",
        "100_200",
        "200_500",
        "500_%2A",
    ]

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.over70_filename = self.settings.get(name="OVER70_LOG_FILENAME",
                                                 default="")
        self.debug = self.settings.get(name="PROJECT_DEBUG", default=False)
        self.city_list = self.settings.get("CITY_LIST", default=[])
        if 1 > len(self.city_list):
            self.logger.error(f"missing CITY_LIST ({self.city_list}) setting")
            sys.exit(1)
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        if self.run_purpose is None:
            self.logger.error(
                f"missing RUN_PURPOSE ({self.run_purpose}) setting")
            sys.exit(2)
        self.save_every_response = self.settings.get(
            name="SAVE_EVERY_RESPONSE", default=False)
        self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="")
        if not hasattr(self, "overwrite_today") or 1 > len(
                self.overwrite_today) or self.overwrite_today is None:
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # set all paths
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="")
        self.gaode_json_dir = self.settings.get(name="SAVED_GAODE_JASON",
                                                default="")
        self.csv_file_path = os.path.join(
            self.crawled_dir, f"shop58_{self.overwrite_today}.csv")

        if 1 > len(self.crawled_dir) or 1 > len(
                self.saved_html_dir) or 1 > len(self.gaode_json_dir):
            error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}), or SAVED_GAODE_JASON ({self.gaode_json_dir}) setting(s)"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            sys.exit(3)

        self.min_proxy_ip_life_time = self.settings.get(
            name="MIN_PROXY_LIFE_SPAN", default=6)
        self.max_proxy_ip_life_time = self.settings.get(
            name="MAX_PROXY_LIFE_SPAN", default=180)
        self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED",
                                           default=False)

    def make_dirs(self):
        # even cache is used, we save all html files; here we make these 3 dirs if they do not exist
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.saved_html_dir):
            os.makedirs(self.saved_html_dir)
        if not os.path.isdir(self.gaode_json_dir):
            os.makedirs(self.gaode_json_dir)

    def proxy_ip_pool(self):
        """
			10000	提取过快，请至少5秒提取一次
		"""
        now = time.time()
        need_new_proxy = False
        if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict):
            need_new_proxy = True
        elif "expire" not in self.proxy_ip_dict.keys():
            need_new_proxy = True
        elif now + 3 > self.proxy_ip_dict["expire"]:
            need_new_proxy = True
        if need_new_proxy:
            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 1 > len(proxies_dict):
                return self.proxy_ip_dict  # still return the old ip dict or {}
            proxies_dict["expire"] = now + random.randint(
                self.min_proxy_ip_life_time,
                self.max_proxy_ip_life_time)  # set ip life time
            self.proxy_ip_dict = proxies_dict
        return self.proxy_ip_dict

    def read_crawled_urls(self):
        """
			for resume crawling at a break point
		"""
        resume_break_point_detailed_file_path = os.path.join(
            self.log_dir, self.resume_break_point_detailed_file_name)
        try:
            with open(resume_break_point_detailed_file_path,
                      "r",
                      encoding="utf-8") as log_file:
                self.crawled_detailed_url_list = log_file.readlines()
                while "" in self.crawled_detailed_url_list:
                    self.crawled_detailed_url_list.remove("")
        except Exception as ex:
            error_msg = f"fail to read {resume_break_point_detailed_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            city_list = self.settings.get("CITY_LIST", default=[])
            number_day_of_this_year = datetime.datetime.now().timetuple(
            ).tm_yday  # type == int
            seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3)
            if seperate_into_days > len(city_list):
                seperate_into_days = len(city_list)
            batch_count = math.ceil(len(city_list) / seperate_into_days)
            today_batch = number_day_of_this_year % seperate_into_days
            start_index = today_batch * batch_count - 1
            end_index = (today_batch + 1) * batch_count
            urls = []
            for index, city in enumerate(city_list):
                if (start_index < index) and (index < end_index):
                    urls.append(f"https://{city}.58.com/shangpu/")

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        elif "SAVE_ONE_HTML" == self.run_purpose:
            url = "https://gz.58.com/shangpu/"
            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            yield scrapy.Request(url=url,
                                 callback=self.do_nothing_for_debug,
                                 meta=meta_dict,
                                 dont_filter=True)
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)

    def remove_url_page_part(self, url="", add_query=True):
        """
			this version ignore url_obj.fragment and url_obj.params
		"""
        new_url = url
        url_obj = parse.urlparse(url)
        if hasattr(url_obj, "path"):
            url_list = url_obj.path.split("/")
            path_changed = False
            for one in url_list:
                if 0 == one.find("pn"):
                    url_list.remove(one)
                    path_changed = True
            if path_changed:
                new_path = "/".join(url_list)
                new_url = f"{url_obj.scheme}://{url_obj.netloc}"
                new_url = new_url.rstrip("/")
                if 0 < len(new_path):
                    new_path = new_path.lstrip("/")
                    new_url = f"{new_url}/{new_path}"

                if hasattr(url_object,
                           "query") and 0 < len(url_obj.query) and add_query:
                    new_url = f"{new_url}{ url_obj.query }"
        return new_url

    def add_url_page_part(self, old_url="", page=2):
        """
			this version ignore url_obj.fragment and url_obj.params
		"""
        url_obj = parse.urlparse(old_url)
        new_url = f"{url_obj.scheme}://{url_obj.netloc}"
        if not hasattr(url_obj, "path") or 1 > len(url_obj.path):
            new_path = f"pn{page}"
        else:
            temp_list = []
            url_list = url_obj.path.split("/")
            for one in url_list:
                if 0 == one.find("pn"):
                    continue
                if 0 < len(one):
                    temp_list.append(one)
            temp_list.append(f"pn{page}")
            new_path = "/".join(temp_list)
        new_url = f"{url_obj.scheme}://{url_obj.netloc}"
        new_url = new_url.rstrip("/")
        new_path = new_path.lstrip("/")
        new_url = f"{new_url}/{new_path}"
        new_url = new_url.rstrip("/")

        if hasattr(url_obj, "query") and 0 < len(url_obj.query):
            new_url = f"{new_url}/?{ url_obj.query }"
        return new_url

    def get_page_from_url(self, url=""):
        page_num = 0
        url_obj = parse.urlparse(url)
        if hasattr(url_obj, "path"):
            url_list = url_obj.path.split("/")
            for one in url_list:
                if 0 == one.find("pn"):
                    page_num = CommonClass.find_digits_from_str(
                        string=one, return_all=False)
        return int(page_num)

    def get_total_pages(self, response=None):
        total_pages = 0
        if response is None:
            return total_pages

        page_list = response.xpath(
            "//div[@class='content-side-left']/div[@class='pager']/a/@href"
        ).extract()
        for one in page_list:
            this_url_page_num = self.get_page_from_url(url=one)
            if total_pages < this_url_page_num:
                total_pages = this_url_page_num

        if 1 > total_pages:
            error_msg = f"fail to extract last page number ({page_list}) from {response.url} or this url has ONLY one page"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        return total_pages

    def get_city_from_url(self, url=""):
        city = ""
        result_obj = parse.urlparse(url)
        if -1 < result_obj.netloc.find("58.com"):
            temp2_list = result_obj.netloc.split(".")
            if 3 == len(temp2_list):
                city = temp2_list[0]
        return city

    def get_page_area_district_from_url(self, url_object=None):
        """
			https://fs.58.com/shangpucz/
			https://gz.58.com/shangpu/
			https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50
			https://fs.58.com/foshan/shangpucz/pn2/			# foshan == 佛山周边，与禅城、高明、三水等同级
			https://gz.58.com/shangpucz/pn3/
			https://fs.58.com/shangpu/38143746902823x.shtml
		"""
        page = "1"
        district = ""
        shop_area = ""
        detailed_page = False
        if url_object is not None and hasattr(
                url_object,
                "netloc") and -1 < url_object.netloc.find("58.com"):
            # parse query
            has_shop_area = True
            if not hasattr(url_object, "query") or 1 > len(url_object.query):
                has_shop_area = False
            if has_shop_area:
                query_dict = parse.parse_qs(url_object.query)
                if "area" in query_dict.keys() and isinstance(
                        query_dict["area"],
                        list) and 0 < len(query_dict["area"]):
                    shop_area = query_dict["area"][0]

            # parse path
            if hasattr(url_object, "path"):
                url_list = url_object.path.split("/")
                temp_list = []
                for one in url_list:
                    if 0 < len(one) and -1 == one.find(
                            "shangpucz") and -1 == one.find(
                                "shangpu") and -1 == one.find("pn"):
                        temp_list.append(one)
                    elif -1 < one.find("pn"):
                        page = CommonClass.find_digits_from_str(
                            string=one, return_all=False)
                    elif -1 < one.find(".shtml"):
                        detailed_page = True
                if not detailed_page and 1 == len(temp_list):
                    district = temp_list[0]
        if detailed_page:
            page = "0"
        return (page, district, shop_area)

    def make_html_file_name(self, url="", city=""):
        """
			https://fs.58.com/shangpucz/
			https://gz.58.com/shangpu/
			https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50
			https://fs.58.com/foshan/shangpucz/pn2/
			https://gz.58.com/shangpucz/pn3/
			https://fs.58.com/shangpu/38143746902823x.shtml
		"""
        now = datetime.datetime.now()
        html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S"))
        today = now.strftime("%Y%m%d")

        result_obj = parse.urlparse(url)
        url_list = result_obj.path.split("/")
        while "" in url_list:
            url_list.remove("")

        detail_page = False
        last_part = url_list[len(url_list) - 1] if 0 < len(url_list) else ""
        if -1 < last_part.find(".shtml"):
            detail_page = True
            # https://fs.58.com/shangpu/38143746902823x.shtml
            shop_id = last_part.rstrip(".shtml")
            html_filename = f"{city}___{shop_id}___{today}.html"
        elif -1 < result_obj.netloc.find("58.com") and 1 == len(
                url_list) and url_list[0] in [
                    "shangpucz",
                    "shangpu",
                ]:
            # list page #1: https://fs.58.com/shangpucz/
            html_filename = f"{city}___all___all___index1___{today}.html"
        else:
            page, district, shop_area = self.get_page_area_district_from_url(
                url_object=result_obj)
            if -1 < shop_area.find("500_"):
                shop_area = "over500"

            if 0 < len(district) and 0 < len(shop_area):
                html_filename = f"{city}___{district}___{shop_area}___index{page}___{today}.html"
            elif 0 < len(district):
                html_filename = f"{city}___{district}___all___index{page}___{today}.html"
            elif 0 < len(shop_area):
                html_filename = f"{city}___all___{shop_area}___index{page}___{today}.html"
            else:
                html_filename = f"{city}___all___all___index{page}___{today}.html"
        return (detail_page, html_filename)

    def get_shop_id(self, url=""):
        shop_id = f"random{random.randint( 10000, 99999 )}"
        url_obj = parse.urlparse(url)
        if hasattr(url_obj, "path"):
            url_list = url_obj.path.split("/")
            for one in url_list:
                if -1 < one.find(".shtml"):
                    shop_id = one.rstrip(".shtml")
        return shop_id

    def save_html(self, response=None, save70=False):
        """
			returns -1: wrong response object
			-2: fail to write response.body
			1001: this is a detailed page
			101: more than 69 pages
			0 to 70: page number; 0:detailed page or fail to extract total page from list page
		"""
        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            if hasattr(response, "url"):
                error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return -1
        url = response.url
        meta_dict = response.meta
        page_type = "index"
        total_pages = 0
        city = self.get_city_from_url(url=url)

        if "page_type" in meta_dict.keys():
            page_type = meta_dict["page_type"]

        if "index" == page_type:
            if "total_pages" in meta_dict.keys():
                total_pages = int(meta_dict["total_pages"])

            if 0 == total_pages:
                total_pages = self.get_total_pages(response=response)

            if 69 < total_pages and not save70:
                return 101
            detail_page, html_filename = self.make_html_file_name(url=url,
                                                                  city=city)
            html_file_path = os.path.join(self.saved_html_dir, html_filename)

        elif "detailed" == page_type:
            total_pages = 1001
            today = datetime.datetime.now().strftime("%Y%m%d")
            shop_id = self.get_shop_id(url=url)
            html_filename = f"{city}___{shop_id}___{today}.html"
            html_file_path = os.path.join(self.saved_html_dir, html_filename)

        try:
            with open(html_file_path, "wb") as f:
                f.write(response.body)
        except Exception as ex:
            error_msg = f"fail to write response.body into {html_file_path} after requesting {url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return -2
        else:
            if 1 > total_pages:
                error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return total_pages  # could be 70 when save70 == True

    def divide_request_into_next_level(self, response=None):
        """
		# returns:
			(-1, [], -1): wrong response object
			(-2, [], 2): already using shop area as level 2, currently we only have levels up to 2
			(-3, [], -1): this response.url is already page #2 or more
			(-4, [], -1): this page is a detailed page
			(-11, [], index_level): fail to extract links from response.body
			(-12, [], index_level): same as (-2, [], 2)
			(-13, [], index_level): wrong parameter (index_level)
			(pointer, district_list, index_level): 0 == pointer; district_list is a []; index_level is int and in [0, 1, 2]
			0 == index_level: this url is for whole city like guangzhou, foshan, or shenzhen
			1 == index_level: this url is for one district like tianhe, baiyun, panyu, or others in guangzhou
			2 == index_level: this url is for one shop_area size listed in self.shop_area_uri_list
			https://fs.58.com/shangpucz/
			https://gz.58.com/shangpu/
			https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50
			https://fs.58.com/foshan/shangpucz/pn2/
			https://gz.58.com/shangpucz/pn3/
		"""
        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            error_msg = f"meta = {hasattr( response, 'meta' )}; body = {hasattr( response, 'body' )}; url = {hasattr( response, 'url' )}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return (-1, [], -1)
        url = response.url
        url_obj = parse.urlparse(url)
        page, district, shop_area = self.get_page_area_district_from_url(
            url_object=url_obj)

        if 0 < len(shop_area):
            page_status = self.save_html(response=response, save70=True)
            self.write_log(content=f"{response.url}",
                           logfilename=self.over70_filename,
                           content_only=True)
            return (-2, [], 2)
        if 1 < int(page):
            page_status = self.save_html(response=response, save70=True)
            return (-3, [], -1)
        elif 0 == int(page):
            return (-4, [], -1)

        index_level = self.get_index_level(response=response,
                                           district=district)

        pointer, link_list = self.extract_this_level_options(
            response=response, index_level=index_level, district=district)
        if pointer in [
                -11,
                -12,
        ]:
            page_status = self.save_html(response=response, save70=True)
            self.write_log(content=f"{response.url}",
                           logfilename=self.over70_filename,
                           content_only=True)
        return (pointer, link_list, index_level)

    def get_index_level(self, response=None, district=""):
        meta_dict = response.meta
        index_level = 0
        if "index_level" in meta_dict.keys():
            index_level = int(meta_dict["index_level"])

        url_obj = parse.urlparse(response.url)
        query_dict = url_obj.query if hasattr(url_obj, "query") else {}
        if 0 < len(query_dict):
            query_dict = parse.parse_qs(query_dict)

        # district and area have higher priority than index_level
        if 0 < len(query_dict) and "area" in query_dict.keys():
            if 2 != index_level:
                index_level = 2
        elif 0 < len(district) and 1 != index_level:
            error_msg = f"index_level {index_level} != ({district})"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            index_level = 1
        elif 0 == len(district) and 0 != index_level:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {index_level} is not 0"
            )
            index_level = 0

        return index_level

    def make_new_url(self, parent_level_url="", index_level=0, fragment=""):
        """
			make one child url according to parent url
			https://fs.58.com/shangpucz/
			https://gz.58.com/shangpu/
			https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50
			https://fs.58.com/foshan/shangpucz/pn2/			# foshan == 佛山周边，与禅城、高明、三水等同级
			https://gz.58.com/shangpucz/pn3/
		"""
        parent_url_obj = parse.urlparse(parent_level_url)
        child_url = f"{parent_url_obj.scheme}://{parent_url_obj.netloc}"
        child_url = child_url.rstrip("/")
        if 1 == index_level:  # it is parent's index_level
            return f"{child_url}/{parent_url_obj.path.lstrip('/')}?area={fragment}"
        elif 0 == index_level:
            return f"{child_url}/{fragment.strip('/')}/shangpucz/"
        else:
            return ""

    def extract_district_from_url_paths(self, district_list=[]):
        """
			/shangpucz/
			/tianhe/shangpucz/
			/haizhu/shangpucz/
		"""
        return_list = []
        for one_link in district_list:
            url_list = one_link.split("/")
            good_url_list = []
            for good_url in url_list:
                if 0 < len(good_url) and -1 == good_url.find(
                        "shangpucz") and -1 == good_url.find("shangpu"):
                    # and -1 == good_url.find("pn") and -1 == one.find(".shtml"):
                    good_url_list.append(good_url)
            if 1 == len(good_url_list):
                return_list.append(good_url_list[0])
        return return_list

    def extract_this_level_options(self,
                                   response=None,
                                   index_level=0,
                                   district=""):
        """
		# returns:
			( 0, [a list has one element or more] )
			( -11, [] ): fail to extract links from response.body
			( -12, [] ): already 2 == index_level
			( -13, [] ): wrong parameter ( index_level )
		"""
        district_list = []
        if 0 == index_level:
            district_dl = response.xpath(
                '//div[@class="filter-wrap"]/dl[@class="secitem"]')
            for one_district in district_dl:
                dl_dtitle = one_district.xpath("./dt/text()").extract_first(
                    default="")
                if 0 == dl_dtitle.find("区域："):
                    break
            district_list = one_district.xpath("./dd/a/@href").extract()
            if 0 < len(district_list):
                district_list = self.extract_district_from_url_paths(
                    district_list=district_list)
            if 0 < len(district_list):
                return (0, district_list)
            error_msg = f"fail to extract links from response.body after requesting {response.url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return (-11, [])
        elif 1 == index_level:
            return (0, self.shop_area_uri_list)
        elif 2 == index_level:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, 2 == index_level; we will NOT divide further"
            )
            return (-12, [])
        else:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong index_level ({index_level})"
            )
            return (-13, [])

    def load_items_into_loader(self, loader=None, text={}, url=""):
        loader.add_value("content", str(text))  # , encoding="utf-8"
        loader.add_value("page_type", "detailed")

        # record housekeeping fields
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get('BOT_NAME'))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date",
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
        return loader

    def extract_shop_id_from_href(self, shop_id="", use_logr=False):
        """
		# href:
			https://jxjump.58.com/service?target=FCADV8oV3os7xtAj_6pMK7rUlr7DdRMx8H_54olt8EXOWkK_Zpk1zEffDjhGKDukKaSGKEtf3gzeNV-\
			jc68R330iX4JeiOAQ9mxZIx_7k2_EqtKoFph2NZi5EUpVl9S607kui9wZ5vFL9FjgOWSrSlIBohzi3WsLQSp_Rr-QuiAazy31jEubeh76kg5T_\
			uVyVN1UCEVsUjMvAnmEU0sZOrGXZEsuraI5DWpE1qXSASL8rH4cOWSrSlIBoh7ifevBw4N33&pubid=75911118&apptype=0&psid=161954686204511925276830641&\
			entinfo=38284513452802_0&cookie=%7C%7C%7C&amp;fzbref=0&amp;key&params=busitime^desc

		# logr:
			z_2_33120284640267_38420286183181_1_2_sortid:613502485@postdate:1560301497000
			gz_2_55687810204183_36683482092955_sortid:599933703@postdate:1560268805000@ses:busitime^desc@pubid:76309565
		"""
        shop_id_str = ""
        if use_logr:
            seen_sortid = False
            if isinstance(shop_id, str):
                temp_list = shop_id.split("_")
                if 0 < len(temp_list):
                    temp_list.reverse()
                    for one in temp_list:
                        if 0 == one.find("sortid"):
                            seen_sortid = True
                        if seen_sortid and 14 == len(one):
                            return one
        else:
            url_obj = None
            if isinstance(shop_id, str):
                url_obj = parse.urlparse(shop_id)
            if hasattr(url_obj, "query"):
                query_dict = parse.parse_qs(url_obj.query)
                if isinstance(query_dict,
                              dict) and "entinfo" in query_dict.keys():
                    shop_id_str = query_dict["entinfo"]
                    if isinstance(shop_id_str, list) and 0 < len(shop_id_str):
                        shop_id_str = shop_id_str[0]
                    if isinstance(shop_id_str,
                                  str) and -1 < shop_id_str.find("_"):
                        temp_list = shop_id_str.split("_")
                        shop_id_str = temp_list[0]
        return shop_id_str

    def parse_list_response_field(self, response=None, city=""):
        text_list = []
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text

        shops = response.xpath(
            '//div[@class="content-wrap"]/div[@class="content-side-left"]/ul[@class="house-list-wrap"]/li[@logr]'
        )
        for one_shop in shops:
            try:
                shop_id = one_shop.xpath(
                    "./div[@class='list-info']/h2[@class='title']/a/@href"
                ).extract_first(default='')
                shop_id = self.extract_shop_id_from_href(shop_id=shop_id,
                                                         use_logr=False)
                if 1 > len(shop_id):
                    shop_id = one_shop.xpath("./@logr").extract_first(
                        default='')
                    shop_id = self.extract_shop_id_from_href(shop_id=shop_id,
                                                             use_logr=True)
                title = one_shop.css(
                    'div.list-info h2.title a span.title_des::text'
                ).extract_first(default='')
                baseinfo_list = one_shop.css('div.list-info p.baseinfo')
                description = ""
                baseinfo_items = []
                address = ""
                for index, onelist in enumerate(baseinfo_list):
                    temp = onelist.css("span::text").extract()
                    if 0 < len(temp):
                        baseinfo_items += temp
                        if index + 1 == len(baseinfo_list):
                            address = temp[len(temp) - 1]
                if 0 < len(baseinfo_items):
                    description = "___descr___".join(baseinfo_items)

                tags = ""
                tag_list = one_shop.xpath(
                    "./div[@class='list-info']/p[@class='tag-wrap']/span/text()"
                ).extract()
                if 0 < len(tag_list):
                    tags = "___tags___".join(tag_list)
                price_box = one_shop.css('div.price')
                price_sum = price_box.css('p.sum b::text').extract_first(
                    default='')
                price_sum_unit = price_box.css(
                    'p.sum span::text').extract_first(default='')
                unitprice = price_box.css('p.unit span::text').extract_first(
                    default='')
                unitprice_unit_list = price_box.css('p.unit::text').extract()
                unitprice_unit = unitprice_unit_list[
                    len(unitprice_unit_list) -
                    1].strip() if 0 < len(unitprice_unit_list) else ""
                text = {
                    "shop_id": shop_id,
                    "city": city,
                    "title": title.strip(),
                    "description": description.strip(),
                    "address": address,
                    "tags": tags,
                    "price_sum": price_sum.strip(),
                    "price_sum_unit": price_sum_unit.strip(),
                    "unitprice": unitprice.strip(),
                    "unitprice_unit": unitprice_unit.strip(),
                }
                text_list.append(text)
            except Exception as ex:
                error_msg = f"Error happened during parsing. Exception = {ex}; one_shop = {one_shop}"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
                continue
        return text_list

    def parse_detailed_page(self, response=None):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, todo..."
        )
        pass

    def do_nothing_for_debug(self, response=None):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}"
        )
        print(response.body)
        # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}]
        # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}'

    def url_contains_error(self, url_obj=""):
        """
			we do not know any anticrawl method by 58.com yet
		"""
        if hasattr(url_obj, "path"):
            pass

        return False

    def make_next_pages_url_from_page_one(self,
                                          url="",
                                          index_level_int=0,
                                          page_number_int=0):
        urls = []
        if 2 > page_number_int or 0 > index_level_int or 2 < index_level_int:
            return urls

        page = self.get_page_from_url(url=url)
        if 1 < page:
            return urls  # we ONLY do this at Page 1
        elif 1 == page:
            # url contains /pnxxx/ part; then remove it
            new_url = self.remove_url_page_part(url=url)
        else:
            new_url = url.rstrip("/")

        if 2 > index_level_int:
            # 0 == index_level_int: https://gz.58.com/shangpucz/pn3/
            # 1 == index_level_int: https://fs.58.com/foshan/shangpucz/pn2/
            for i in range(page_number_int - 1):
                urls.append(self.add_url_page_part(old_url=url, page=(i + 2)))
        else:
            # https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50
            for i in range(page_number_int - 1):
                urls.append(self.add_url_page_part(old_url=url, page=(i + 2)))
        return urls

    def parse_list_page(self, response=None):
        page_status = self.save_html(response=response, save70=False)

        url_obj = parse.urlparse(response.url)
        no_url_error = self.url_contains_error(url_obj=url_obj)
        load_this_page_items = False

        if 1 > page_status:
            pass
            # -2, -1, 0: error_msg has been logged; just pass
        elif 0 < page_status and 101 > page_status and not no_url_error:
            # 1 to 70 also means "index" == page_type
            load_this_page_items = True
            if 1 < page_status:  # ONLY reponsed html having total page more than 1 will go further
                page, district, shop_area = self.get_page_area_district_from_url(
                    url_object=url_obj)
                if 1 == int(page):
                    # ONLY do this on Page #1
                    index_level = self.get_index_level(response=response,
                                                       district=district)
                    urls = self.make_next_pages_url_from_page_one(
                        url=response.url,
                        index_level_int=index_level,
                        page_number_int=page_status)
                    meta_dict = {
                        "page_type": "index",
                        "total_pages": page_status,
                        "index_level": index_level,
                    }
                    for one_url in urls:
                        yield scrapy.Request(url=one_url,
                                             callback=self.parse_list_page,
                                             meta=meta_dict,
                                             dont_filter=True)
        elif 101 == page_status and not no_url_error:
            # 101 also means "index" == page_type
            pointer, link_list, index_level = self.divide_request_into_next_level(
                response=response)
            if pointer in [
                    -2,
                    -3,
                    -11,
                    -12,
            ]:
                load_this_page_items = True
            elif -1 < pointer:
                # going to request all children level list page
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": index_level + 1,
                }
                if self.use_proxy:
                    proxies_dict = self.proxy_ip_pool()
                    meta_dict["proxy"] = proxies_dict["http"]
                for i in range(len(link_list) - pointer):
                    new_url = self.make_new_url(parent_level_url=response.url,
                                                index_level=index_level,
                                                fragment=link_list[i +
                                                                   pointer])
                    if 0 < len(new_url):
                        self.logger.info(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, requesting {new_url}; meta_dict = {meta_dict}"
                        )
                        yield scrapy.Request(url=new_url,
                                             callback=self.parse_list_page,
                                             meta=meta_dict,
                                             dont_filter=True)
        elif 1001 == page_status and not no_url_error:
            self.parse_detailed_page(response=response)
            # 1001 also means "detailed" == page_type
            # will never reach here because self.parse_detailed_page() is the callback method

        if load_this_page_items:
            url = response.url
            city = self.get_city_from_url(url=url)
            text_list = self.parse_list_response_field(response=response,
                                                       city=city)
            try:
                for text in text_list:
                    loader = ItemLoader(item=Shop58Item(), response=response)
                    loader = self.load_items_into_loader(loader=loader,
                                                         text=text,
                                                         url=url)
                    yield loader.load_item()
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
                )

    def read_and_parse(self, response=None):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}. under developing..."
        )
        pass
        # file_list = os.listdir( self.saved_html_dir )
        # for one_file in file_list:
        # 	if -1 == one_file.find("index"):
        # 		temp_list = one_file.split("___")
        # 		apt_id = 0
        # 		city = ""
        # 		if 1 < len( temp_list ):
        # 			apt_id = temp_list[1]
        # 			city = temp_list[0]
        # 		url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm"
        # 		html_file_path = os.path.join( self.saved_html_dir, one_file )
        # 		if os.path.isfile(html_file_path):
        # 			doc = None
        # 			with open( html_file_path,'rb') as f:
        # 				# doc = f.read().decode('gb2312', 'ignore')
        # 				doc = f.read().decode('utf-8', 'ignore')
        # 			if doc is None:
        # 				self.logger.error( f"Error: cannot read html file {html_file_path}.")
        # 				continue
        # 			response = Selector( text=doc, type="html" )
        # 			text_list = self.parse_list_response_field( response = response, city = city, apt_id = apt_id )
        # 			try:
        # 				for text in text_list:
        # 					loader = ItemLoader( item = Shop58Item(), response = response )
        # 					loader = self.load_items_into_loader( loader = loader, text = text, url = url )
        # 					yield loader.load_item()
        # 			except Exception as ex:
        # 				self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}" )
        # 			if self.debug:
        # 				break

    def write_log(self, content=None, logfilename=None, content_only=False):
        if content is not None and 0 < len(content):
            today = datetime.datetime.now().strftime("%Y%m%d")
            if logfilename is None:
                logfilename = f"{self.name}{today}.log"
            try:
                with open(os.path.join(self.log_dir, logfilename),
                          'a',
                          encoding='utf-8') as f:
                    if content_only:
                        info = f"{str(content)}\n"
                    else:
                        info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
                    f.write(info)
                return 1
            except Exception as ex:
                return 0
        return -1

Esempio n. 6

Mostra file

class DirectionamapSpider(scrapy.Spider):
    """
		sys.exit code == 2 # missing CITY_LIST or missing input file(s)
		sys.exit code == 3 # classification file format error
		sys.exit code == 4 # already requested all xy points in city_list today
	"""
    name = "directionbaidu"

    root_path = ""
    log_dir = ""
    # debug = False
    # save_every_response = False
    crawled_dir = ""
    json_dir = ""
    output_folder_name = ""
    # output_file_format = "json"
    # base_uri = ""
    run_purpose = None
    overwrite_today = ""
    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    # crontab will start a new process in every 2 hours; therefore in 1 day, the crontab will start 12 times
    maximal_requests_of_one_crontab_process = 23
    interval_between_requests = 300
    request_counter = 0
    last_4_requests = {}

    urls = []

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        # self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False )
        # self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False )
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.json_dir = self.settings.get(name="SAVED_JSON", default="")
        self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME",
                                                    default="")
        self.base_uri = self.settings.get(name="BASE_URI", default="")
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        self.overwrite_today = self.settings.get(name="OVERWRITE_TODAY",
                                                 default="")

        self.maximal_requests_of_one_crontab_process = self.settings.get(
            name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=23)
        self.interval_between_requests = self.settings.get(
            name="INTERVAL_BETWEEN_REQUESTS", default=300)

        xy_points = {
            "country_garden": "22.9299453776,113.2749357238",
            "baiyun_airport_departure":
            "23.3932641265,113.3085855889",  # T1航站楼国内出发
            "baiyun_airport_arrival":
            "23.3937931265,113.3068755889",  # T1航站楼国内到达
            "baoan_airport_departure":
            "22.6303448273,113.8207143453",  # T3航站楼国内出发
            "baoan_airport_arrival":
            "22.6296848273,113.8192343453",  # T3航站楼国内到达
        }
        query_dict = {
            "origin": xy_points["country_garden"],
            "destination": xy_points["baiyun_airport_departure"],
            "coord_type": "bd09ll",
            "ret_coordtype": "bd09ll",
            "tactics": 7,
            "alternatives": 0,
            "output": "json",
            "ak": "iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL",
        }

        # 0 == 碧桂园总部到白云机场；1 == 白云机场到碧桂园总部；2 == 总部到宝安机场；3 == 宝安机场到总部
        query_list = []
        query_list.append(query_dict)

        temp_dict = copy.deepcopy(query_dict)
        temp_dict["origin"] = xy_points["baiyun_airport_arrival"]
        temp_dict["destination"] = xy_points["country_garden"]
        query_list.append(temp_dict)

        temp_dict = copy.deepcopy(query_dict)
        temp_dict["origin"] = xy_points["country_garden"]
        temp_dict["destination"] = xy_points["baoan_airport_departure"]
        query_list.append(temp_dict)

        temp_dict = copy.deepcopy(query_dict)
        temp_dict["origin"] = xy_points["baoan_airport_arrival"]
        temp_dict["destination"] = xy_points["country_garden"]
        query_list.append(temp_dict)

        for one_query_dict in query_list:
            self.urls.append(
                f"{self.base_uri}?{parse.urlencode(one_query_dict)}")

        if 4 != len(self.urls):
            self.logger.error(f"self.urls length shall be 4 ({self.urls})")

    def check_dirs_and_files(self):
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.json_dir):
            os.makedirs(self.json_dir)

    def start_requests(self):
        self.init_self_attributes()
        self.check_dirs_and_files()

        if "READ_JSON_AND_WRITE_CSV" == self.run_purpose:
            one_url = "https://blog.csdn.net/qq_37193537/article/details/78987949"
            callback_func = self.read_json_and_parse
            yield scrapy.Request(url=one_url,
                                 callback=callback_func,
                                 dont_filter=True)
        else:
            timestamp_float = time.time()
            self.last_4_requests = {
                "request_time": timestamp_float,
                "requested_index": [
                    0,
                    1,
                    2,
                    3,
                ]
            }
            callback_func = self.parse_json
            for index, one_url in enumerate(self.urls):
                meta_dict = {
                    "preset_route":
                    index,  # 0 == 碧桂园总部到白云机场；1 == 白云机场到碧桂园总部；2 == 总部到宝安机场；3 == 宝安机场到总部
                    "redo": 0,
                }
                self.logger.info(f"{index}: requesting {one_url} ")
                yield scrapy.Request(url=one_url,
                                     callback=callback_func,
                                     meta=meta_dict,
                                     dont_filter=True)

    def get_url_according_to_preset_route(self, preset_route=101):
        # 由于有一个严重的bug直到20190619_2220才修补，导致在这之前的所有请求都是宝安机场到总部的(即preset_route == 3)
        baoan2headquarter = "http://api.map.baidu.com/direction/v2/driving?origin=22.6296848273%2C113.8192343453&destination=22.9299453776%2C113.2749357238&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL"
        if 0 < len(self.overwrite_today
                   ) and "READ_JSON_AND_WRITE_CSV" == self.run_purpose:
            time_array = time.strptime(self.overwrite_today, "%Y%m%d")
            timestamp_overwrite_today = float(time.mktime(time_array))
            time_array = time.strptime("20190619_222000", "%Y%m%d_%H%M%S")
            timestamp_bug_fixed = float(time.mktime(time_array))
            if timestamp_overwrite_today < timestamp_bug_fixed:
                return baoan2headquarter
        if 3 == preset_route:
            return baoan2headquarter
        elif 2 == preset_route:
            return "http://api.map.baidu.com/direction/v2/driving?origin=22.9299453776%2C113.2749357238&destination=22.6303448273%2C113.8207143453&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL"
        elif 1 == preset_route:
            return "http://api.map.baidu.com/direction/v2/driving?origin=23.3937931265%2C113.3068755889&destination=22.9299453776%2C113.2749357238&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL"
        elif 0 == preset_route:
            return "http://api.map.baidu.com/direction/v2/driving?origin=22.9299453776%2C113.2749357238&destination=23.3932641265%2C113.3085855889&coord_type=bd09ll&ret_coordtype=bd09ll&tactics=7&alternatives=0&output=json&ak=iL3ZmAje32Q6WrXgaWcBSZP0RZG1hekL"
        return ""

    def read_json_and_parse(self, response):
        file_list = os.listdir(self.json_dir)
        # route0___0___20190615_234522.json
        for one_file in file_list:
            temp_list = one_file.split("___")
            preset_route = 0
            now = ""
            if 2 < len(temp_list):
                preset_route = temp_list[0]
                preset_route = preset_route.lstrip("route")
                preset_route = CommonClass.find_digits_from_str(
                    string=preset_route, return_all=False)
                preset_route = int(preset_route)
                now = temp_list[2]
                now = now.rstrip(".json")

                url = self.get_url_according_to_preset_route(
                    preset_route=preset_route)
                json_file_path = os.path.join(self.json_dir, one_file)
                if os.path.isfile(json_file_path):
                    try:
                        doc = None
                        with open(json_file_path, "rb") as f:
                            doc = f.read().decode("utf-8", "ignore")
                        if doc is None:
                            self.logger.error(
                                f"Error: cannot read html file {json_file_path}."
                            )
                            continue
                        text_dict = self.extract_text_dict_from_response_body(
                            body=doc, preset_route=preset_route, now=now)
                        if 0 < len(text_dict):
                            json_selector = Selector(text=doc, type=None)
                            loader = ItemLoader(item=DirectionbaiduItem(),
                                                selector=json_selector)
                            loader = self.load_items_into_loader(
                                loader=loader,
                                text=text_dict,
                                url=url,
                                now=now)
                            yield loader.load_item()
                    except Exception as ex:
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}"
                        )

    def extract_text_dict_from_response_body(self,
                                             body="",
                                             preset_route=101,
                                             now=""):
        text_dict = {}
        json_dict = json.loads(body)
        result_dict = json_dict["result"] if "result" in json_dict.keys(
        ) else {}
        total = int(
            result_dict["total"]) if "total" in result_dict.keys() else 0
        routes_list = result_dict["routes"] if "routes" in result_dict.keys(
        ) else []
        selected_route_dict = {}
        if 1 < len(routes_list):
            for one_route_dict in routes_list:
                tag = one_route_dict["tag"] if "tag" in one_route_dict.keys(
                ) else ""
                if -1 < tag.find("推荐路线"):
                    selected_route_dict = one_route_dict
                    break
        elif 1 == len(routes_list):
            selected_route_dict = routes_list[0]

        # if no 推荐路线, just select the first route_dict
        if 1 < len(routes_list):
            selected_route_dict = routes_list[0]

        if 0 < len(selected_route_dict):
            tag = selected_route_dict[
                "tag"] if "tag" in selected_route_dict.keys() else ""
            distance = selected_route_dict[
                "distance"] if "distance" in selected_route_dict.keys() else 0
            duration = selected_route_dict[
                "duration"] if "duration" in selected_route_dict.keys() else 0
            selected_path_steps = selected_route_dict[
                "steps"] if "steps" in selected_route_dict.keys() else []

            text_dict = {
                "preset_route": preset_route,
                "strategy": tag,
                "duration": duration,
                "distance": distance,
                "count": total,
                "paths": len(routes_list),
                "now": now,
                "selected_path_steps": selected_path_steps,
            }
        return text_dict

    def parse_json(self, response):
        status, message = self.save_json(response=response, page_type="json")
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        preset_route = -1
        if 0 == status:
            try:
                meta_dict = response.meta
                preset_route = int(meta_dict["preset_route"])
                text_dict = self.extract_text_dict_from_response_body(
                    body=response.body, preset_route=preset_route, now=now)
                if 0 < len(text_dict):
                    loader = ItemLoader(item=DirectionbaiduItem(),
                                        response=response)
                    loader = self.load_items_into_loader(loader=loader,
                                                         text=text_dict,
                                                         url=response.url,
                                                         now=now)
                    yield loader.load_item()
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}"
                )
        if -1 == preset_route:
            if hasattr(response, "meta"):
                meta_dict = response.meta
                if "preset_route" in meta_dict.keys():
                    preset_route = int(meta_dict["preset_route"])
        if -1 < preset_route:
            received_all_4_requests_bool = self.check_this_preset_route(
                preset_route=preset_route)

            if not received_all_4_requests_bool and "redo" in response.meta.keys(
            ):
                delayed_index_list = self.get_delayed_response_more_than_1_minute(
                )
                if 0 < len(delayed_index_list):
                    request_result_bool = self.redo_requests(
                        redo=response.meta["redo"])

            # get data again after 5 minutes
            if self.request_counter < self.maximal_requests_of_one_crontab_process and received_all_4_requests_bool:
                while (self.check_time_interval()):
                    time.sleep(10)

                self.request_counter += 1
                now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                self.logger.info(
                    f" requesting amap at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )"
                )
                self.last_4_requests = {
                    "request_time": time.time(),
                    "requested_index": [
                        0,
                        1,
                        2,
                        3,
                    ]
                }
                callback_func = self.parse_json
                for index, one_url in enumerate(self.urls):
                    meta_dict = {
                        "preset_route": index,
                        "redo": 0,
                    }
                    self.logger.info(f"{index}: requesting {one_url} ")
                    yield scrapy.Request(url=one_url,
                                         callback=callback_func,
                                         meta=meta_dict,
                                         dont_filter=True)

    def check_time_interval(self):
        if "request_time" not in self.last_4_requests.keys() or not isinstance(
                self.last_4_requests["request_time"], float):
            return False
        if time.time() - self.last_4_requests["request_time"] > float(
                self.interval_between_requests):
            return False
        return True

    def redo_requests(self, redo=-1):
        urls = []
        index_list = []
        if 1 > len(self.last_4_requests["requested_index"]) or 0 > redo:
            return False
        for one_index in self.last_4_requests["requested_index"]:
            urls.append(self.urls[one_index])
            index_list.append(one_index)
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        for index, one_url in enumerate(urls):
            meta_dict = {
                "preset_route": index_list[index],
                "redo": redo + 1,
            }
            self.logger.info(
                f"[{now}] redo {index_list[index]}: requesting {one_url} ")
            yield scrapy.Request(url=one_url,
                                 callback=self.parse_json,
                                 meta=meta_dict,
                                 dont_filter=True)

    def get_delayed_response_more_than_1_minute(self):
        if "requested_index" not in self.last_4_requests.keys(
        ) or not isinstance(self.last_4_requests["requested_index"], list):
            return []
        if "request_time" not in self.last_4_requests.keys() or not isinstance(
                self.last_4_requests["request_time"], float):
            return []
        if time.time() - self.last_4_requests["request_time"] > 60.0:
            return self.last_4_requests["requested_index"]
        return []

    def check_this_preset_route(self, preset_route=-1):
        if preset_route not in [
                0,
                1,
                2,
                3,
        ]:
            return True
        if "request_time" not in self.last_4_requests.keys() or not isinstance(
                self.last_4_requests["request_time"], float):
            return True
        if "requested_index" not in self.last_4_requests.keys(
        ) or not isinstance(self.last_4_requests["requested_index"], list):
            return True

        # 4 minutes have passed, just return True
        if time.time() - self.last_4_requests["request_time"] > 240.0:
            return True

        # remove current preset_route
        if preset_route in self.last_4_requests["requested_index"]:
            self.last_4_requests["requested_index"].remove(preset_route)
        if 1 > len(self.last_4_requests["requested_index"]):
            return True

        # There are(is a) element(s) in self.last_4_requests["requested_index"]
        return False

    def load_items_into_loader(self, loader=None, text={}, url="", now=""):
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get("BOT_NAME"))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date", now)

        loader.add_value("content", str(text))
        loader.add_value("page_type", "json")

        return loader

    def save_json(self, response=None, page_type="json"):
        status = -4
        if response is None or not hasattr(response, "body") or not hasattr(
                response, "url") or not hasattr(response, "meta"):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object"
            )
            return (-1, f"wrong response object")
        meta_dict = response.meta
        preset_route = meta_dict[
            "preset_route"] if "preset_route" in meta_dict.keys() else ""
        file_path = ""
        if "json" == page_type:
            json_dict = json.loads(response.body)
            status = json_dict["status"] if "status" in json_dict.keys(
            ) else "404"
            result_dict = json_dict["result"] if "result" in json_dict.keys(
            ) else {}
            routes_list = result_dict[
                "routes"] if "routes" in result_dict.keys() else []
            if isinstance(routes_list, list) and 0 < len(routes_list):
                now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                file_path = os.path.join(
                    self.json_dir,
                    f"route{preset_route}___{status}___{now}.json")
                status = int(status)
        else:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter page_type == {page_type} from {response.url}"
            )
            return (-2, f"page_type can ONLY be json")

        return_msg = "0 count"
        if 0 < len(file_path):
            try:
                with open(file_path, 'wb') as f:
                    f.write(response.body)
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}"
                )
                return (status, f"failed to write json file")  # not -3
        return (status, return_msg)

Esempio n. 7

Mostra file

File: directionamapcoverage.py Progetto: zouyaoji/Crawls

class DirectionamapcoverageSpider(scrapy.Spider):
    """
		sys.exit code == 2 # missing CITY_LIST or missing input file(s)
		sys.exit code == 3 # classification file format error
		sys.exit code == 4 # already requested all xy points in city_list today
	"""
    name = "directionamapcoverage"

    root_path = ""
    log_dir = ""
    # debug = False
    # save_every_response = False
    crawled_dir = ""
    json_dir = ""
    output_folder_name = ""
    # output_file_format = "json"
    # base_uri = ""
    # run_purpose = None
    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    # crontab will start a new process in every 2 hours; therefore in 1 day, the crontab will start 12 times
    maximal_requests_of_one_crontab_process = 23
    interval_between_requests = 300
    request_counter = 0
    last_4_requests = {}

    urls = [
        "https://restapi.amap.com/v3/direction/driving?origin=113.268029,22.923338&destination=113.3025,23.38575&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2",
        "https://restapi.amap.com/v3/direction/driving?origin=113.30508,23.38597&destination=113.268029,22.923338&extensions=all&output=json&key=470fdf698e3aab758d4cb026244f5194",
        "https://restapi.amap.com/v3/direction/driving?origin=113.268029,22.923338&destination=113.81424,22.62471&extensions=all&output=json&key=740f50c6fabd5801d0fad1cba62446d9",
        "https://restapi.amap.com/v3/direction/driving?origin=113.81276,22.62405&destination=113.268029,22.923338&extensions=all&output=json&key=4328d392605802de34406045b9701bb8",
    ]

    # 0 == 碧桂园总部到白云机场；1 == 白云机场到碧桂园总部；2 == 总部到宝安机场；3 == 宝安机场到总部

    # https://restapi.amap.com/v3/direction/driving?origin=113.267982,22.92451&destination=113.307605,23.389929&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2
    # https://restapi.amap.com/v3/direction/driving?origin=113.307605,23.389929&destination=113.267982,22.92451&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2
    # 碧桂园总部
    # 113.268029,22.923338

    # 白云机场
    # 113.3025,23.38575:	T1航站楼国内出发
    # 113.30508,23.38597:	T1航站楼国内到达

    # 宝安机场
    # 113.81424,22.62471:	T3航站楼国内出发
    # 113.81276,22.62405:	T3航站楼国内到达

    # https://restapi.amap.com/v3/direction/driving?origin=113.267982,22.92451&destination=113.814829,22.633092&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2
    # https://restapi.amap.com/v3/direction/driving?origin=113.814829,22.633092&destination=113.267982,22.92451&extensions=all&output=json&key=4ebb849f151dddb3e9aab7abe6e344e2

    # https://ditu.amap.com/dir?from%5Badcode%5D=440306&from%5Bname%5D=%E6%B7%B1%E5%9C%B3%E5%AE%9D%E5%AE%89%E5%9B%BD%E9%99%85%E6%9C%BA%E5%9C%BA&from%5Bid%5D=B02F37T239&from%5Bpoitype%5D=150104&from%5Blnglat%5D=113.81482900000003%2C22.633092&from%5Bmodxy%5D=113.815186%2C22.624847&to%5Bname%5D=%E7%A2%A7%E6%A1%82%E5%9B%AD%E6%80%BB%E9%83%A8&to%5Blnglat%5D=113.267982%2C22.92451&to%5Bid%5D=B0FFFVAF72&to%5Bpoitype%5D=120201&to%5Badcode%5D=440600&to%5Bmodxy%5D=113.269254%2C22.923768&type=car&policy=1
    # https://www.amap.com/dir?from%5Bname%5D=%E7%A2%A7%E6%A1%82%E5%9B%AD%E6%80%BB%E9%83%A8&from%5Blnglat%5D=113.267982%2C22.92451&from%5Bid%5D=B0FFFVAF72-from&from%5Bpoitype%5D=120201&from%5Badcode%5D=440600&from%5Bmodxy%5D=113.269254%2C22.923768&to%5Bid%5D=B0FFG40CGO&to%5Bname%5D=%E5%B9%BF%E5%B7%9E%E7%99%BD%E4%BA%91%E5%9B%BD%E9%99%85%E6%9C%BA%E5%9C%BAT1%E8%88%AA%E7%AB%99%E6%A5%BC(F3%E5%9B%BD%E5%86%85%E5%87%BA%E5%8F%915%E5%8F%B7%E9%97%A8)&to%5Blnglat%5D=113.302846%2C23.385712&to%5Bmodxy%5D=113.302846%2C23.385712&to%5Bpoitype%5D=150105&to%5Badcode%5D=440114&type=car&policy=1
    # https://www.amap.com/dir?from%5Bid%5D=B00140NZIQ&from%5Bname%5D=%E5%B9%BF%E5%B7%9E%E7%99%BD%E4%BA%91%E5%9B%BD%E9%99%85%E6%9C%BA%E5%9C%BA&from%5Blnglat%5D=113.307605%2C23.389929&from%5Bmodxy%5D=113.303722%2C23.385187&from%5Bpoitype%5D=150104&from%5Badcode%5D=440111&to%5Bname%5D=%E7%A2%A7%E6%A1%82%E5%9B%AD%E6%80%BB%E9%83%A8&to%5Blnglat%5D=113.267982%2C22.92451&to%5Bid%5D=B0FFFVAF72&to%5Bpoitype%5D=120201&to%5Badcode%5D=440600&to%5Bmodxy%5D=113.269254%2C22.923768&type=car&policy=1

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        # self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False )
        # self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False )
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.json_dir = self.settings.get(name="SAVED_JSON", default="")
        self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME",
                                                    default="")
        # self.output_file_format = self.settings.get( name = "OUTPUT_FILE_FORMAT", default="json" )
        # self.base_uri = self.settings.get( name = "BASE_URI", default="" )
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        self.overwrite_today = self.settings.get(name="OVERWRITE_TODAY",
                                                 default="")

        self.maximal_requests_of_one_crontab_process = self.settings.get(
            name="MAXIMAL_REQUESTS_OF_ONE_CRONTAB_PROCESS", default=23)
        self.interval_between_requests = self.settings.get(
            name="INTERVAL_BETWEEN_REQUESTS", default=300)

    def check_dirs_and_files(self):
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.json_dir):
            os.makedirs(self.json_dir)

    def start_requests(self):
        self.init_self_attributes()
        self.check_dirs_and_files()

        if "READ_JSON_AND_WRITE_CSV" == self.run_purpose:
            one_url = "https://blog.csdn.net/qq_37193537/article/details/78987949"
            callback_func = self.read_json_and_parse
            yield scrapy.Request(url=one_url,
                                 callback=callback_func,
                                 dont_filter=True)
        else:
            timestamp_float = time.time()
            self.last_4_requests = {
                "request_time": timestamp_float,
                "requested_index": [
                    0,
                    1,
                    2,
                    3,
                ]
            }
            callback_func = self.parse_json
            for index, one_url in enumerate(self.urls):
                meta_dict = {
                    "preset_route":
                    index,  # 0 == 碧桂园总部到白云机场；1 == 白云机场到碧桂园总部；2 == 总部到宝安机场；3 == 宝安机场到总部
                    "redo": 0,
                }
                self.logger.info(f"{index}: requesting {one_url} ")
                yield scrapy.Request(url=one_url,
                                     callback=callback_func,
                                     meta=meta_dict,
                                     dont_filter=True)

    def get_url_according_to_preset_route(self, preset_route=101):
        if preset_route in [
                0,
                1,
                2,
                3,
        ]:
            return self.urls[preset_route]
        return ""

    def read_json_and_parse(self, response):
        file_list = os.listdir(self.json_dir)
        # route0___1___20190615_234522.json
        for one_file in file_list:
            temp_list = one_file.split("___")
            preset_route = 0
            now = ""
            if 2 < len(temp_list):
                preset_route = temp_list[0]
                preset_route = preset_route.lstrip("route")
                preset_route = CommonClass.find_digits_from_str(
                    string=preset_route, return_all=False)
                preset_route = int(preset_route)
                now = temp_list[2]
                now = now.rstrip(".json")

                url = self.get_url_according_to_preset_route(
                    preset_route=preset_route)
                json_file_path = os.path.join(self.json_dir, one_file)
                if os.path.isfile(json_file_path):
                    try:
                        doc = None
                        with open(json_file_path, "rb") as f:
                            doc = f.read().decode("utf-8", "ignore")
                        if doc is None:
                            self.logger.error(
                                f"Error: cannot read html file {json_file_path}."
                            )
                            continue
                        text_dict = self.extract_text_dict_from_response_body(
                            body=doc, preset_route=preset_route, now=now)
                        if 0 < len(text_dict):
                            json_selector = Selector(text=doc, type=None)
                            loader = ItemLoader(item=DirectionamapItem(),
                                                selector=json_selector)
                            loader = self.load_items_into_loader(
                                loader=loader,
                                text=text_dict,
                                url=url,
                                now=now)
                            yield loader.load_item()
                    except Exception as ex:
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}"
                        )

    def extract_text_dict_from_response_body(self,
                                             body="",
                                             preset_route=101,
                                             now=""):
        text_dict = {}
        json_dict = json.loads(body)
        count = int(json_dict["count"])  # already 0 < count
        route_dict = json_dict["route"] if "route" in json_dict.keys() else {}
        paths = route_dict["paths"] if "paths" in route_dict.keys() else []
        duration = 0
        strategy = "速度最快"
        selected_path_steps = []
        found_fastest = False
        if 1 < len(paths):
            for one_path in paths:
                temp_strategy = one_path[
                    "strategy"] if "strategy" in one_path.keys() else ""
                if -1 < temp_strategy.find("速度最快"):
                    duration = int(one_path["duration"]
                                   ) if "duration" in one_path.keys() else 0
                    strategy = temp_strategy
                    selected_path_steps = one_path[
                        "steps"] if "steps" in one_path.keys() else []
                    found_fastest = True
                    break
        if 1 == len(paths) or (not found_fastest and 1 < len(paths)):
            duration = int(
                paths[0]["duration"]) if "duration" in paths[0].keys() else 0
            strategy = paths[0]["strategy"] if "strategy" in paths[0].keys(
            ) else ""
            selected_path_steps = paths[0]["steps"] if "steps" in paths[
                0].keys() else []

        text_dict = {
            "preset_route": preset_route,
            "strategy": strategy,
            "duration": duration,
            "count": count,
            "paths": len(paths),
            "now": now,
            "selected_path_steps": selected_path_steps,
        }

        return text_dict

    def parse_json(self, response):
        status, message = self.save_json(response=response, page_type="json")
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        preset_route = -1
        if 1 == status:
            try:
                meta_dict = response.meta
                preset_route = int(meta_dict["preset_route"])
                text_dict = self.extract_text_dict_from_response_body(
                    body=response.body, preset_route=preset_route, now=now)
                loader = ItemLoader(item=DirectionamapItem(),
                                    response=response)
                loader = self.load_items_into_loader(loader=loader,
                                                     text=text_dict,
                                                     url=response.url,
                                                     now=now)
                yield loader.load_item()
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}"
                )
        if -1 == preset_route:
            if hasattr(response, "meta"):
                meta_dict = response.meta
                if "preset_route" in meta_dict.keys():
                    preset_route = int(meta_dict["preset_route"])
        if -1 < preset_route:
            received_all_4_requests_bool = self.check_this_preset_route(
                preset_route=preset_route)

            if not received_all_4_requests_bool and "redo" in response.meta.keys(
            ):
                delayed_index_list = self.get_delayed_response_more_than_1_minute(
                )
                if 0 < len(delayed_index_list):
                    request_result_bool = self.redo_requests(
                        redo=response.meta["redo"])

            # get data again after 5 minutes
            if self.request_counter < self.maximal_requests_of_one_crontab_process and received_all_4_requests_bool:
                while (self.check_time_interval()):
                    time.sleep(10)

                self.request_counter += 1
                now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                self.logger.info(
                    f" requesting amap at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )"
                )
                self.last_4_requests = {
                    "request_time": time.time(),
                    "requested_index": [
                        0,
                        1,
                        2,
                        3,
                    ]
                }
                callback_func = self.parse_json
                for index, one_url in enumerate(self.urls):
                    meta_dict = {
                        "preset_route": index,
                        "redo": 0,
                    }
                    self.logger.info(f"{index}: requesting {one_url} ")
                    yield scrapy.Request(url=one_url,
                                         callback=callback_func,
                                         meta=meta_dict,
                                         dont_filter=True)

    def check_time_interval(self):
        if "request_time" not in self.last_4_requests.keys() or not isinstance(
                self.last_4_requests["request_time"], float):
            return False
        if time.time() - self.last_4_requests["request_time"] > float(
                self.interval_between_requests):
            return False
        return True

    def redo_requests(self, redo=-1):
        urls = []
        index_list = []
        if 1 > len(self.last_4_requests["requested_index"]) or 0 > redo:
            return False
        for one_index in self.last_4_requests["requested_index"]:
            urls.append(self.urls[one_index])
            index_list.append(one_index)
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        for index, one_url in enumerate(urls):
            meta_dict = {
                "preset_route": index_list[index],
                "redo": redo + 1,
            }
            self.logger.info(
                f"[{now}] redo {index_list[index]}: requesting {one_url} ")
            yield scrapy.Request(url=one_url,
                                 callback=self.parse_json,
                                 meta=meta_dict,
                                 dont_filter=True)

    def get_delayed_response_more_than_1_minute(self):
        if "requested_index" not in self.last_4_requests.keys(
        ) or not isinstance(self.last_4_requests["requested_index"], list):
            return []
        if "request_time" not in self.last_4_requests.keys() or not isinstance(
                self.last_4_requests["request_time"], float):
            return []
        if time.time() - self.last_4_requests["request_time"] > 60.0:
            return self.last_4_requests["requested_index"]
        return []

    def check_this_preset_route(self, preset_route=-1):
        if preset_route not in [
                0,
                1,
                2,
                3,
        ]:
            return True
        if "request_time" not in self.last_4_requests.keys() or not isinstance(
                self.last_4_requests["request_time"], float):
            return True
        if "requested_index" not in self.last_4_requests.keys(
        ) or not isinstance(self.last_4_requests["requested_index"], list):
            return True

        # 4 minutes have passed, just return True
        if time.time() - self.last_4_requests["request_time"] > 240.0:
            return True

        # remove current preset_route
        if preset_route in self.last_4_requests["requested_index"]:
            self.last_4_requests["requested_index"].remove(preset_route)
        if 1 > len(self.last_4_requests["requested_index"]):
            return True

        # There are(is a) element(s) in self.last_4_requests["requested_index"]
        return False

    def load_items_into_loader(self, loader=None, text={}, url="", now=""):
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get("BOT_NAME", default=""))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date", now)

        loader.add_value("content", str(text))
        loader.add_value("page_type", "json")

        return loader

    def save_json(self, response=None, page_type="json"):
        status = -4
        if response is None or not hasattr(response, "body") or not hasattr(
                response, "url") or not hasattr(response, "meta"):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object"
            )
            return (-1, f"wrong response object")
        meta_dict = response.meta
        preset_route = meta_dict[
            "preset_route"] if "preset_route" in meta_dict.keys() else ""
        file_path = ""
        if "json" == page_type:
            json_dict = json.loads(response.body)
            status = json_dict["status"] if "status" in json_dict.keys(
            ) else "404"
            count = int(
                json_dict["count"]) if "count" in json_dict.keys() else 0
            if 0 < count:
                now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                file_path = os.path.join(
                    self.json_dir,
                    f"route{preset_route}___{status}___{now}.json")
                status = int(status)
        else:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter page_type == {page_type} from {response.url}"
            )
            return (-2, f"page_type can ONLY be json")

        return_msg = "0 count"
        if 0 < len(file_path):
            try:
                with open(file_path, 'wb') as f:
                    f.write(response.body)
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}"
                )
                return (status, f"failed to write json file")
        return (status, return_msg)

Esempio n. 8

Mostra file

class QqhouseSpider(scrapy.Spider):
	"""
		sys.exit code == 1 # missing CITIES_FOR_CRAWLING
		sys.exit code == 2 # wrong or missing CITY_PAGE_DICT
		sys.exit code == 3 # wrong value(s) of CITY_PAGE_DICT
		On 20190527 Peter re-write this spider for fixing bugs
	"""
	name = "qqhouse"
	
	root_path = ""
	run_purpose = None
	missed_id_txt_filename = ""
	maximal_request_times = []
	debug = None
	city_page_dict = {}
	maximal_list_pages = 0
	city_list = []
	save_every_response = False
	crawled_dir = ""
	detail_html_dir = ""
	list_html_dir = ""
	output_folder_name = ""
	log_dir = ""
	custom_settings = CommonClass.get_custom_settings_dict( spider=name )

	date_list = []
		
	def init_self_attributes(self):
		self.root_path = self.settings.get( "PROJECT_PATH" )
		self.run_purpose = self.settings.get( name = "RUN_PURPOSE", default=None )
		self.missed_id_txt_filename = self.settings.get( name = "MISSED_ID_TXT", default="" )
		self.maximal_request_times = self.settings.get( name = "MAXIMAL_REQUEST_TIMES", default=[] )
		self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False )
		self.city_page_dict = self.settings.get( name = "CITY_PAGE_DICT", default={} )
		self.maximal_list_pages = self.settings.get( name = "MAXIMAL_LIST_PAGES", default=0 )
		self.city_list = self.settings.get( name = "CITIES_FOR_CRAWLING", default=[] )
		self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default=False )
		self.crawled_dir = self.settings.get( name="CRAWLED_DIR", default = "" )
		self.detail_html_dir = self.settings.get( name="SAVED_DETAIL_HTML", default="" )
		self.list_html_dir = self.settings.get( name="SAVED_LIST_HTML", default="" )
		self.output_folder_name = self.settings.get( name="OUTPUT_FOLDER_NAME", default="" )
		self.log_dir = self.settings.get( name="LOG_DIR", default="" )

	def make_dirs(self):
		# even cache is used, we save all html files; here we make these 3 dirs if they do not exist
		if not os.path.isdir( self.crawled_dir ):
			os.makedirs( self.crawled_dir )
		if not os.path.isdir( self.detail_html_dir ):
			os.makedirs( self.detail_html_dir )
		if not os.path.isdir( self.list_html_dir ):
			os.makedirs( self.list_html_dir )

	def start_requests(self):
		self.init_self_attributes()
		self.make_dirs()

		urls = []
		callback_func = self.parse_list
		meta_dict = {"page_type": "list"}
		if 1 > len( self.city_list ):
			self.logger.error( f"self.city_list can NOT be empty." )
			sys.exit(1)
			
		for one_city in self.city_list:
			if one_city not in self.city_page_dict.keys():
				self.logger.error( f"{one_city} is NOT in {self.city_page_dict}" )
				sys.exit(2)
			if 1 > int( self.city_page_dict[ one_city ] ):
				self.logger.error( f"Wrong value of {self.city_page_dict[one_city]} (key == {one_city})" )
				sys.exit(3)
			if 0 != self.maximal_list_pages and self.maximal_list_pages < int( self.city_page_dict[ one_city ] ):
				self.city_page_dict[ one_city ] = self.maximal_list_pages

		for one_city in self.city_list:
			for i in range( int( self.city_page_dict[ one_city ] ) ): # list_page urls
				urls.append( f"https://db.house.qq.com/index.php?mod=search&act=newsearch&city={one_city}&showtype=1&page_no={i+1}" )

		if self.debug:
			self.logger.debug( urls )
			urls = [
				"http://quotes.toscrape.com/page/1/",
				"http://quotes.toscrape.com/page/2/",
			]
			callback_func = self.do_nothing_for_debug
		elif "REDO_MISSED_HOUSE_IDS" == self.run_purpose:
			# REDO_MISSED_HOUSE_IDS is a special debug, HTTPCACHE_ENABLED == False before running REDO_MISSED_HOUSE_IDS
			urls = []
			try:
				file_path = os.path.join( self.root_path, self.name, self.output_folder_name, self.missed_id_txt_filename )
				city = "gz"
				with open( file_path, "r" ) as f:
					for one_id in f.readlines():
						one_id = one_id.replace("\n", "")
						urls.append( one_id ) # f"https://db.house.qq.com/{city}_{one_id}/"
			except Exception as ex:
				self.logger.error( f"failed to read missed_id_txt_file from {file_path}. Exception = {ex}" )
				sys.exit(4)
			callback_func = self.parse_detailed
		elif "REDO_MISSED_PAGE_IDS" == self.run_purpose:
			# REDO_MISSED_PAGE_IDS is a special debug, HTTPCACHE_ENABLED == False before running REDO_MISSED_PAGE_IDS
			urls = []
			try:
				file_path = os.path.join( self.root_path, self.name, self.output_folder_name, self.missed_id_txt_filename )
				city = "gz"
				with open( file_path, "r" ) as f:
					for one_id in f.readlines():
						one_id = one_id.replace("\n", "")
						urls.append( f"https://db.house.qq.com/index.php?mod=search&act=newsearch&city={city}&showtype=1&page_no={one_id}" )
			except Exception as ex:
				self.logger.error( f"failed to read missed_id_txt_file from {file_path}. Exception = {ex}" )
				sys.exit(4)
			callback_func = self.parse_list
		elif "READ_CSV_TO_KAFKA" == self.run_purpose:
			temp_list = self.settings.get( name="DATES_TO_BE_READ", default=[] )
			for one in temp_list:
				if isinstance( one, Iterable) and 0 < len( one ):
					temp_list.append( one )
			if 0 < len( temp_list ):
				self.date_list = temp_list
			callback_func = self.read_csv_to_kafka
			urls = [ "http://quotes.toscrape.com/page/1/", ]

		if self.run_purpose in ["REDO_MISSED_HOUSE_IDS", "REDO_MISSED_PAGE_IDS", "READ_CSV_TO_KAFKA", ]:
			for url in urls:
				yield scrapy.Request( url=url, callback=callback_func, meta = meta_dict, dont_filter = True )
		else:
			for url in urls:
				yield scrapy.Request( url=url, callback=callback_func, meta = meta_dict )

	def read_csv_to_kafka(self, response):
		# do not go to pipeline and just read csv file and produce message to Kafka
		if 1 > len( self.date_list ):
			return False
		for one_date in self.date_list:
			folder_name = f"{one_date}crawled"
			crawled_dir = os.path.join( self.root_path, self.name, self.output_folder_name, f"{today}crawled" )
			csv_file_path = os.path.join( crawled_dir, f"qqhouse{one_date}.csv" )
			if os.path.isdir( crawled_dir ) and os.path.isfile( csv_file_path ):
				with open( csv_file_path, newline="" ) as csvfile:
					file_reader = csv.reader(csvfile) # , delimiter=' ', quotechar='|'
					for row in file_reader:
						temp_dict = eval(row)
						print( temp_dict )
						print( type( temp_dict ) )

	def do_nothing_for_debug(self, response):
		self.logger.info( f"inside Method do_nothing_for_debug of Class QqhouseSpider. url = {response.url}" )

	def load_items_into_loader(self, loader = None, text = {}, url = ""):
		loader.add_value( 'content', str(text) ) # , encoding="utf-8"
		loader.add_value( 'page_type', "detailed" )

		# record housekeeping fields
		loader.add_value('url', url)
		loader.add_value('project', self.settings.get('BOT_NAME') )
		loader.add_value('spider', self.name )
		loader.add_value('server', socket.gethostname() )
		loader.add_value('date', datetime.datetime.now().strftime("%Y%m%d_%H%M%S") )
		return loader

	def get_list_html_file_path( self, city = "", page_no = 0 ):
		if 1 > len( city ) or 1 > page_no:
			return ""
		return os.path.join( self.list_html_dir, f"{city}_list_{page_no}.html" )

	def find_more_house_ids(self, doc = ""):
		house_id_list = []
		counter = 0
		index = 0
		while True:
			index = doc.find("data-hid", index)
			if -1 == index:
				break
			sub_doc = doc[index+10:index+25]
			house_id_list.append( CommonClass.find_digits_from_str( sub_doc ) )
			index += 10
			counter += 1
		return house_id_list

	def extract_all_detailed_html_links(self, string = ""):
		house_id_list = []
		if 1 > len( string ):
			return house_id_list
		doc = string.decode('utf-8')
		end_string = '";var search_result_list_num ='
		end_pos = len( doc )
		if -1 < doc.find( end_string ):
			end_pos = doc.find( end_string )
		doc = doc[ len('var search_result = "			'):end_pos ]
		doc = '<!DOCTYPE html><html><head lang="zh-cn"><title>腾讯房产列表</title></head><body>' + f"{doc}</body></html>"
		response = Selector( text=doc, type="html" )
		house_id_list = response.xpath("//div/@data-hid").extract()
		if 10 > len( house_id_list ):
			house_id_list = self.find_more_house_ids( doc = doc )
		else:
			temp_list = []
			for one_id in house_id_list:
				temp_list.append( CommonClass.find_digits_from_str( one_id ) )
			house_id_list = temp_list
		
		return house_id_list

	def parse_list(self, response = None):
		url = response.url
		city = ""
		page_no = 1
		page_type = "list"
		if hasattr( response, "meta" ) and "page_type" in response.meta.keys():
			page_type = response.meta["page_type"]
		
		if "list" == page_type:
			if 10 > len( str( response.body ) ): # cannot use 1 > ...
				meta_dict = self.request_counter_and_action(response = response)
				if 0 < meta_dict["request_counter"]:
					yield scrapy.Request( url=url, callback=self.parse_list, meta = meta_dict, dont_filter = True )
			else:
				house_id_list = []
				query_part_list = url.split("?")
				if 2 == len( query_part_list ):
					result_dict = parse.parse_qs( query_part_list[1] )
					if "city" in result_dict.keys() and 0 < len( result_dict["city"] ):
						city = result_dict["city"][0]
					if "page_no" in result_dict.keys() and 0 < len(result_dict["page_no"]) and 1 < int( result_dict["page_no"][0] ):
						page_no = int( result_dict["page_no"][0] )
					if self.save_every_response:
						list_html_file_path = self.get_list_html_file_path( city, page_no )
						if 0 < len( list_html_file_path ):
							self.save_html( response = response, page_type = "list", city = city, page_no= str(page_no), house_id = "" )
					house_id_list = self.extract_all_detailed_html_links( response.body )
					# counter = 0
					for one_id in house_id_list:
						next_url = f"https://db.house.qq.com/{city}_{one_id}/"
						self.logger.info( f"crawling next url at {next_url}" )
						yield response.follow( next_url, self.parse_detailed )
		else:
			self.logger.error( f"page_type ({page_type}) is NOT \"list\" in parse_list Method. url = {url}" )

	def load_items_into_loader(self, loader = None, text = {}, url = ""):
		loader.add_value( 'content', str(text) ) # , encoding="utf-8"
		loader.add_value( 'page_type', "detailed" )

		# record housekeeping fields
		loader.add_value('url', url)
		loader.add_value('project', self.settings.get('BOT_NAME') )
		loader.add_value('spider', self.name )
		loader.add_value('server', socket.gethostname() )
		loader.add_value('date', datetime.datetime.now().strftime("%Y%m%d_%H%M%S") )
		return loader

	def save_html(self, response=None, page_type = "detailed", city = "", page_no="", house_id = "" ):
		if response is None or not hasattr(response, "body") or not hasattr( response, "url" ):
			return False
		doc = response.body
		if "detailed" == page_type:
			temp_str = str( house_id ).zfill(8)
			file_path = os.path.join( self.detail_html_dir, f"{city}_{temp_str}.html" )
		elif "list" == page_type:
			temp_str = str( page_no ).zfill(4)
			file_path = os.path.join( self.list_html_dir, f"{city}_list{temp_str}.txt" )
		else:
			return False
		try:
			with open( file_path, 'wb' ) as f:
				f.write( doc )
		except Exception as ex:
			self.logger.warning( f"failed to write response.body from {response.url}" )
			return False
		return True

	def request_counter_and_action(self, response = None):
		request_counter = 0
		request_pointer = 0
		if hasattr( response, "meta" ) and "request_pointer" in response.meta.keys():
			request_pointer = int( response.meta["request_pointer"] )
		if hasattr( response, "meta" ) and "request_counter" in response.meta.keys():
			request_counter = int(response.meta["request_counter"])
		if request_pointer < len( self.maximal_request_times ):
			self.logger.info( f"request_counter == {request_counter}; request_pointer == {request_pointer} for the last request from {response.url}" )
			if request_counter < self.maximal_request_times[request_pointer]:
				return {
					"request_counter": request_counter + 1,
					"request_pointer": request_pointer,
				}
			else:
				return {
					"request_counter": 1,
					"request_pointer": request_pointer + 1,
				}
		else:
			today = datetime.datetime.now().strftime("%Y%m%d")
			self.logger.error( f"{self.maximal_request_times} requests have been sent but ONLY empty response.body received from {response.url}" )
			self.write_log( content = response.url, logfilename = f"missed_uris{today}.txt", content_only = True)
			return {
				"request_counter": -1,
				"request_pointer": request_pointer,
			}

	def extract_detailed_elements( self, response = None, city = "", house_id = "" ):
		text = {}

		# parse fields previously required
		big_box = response.css("div.item.fl")
		real_estate_name = response.css("div.name.fl div.cf h2::text").extract_first(default="")
		real_estate_slogan = big_box.css("div.hd.cf h1.Pagetitle::text").extract_first(default="")
		price_label = big_box.css("div.hd.cf h2.fl.yh.cf em.itemHeader::text").extract_first(default="")
		price_span_list = big_box.css("div.hd.cf h2.fl.yh.cf span.price::text").extract()
		price_span_money = big_box.css("div.hd.cf h2.fl.yh.cf span.price strong::text").extract_first(default="")
		if 2 == len( price_span_list ):
			price_str = f"{price_span_list[0]}___price___{price_span_money}___price___{price_span_list[1]}"
		else:
			price_str = "___price___".join(price_span_list)
			price_str = f"{price_str}___price___{price_span_money}"
		detail_lis = big_box.css( "ul.itemContent.itemContent3.pr li" )
		items = []
		for one_li in detail_lis:
			em_element = one_li.css( "em.itemHeader" )
			if em_element is not None and 0 < len(em_element):
				item_value_list = one_li.css( "::text" ).extract()
				item_value = ""
				if 1 < len( item_value_list ):
					for index, value in enumerate(item_value_list):
						item_value_list[index] = value.strip()
					item_value += str("".join(item_value_list))
				elif 1 == len( item_value_list ):
					item_value = item_value_list[0].strip()
				if "" != item_value:
					item_value = CommonClass.replace_string( string = item_value, char_to_remove = ['\r', '\n', '\t', ' ',], new_char = "___break___" )
					items.append( item_value )
			else:
				continue
		item_string = ""
		if 0 < len(items):
			item_string = "___descr___".join(items)
		if "" != item_string or "" != real_estate_name or "" != price_label or "" != price_str:
			text["real_estate_name"] = real_estate_name
			text["real_estate_slogan"] = real_estate_slogan
			text["price_label"] = price_label
			text["price_str"] = price_str
			text["item_string"] = item_string
			text["city"] = city
			text["house_id"] = house_id

		# parse fields required on 20190528
		basic_info_box = response.css("div#xxIntr ul.hdl.ft")
		all_lis = basic_info_box.xpath("./li")
		item_list = []
		for one_li in all_lis:
			key = one_li.xpath("./span/text()").extract_first(default="")
			value = one_li.xpath("./p/text()").extract_first(default="")
			if 0 < len( key ) and 0 < len( value ):
				item_list.append( f"{key}___key2value___{value}" )
		if 0 < len( item_list ):
			text["basic_info"] = "___basic___".join( item_list )

		return text

	def parse_detailed(self, response = None):
		# response=response.replace(encoding="gb2312") # do NOT use this line
		url = response.url
		doc = response.body
		doc = doc.decode("gb2312", "ignore")
		if 1 > len( str( doc ) ):
			meta_dict = self.request_counter_and_action(response = response)
			if 0 < meta_dict["request_counter"]:
				yield scrapy.Request( url=url, callback=self.parse_detailed, meta = meta_dict, dont_filter = True )
		else:
			city = ""
			house_id = ""
			url_list = url.split( "qq.com" )
			if 2 == len( url_list ):
				temp_list = url_list[1].replace("/", "")
				temp_list = temp_list.split("_")
				if 2 == len( temp_list ):
					city = temp_list[0]
					house_id = temp_list[1]
			if 0 < len( city) and 0 < len( house_id ):
				self.save_html( response = response, page_type = "detailed", city = city, house_id = house_id )
			
			text = {}
			try:
				response2 = Selector(text=doc, type="html")
				text = self.extract_detailed_elements( response = response2, city = city, house_id = house_id  )
			except Exception as ex:
				self.logger.error( f"Error! Exception = {ex}; text = {text}" )
			else:
				if 0 < len( text ):
					try:
						loader = ItemLoader( item = QqhouseItem(), response = response )
						loader = self.load_items_into_loader( loader = loader, text = text, url = url )
						yield loader.load_item()
					except Exception as ex:
						self.logger.error( f"Error happened during loading ItemLoader in Method parse_detailed of Class QqhouseSpider. Exception = {ex}" )
	
	def write_log(self, content = None, logfilename = None, content_only = False):
		if content is not None and 0 < len( content ):
			today = datetime.datetime.now().strftime("%Y%m%d")
			if logfilename is None:
				logfilename = f"{self.name}{today}.log"
			try:
				with open( os.path.join( self.log_dir, logfilename ), 'a', encoding='utf-8') as f:
					if content_only:
						info = f"{str(content)}\n"
					else:
						info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
					f.write(info)
				return 1
			except Exception as ex:
				return 0
		return -1

Esempio n. 9

Mostra file

File: poibaidu.py Progetto: zouyaoji/Crawls

class PoibaiduSpider(scrapy.Spider):
    """
		sys.exit code == 1 # missing input folder
		sys.exit code == 2 # missing CITY_LIST or missing input file(s)
		sys.exit code == 3 # classification file format error
		sys.exit code == 4 # already requested all xy points in city_list today
		sys.exit code == 5 # missing Baidu ak
		sys.exit code == 6 # Method make_request_uris of Class PoibaiduSpider, query_type can ONLY be 3
		sys.exit code == 7 # Run out of all Baidu ak today!
	"""
    name = "poibaidu"

    root_path = ""
    log_dir = ""
    baidu_ak_list = []
    debug = False
    save_every_response = False
    crawled_dir = ""
    json_dir = ""
    input_folder_name = ""
    output_folder_name = ""
    classification_filename = ""
    maximal_request_times = []
    output_file_format = "json"
    base_uri = ""
    query_type = 3
    query_type3edge = 0
    lng_delta = 0
    lat_delta = 0
    baidu_status_code = {}
    run_purpose = None
    city_list = []
    input_dir = ""
    bout = 0
    category_level = 1
    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    classification_dict = {}
    classification_dict_english_mapper = {}
    second_part_of_xy_filename = "2km_with_zero.txt"
    ak_pointer = 0
    center_dict = {}
    request_scope = 2  # 检索结果详细程度。取值为1 或空，则返回基本信息；取值为2，返回检索POI详细信息
    page_size = 20  # 百度API说最大值是每一次请求返回20条记录：http://lbsyun.baidu.com/index.php?title=webapi/guide/webservice-placeapi

    # scrapy housekeeping keys:
    housekeeping_key_list = [
        "download_slot",
        "download_latency",
        "depth",
        "query",
    ]

    bad_ak_status = [
        4,
        5,
        210,
        211,
        302,
    ]

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.baidu_ak_list = self.settings.get(name="BAIDU_AK", default=[])
        if 1 > len(self.baidu_ak_list):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing Baidu ak"
            )
            sys.exit(5)

        self.debug = self.settings.get(name="PROJECT_DEBUG", default=False)
        self.save_every_response = self.settings.get(
            name="SAVE_EVERY_RESPONSE", default=False)
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.json_dir = self.settings.get(name="SAVED_JSON", default="")
        self.input_folder_name = self.settings.get(name="INPUT_FOLDER_NAME",
                                                   default="")
        self.output_folder_name = self.settings.get(name="OUTPUT_FOLDER_NAME",
                                                    default="")
        self.classification_filename = self.settings.get(
            name="QUERY_CLASSIFICATION_FILENAME", default="")
        self.maximal_request_times = self.settings.get(
            name="MAXIMAL_REQUEST_TIMES", default=[])
        self.output_file_format = self.settings.get(name="OUTPUT_FILE_FORMAT",
                                                    default="json")
        self.base_uri = self.settings.get(name="BASE_URI", default="")
        self.query_type = self.settings.get(name="QUERY_TYPE", default=3)
        self.query_type3edge = self.settings.get(name="QUERY_TYPE3EDGE",
                                                 default=1.1)
        # https://zhidao.baidu.com/question/138957118823573885.html
        # 北纬30度，应该是0.010402707553*edge
        # 北纬45度，应该是0.0127406627241*edge
        # 北纬60度，应该是0.01801801801802*edge
        self.lng_delta = 0.01167 * self.query_type3edge
        self.lat_delta = 0.009009009 * self.query_type3edge  # 每一纬度是111公里
        self.baidu_status_code = self.settings.get(name="BAIDU_STATUS_CODE",
                                                   default={})
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        self.city_list = self.settings.get(name="CITY_LIST", default=[])
        self.input_dir = os.path.join(self.root_path, self.name,
                                      self.input_folder_name)
        self.bout = self.settings.get(name="RUN_PURPOSE_BOUT", default=1)

        self.category_level = self.settings.get(name="NEED_LEVELS", default=1)

        self.classification_dict_english_mapper = self.settings.get(
            name="DATABASE_ENGLISH_CATEGORY_TABLE", default={})

    def check_dirs_and_files(self):
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.json_dir):
            os.makedirs(self.json_dir)

        # check all files and dirs
        if not os.path.isdir(self.input_dir):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, input folder ({self.input_dir}) and input files are needed."
            )
            sys.exit(1)

        temp_list = []
        missed_input_file = []
        for one_city in self.city_list:
            input_file_path = os.path.join(
                self.input_dir, f"{one_city}{self.second_part_of_xy_filename}")
            if os.path.isfile(input_file_path):
                temp_list.append(one_city)
            else:
                missed_input_file.append(one_city)
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing {input_file_path}"
                )
        if 0 < len(missed_input_file):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing input files of {missed_input_file}"
            )
            sys.exit(2)
            # self.city_list = temp_list
        if 1 > len(self.city_list):
            # errorMsg = f"Missing input files of {missed_input_file}" if 0 < len(missed_input_file) else f"please indicate which cities you want to request POIs"
            errorMsg = f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, please indicate which cities you want to request POIs"
            self.logger.error(errorMsg)
            sys.exit(2)

    def read_xy_file(self, city=""):
        """
		return a list [] that including this city's xy points
		"""
        center = []
        temp_list = []
        if 1 > len(city):
            return center
        today = datetime.datetime.now().strftime("%Y%m%d")
        try:
            input_filename = f"{city}{self.second_part_of_xy_filename}"
            with open(os.path.join(self.input_dir, input_filename),
                      'r',
                      encoding='utf-8') as f:
                for item in f.readlines()[1:]:
                    center.append(tuple(item.strip().split(
                        ",")[-5:]))  # lng, lat, ok0, max_value, max_timestamp
        except Exception as ex:
            center = []
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, cannot read xy_list file ({input_filename}) or requested xy points file ({input_filename}). Exception = {ex}"
            )
        return center

    def parse_uri_query_to_dict(self,
                                url="",
                                only_these_keys=[],
                                map_query_english=True):
        result_dict = {}
        query_part_list = url.split("?")
        if 2 == len(query_part_list):
            result_dict = parse.parse_qs(query_part_list[1])
        for index, key in enumerate(result_dict):
            if 1 == len(result_dict[key]):
                result_dict[key] = result_dict[key][0]
            else:
                self.logger.warning(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, length of {len(result_dict[key])} is more than 1"
                )
        if 0 < len(only_these_keys):
            temp_dict = {}
            for index, key in enumerate(result_dict):
                if key in only_these_keys:
                    temp_dict[key] = result_dict[key]
            result_dict = temp_dict

        if "bounds" in result_dict.keys():
            result_dict["bounds"] = result_dict["bounds"].replace(",", "_")
        if map_query_english and "query" in result_dict.keys():
            if result_dict[
                    "query"] in self.classification_dict_english_mapper.keys():
                result_dict["query"] = self.classification_dict_english_mapper[
                    result_dict["query"]]
            else:
                result_dict[
                    "query"] = f"unknown_english_name{random.randint(10000,99999)}"
        return result_dict

    def return_next_ak(self):
        self.ak_pointer += 1
        if self.ak_pointer >= len(self.baidu_ak_list):  # do not use ==
            self.ak_pointer = 0
        return self.baidu_ak_list[self.ak_pointer]

    def make_request_uris(self, query_type=3, exclude_requested_today=True):
        """
		As of 20190529, ONLY 3 == query_type is coded
		1 == query_type: "http://api.map.baidu.com/place/v2/search?query=ATM机&tag=银行&region=北京&output=json&ak=您的ak" # requesting pois in one city
		2 == query_type: "http://api.map.baidu.com/place/v2/search?query=银行&location=39.915,116.404&radius=2000&output=xml&ak=您的密钥" # requesting pois in one circle area
		3 == query_type: "http://api.map.baidu.com/place/v2/search?query=银行&bounds=39.915,116.404,39.975,116.414&output=json&ak={您的密钥}" # requesting pois in one rectangle area
		4 == query_type: "http://api.map.baidu.com/place/v2/detail?uid=435d7aea036e54355abbbcc8&output=json&scope=2&ak=您的密钥" # requesting pois at one location with detailed address
		"""
        base_uri = self.base_uri
        if 4 == query_type:
            base_uri = self.base_uri.replace("/search", "/detail")
        if 3 != query_type:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, argumnet query_type can ONLY be 3; 1/2/4 are under developing"
            )
            sys.exit(6)

        today = datetime.datetime.now().strftime("%Y%m%d")
        urls = {}
        all_categories = []
        if 1 == self.category_level:
            all_categories = self.classification_dict.keys()
        elif 2 == self.category_level:
            for index, level1key in enumerate(self.classification_dict):
                all_categories += self.classification_dict[level1key]
        elif 3 == self.category_level:
            all_categories = list(self.classification_dict.keys())
            for index, level1key in enumerate(self.classification_dict):
                all_categories += self.classification_dict[level1key]
        for index, city in enumerate(self.center_dict):
            # read this city's log file to exclude today's requested points
            requested = []
            finished_xy_filename = f"{city}_finished_xy_query_points_{today}.log"
            finished_file_path = os.path.join(self.log_dir,
                                              finished_xy_filename)
            if exclude_requested_today and os.path.isfile(finished_file_path):
                with open(finished_file_path, "r",
                          encoding="utf-8") as log_file:
                    for item in log_file.readlines():
                        value = item.strip().split(",")
                        if 3 == len(value):
                            requested.append(
                                f"{value[0]}___{value[1]}___{value[2]}")

            excluded = []
            for category in all_categories:
                city_category_dict = {}
                for item in self.center_dict[city]:
                    lng, lat, ok0, max_value, max_timestamp = item
                    requested_key = f"{lng}___{lat}___{category}"
                    if exclude_requested_today and 0 < len(requested):
                        if requested_key in requested:
                            excluded.append(requested_key)
                            continue
                    lng, lat = float(lng), float(lat)
                    lng_min = float("%.3f" % (lng - 0.5 * self.lng_delta))
                    lng_max = float("%.3f" % (lng + 0.5 * self.lng_delta))
                    lat_min = float("%.3f" % (lat - 0.5 * self.lat_delta))
                    lat_max = float("%.3f" % (lat + 0.5 * self.lat_delta))
                    bounds = f"{lat_min},{lng_min},{lat_max},{lng_max}"
                    city_category_dict[
                        requested_key] = f"{base_uri}?query={category}&page_size={self.page_size}&page_num=0&scope={self.request_scope}&bounds={bounds}&output={self.output_file_format}&ak={self.return_next_ak()}"
                if 0 < len(city_category_dict):
                    urls[f"{city}___{category}"] = city_category_dict
            if 0 < len(excluded):
                self.logger.info(
                    f"{len(excluded)} requests have been excluded in City {city}: ({excluded})"
                )
        return urls

    def make_point_request_from_500_by_500(self, url_fragment_list=[]):
        url = url_fragment_list[-1]
        bounds = ""
        xy_list = []
        new_center = {}
        if 0 < len(url):
            result_dict = parse.parse_qs(url)
            if 0 < len(result_dict) and "bounds" in result_dict.keys():
                bounds = result_dict["bounds"][0]
        if 0 < len(bounds):
            xy_list = bounds.split(",")
        if 4 == len(xy_list):
            # bounds=23.091,113.306,23.097,113.313
            y_min = float(xy_list[0])
            x_min = float(xy_list[1])
            y_max = float(xy_list[2])
            x_max = float(xy_list[3])
            delta_x = int(1000 * (x_max - x_min))
            delta_y = int(1000 * (y_max - y_min))
            if 0 < delta_x and 0 < delta_y:
                for i in range(delta_x):
                    for j in range(delta_y):
                        key_x = "%.6f" % (x_min + i / 1000)
                        key_y = "%.5f" % (y_min + j / 1000)
                        x = "%.3f" % (x_min + i / 1000)
                        y = "%.3f" % (y_min + j / 1000)
                        x_plus_1 = "%.3f" % (x_min + (i + 1) / 1000)
                        y_plus_1 = "%.3f" % (y_min + (j + 1) / 1000)
                        new_center[
                            f"{key_x}___{key_y}"] = f"{y},{x},{y_plus_1},{x_plus_1}"
        return new_center

    def make_16_request_from_2km_by_2km(self, url_fragment_list=[]):
        new_center = {}
        lng, lat = float(url_fragment_list[0]), float(url_fragment_list[1])
        if 0 < lng and 0 < lat:
            center_xy = [
                lng,
                lat,
            ]
            new_center = self.get_center_xys_from_single_xy(
                center_xy, half_edge_seperator=2, query_type3edge=1.1)
        return new_center

    def get_center_xys_from_single_xy(self,
                                      center_xy=[],
                                      half_edge_seperator=2,
                                      query_type3edge=1.1):
        new_center = {}
        if not isinstance(half_edge_seperator, int):
            return new_center
        lng, lat = float(center_xy[0]), float(center_xy[1])
        old_half_edge = float("%.3f" % (query_type3edge / 1.1))
        xy_point_list = []
        new_span = old_half_edge / half_edge_seperator
        lng_delta = 0.01167 * new_span
        lat_delta = 0.009009009 * new_span
        x_minimal = "%.6f" % (lng - (half_edge_seperator - 0.5) * lng_delta)
        y_minimal = "%.5f" % (lat - (half_edge_seperator - 0.5) * lat_delta)
        for i in range(half_edge_seperator * 2):
            for j in range(half_edge_seperator * 2):
                x = "%.6f" % (lng + i * lng_delta)
                y = "%.5f" % (lat + j * lat_delta)
                key = f"{x}___{y}"
                xy_point_list.append(key)
        for one in xy_point_list:
            temp_list = one.split("___")
            if 2 == len(temp_list):
                bound_string = self.get_small_bounds(
                    center_xy=temp_list, query_type3edge=new_span *
                    1.2)  # for smaller rectangle, we use 1.2
                if 0 < len(bound_string):
                    new_center[one] = bound_string
        return new_center

    def get_small_bounds(self, center_xy=[], query_type3edge=0.55):
        lng_delta = 0.01167 * query_type3edge
        lat_delta = 0.009009009 * query_type3edge
        lng, lat = float(center_xy[0]), float(center_xy[1])
        lng_min = float("%.3f" % (lng - 0.5 * lng_delta))
        lng_max = float("%.3f" % (lng + 0.5 * lng_delta))
        lat_min = float("%.3f" % (lat - 0.5 * lat_delta))
        lat_max = float("%.3f" % (lat + 0.5 * lat_delta))
        if 0 < lng_min and 0 < lng_max and 0 < lat_min and 0 < lat_max:
            return f"{lat_min},{lng_min},{lat_max},{lng_max}"
        else:
            return ""

    def do_makeup_requests(self, query_type=3, bout=1, single_line=""):
        all_city_dict = {}
        if bout not in [
                1,
                2,
        ] or 3 != query_type:
            return all_city_dict

        base_uri = self.base_uri
        if 4 == query_type:
            base_uri = self.base_uri.replace("/search", "/detail")
        if 0 == len(single_line):
            today = datetime.datetime.now().strftime("%Y%m%d")
            empty_file_dir = os.path.join(self.root_path, self.name,
                                          self.output_folder_name,
                                          f"{today}waiting4next")
            file_list = os.listdir(empty_file_dir)
            all_lines = []
            for one_file in file_list:
                try:
                    this_file_path = os.path.join(empty_file_dir, one_file)
                    with open(this_file_path, 'r', encoding='utf-8') as f:
                        for item in f.readlines():
                            all_lines.append(item)
                except Exception as ex:
                    all_lines = []
                    self.logger.error(
                        f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, cannot read xy_list file ({this_file_path}). Exception = {ex}"
                    )
        else:
            all_lines = [single_line]
        new_center = {}

        for item in all_lines:
            url_fragment_list = item.strip().split("___")
            if 1 == bout:
                new_center = self.make_16_request_from_2km_by_2km(
                    url_fragment_list)
            elif 2 == bout:
                new_center = self.make_point_request_from_500_by_500(
                    url_fragment_list)
            category = url_fragment_list[2]
            city = url_fragment_list[3]
            city_category_key = f"{city}___{category}"
            for index, key in enumerate(new_center):
                temp_list = key.split("___")
                if 2 == len(temp_list):
                    requested_key = f"{temp_list[0]}___{temp_list[1]}___{category}"
                    bounds = new_center[key]
                    temp_dict = {}
                    if city_category_key in all_city_dict.keys():
                        temp_dict = all_city_dict[city_category_key]
                    temp_dict[
                        requested_key] = f"{base_uri}?query={category}&page_size={self.page_size}&page_num=0&scope={self.request_scope}&bounds={bounds}&output={self.output_file_format}&ak={self.return_next_ak()}"
                    all_city_dict[city_category_key] = temp_dict
        return all_city_dict

    def start_requests(self):
        self.init_self_attributes()
        self.check_dirs_and_files()
        self.read_classification_file()
        self.center_dict = {}
        for city in self.city_list:
            center_list = self.read_xy_file(city=city)
            if 0 < len(center_list):
                self.center_dict[city] = center_list
        if 1 > len(self.center_dict):
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, you already requested all xy points in {self.city_list} today."
            )
            sys.exit(4)

        if "REDO_OVER400_POIS" == self.run_purpose:
            url_dict = self.do_makeup_requests(query_type=3,
                                               bout=self.bout,
                                               single_line="")
        else:
            url_dict = self.make_request_uris(query_type=3,
                                              exclude_requested_today=True)
        callback_func = self.parse_json
        if self.debug:
            callback_func = self.do_nothing_for_debug

        meta_dict = {}
        for index, key in enumerate(url_dict):
            temp_list = key.split("___")
            if 2 == len(temp_list):
                meta_dict = {
                    "city": temp_list[0],
                    "category": temp_list[1],
                    "page_num": 0,
                }
                for inner_index, center_xy in enumerate(url_dict[key]):
                    one_url = url_dict[key][center_xy]
                    temp_list = center_xy.split("___")
                    if 3 == len(temp_list):
                        meta_dict["center_x"] = temp_list[0]
                        meta_dict["center_y"] = temp_list[1]
                        if "REDO_OVER400_POIS" == self.run_purpose:
                            self.logger.info(
                                f"requesting {one_url}; meta = {meta_dict}")
                        else:
                            self.logger.info(f"requesting {one_url}")
                        yield scrapy.Request(url=one_url,
                                             callback=callback_func,
                                             meta=meta_dict,
                                             dont_filter=True)
                    else:
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {center_xy} error in url_dict[key] ({len(url_dict[key])})"
                        )
            else:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {key} error in url_dict ({len(url_dict)})"
                )
                continue

    def do_nothing_for_debug(self, response):
        pass

    def parse_json(self, response):
        status, message = self.save_json(response=response, page_type="json")
        callback_func = self.parse_json
        url = response.url
        today = datetime.datetime.now().strftime("%Y%m%d")
        if status in [
                -1,
                -2,
                -3,
        ]:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, wrong parameter(s) have passed to self.save_json!"
            )
        elif 404 == status:
            meta_dict = self.request_counter_and_action(response=response)
            if 0 < meta_dict["request_counter"]:
                yield scrapy.Request(url=response.url,
                                     callback=callback_func,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif status in [
                2,
                3,
        ]:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to request {response.url} using wrong parameters or verification error(s)"
            )
        elif status in self.bad_ak_status:
            self.logger.info(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to request {response.url}; run out of Baidu Ak quota or wrong settings; status code {status}"
            )
            temp_list = message.split("___")
            if 4 == len(temp_list):
                bad_ak = temp_list[3]
                if 0 < len(bad_ak) and bad_ak in self.baidu_ak_list:
                    self.baidu_ak_list.remove(bad_ak)
                if 1 > len(self.baidu_ak_list):
                    self.logger.error(
                        f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, run out of all Baidu ak today!"
                    )
                    sys.exit(7)
                else:
                    yield scrapy.Request(url=response.url,
                                         callback=callback_func,
                                         meta=response.meta_dict,
                                         dont_filter=True)
            else:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, self.save_json did NOT pass the bad ak back!"
                )
        elif 0 == status:
            temp_list = message.split("___")
            if 3 == len(temp_list):
                page_num = int(temp_list[1])
                total = int(temp_list[2])
                will_divide_request = False
                if 400 <= total:
                    meta_dict = {}
                    if hasattr(response, "meta"):
                        meta_dict = response.meta
                    needed_keys = [
                        "city",
                        "center_x",
                        "center_y",
                    ]
                    city = meta_dict["city"] if "city" in meta_dict.keys(
                    ) else ""
                    center_x = meta_dict[
                        "center_x"] if "center_x" in meta_dict.keys() else ""
                    center_y = meta_dict[
                        "center_y"] if "center_y" in meta_dict.keys() else ""
                    query = meta_dict[
                        "category"] if "category" in meta_dict.keys() else ""
                    content = f"{center_x}___{center_y}___{query}___{city}___{page_num}___{total}___{url}"

                    # begin to request 16 times for 1 == bout or 0.001 step for 2 == bout
                    bout = int(
                        meta_dict["bout"]) if "bout" in meta_dict.keys() else 1
                    if 3 > bout:
                        url_dict = self.do_makeup_requests(query_type=3,
                                                           bout=bout,
                                                           single_line=content)
                        for index, key in enumerate(url_dict):
                            temp_list = key.split("___")
                            if 2 == len(temp_list):
                                meta_dict = {
                                    "city": temp_list[0],
                                    "category": temp_list[1],
                                    "page_num": 0,
                                    "bout": bout + 1,
                                }
                                for inner_index, center_xy in enumerate(
                                        url_dict[key]):
                                    one_url = url_dict[key][center_xy]
                                    temp_list = center_xy.split("___")
                                    if 3 == len(temp_list):
                                        meta_dict["center_x"] = temp_list[0]
                                        meta_dict["center_y"] = temp_list[1]
                                        self.logger.info(
                                            f"requesting {one_url}; meta = {meta_dict}"
                                        )
                                        yield scrapy.Request(
                                            url=one_url,
                                            callback=self.parse_json,
                                            meta=meta_dict,
                                            dont_filter=True)
                                    else:
                                        self.logger.error(
                                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {center_xy} error in url_dict[key] ({len(url_dict[key])})"
                                        )
                            else:
                                self.logger.error(
                                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {key} error in url_dict ({len(url_dict)})"
                                )
                                continue
                        will_divide_request = True
                    else:
                        xy_over400_filename = f"{city}_over400_xy_{today}.log"
                        self.write_log(content=content,
                                       logfilename=xy_over400_filename,
                                       content_only=True)
                        will_divide_request = False
                        # directly save this json file even it contains more than 400 POIs
                if not will_divide_request:
                    json_dict = json.loads(response.body)
                    result_list = json_dict[
                        "results"] if "results" in json_dict.keys() else []
                    this_page_pois_list = []
                    for one_poi in result_list:
                        this_poi_dict = self.process_one_baidu_poi_json_dict(
                            json_dict=one_poi)
                        this_page_pois_list.append(this_poi_dict)

                    housekeeping_dict = {}
                    meta_dict = {}
                    if hasattr(response, "meta"):
                        meta_dict = response.meta
                    for one_key in self.housekeeping_key_list:
                        housekeeping_dict[one_key] = meta_dict[
                            one_key] if one_key in meta_dict.keys() else ""
                    housekeeping_dict["query"] = meta_dict[
                        "category"] if "category" in meta_dict.keys() else ""

                    # yield to pipeline
                    try:
                        for one_poi in this_page_pois_list:
                            loader = ItemLoader(item=PoibaiduItem(),
                                                response=response)
                            loader = self.load_items_into_loader(
                                loader=loader,
                                one_poi_dict=one_poi,
                                housekeeping_dict=housekeeping_dict,
                                url=url)
                            yield loader.load_item()
                    except Exception as ex:
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}"
                        )

                    last_page = math.ceil(total / self.page_size) - 1
                    if page_num < last_page and hasattr(response, "meta"):
                        meta_dict["page_num"] = page_num + 1
                        url = url.replace(f"page_num={page_num}",
                                          f"page_num={page_num+1}")
                        yield scrapy.Request(url=url,
                                             callback=callback_func,
                                             meta=meta_dict,
                                             dont_filter=True)
                    elif page_num < last_page:
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, missing response.meta while requesting {url} (last_page == {last_page})"
                        )
                    elif page_num >= last_page:
                        # this one shall be else:, but we just add one more condition
                        # write separate log before yielding to pipeline
                        center_x = meta_dict[
                            "center_x"] if "center_x" in meta_dict.keys(
                            ) else ""
                        center_y = meta_dict[
                            "center_y"] if "center_y" in meta_dict.keys(
                            ) else ""
                        query = housekeeping_dict["query"]
                        if 0 < len(meta_dict["city"]) and 0 < len(
                                center_x) and 0 < len(center_y):
                            city = meta_dict["city"]
                            finished_xy_filename = f"{city}_finished_xy_query_points_{today}.log"
                            self.write_log(
                                content=f"{center_x},{center_y},{query}",
                                logfilename=finished_xy_filename,
                                content_only=True)
                            # to have smaller I/O, we do NOT inlcude page_num and total_page here. Baidu API Server seems to be stronger than QQ
            else:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, self.save_json did NOT pass correct message ({message})!"
                )

    def load_items_into_loader(self,
                               loader=None,
                               one_poi_dict={},
                               housekeeping_dict={},
                               url=""):
        # record housekeeping fields
        for index, key in enumerate(housekeeping_dict):
            loader.add_value(key, housekeeping_dict[key])
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get("BOT_NAME"))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date",
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))

        # record all fields for database table(s)
        loader.add_value("content", str(one_poi_dict))
        loader.add_value("page_type", "json")

        return loader

    def print_attributes(self):
        for one in dir(self):
            if not callable(getattr(self, one)) and -1 == one.find("__"):
                self.logger.info(f"{one} ==> {getattr(self, one)}")

    def read_classification_file(self):
        classification_dict = {}
        classification_file_path = os.path.join(self.input_dir,
                                                self.classification_filename)
        try:
            with open(classification_file_path, 'r', encoding='utf-8') as f:
                for item in f.readlines():
                    temp_list = item.strip().split(":")
                    if 2 == len(temp_list):
                        value_list = temp_list[1].split(
                            "、") if -1 < temp_list[1].find("、") else [
                                temp_list[1]
                            ]
                        classification_dict[temp_list[0]] = value_list
                    else:
                        raise FileFormatException
        except FileFormatException as ex:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {item} cannot be splitted into 2 by a colon. Wrong format in File {classification_file_path}. Exception = {ex}"
            )
            sys.exit(3)
        except Exception as ex:
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, cannot read classification file ({classification_file_path}). Exception = {ex}"
            )
            sys.exit(3)
        self.classification_dict = classification_dict

    def extract_response_meta_to_dict(self, response=None):
        city = ""
        category = ""
        page_num = -1
        if hasattr(response, "meta") and "city" in response.meta.keys():
            city = response.meta["city"]
        if hasattr(response, "meta") and "category" in response.meta.keys():
            category = response.meta["category"]
        if hasattr(response, "meta") and "page_num" in response.meta.keys():
            page_num = int(response.meta["page_num"])
        return (city, category, page_num)

    def save_json(self, response=None, page_type="json"):
        if response is None or not hasattr(response, "body") or not hasattr(
                response, "url"):
            return (-1, f"wrong response format")
        city = ""
        category = ""
        page_num = -1
        status = 404
        total = "0"
        bad_baidu_ak = False
        uri_query_dict = {}
        if "json" == page_type:
            json_dict = json.loads(response.body)
            status = json_dict["status"] if "status" in json_dict.keys(
            ) else "404"
            total = json_dict["total"] if "total" in json_dict.keys() else "0"
            only_these_keys = [
                "page_num",
                "query",
                "bounds",
            ]
            if int(status) in self.bad_ak_status:
                only_these_keys.append("ak")
                bad_baidu_ak = True
            uri_query_dict = self.parse_uri_query_to_dict(
                url=response.url,
                only_these_keys=only_these_keys,
                map_query_english=True)
            city, category, page_num = self.extract_response_meta_to_dict(
                response=response)
            page_num = uri_query_dict["page_num"]
            category = uri_query_dict["query"]
            bounds = uri_query_dict["bounds"]
            file_path = os.path.join(
                self.json_dir,
                f"{city}___{category}___{bounds}___{page_num}___{total}___{status}.json"
            )
            status = int(status)
        else:
            return (-2, f"page_type can ONLY be json")

        try:
            with open(file_path, 'wb') as f:
                f.write(response.body)
        except Exception as ex:
            self.logger.warning(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, failed to write response.body from {response.url}"
            )
            return (-3, f"failed to write json file")
        else:
            return_msg = f"# {page_num} of {total} pages is requested___{page_num}___{total}"
            if bad_baidu_ak:
                bad_ak = uri_query_dict["ak"] if "ak" in uri_query_dict.keys(
                ) else ""
                return_msg = f"# {page_num} of {total} pages is requested___{page_num}___{total}___{bad_ak}"
            return (status, return_msg)

    def request_counter_and_action(self, response=None):
        return_dict = {}
        if hasattr(response, "meta"):
            return_dict = response.meta
        request_counter = 0
        request_pointer = 0
        if hasattr(response,
                   "meta") and "request_pointer" in response.meta.keys():
            request_pointer = int(response.meta["request_pointer"])
            del return_dict["request_pointer"]
        if hasattr(response,
                   "meta") and "request_counter" in response.meta.keys():
            request_counter = int(response.meta["request_counter"])
            del return_dict["request_counter"]

        if request_pointer < len(self.maximal_request_times):
            self.logger.info(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, request_counter == {request_counter}; request_pointer == {request_pointer} for the last request from {response.url}"
            )
            if request_counter < self.maximal_request_times[request_pointer]:
                return_dict["request_counter"] = request_counter + 1
                return_dict["request_pointer"] = request_pointer
                return return_dict
            else:
                return_dict["request_counter"] = 1
                return_dict["request_pointer"] = request_pointer + 1
                return return_dict
        else:
            today = datetime.datetime.now().strftime("%Y%m%d")
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {self.maximal_request_times} requests have been sent but ONLY empty response.body received from {response.url}"
            )
            self.write_log(content=response.url,
                           logfilename=f"missed_uris{today}.txt",
                           content_only=True)
            return_dict["request_counter"] = -1
            return_dict["request_pointer"] = request_pointer
            return return_dict

    def process_one_baidu_poi_json_dict(self, json_dict={}):
        return_dict = {}
        key_list = [
            "name",
            "address",
            "province",
            "city",
            "area",
            "telephone",
            "uid",
            "street_id",
            "detail",
        ]

        for one_key in key_list:
            if one_key in json_dict.keys():
                return_dict[one_key] = json_dict[one_key]
                has_item = True
            else:
                return_dict[one_key] = ""

        # process "detail_info", "tag" and "type"
        if "detail_info" in json_dict.keys():
            return_dict["detail_info"] = str(json_dict["detail_info"])
            if "type" in json_dict["detail_info"].keys():
                return_dict["type"] = json_dict["detail_info"]["type"]
            if "tag" in json_dict["detail_info"].keys():
                return_dict["tag"] = json_dict["detail_info"]["tag"]
        else:
            return_dict["detail_info"] = str({})
        if "type" not in return_dict.keys():
            return_dict["type"] = ""
        if "tag" not in return_dict.keys():
            return_dict["tag"] = ""

        # process "lat" and "lng"
        if "location" in json_dict.keys():
            location = json_dict["location"]
            if "lat" in location.keys() and "lng" in location.keys():
                return_dict["lat"] = location["lat"]
                return_dict["lng"] = location["lng"]
        if "lat" not in return_dict.keys():
            return_dict["lat"] = ""
        if "lng" not in return_dict.keys():
            return_dict["lng"] = ""

        return return_dict

    def write_log(self, content=None, logfilename=None, content_only=False):
        if content is not None and 0 < len(content):
            today = datetime.datetime.now().strftime("%Y%m%d")
            if logfilename is None:
                logfilename = f"{self.name}{today}.log"
            try:
                with open(os.path.join(self.log_dir, logfilename),
                          'a',
                          encoding='utf-8') as f:
                    if content_only:
                        info = f"{str(content)}\n"
                    else:
                        info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
                    f.write(info)
                return 1
            except Exception as ex:
                return 0
        return -1

Esempio n. 10

Mostra file

File: fang.py Progetto: zouyaoji/Crawls

class FangSpider(scrapy.Spider):
    """
		sys.exit code == 1 # wrong or missing RUN_PURPOSE
		sys.exit code == 2 # wrong or missing CRAWLED_DIR, SAVED_DETAIL_HTML, or SAVED_GAODE_JASON
		On 20190517 Peter re-write this spider for fixing bugs
	"""
    name = "fang"

    csv_filename = None
    root_path = ""
    run_purpose = None
    overwrite_today = ""
    crawled_dir = ""
    detail_html_dir = ""
    gaode_json_dir = ""
    csv_file_path = None
    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        if self.run_purpose is None:
            self.logger.error(
                f"missing RUN_PURPOSE ({self.run_purpose}) setting")
            sys.exit(1)
        self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="")
        self.debug = self.settings.get(
            name="PROJECT_DEBUG",
            default=False)  # whether this run is for debugging
        if not hasattr(self, "overwrite_today") or 1 > len(
                self.overwrite_today) or self.overwrite_today is None:
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # set all paths
        self.crawled_dir = self.settings.get(name='CRAWLED_DIR', default="")
        self.detail_html_dir = self.settings.get(name='SAVED_DETAIL_HTML',
                                                 default="")
        self.gaode_json_dir = self.settings.get(name='SAVED_GAODE_JASON',
                                                default="")
        self.csv_file_path = os.path.join(
            self.crawled_dir, f"fang_zu{self.overwrite_today}.csv")

        if 1 > len(self.crawled_dir) or 1 > len(
                self.detail_html_dir) or 1 > len(self.gaode_json_dir):
            self.logger.info(
                f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_DETAIL_HTML ({self.detail_html_dir}), or SAVED_GAODE_JASON ({self.gaode_json_dir}) setting(s)"
            )
            sys.exit(2)

    def make_dirs(self):
        # even cache is used, we save all html files; here we make these 3 dirs if they do not exist
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.detail_html_dir):
            os.makedirs(self.detail_html_dir)
        if not os.path.isdir(self.gaode_json_dir):
            os.makedirs(self.gaode_json_dir)

    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()

        if "READ_HTML" == self.run_purpose:
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            city_list = self.settings.get("CITY_LIST", default=[])
            number_day_of_this_year = datetime.datetime.now().timetuple(
            ).tm_yday  # type == int
            seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3)
            if seperate_into_days > len(city_list):
                seperate_into_days = len(city_list)
            batch_count = math.ceil(len(city_list) / seperate_into_days)
            today_batch = number_day_of_this_year % seperate_into_days
            start_index = today_batch * batch_count - 1
            end_index = (today_batch + 1) * batch_count
            urls = []
            for index, city in enumerate(city_list):
                if (start_index < index) and (index < end_index):
                    urls.append(f"https://{city}.zu.fang.com/")
            for url in urls:
                yield scrapy.Request(url=url, callback=self.parse)
        else:
            urls = [
                'http://quotes.toscrape.com/page/1/',
                'http://quotes.toscrape.com/page/2/',
                'http://quotes.toscrape.com/page/3/',
                'http://quotes.toscrape.com/page/4/',
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)

    def do_nothing_for_debug(self, response):
        self.logger.info(
            f"inside Method do_nothing_for_debug of Class FangSpider. url = {response.url}"
        )

    def read_and_parse(self, response):
        file_list = os.listdir(self.detail_html_dir)
        for one_file in file_list:
            if -1 < one_file.find("index"):
                self.logger.info(f"ignoring {one_file}")
            else:
                temp_list = one_file.split("_")
                apt_id = 0
                city_name = ""
                if 1 < len(temp_list):
                    apt_id = temp_list[1]
                    city_name = temp_list[0]
                url = f"https://{city_name}.zu.fang.com/house/"
                html_file = os.path.join(self.detail_html_dir, one_file)
                if os.path.isfile(html_file):
                    doc = None
                    with open(html_file, 'rb') as f:
                        doc = f.read().decode('gb2312', 'ignore')
                    if doc is None:
                        self.logger.error(
                            f"Error: cannot read html file {html_file}.")
                        continue
                    response = Selector(text=doc, type="html")
                    text = self.parse_response_field(response=response,
                                                     city_name=city_name,
                                                     apt_id=apt_id)
                    try:
                        response_for_items = TextResponse(
                            url=url,
                            status=200,
                            body=bytes(doc, encoding="utf-8"))
                        loader = ItemLoader(item=FangItem(),
                                            response=response_for_items)
                        loader = self.load_items_into_loader(loader=loader,
                                                             text=text,
                                                             url=url)
                        yield loader.load_item()
                    except Exception as ex:
                        print(
                            f"Error happened during parsing in Method read_and_parse of Class FangSpider. Exception = {ex}"
                        )

    def load_items_into_loader(self, loader=None, text={}, url=""):
        loader.add_value('content', str(text))  # , encoding="utf-8"
        loader.add_value('page_type', "detailed")

        # record housekeeping fields
        loader.add_value('url', url)
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date',
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
        return loader

    def parse_response_field(self, response=None, city_name="", apt_id=""):
        text = {}
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text
        address_list = response.xpath(
            '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]')
        address = address_list[0].xpath(
            '//div[@class="rcont"]/a/text()').extract_first(
                default="") if 0 < len(address_list) else ""
        location_list = response.xpath(
            '//div[@class="trl-item2 clearfix"]/div[@class="rcont address_zf"]/a/text()'
        ).extract()
        if location_list is None or 1 > len(location_list):
            location_list = response.xpath(
                '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[@class="link-under"]/text()'
            ).extract()
            address_list = response.xpath(
                '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[not(@class)]/text()'
            ).extract()
            address = ""
            if 0 < len(address_list):
                address = "；".join(address_list)
        location_list.reverse()
        location = ""
        for one_location in location_list:
            location += one_location
        if 0 < len(address):
            address = CommonClass.clean_string(string=address,
                                               char_to_remove=[
                                                   '\r',
                                                   '\n',
                                                   '\t',
                                                   '"',
                                               ])
        if 0 < len(location):
            location = CommonClass.clean_string(string=location,
                                                char_to_remove=[
                                                    '\r',
                                                    '\n',
                                                    '\t',
                                                    '"',
                                                ])
        rent_div = response.xpath(
            '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1 rel"]'
        )
        if rent_div is None or 1 > len(rent_div):
            rent_div = response.xpath(
                '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1"]'
            )
        temp = rent_div.css('::text').extract()
        rent_list = []
        for one_rent in temp:
            temp2 = one_rent.replace("\n", " ")
            temp2 = temp2.strip()
            if 0 < len(temp2):
                rent_list.append(temp2)
        while "" in rent_list:
            rent_list.remove("")
        rent = ""
        if 1 < len(rent_list):
            rent = rent_list[0] + rent_list[1]
        rent_type_div = response.xpath(
            '//div[@class="trl-item1 w146"]/div[@class="tt"]')
        rent_type = rent_type_div[0].css('div::text').extract_first(
            default="") if 0 < len(rent_type_div) else ""
        facing = rent_type_div[1].css('div::text').extract_first(
            default="") if 1 < len(rent_type_div) else ""
        apt_type_div = response.xpath(
            '//div[@class="trl-item1 w182"]/div[@class="tt"]')
        apt_type = apt_type_div[0].css('div::text').extract_first(
            default="") if 0 < len(apt_type_div) else ""
        floor = apt_type_div[1].css('div::text').extract_first(
            default="") if 1 < len(apt_type_div) else ""
        area_div = response.xpath(
            '//div[@class="trl-item1 w132"]/div[@class="tt"]')
        area = area_div[0].css('div::text').extract_first(
            default="") if 0 < len(area_div) else ""
        decorate = area_div[1].css('div::text').extract_first(
            default="") if 1 < len(area_div) else ""
        update_date_spans = response.xpath('//p[@class="gray9 fybh-zf"]/span')
        update_date = ""
        if 1 < len(update_date_spans):
            update_date = update_date_spans[1].css("::text").extract_first(
                default="")
        text = {
            "rent_id": f"{city_name}_{apt_id.strip()}_{self.overwrite_today}",
            "location": location.strip(),
            "address": address.strip(),
            "rent": rent.strip(),
            "rent_type": rent_type.strip(),
            "facing": facing.strip(),
            "apt_type": apt_type.strip(),
            "floor": floor.strip(),
            "area": area.strip(),
            "decorate": decorate.strip(),
            "update_date": update_date.strip(),
        }
        return text

    def parse_one_detail_page(self, response=None, apt_id=0, city_name=""):
        self.logger.info(
            f"inside Method parse_one_detail_page (todo...) of Class FangSpider. url = {response.url}; apt_id = {apt_id}; city_name = {city_name}"
        )

    def url_contains_error(self, result_obj_path=""):
        if not isinstance(result_obj_path, str) or 1 > len(result_obj_path):
            return False
        path_fragment_list = result_obj_path.split("/")
        if 1 > len(path_fragment_list):
            return False

        # https://sz.esf.fang.com/staticsearchlist/Error/Error404?aspxerrorpath=/house-a013057/i330/i330
        for one in path_fragment_list:
            if -1 < one.find("Error") or -1 < one.find(
                    "Error404") or -1 < one.find("staticsearchlist"):
                self.logger.info(
                    f"Error! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}"
                )
                return True

        # http://search.fang.com/captcha-verify/redirect?h=https://wuxi.zu.fang.com/chuzu/3_166962621_1.htm
        for one in path_fragment_list:
            if -1 < one.find("captcha") or -1 < one.find("verify"):
                self.logger.info(
                    f"Need captcha-verify! Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {result_obj_path}"
                )
                return True

        return False

    def parse(self, response):
        url = response.url
        # detailed page:		https://gz.zu.fang.com/chuzu/3_238110671_1.htm?channel=3,8
        # list page (first):	https://gz.zu.fang.com/
        # list page (next):		https://gz.zu.fang.com/house/i32/
        result_obj = parse.urlparse(url)
        has_url_error = self.url_contains_error(
            result_obj_path=result_obj.path)
        if has_url_error:
            return False
        detail_page = False
        now = datetime.datetime.now()

        url_list = url.split("/")
        while "" in url_list:
            url_list.remove("")
        html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S"))
        today = f'{now.strftime("%Y%m%d")}'
        apt_id = ""
        city_name = ""
        if 0 < len(url_list):
            last_part = url_list[len(url_list) - 1]
            temp_list = url_list[1].split(
                "."
            )  # empty "" element has been removed; "gz.zu.fang.com" == url_list[1] # not a strong code
            city_name = temp_list[0]
            if -1 < last_part.find(".htm"):
                detail_page = True
                temp = last_part.split("_")
                if 1 < len(temp):
                    apt_id = f"{temp[1]}"
                    html_filename = f"{city_name}_{apt_id}_{today}.html"
            elif -1 < last_part.find("fang.com"):
                html_filename = f"{city_name}_index1_{today}.html"
            else:
                page = last_part[2:]
                html_filename = f"{city_name}_index{page}_{today}.html"
        html_file_path = os.path.join(self.detail_html_dir, html_filename)
        with open(html_file_path, 'wb') as f:
            f.write(response.body)
        if detail_page:
            text = self.parse_response_field(response=response,
                                             city_name=city_name,
                                             apt_id=apt_id)
            try:
                loader = ItemLoader(item=FangItem(), response=response)
                loader = self.load_items_into_loader(loader=loader,
                                                     text=text,
                                                     url=url)
                yield loader.load_item()
            except Exception as ex:
                print(
                    f"Error happened during parsing in Method read_and_parse of Class FangSpider. Exception = {ex}"
                )
        else:
            url_list = url.split("fang.com")
            base_url = ""
            if 0 < len(url_list):
                base_url = f"{url_list[0]}fang.com"
            total_pages = response.xpath(
                '//div[@class="fanye"]/a/@href').extract()
            if 0 < len(total_pages):
                last_page = total_pages[len(total_pages) - 1]  # /house/i33/
                last_page = last_page[9:]
                last_page = last_page.strip('/')
                if last_page is not None and 0 < len(last_page):
                    for i in range(int(last_page) - 1):
                        next_url = base_url + f'/house/i3{i + 2}/'
                        self.logger.info(
                            f"\ngoing to the next list page at {next_url}")
                        yield response.follow(next_url, self.parse)
            apartments = response.xpath(
                '//dl[@class="list hiddenMap rel"]/dt[@class="img rel floatl"]'
            )
            for one_apt in apartments:
                next_url = base_url + one_apt.css(
                    "a::attr(href)").extract_first(default='')
                self.logger.info(
                    f"\ngoing to the next detail page at {next_url}")
                yield response.follow(next_url, self.parse)

Esempio n. 11

Mostra file

class DianpingSpider(scrapy.Spider):
    """
		sys.exit code == 1 # wrong or missing self.run_purpose
		sys.exit code == 2 # UnicodeDecodeError
		sys.exit code == 3 # Wrong file name
		On 20190517 Peter deleted 500-line codes that are no longer useful
	"""
    name = 'dianping'

    temp_str = ""  # temporary string used by Method replace_encoded()
    class_mapping_dict = {}  # temporary dict having structure like this:
    # {
    # 	'd21b952fda06ad9439a0c92a13aa2c56': {
    # 		'class_mapping': {'pufzt4':'屋', 'btk1j2': '成', and the like, },
    # 		'all_keys': ['puf', 'cmx', and the like, ],
    # 		'key_length': 3,
    # 	},
    # 	'711a35ed11322e6dc897d7918ffdaeb4': {
    # 		'class_mapping': {a key: char pair dict},
    # 		'all_keys': ['av', 'io', and the like, ],
    # 		'key_length': 2,
    # 	},
    # }

    # this attribute indicates the purpose of current run
    run_purpose = None
    root_path = None
    crawled_folder_name = None
    detail_html_folder_name = None
    list_html_folder_name = None
    svg_text_css_folder_name = None
    debug = False
    move_fiddler_file = True

    # while using proxy:
    proxy_meta = ""
    max_list_page = 50

    database_city_district_table = {}
    database_level2name_table = {}
    database_merchant_star_level_table = {}
    database_anticrawl20190505_table = {}
    database_common_channel_list_table = []

    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    def init_self_attributes(self):
        self.run_purpose = self.settings.get(name='RUN_PURPOSE', default=None)

        # set all paths
        self.root_path = self.settings.get('PROJECT_PATH')
        self.crawled_folder_name = self.settings.get(name='CRAWLED_DIR',
                                                     default='crawled')
        self.detail_html_folder_name = self.settings.get(
            name='SAVED_DETAIL_HTML', default='detail_html')
        self.list_html_folder_name = self.settings.get(name='SAVED_LIST_HTML',
                                                       default='list_html')
        self.svg_text_css_folder_name = self.settings.get(name='SVG_TEXT_CSS',
                                                          default='svgtextcss')
        if self.run_purpose in [
                "PARSE_FIDDLER",
                "PARSE_DETAILED_HOTEL",
        ]:
            self.detail_html_folder_name = f"{ self.detail_html_folder_name }_fiddler"
            self.list_html_folder_name = f"{ self.list_html_folder_name }_fiddler"
            self.svg_text_css_folder_name = f"{ self.svg_text_css_folder_name }_fiddler"

        # whether this run is for debugging
        self.debug = self.settings.get(name='PROJECT_DEBUG', default=False)
        self.move_fiddler_file = self.settings.get(
            name='MOVE_FIDDLER_HTML_FILE', default=True)

        # get proxy header
        temp = CommonClass.get_proxies(proxy_dict={})
        self.proxy_meta = temp['http']

        self.database_city_district_table = self.settings.get(
            name='DATABASE_CITY_DISTRICT_TABLE', default={})
        self.database_level2name_table = self.settings.get(
            name='DATABASE_LEVEL2NAME_TABLE', default={})
        self.database_merchant_star_level_table = self.settings.get(
            name='DATABASE_MERCHANT_STAR_LEVEL_TABLE', default={})
        self.database_anticrawl20190505_table = self.settings.get(
            name='DATABASE_ANTICRAWL20190505_TABLE', default={})
        self.database_common_channel_list_table = self.settings.get(
            name='DATABASE_COMMON_CHANNEL_LIST_TABLE', default=[])

    def make_dirs(self):
        # even cache is used, we save all html files
        crawled_dir = os.path.join(self.root_path, self.name,
                                   self.crawled_folder_name)
        if not os.path.isdir(crawled_dir):
            os.mkdir(crawled_dir)
        detail_html_dir = os.path.join(self.root_path, self.name,
                                       self.detail_html_folder_name)
        if not os.path.isdir(detail_html_dir):
            os.mkdir(detail_html_dir)
        list_html_dir = os.path.join(self.root_path, self.name,
                                     self.list_html_folder_name)
        if not os.path.isdir(list_html_dir):
            os.mkdir(list_html_dir)
        svg_css_dir = os.path.join(self.root_path, self.name,
                                   self.svg_text_css_folder_name)
        if not os.path.isdir(svg_css_dir):
            os.mkdir(svg_css_dir)
        if self.run_purpose in [
                "PARSE_FIDDLER",
                "PARSE_DETAILED_HOTEL",
        ]:
            temp_foldername = self.detail_html_folder_name.replace(
                "_fiddler", "")
            temp_dir = os.path.join(self.root_path, self.name, temp_foldername)
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)
            temp_foldername = self.list_html_folder_name.replace(
                "_fiddler", "")
            temp_dir = os.path.join(self.root_path, self.name, temp_foldername)
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)
            temp_foldername = self.svg_text_css_folder_name.replace(
                "_fiddler", "")
            temp_dir = os.path.join(self.root_path, self.name, temp_foldername)
            if not os.path.isdir(temp_dir):
                os.mkdir(temp_dir)

    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()

        urls = [
            'http://quotes.toscrape.com/page/1/',
        ]

        if self.debug:
            callback = self.parse_list_debug
        elif "PARSE_FIDDLER" == self.run_purpose:
            callback = self.parse_fiddler_list
            fiddler_list_dir = os.path.join(self.root_path, self.name,
                                            self.list_html_folder_name)
            file_list = os.listdir(fiddler_list_dir)
            for i in range(len(file_list)):
                urls.append('http://quotes.toscrape.com/page/1/')
        elif "PARSE_DETAILED_HOTEL" == self.run_purpose:
            callback = self.parse_detailed_hotel_html
        else:
            self.logger.critical(
                f"self.run_purpose ({self.run_purpose}) can ONLY be PARSE_FIDDLER or PARSE_DETAILED_HOTEL"
            )
            sys.exit(1)  # wrong or missing self.run_purpose

        for url in urls:
            if self.run_purpose in [
                    "PARSE_FIDDLER",
                    "PARSE_DETAILED_HOTEL",
            ]:
                yield scrapy.Request(url=url,
                                     callback=callback,
                                     dont_filter=True)
            else:
                yield scrapy.Request(url=url, callback=callback)

    def parse_list_debug(self, response):
        pass
        # add whatever debug code here

    def parse_detailed_hotel_html(self, response):
        now = datetime.datetime.now()
        today = now.strftime('%Y%m%d')
        fiddler_detailed_dir = os.path.join(self.root_path, self.name,
                                            self.detail_html_folder_name)
        file_list = os.listdir(fiddler_detailed_dir)
        hotel_dict = {}
        file_path = ""
        for one_file in file_list:
            doc = None
            file_path = os.path.join(fiddler_detailed_dir, one_file)
            try:
                with open(file_path, 'r', encoding="utf-8",
                          errors="ignore") as f:
                    doc = f.read(
                    )  # .decode(encoding='utf-8', errors='ignore')
            except Exception as ex:
                self.logger.critical(
                    f"cannot read file {file_path}; Exception = {ex}")
                sys.exit(2)  # UnicodeDecodeError

            response = Selector(text=doc, type="html")
            shop_id = one_file.strip(".html")
            shop_id = shop_id.strip("shop")
            hotel_address = response.xpath(
                "//span[@class='hotel-address']/text()").extract_first(
                    default="")
            if hotel_address is not None and 0 < len(hotel_address):
                hotel_dict[shop_id] = hotel_address.strip("\"")
            else:
                self.logger.critical(
                    f"cannot xpath hotel address from saved html file (shop_id = {shop_id})"
                )
                sys.exit(2)
        try:
            all_keys = [
                "shop_id",
                "hotel_address_newly_added",
            ]
            file_path = os.path.join(self.root_path, self.name,
                                     self.crawled_folder_name,
                                     f"addresses{today}.csv")
            with open(file_path, 'a', encoding='utf-8', newline="") as f:
                writer = csv.writer(f)
                writer.writerow(all_keys)
                for index, shop_id in enumerate(hotel_dict):
                    writer.writerow([shop_id, hotel_dict[shop_id]])
        except Exception as ex:
            self.logger.error(
                f"cannot write csv file in Method parse_detailed_hotel_html of Class DianpingSpider. Exception = {ex}; file_path = {file_path}"
            )
        else:
            # move this html file
            if self.move_fiddler_file:
                for one_file in file_list:
                    file_path = os.path.join(fiddler_detailed_dir, one_file)
                    dst_path = file_path.replace("_fiddler", "")
                    shutil.move(file_path, dst_path)

    def generate_filename_from_url(self, url="", file_type=""):
        response_html = ""
        filename = ""
        filename_base = ""
        folder = self.list_html_folder_name
        now = datetime.datetime.now()
        today = now.strftime('%Y%m%d')

        url_fragments = url.split("/")
        while '' in url_fragments:
            url_fragments.remove('')
        # Examples:
        # http://www.dianping.com/chenzhou/ch10/g113
        # http://www.dianping.com/shop/72457872
        # http://www.dianping.com/shop/8910906/review_all/p624
        if "list2" == file_type:
            if 3 < len(url_fragments):
                filename_base = f"{url_fragments[-3]}_{url_fragments[-2]}_{url_fragments[-1]}"
                response_html = f"{filename_base}_{today}.html"
                filename = response_html
        elif "detailed" == file_type:
            folder = self.detail_html_folder_name
            if 3 < len(url_fragments) and "review_all" == url_fragments[-2]:
                shop_id = CommonClass.find_digits_from_str(url_fragments[-3])
                filename_base = f"shop_{shop_id}_{url_fragments[-1]}"
                response_html = f"{filename_base}_{today}.html"
                filename = response_html
            elif 2 < len(url_fragments):
                shop_id = CommonClass.find_digits_from_str(url_fragments[-1])
                filename_base = f"shop_{shop_id}_p1"
                response_html = f"{filename_base}_{today}.html"
                filename = response_html
        elif "css" == file_type:
            # http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/a59454e0c1813952099c1e006c298195.css
            folder = self.svg_text_css_folder_name
            if 1 < len(url_fragments) and url_fragments[-1].endswith(".css"):
                filename_base = url_fragments[-1].replace(".css", "")
                response_html = url_fragments[-1]
                filename = response_html

        if response_html is None or 1 > len(response_html):
            rand_int = random.randint(100000, 999999)
            response_html = f"unknown{rand_int}_{today}.html"
            self.logger.error(
                f"File {response_html} is used to store html page crawled from {url}"
            )

        return response_html, folder, filename, filename_base

    def save_crawled_page(self, response=None, file_type="list2"):
        # even cache is used, we save all html files
        response_html = ""
        url = ""

        if self.debug:
            if "list2" == file_type:
                url = "http://www.dianping.com/chenzhou/ch10/g113"
            else:
                url = "http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/a59454e0c1813952099c1e006c298195.css"
        elif hasattr(response, "url"):
            url = response.url

        response_html, folder, filename, filename_base = self.generate_filename_from_url(
            url=url, file_type=file_type)

        response_html = os.path.join(self.root_path, self.name, folder,
                                     response_html)
        try:
            with open(response_html, 'wb') as f:
                if hasattr(response, "body"):
                    f.write(response.body)
                elif self.debug:
                    f.write(response)
                return response_html
        except Exception as ex:
            self.logger.error(
                f"Fail to write file {response_html} for storing html page crawled from {url}; Exception = {ex}"
            )
            return ""

    def get_parse_dict_on_list_page(self, one_li=None, channel=""):
        """the html pages in different channels have different xpath
			return the right dict according to the input channel
			self.database_common_channel_list_table includes all channels but 'hotel' and 'ch70'
		"""
        this_page_xpath = {}
        this_page_dict = {}
        need_clean = []
        use_extract = []
        need_split_and_clean = []
        if channel in self.database_common_channel_list_table:
            use_extract = ['group_deal_list']
            this_page_xpath = {
                'title':
                "./div[@class='txt']/div[@class='tit']/a/h4/text()",
                'shop_id':
                "./div[@class='txt']/div[@class='tit']/a/@data-shopid",
                'star':
                "./div[@class='txt']/div[@class='comment']/span[contains(@class, 'sml-rank-stars')]/@title",
                'group_deal':
                "./div/a[@data-click-name='shop_info_groupdeal_click']/@title",
                'group_deal_list':
                "./div[@class='svr-info']/div/a[@data-click-name='shop_info_groupdeal_click']/@title",
                # group_deal_list found in [ 'ch10', 'ch15', 'ch30', 'ch45', 'ch50', 'ch65', 'ch75', 'ch80', 'ch85', 'ch95', ]:
                'address':
                "./div/a[@data-click-name='shop_map_click']/@data-address",
                'out_of_business':
                "./div[@class='txt']/div[@class='tit']/span[@class='istopTrade']/text()",
            }
            if 'ch10' == channel:
                need_split_and_clean = ['recommended_dishes']
                this_page_xpath[
                    'takeway'] = "./div/a[@data-click-name='shop_info_takeway_click']/@title"
                this_page_xpath[
                    'recommended_dishes'] = "string(./div[@class='txt']/div[@class='recommend'])"
            elif channel in ['ch30', 'ch25']:
                this_page_xpath[
                    'group_deal'] = "./div[@class='txt']/div[@class='tit']/div/a[@class='igroup']/@title"
        elif channel in [
                'ch70',
        ]:
            this_page_xpath = {
                'title':
                "./div[@class='info baby-info']/p[@class='title']/a[@class='shopname']/text()",
                'branch':
                "./div[@class='info baby-info']/p[@class='title']/span[@class='icon-sale']/a[@class='shopbranch']/em/text()",
                'shop_id':
                "./@data-shopid",
                'star':
                "./div[@class='info baby-info']/p[@class='remark']/span[contains(@class, 'item-rank-rst')]/@title",
                'review_numbers':
                "./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='comment-count']/a/text()",
                'mean_prices':
                "string(./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='average'])",
                'group_deal':
                "./div[@class='info baby-info']/div[@class='tuan-info']/a[@class='tuan']/@title",
            }
            need_clean = [
                'mean_prices',
            ]
        elif channel in [
                'ch90',
        ]:
            # ch90家装频道是201905以后增加的新频道，目前完全没有字符串加密。直接读取中文和数字即可
            this_page_xpath = {
                'title':
                "./div[@class='info baby-info']/p[@class='title']/a[@class='shopname']/text()",
                'branch':
                "./div[@class='info baby-info']/p[@class='title']/span[@class='icon-sale']/a[@class='shopbranch']/em/text()",
                'shop_id':
                "./@data-shopid",
                'star':
                "./div[@class='info baby-info']/p[@class='remark']/span[contains(@class, 'item-rank-rst')]/@title",
                'review_numbers':
                "./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='comment-count']/a/text()",
                'mean_prices':
                "string(./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='average'])",
                'group_deal':
                "./div[@class='info baby-info']/div[@class='tuan-info']/a[@class='tuan']/@title",
            }
            need_clean = [
                'mean_prices',
            ]
        elif channel in ['hotel']:
            use_extract = ['hotel_tags']
            need_clean = [
                'place',
                'price',
            ]
            this_page_xpath = {
                'shop_id':
                "./@data-poi",
                'title':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/h2[@class='hotel-name']/a/text()",
                'place':
                "string(./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/p[@class='place'])",
                'hotel_tags':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/p[@class='hotel-tags']/span/text()",
                'price':
                "string(./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='price']/p)",
                'star':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='remark']/div[@class='item-rank-ctn']/div[@class='item-rank-ctn']/span/@class",
                'review_numbers':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='remark']/div[@class='item-rank-ctn']/div[@class='item-rank-ctn']/a/text()",
            }

        if one_li is not None:
            for index, key in enumerate(this_page_xpath):
                if key in use_extract:
                    temp_list = one_li.xpath(this_page_xpath[key]).extract()
                    this_page_dict[
                        key] = CommonClass.get_cleaned_string_by_splitting_list(
                            string_or_list=temp_list,
                            char_to_remove=[
                                '\r',
                                '\n',
                                '\t',
                                ' ',
                            ])
                elif key in need_clean:
                    temp_str = one_li.xpath(
                        this_page_xpath[key]).extract_first(default="")
                    this_page_dict[key] = CommonClass.clean_string(
                        string=temp_str,
                        char_to_remove=[
                            '\r',
                            '\n',
                            '\t',
                            ' ',
                        ])
                elif key in need_split_and_clean:
                    temp_string = one_li.xpath(
                        this_page_xpath[key]).extract_first(default="")
                    this_page_dict[
                        key] = CommonClass.get_cleaned_string_by_splitting_list(
                            string_or_list=temp_string,
                            char_to_remove=[
                                '\r',
                                '\n',
                                '\t',
                                ' ',
                            ])
                else:
                    this_page_dict[key] = one_li.xpath(
                        this_page_xpath[key]).extract_first(default="")

                # special fields
                if channel in ['hotel']:
                    if 'star' in this_page_dict.keys():
                        temp = this_page_dict['star'].replace(
                            "sml-rank-stars sml-str", "")
                        if re.match(r'^(\d)+$', temp):
                            temp = int(temp)
                            if temp in self.database_merchant_star_level_table.keys(
                            ):
                                this_page_dict[
                                    'star'] = self.database_merchant_star_level_table[
                                        temp]
                            else:
                                this_page_dict['star'] = this_page_dict[
                                    'star'].replace("sml-rank-stars sml-str",
                                                    "")
                        else:
                            this_page_dict['star'] = temp
                    if 'review_numbers' in this_page_dict.keys():
                        this_page_dict['review_numbers'] = this_page_dict[
                            'review_numbers'].replace("(", "")
                        this_page_dict['review_numbers'] = this_page_dict[
                            'review_numbers'].replace(")", "")
        shop_id = this_page_dict[
            'shop_id'] if 'shop_id' in this_page_dict.keys() else '0'

        # extract special nodes
        # no by now

        return this_page_dict, shop_id

    def parse_shop_list(self,
                        all_lis=[],
                        css_svg_ready_for_decoding=True,
                        constant_items={}):
        shop_list = []
        decoded_shop_dict = {}
        city = constant_items['city']
        channel = constant_items['channel']
        this_level2 = constant_items['level2']
        this_level2_name = constant_items['level2name']
        for one_li in all_lis:
            this_page_dict, shop_id = self.get_parse_dict_on_list_page(
                one_li=one_li, channel=channel)
            this_page_dict.update(constant_items)
            if css_svg_ready_for_decoding:
                decoded_shop_dict[shop_id] = self.decode_fields(
                    one_li=one_li, channel=channel)
            shop_list.append(this_page_dict)
        return shop_list, decoded_shop_dict

    def get_district_name(self, city="", district=""):
        district_name = ""
        if 0 < len(city) and city in self.database_city_district_table.keys():
            if district in self.database_city_district_table[city].keys():
                district_name = self.database_city_district_table[city][
                    district]
        return district_name

    def parse_fiddler_list(self, response):
        """for parsing html pages saved by fiddler
		"""
        fiddler_list_dir = os.path.join(self.root_path, self.name,
                                        self.list_html_folder_name)
        file_list = os.listdir(fiddler_list_dir)
        if 1 > len(file_list):
            return None
        file_path = os.path.join(fiddler_list_dir, file_list[0])
        doc = None
        try:
            with open(file_path, 'r', encoding="utf-8", errors='ignore') as f:
                doc = f.read()  # .decode(encoding='utf-8', errors='ignore')
        except Exception as ex:
            self.logger.critical(
                f"cannot read file {file_path}; Exception = {ex}")
            sys.exit(2)  # UnicodeDecodeError

        response = Selector(text=doc, type="html")

        # get all level2 categories
        filename_fragment_list = file_list[0].split("_")
        if 3 > len(filename_fragment_list):
            self.logger.critical(f"file {file_list[0]} has wrong name")
            sys.exit(3)  # Wrong file name
        city = filename_fragment_list[0]
        channel = filename_fragment_list[1]
        this_level2, district = self.clean_this_level2_string(
            this_level2=filename_fragment_list[2])
        district_name = ""
        if 0 < len(district):
            district_name = self.get_district_name(city=city,
                                                   district=district)
        categories, this_level2_name = self.get_categories_and_level2_name(
            response=response,
            city=city,
            channel=channel,
            this_level2=this_level2)
        if 0 < len(district_name):
            this_level2_name = this_level2_name + "_" + district_name
        if 1 > len(categories):  # this page has no categories
            city = filename_fragment_list[0]
            channel = filename_fragment_list[1]
        url = f"http://www.dianping.com/{city}/{channel}/{filename_fragment_list[2]}"

        # get the css request address
        css_url = None
        all_link_hrefs = response.xpath(
            "//link[@rel='stylesheet']/@href").extract()
        for one_href in all_link_hrefs:
            if -1 < one_href.find("//s3plus.meituan.net"):
                css_url = one_href
        css_filename = ""
        if css_url is None:
            # when html file has no //s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/xxxx.css file,
            # we try to use the latest css file downloaded
            this_svg_css_dir = os.path.join(self.root_path, self.name,
                                            self.svg_text_css_folder_name)
            css_filename = CommonClass.get_latest_file_name(
                this_dir=this_svg_css_dir, suffix=".css", logger=self.logger)
            css_url = f"//s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/{css_filename}"

        if 1 > len(css_filename):
            url_fragments = css_url.split("/")
            if 1 < len(url_fragments) and url_fragments[-1].endswith(".css"):
                css_filename = url_fragments[-1]
        csv_file = f"{css_filename}"
        csv_file = csv_file.replace(".css", ".csv")
        self.check_extract_svg_loaded(css_filename=css_filename,
                                      referer=css_url,
                                      csv_file=csv_file,
                                      folder="list_html_fiddler")
        if "PARSE_FIDDLER" == self.run_purpose:
            self.read_all_css_files_for_mapping(referer=css_url,
                                                csv_file=csv_file,
                                                folder="list_html_fiddler")
        css_svg_ready_for_decoding = True

        response_for_items = TextResponse(url=url,
                                          status=200,
                                          body=bytes(doc, encoding="utf-8"))
        loader = ItemLoader(item=DianpingListItem(),
                            response=response_for_items)

        loader.add_value('category_id', f"{city}___{channel}")
        loader.add_value('category_list', categories)
        loader.add_value('targeted_page', 'list2')

        # get all shop_ids
        if channel in self.database_common_channel_list_table:
            all_lis = response.xpath("//div[@id='shop-all-list']/ul/li")
        elif channel in ['ch70']:  # ch70 == 亲子
            all_lis = response.xpath("//ul[@class='shop-list']/li")
        elif channel in ['hotel']:
            all_lis = response.xpath(
                "//ul[@class='hotelshop-list']/li[@class='hotel-block']")
        constant_items = {
            'city': city,
            'channel': channel,
            'level2': this_level2,
            'level2name': this_level2_name,
        }
        shop_list, decoded_shop_dict = self.parse_shop_list(
            all_lis=all_lis,
            css_svg_ready_for_decoding=css_svg_ready_for_decoding,
            constant_items=constant_items)

        loader.add_value('shop_list', shop_list)
        loader.add_value('css_url', css_url)
        loader.add_value('decoded_shop_dict', decoded_shop_dict)

        # record housekeeping fields
        loader.add_value('url', url)
        loader.add_value('project', self.settings.get('BOT_NAME'))
        loader.add_value('spider', self.name)
        loader.add_value('server', socket.gethostname())
        loader.add_value('date',
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))

        # move this html file
        if self.move_fiddler_file:
            dst_path = file_path.replace("_fiddler", "")
            shutil.move(file_path, dst_path)
        yield loader.load_item()

    def clean_this_level2_string(self, this_level2=""):
        searchObj = re.search(r'p(\d)+$', this_level2)
        if searchObj is not None:
            pos_tuple = searchObj.span()
            found_str = this_level2[pos_tuple[0]:pos_tuple[1]]  # like p15
            this_level2 = this_level2.replace(f"{found_str}", "")

        district = ""  # 20190509 district code is added
        separators = [
            "r",
            "c",
        ]
        for separator in separators:
            if -1 < this_level2.find(separator):
                temp_list = this_level2.split(separator)
                if 1 < len(temp_list):
                    district = f"{separator}{temp_list[1]}"
                if 0 < len(temp_list):
                    this_level2 = temp_list[0]
                break
        return this_level2, district

    def get_categories_and_level2_name(self,
                                       response=None,
                                       city="",
                                       channel="",
                                       this_level2=""):
        categories = []
        this_level2_name = ""
        if channel in self.database_common_channel_list_table:
            level2categories_a = response.xpath("//div[@id='classfy']/a")
            for one in level2categories_a:
                link = one.xpath("./@href").extract_first(default="")
                name = one.xpath("./span/text()").extract_first(default="")
                temp_dict = self.get_one_category(link=link, name=name)
                if temp_dict is not None and 0 < len(temp_dict):
                    categories.append(temp_dict)
                    if this_level2 == temp_dict['level2']:
                        this_level2_name = temp_dict['name']
        elif channel in [
                'ch70',
        ]:
            level2category_lis = response.xpath(
                "//div[@id='nav']/div/ul/li[@class='first-item']")
            for one_li in level2category_lis:
                level2category_a = one_li.xpath(
                    "./div[@class='primary-container']/span[@class='span-container']/a[@class='index-title' or @class='index-item']"
                )
                for one_a in level2category_a:
                    link = one_a.xpath("./@href").extract_first(default="")
                    name = one_a.xpath("string(.)").extract_first(default="")
                    temp_dict = self.get_one_category(link=link, name=name)
                    if temp_dict is not None and 0 < len(temp_dict):
                        categories.append(temp_dict)
                        if this_level2 == temp_dict['level2']:
                            this_level2_name = temp_dict['name']
        if 1 > len(this_level2_name
                   ) and this_level2 in self.database_level2name_table.keys():
            this_level2_name = self.database_level2name_table[this_level2]
        return categories, this_level2_name

    def get_one_category(self, link="", name=""):
        temp_dict = {}
        if name is not None and 0 < len(name):
            temp_list = link.split("/")
            if 3 < len(temp_list):
                city = temp_list[-3]
                channel = temp_list[-2]
                level2 = temp_list[-1]
                temp_dict = {
                    'name': name,
                    'city': city,
                    'channel': channel,
                    'level2': level2,
                    'link': link,
                }
        return temp_dict

    def decode_fields(self, one_li=None, channel=""):
        return_dict = {}
        if one_li is None:
            return return_dict
        if channel in self.database_common_channel_list_table:
            review_number_a = one_li.xpath(
                "./div[@class='txt']/div[@class='comment']/a[@class='review-num']"
            )
            review_numbers = self.get_decoded_str(element=review_number_a)
            mean_price_a = one_li.xpath(
                "./div[@class='txt']/div[@class='comment']/a[@class='mean-price']"
            )
            mean_prices = self.get_decoded_str(element=mean_price_a)

            # 总分/口味/质量/环境comment-list：[ 'ch10', 'ch15', 'ch20', 'ch50', 'ch75', 'ch85', 'ch95', ]
            comment_score_span = one_li.xpath(
                "./div[@class='txt']/span[@class='comment-list']/span")
            comment_score_str = ""
            for one_span in comment_score_span:
                if 1 > len(comment_score_str):
                    comment_score_str = self.get_decoded_str(element=one_span)
                else:
                    comment_score_str += f"; {self.get_decoded_str( element = one_span )}"

            return_dict = {
                'review_numbers': review_numbers,
                'mean_prices': mean_prices,
                'comment_score': comment_score_str,
            }
        elif channel in ['hotel0', 'ch70']:
            pass
            # there is no encoded fields in these 2 channels
        return return_dict

    def get_decoded_str(self, element=None):
        if element is None:
            return ""
        self.temp_str = ""  # need to initialize this!
        self.replace_encoded(element=element)
        return self.temp_str
        # <a class="review-num"><b>1<span class="niv48y"></span><span class="niv48y"></span><span class="niv8q4"></span></b>条点评</a>

    def replace_one_node_text(self,
                              node=None,
                              this_node_class_name20190505=""):
        if node is None:
            return ""
        this_node_class_name = node.xpath("./@class").extract_first(default="")

        # the following 7 lines are for updated anticrawl methods on 20190505
        this_node_get_text = node.get()
        if this_node_get_text is not None and 0 < len(this_node_get_text):
            this_node_get_text5 = this_node_get_text.encode(
                'unicode_escape').decode('utf-8')
            if 6 == len(this_node_get_text5) and '\\' == this_node_get_text5[
                    0] and 'u' == this_node_get_text5[
                        1] and -1 < this_node_class_name20190505.find(
                            "shopNum"):
                key = this_node_get_text5[2:]
                if key in self.database_anticrawl20190505_table.keys():
                    # self.logger.warning( f"{this_node_get_text5} ==> {key}; found in {self.database_anticrawl20190505_table[ key ]}" )
                    return self.database_anticrawl20190505_table[key]
                # has no class as shopNum: ￥ ==> \uffe5

        not_in_class_mapping_dict = False

        for index, key in enumerate(self.class_mapping_dict):
            this_dict = self.class_mapping_dict[key]
            key_length = this_dict['key_length']
            all_keys = this_dict['all_keys']
            if key_length < len(
                    this_node_class_name
            ) and this_node_class_name[:key_length] in all_keys:
                value = this_dict['class_mapping'][
                    this_node_class_name] if this_node_class_name in this_dict[
                        'class_mapping'].keys() else ""
                if 0 < len(value):
                    return value
                else:
                    not_in_class_mapping_dict = True
                    self.logger.error(
                        f"cannot find {this_node_class_name} in saved mapping class {key}."
                    )
        if not_in_class_mapping_dict:
            return ""
        else:
            temp = CommonClass.clean_string(string=node.get(),
                                            char_to_remove=[
                                                '\r',
                                                '\n',
                                                '\t',
                                                ' ',
                                            ])
            return temp

    def replace_encoded(self, element=None):
        if element is None:
            return ""
        children = element.xpath("./child::node()")
        this_node_class_name20190505 = element.xpath("./@class").extract_first(
            default="")
        if 0 < len(children):
            for one_child in children:
                grandchild = one_child.xpath("./child::node()")
                if 0 < len(grandchild):
                    self.replace_encoded(element=one_child)
                else:
                    this_node_text = self.replace_one_node_text(
                        node=one_child,
                        this_node_class_name20190505=
                        this_node_class_name20190505)
                    if this_node_text is not None and 0 < len(this_node_text):
                        self.temp_str += this_node_text
        else:  # only node having no child needed to be decoded.
            this_node_text = self.replace_one_node_text(
                node=element,
                this_node_class_name20190505=this_node_class_name20190505)
            if this_node_text is not None and 0 < len(this_node_text):
                self.temp_str += this_node_text

    def check_extract_svg_loaded(self,
                                 css_filename="",
                                 css_string="",
                                 referer="",
                                 csv_file="",
                                 folder=""):
        send_requests = True
        if "PARSE_FIDDLER" == self.run_purpose:
            send_requests = False
        app = ExtractSVG(root_path=self.root_path,
                         css_file=css_filename,
                         css_string=css_string,
                         send_requests=send_requests,
                         referer=referer,
                         save_requested_svg=True,
                         csv_file=csv_file,
                         settings=self.settings,
                         folder=folder,
                         logger=self.logger)
        app.run()
        all_keys = app.svg_file_dict
        return_dict = {
            'class_mapping': app.class_mapping,
            'all_keys': all_keys.keys(),
            'key_length': int(app.key_length),
        }
        this_key = csv_file.replace(".csv", "")
        self.class_mapping_dict[this_key] = return_dict

    def read_all_css_files_for_mapping(self,
                                       referer="",
                                       csv_file="",
                                       folder=""):
        svg_css_dir = os.path.join(self.root_path, self.name,
                                   self.svg_text_css_folder_name)
        file_list = os.listdir(svg_css_dir)
        css_filename = csv_file.replace(".csv", ".css")
        for filename in file_list:
            if filename not in [css_filename] and filename.endswith(".css"):
                this_referer = referer.replace(css_filename, filename)
                csv_filename = filename.replace(".css", ".csv")
                self.check_extract_svg_loaded(css_filename=filename,
                                              css_string="",
                                              referer=this_referer,
                                              csv_file=csv_filename,
                                              folder=folder)

Esempio n. 12

Mostra file

File: cnemc1.py Progetto: zouyaoji/Crawls

class Gzzfcj1Spider(scrapy.Spider):
	"""
	20190806王总指出目前还缺少空气质量，本爬虫爬取中国环境监测总局官网的空气质量
	http://www.cnemc.cn/sssj/
		revision 20190806
	"""
	name = "cnemc1"

	root_path = ""
	log_dir = ""
	debug = False
	run_purpose = None
	save_every_response = False
	overwrite_today = ""
	crawled_dir = ""
	saved_json_dir = ""
	base_url = ""

	custom_settings = CommonClass.get_custom_settings_dict(spider=name)

	proxy_ip_dict = {}
	min_proxy_ip_life_time = 6
	max_proxy_ip_life_time = 180
	use_proxy = False
	proxy_agent = ""

	maximal_requests_of_one_crontab_process = 23
	interval_between_requests = 1800
	request_counter = 0
	last_request_time = 0.0
		
	def init_self_attributes(self):
		self.root_path = self.settings.get( "PROJECT_PATH" )
		self.log_dir = self.settings.get( name="LOG_DIR", default="" )
		self.debug = self.settings.get( name = "PROJECT_DEBUG", default=False )
		self.run_purpose = self.settings.get( name = "RUN_PURPOSE", default=None )
		if self.run_purpose is None:
			self.logger.error( f"missing RUN_PURPOSE ({self.run_purpose}) setting" )
			sys.exit(1)
		self.save_every_response = self.settings.get( name = "SAVE_EVERY_RESPONSE", default = False )
		self.overwrite_today = self.settings.get( "OVERWRITE_TODAY", default = "" )
		if not hasattr(self, "overwrite_today") or 1 > len( self.overwrite_today ) or self.overwrite_today is None:
			self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

		# set all paths
		self.crawled_dir = self.settings.get( name = "CRAWLED_DIR", default = "" )
		self.saved_json_dir = self.settings.get( name = "SAVED_JSON", default="" )
		self.base_url = self.settings.get( name = "BASE_URL", default="" )

		if 1 > len( self.crawled_dir ) or 1 > len( self.saved_json_dir ):
			error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_JSON ({self.saved_json_dir}) setting(s)"
			self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" )
			sys.exit(3)

		self.min_proxy_ip_life_time = self.settings.get( name = "MIN_PROXY_LIFE_SPAN", default = 6 )
		self.max_proxy_ip_life_time = self.settings.get( name = "MAX_PROXY_LIFE_SPAN", default = 180 )
		self.use_proxy = self.settings.get( name="HTTPPROXY_ENABLED", default = False )
		self.proxy_agent = self.settings.get( name="PROXY_AGENT", default = "" )

	def make_dirs(self):
		# even cache is used, we save all html files; here we make these 3 dirs if they do not exist
		if not os.path.isdir( self.crawled_dir ):
			os.makedirs( self.crawled_dir )
		if not os.path.isdir( self.saved_json_dir ):
			os.makedirs( self.saved_json_dir )

	def proxy_ip_pool(self):
		"""
			迅联错误码10000		提取过快，请至少5秒提取一次
		"""
		if "DRAGONFLY" == self.proxy_agent:
			return CommonClass.get_proxies( proxy_dict = {} )
		now = time.time()
		need_new_proxy = False
		if self.proxy_ip_dict is None or 1 > len( self.proxy_ip_dict ):
			need_new_proxy = True
		elif "expire" not in self.proxy_ip_dict.keys():
			need_new_proxy = True
		elif now + 3 > self.proxy_ip_dict["expire"]:
			need_new_proxy = True
		if need_new_proxy:
			proxies_dict = ProxyAgent.get_xunlian_proxy_dict(headers = {}, params_for_proxy_ip={}, setup_xunlian_dict = {}, need_setup_xunlian = False, logger=self.logger )
			if 1 > len( proxies_dict ):
				return self.proxy_ip_dict # still return the old ip dict or {}
			proxies_dict["expire"] = now + random.randint( self.min_proxy_ip_life_time, self.max_proxy_ip_life_time ) # set ip life time
			self.proxy_ip_dict = proxies_dict
		return self.proxy_ip_dict

	def start_requests(self):
		self.init_self_attributes()
		self.make_dirs()

		if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug
			url = 'http://quotes.toscrape.com/page/1/'
			yield scrapy.Request( url = url, callback = self.read_and_parse )
		elif "PRODUCTION_RUN" == self.run_purpose:
			urls = [
				# "http://www.cnemc.cn/sssj/", # 中国环境监测总局，实时数据页面
				self.base_url,
			]
			meta_dict = {}
			if self.use_proxy:
				proxies_dict = self.proxy_ip_pool()
				if 1 > len( proxies_dict):
					sys.exit(3)
				meta_dict["proxy"] = proxies_dict["http"]
			
			formdata_dict = {} # 没有任何表单字段需要post给目标网站
			for url in urls:
				# yield scrapy.RequestForm( url = url, callback = self.parse_json, meta = meta_dict, dont_filter = True )
				# yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
				self.last_request_time = time.time()
				yield scrapy.FormRequest( url = url, formdata = formdata_dict, callback = self.parse_json, meta = meta_dict, dont_filter = True )
		elif "CHECK_PROXY_IP" == self.run_purpose:
			now = int(time.time())
			token = f"Guangzhou{str(now)}"
			m = hashlib.md5()  
			m.update( token.encode(encoding = 'utf-8') )
			urls = [
				f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
			]
			
			if "DRAGONFLY" == self.proxy_agent:
				proxies_dict = CommonClass.get_proxies( proxy_dict = {} )
			else:
				proxies_dict = ProxyAgent.get_xunlian_proxy_dict(headers = {}, params_for_proxy_ip={}, setup_xunlian_dict = {}, need_setup_xunlian = False, logger=self.logger )
			if 0 < len( proxies_dict):
				meta_dict = {
					"proxy": proxies_dict["http"]
				}
				for url in urls:
					yield scrapy.Request( url=url, callback=self.do_nothing_for_debug, meta = meta_dict )
			else:
				self.logger.error( f"Error! No proxy ip returns. {proxies_dict}" )
		else:
			urls = [
				"http://quotes.toscrape.com/page/1/",
				"http://quotes.toscrape.com/page/2/",
			]
			for url in urls:
				yield scrapy.Request( url=url, callback=self.do_nothing_for_debug )

	def make_html_file_name( self, url = "", city = "", page_type = "" ):
		"""
			pass
		"""
		now = datetime.datetime.now()
		html_filename = "{}.html".format( now.strftime("%Y%m%d_%H%M%S") )
		today = now.strftime("%Y%m%d")

		url_obj = parse.urlparse(url)
		url_list = url_obj.path.split("/")
		for one in url_list:
			if -1 < one.find(".html"):
				html_filename = f"{city}__{page_type}__{one}"
				break
		return html_filename

	def load_items_into_loader(self, loader = None, text = {}, url = ""):
		loader.add_value( "content", str(text) ) # , encoding="utf-8"
		loader.add_value( "page_type", "detailed" )

		# record housekeeping fields
		loader.add_value( "url", url )
		loader.add_value( "project", self.settings.get('BOT_NAME') )
		loader.add_value( "spider", self.name )
		loader.add_value( "server", socket.gethostname() )
		loader.add_value( "date", datetime.datetime.now().strftime("%Y%m%d_%H%M%S") )
		return loader

	def do_nothing_for_debug(self, response = None):
		self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}" )
		# print( response.body )
		# Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}] 
		# b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}'
		# 2019-06-20 16:28:55 [fangesf] INFO: Inside Method do_nothing_for_debug of Class FangesfSpider, 
		# url = https://www.coursehelper.site/index/index/getHeaders?token=ad89558c89c3394167adbfd1484c8700
		# 2019-06-20 16:28:55 [stdout] INFO: b'{"REMOTE_ADDR":"139.196.200.61","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"139.196.200.61, 139.196.200.61"}'

	def url_contains_error(self, url_obj_path = ""):
		if 1 > len( url_obj_path ):
			return False
		path_fragment_list = url_obj_path.split("/")
		if 1 > len( path_fragment_list ):
			return False

		pass
		# do know any anticrawl methods yet

		return False

	def save_json(self, response=None ):
		json_dict = {}
		if response is None or not hasattr(response, "body") or not hasattr( response, "url" ) or not hasattr( response, "meta"):
			self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object" )
			return json_dict

		now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
		try:
			json_dict = json.loads( response.body )
			file_path = os.path.join( self.saved_json_dir, f"cnemc_sssj_{now}.json" )
			with open( file_path, "wb" ) as f:
				f.write( response.body )
		except Exception as ex:
			error_msg = f"failed to write response.body from {response.url}; Exception = {ex}"
			self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" )
		
		temp_dict = {}
		abandoned_key_list = ["localCityAir", "airDataTotal", "waterRealTimeList", "waterWeekList", "waterDataTotal", ]
		for index, key in enumerate( json_dict ):
			if key in abandoned_key_list:
				continue
			temp_dict[key] = json_dict[key]
		temp_dict["response_time"] = now
		return temp_dict

	def parse_json(self, response = None):
		"""
		经过测试，没有发现请求http://www.cnemc.cn/getIndexData.do需要cookie
		可以直接在start_requests方法内请求这个
		print( dir(response) )
		[
		'_DEFAULT_ENCODING', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', 
		'__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', 
		'__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', 
		'_auto_detect_fun', '_body', '_body_declared_encoding', '_body_inferred_encoding', '_cached_benc', '_cached_selector', '_cached_ubody', 
		'_declared_encoding', '_encoding', '_get_body', '_get_url', '_headers_encoding', '_set_body', '_set_url', '_url', 
		'body', 'body_as_unicode', 'copy', 'css', 'encoding', 'flags', 'follow', 'headers', 'meta', 'replace', 'request', 'selector', 'status', 'text', 'url', 'urljoin', 'xpath'
		]
		print( dir(response.request) )
		[
		'__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', 
		'__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', 
		'__slots__', '__str__', '__subclasshook__', '__weakref__', 
		'_body', '_encoding', '_get_body', '_get_url', '_meta', '_set_body', '_set_url', '_url', 
		'body', 'callback', 'cookies', 'copy', 'dont_filter', 'encoding', 'errback', 'flags', 'headers', 'meta', 'method', 'priority', 'replace', 'url'
		]
		"""
		url_obj = parse.urlparse( response.url )
		has_url_error = self.url_contains_error( url_obj_path = url_obj.path )
		if has_url_error:
			return False

		json_dict = self.save_json( response = response )
		if isinstance( json_dict, dict ) and 0 < len( json_dict ):
			loader = ItemLoader( item = Cnemc1Item(), response = response )
			loader = self.load_items_into_loader( loader = loader, text = json_dict, url = response.url )
			yield loader.load_item()

		# get data again after 30 minutes
		if self.request_counter < self.maximal_requests_of_one_crontab_process:
			while( self.check_time_interval() ):
				time.sleep(10)
			
			self.request_counter += 1
			self.last_request_time = time.time()
			now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
			self.logger.info( f" requesting cnemc sssj at {now} ( {self.request_counter} of { self.maximal_requests_of_one_crontab_process } )")
			meta_dict = {}
			if self.use_proxy:
				proxies_dict = self.proxy_ip_pool()
				if 1 > len( proxies_dict):
					sys.exit(3)
				meta_dict["proxy"] = proxies_dict["http"]
			
			formdata_dict = {} # 没有任何表单字段需要post给目标网站
			yield scrapy.FormRequest( url = self.base_url, formdata = formdata_dict, callback = self.parse_json, meta = meta_dict, dont_filter = True )

	def check_time_interval( self ):
		if not isinstance( self.last_request_time, float ):
			return False
		if time.time() - self.last_request_time > float(self.interval_between_requests):
			return False
		return True
			
	def read_and_parse(self, response = None):
		file_list = os.listdir( self.saved_json_dir )
		for one_file in file_list:
			if -1 == one_file.find("index"):
				temp_list = one_file.split("___")
				apt_id = 0
				city = ""
				if 1 < len( temp_list ):
					apt_id = temp_list[1]
					city = temp_list[0]
				url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm" # can also be 16_, 10_, and others
				# https://sz.esf.fang.com/chushou/3_218307566.htm
				html_file_path = os.path.join( self.saved_json_dir, one_file )
				if os.path.isfile(html_file_path):
					doc = None
					with open( html_file_path,'rb') as f:
						# doc = f.read().decode('gb2312', 'ignore')
						doc = f.read().decode('utf-8', 'ignore')
					if doc is None:
						self.logger.error( f"Error: cannot read html file {html_file_path}.")
						continue
					response = Selector( text=doc, type="html" )
					text = self.parse_detailed_response_field( response = response, city = city, apt_id = apt_id )
					try:
						response_for_items = TextResponse( url = url, status = 200, body = bytes(doc, encoding="utf-8") )
						loader = ItemLoader( item = Cnemc1Item(), response = response_for_items )
						loader = self.load_items_into_loader( loader = loader, text = text, url = url )
						yield loader.load_item()
					except Exception as ex:
						self.logger.info( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, Exception = {ex}" )
					if self.debug:
						break

	def write_log(self, content = None, logfilename = None, content_only = False):
		if content is not None and 0 < len( content ):
			today = datetime.datetime.now().strftime("%Y%m%d")
			if logfilename is None:
				logfilename = f"{self.name}{today}.log"
			try:
				with open( os.path.join( self.log_dir, logfilename ), 'a', encoding='utf-8') as f:
					if content_only:
						info = f"{str(content)}\n"
					else:
						info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
					f.write(info)
				return 1
			except Exception as ex:
				return 0
		return -1

Esempio n. 13

Mostra file

File: gzzfcj1.py Progetto: zouyaoji/Crawls

class Gzzfcj1Spider(scrapy.Spider):
    """
	爬取广州阳光家缘列表页面给湘龙，不过由于只有广州有这些数据，决定本网站仅仅是备份；以后有需要再爬取详情页面
		revision 20190806
	"""
    name = "gzzfcj1"

    root_path = ""
    log_dir = ""
    resume_break_point_detailed_file_name = "crawled_detailed_html.log"
    resume_break_point_list_file_name = "crawled_list_html.log"
    crawled_list_url_list = []
    crawled_detailed_url_list = []
    debug = False
    city_list = []
    city_name_dict = {}
    run_purpose = None
    save_every_response = False
    overwrite_today = ""
    crawled_dir = ""
    saved_html_dir = ""
    base_url = ""

    custom_settings = CommonClass.get_custom_settings_dict(spider=name)

    proxy_ip_dict = {}
    min_proxy_ip_life_time = 6
    max_proxy_ip_life_time = 180
    use_proxy = False
    proxy_agent = ""

    def init_self_attributes(self):
        self.root_path = self.settings.get("PROJECT_PATH")
        self.log_dir = self.settings.get(name="LOG_DIR", default="")
        self.debug = self.settings.get(name="PROJECT_DEBUG", default=False)
        self.run_purpose = self.settings.get(name="RUN_PURPOSE", default=None)
        if self.run_purpose is None:
            self.logger.error(
                f"missing RUN_PURPOSE ({self.run_purpose}) setting")
            sys.exit(1)
        self.save_every_response = self.settings.get(
            name="SAVE_EVERY_RESPONSE", default=False)
        self.overwrite_today = self.settings.get("OVERWRITE_TODAY", default="")
        if not hasattr(self, "overwrite_today") or 1 > len(
                self.overwrite_today) or self.overwrite_today is None:
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # set all paths
        self.crawled_dir = self.settings.get(name="CRAWLED_DIR", default="")
        self.saved_html_dir = self.settings.get(name="SAVED_HTML", default="")
        self.base_url = self.settings.get(name="BASE_URL", default="")

        if 1 > len(self.crawled_dir) or 1 > len(self.saved_html_dir):
            error_msg = f"missing CRAWLED_DIR ({self.crawled_dir}), SAVED_HTML ({self.saved_html_dir}) setting(s)"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            sys.exit(3)

        self.min_proxy_ip_life_time = self.settings.get(
            name="MIN_PROXY_LIFE_SPAN", default=6)
        self.max_proxy_ip_life_time = self.settings.get(
            name="MAX_PROXY_LIFE_SPAN", default=180)
        self.use_proxy = self.settings.get(name="HTTPPROXY_ENABLED",
                                           default=False)
        self.proxy_agent = self.settings.get(name="PROXY_AGENT", default="")

    def make_dirs(self):
        # even cache is used, we save all html files; here we make these 3 dirs if they do not exist
        if not os.path.isdir(self.crawled_dir):
            os.makedirs(self.crawled_dir)
        if not os.path.isdir(self.saved_html_dir):
            os.makedirs(self.saved_html_dir)

    def proxy_ip_pool(self):
        """
			迅联错误码10000		提取过快，请至少5秒提取一次
		"""
        if "DRAGONFLY" == self.proxy_agent:
            return CommonClass.get_proxies(proxy_dict={})
        now = time.time()
        need_new_proxy = False
        if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict):
            need_new_proxy = True
        elif "expire" not in self.proxy_ip_dict.keys():
            need_new_proxy = True
        elif now + 3 > self.proxy_ip_dict["expire"]:
            need_new_proxy = True
        if need_new_proxy:
            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 1 > len(proxies_dict):
                return self.proxy_ip_dict  # still return the old ip dict or {}
            proxies_dict["expire"] = now + random.randint(
                self.min_proxy_ip_life_time,
                self.max_proxy_ip_life_time)  # set ip life time
            self.proxy_ip_dict = proxies_dict
        return self.proxy_ip_dict

    def read_crawled_urls(self):
        resume_break_point_detailed_file_path = os.path.join(
            self.log_dir, self.resume_break_point_detailed_file_name)
        try:
            with open(resume_break_point_detailed_file_path,
                      "r",
                      encoding="utf-8") as log_file:
                self.crawled_detailed_url_list = log_file.readlines()
                while "" in self.crawled_detailed_url_list:
                    self.crawled_detailed_url_list.remove("")
        except Exception as ex:
            error_msg = f"fail to read {resume_break_point_detailed_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        # for list pages, do not use this [] to exclude seen urls

    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            urls = [
                # 只有广州有阳光家缘
                "http://zfcj.gz.gov.cn/data/Laho/ProjectSearch.aspx",
            ]

            meta_dict = {
                "page_type": "index",
                "page": 1,
                "total_pages": 468,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            for url in urls:
                # yield scrapy.Request( url = url, cookies=cookie_dict, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "GET_CHANNELS" == self.run_purpose:  # GET_CHANNELS is one kind of debug
            urls = []
            city_list = self.settings.get("CITY_LIST", default=[])
            for index, city in enumerate(city_list):
                urls.append(f"https://{city}.esf.fang.com/")
            if 0 < len(urls):
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": 0,
                }
                yield scrapy.Request(url=urls[0],
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)

    def change_cookies(self, cookie_dict={}):
        if "uservisitMarketitem" in cookie_dict.keys():
            item_str = cookie_dict["uservisitMarketitem"]
            item_str = parse.unquote(item_str)
            item_list = item_str.split(",")
            new_str = ""
            for index, one in enumerate(item_list):
                if index > len(item_list) - 4:
                    new_str += f",{one}"
            cookie_dict["uservisitMarketitem"] = parse.quote(new_str)
        return cookie_dict

    def get_total_pages(self, response=None):
        """
			/market/440600__4______1_1_3.html
		"""
        total_pages = 0
        if response is None:
            return total_pages
        all_span_text_list = response.xpath(
            "//span[@id='pe100_page_项目信息查询列表']/div[@class='pager']/span/text()"
        ).extract()
        # 共7013/共468页
        for one_text in all_span_text_list:
            if -1 == one_text.find("共"):
                continue
            page = 0
            fragment_list = one_text.split("/")
            for one_fragment in fragment_list:
                if -1 < one_fragment.find("页"):
                    page = one_fragment.replace("页", "")
                    page = page.replace("共", "")
                    return int(page)
        return 0

    def make_html_file_name(self, url="", city="", page_type=""):
        """
			pass
		"""
        now = datetime.datetime.now()
        html_filename = "{}.html".format(now.strftime("%Y%m%d_%H%M%S"))
        today = now.strftime("%Y%m%d")

        url_obj = parse.urlparse(url)
        url_list = url_obj.path.split("/")
        for one in url_list:
            if -1 < one.find(".html"):
                html_filename = f"{city}__{page_type}__{one}"
                break
        return html_filename

    def save_html(self, response=None):
        page_type = "index"
        total_pages = 0
        page = 0

        if response is None or not hasattr(response, "meta") or not hasattr(
                response, "body") or not hasattr(response, "url"):
            if hasattr(response, "url"):
                error_msg = f"fail to save response.body after requesting {response.url}; response has no body or meta attribute(s)"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return -1, page
        url = response.url
        meta_dict = response.meta

        if "page_type" in meta_dict.keys(): page_type = meta_dict["page_type"]
        if "page" in meta_dict.keys(): page = meta_dict["page"]

        if "index" == page_type:
            if "total_pages" in meta_dict.keys():
                total_pages = int(meta_dict["total_pages"])
            if 0 == total_pages:
                total_pages = self.get_total_pages(response=response)
            html_filename = f"realestate___list{page}.html"
            html_file_path = os.path.join(self.saved_html_dir, html_filename)

        elif "detailed" == page_type:
            html_filename = self.make_html_file_name(url=url,
                                                     page_type=page_type)
            html_file_path = os.path.join(self.saved_html_dir, html_filename)
            total_pages = 100001

        try:
            with open(html_file_path, "wb") as f:
                f.write(response.body)
        except Exception as ex:
            error_msg = f"fail to write response.body into {html_file_path} after requesting {url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return -2, page
        else:
            if 1 > total_pages:
                error_msg = f"response.body saved after requesting {response.url}; but fail to extract total page number from response.body"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
            return total_pages, page

    def extract_link_list(self, response=None):
        record_list = []
        tr_list = response.xpath('//table[@class="resultTableC"]/tbody/tr')
        for one_tr in tr_list:
            try:
                detailed_page_link = one_tr.xpath(
                    './tr/td/a/@href').extract_first(default="")
                detailed_page_link = CommonClass.clean_string(
                    string=detailed_page_link,
                    char_to_remove=[
                        '\r',
                        '\n',
                        '\t',
                        ' ',
                    ])
                td_list = one_tr.xpath('./td')
                value_list = []
                for one_td in td_list:
                    value_list.append(
                        one_td.xpath("./a/text()").extract_first(default=""))

                # 检查这7个字段是否都是空字符串
                if 7 == len(value_list):
                    not_empty = False
                    for one_value in value_list:
                        if isinstance(one_value, str) and 0 < len(one_value):
                            not_empty = True
                            break
                if 7 == len(value_list) and not_empty:
                    this_record = {
                        "序号": value_list[0],
                        "项目名称": value_list[1],
                        "开发商": value_list[2],
                        "预售证": value_list[3],
                        "项目地址": value_list[4],
                        "住宅已售套数": value_list[5],
                        "住宅未售套数": value_list[6],
                        "详情链接": detailed_page_link,
                    }
                    record_list.append(this_record)
                elif 7 != len(value_list):
                    error_msg = f"value_list ({value_list}) has length other than 7"
                    self.logger.error(
                        f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                    )

            except Exception as ex:
                error_msg = f"xpath error! Exception = {ex}"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
        if 1 > len(record_list):
            error_msg = f"Fail to extract links from {response.url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        return record_list

    def load_items_into_loader(self, loader=None, text={}, url=""):
        loader.add_value("content", str(text))  # , encoding="utf-8"
        loader.add_value("page_type", "detailed")

        # record housekeeping fields
        loader.add_value("url", url)
        loader.add_value("project", self.settings.get('BOT_NAME'))
        loader.add_value("spider", self.name)
        loader.add_value("server", socket.gethostname())
        loader.add_value("date",
                         datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
        return loader

    def parse_detailed_response_field(self, response=None, city=""):
        text = {}
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text
        information_div = response.xpath("//div[@id='printData1']")

        title = information_div.xpath(
            "./div[@class='tit_box01']/text()").extract_first(default="")
        land_id = information_div.xpath(
            "./div[@class='menubox01 mt20']/span[@class='gray2']/text()"
        ).extract_first(default="")
        province_city = information_div.xpath(
            "string(./div[@class='menubox01 p0515']/div[@class='fl'])"
        ).extract()
        province_city = "___".join(province_city)

        if 0 < len(title): text["title"] = title
        if 0 < len(land_id): text["land_id"] = land_id
        if 0 < len(province_city): text["province_city"] = province_city

        key1 = information_div.xpath(
            "./div[@class='p1015']/div[@class='tit_box02 border03']/text()"
        ).extract_first(default="")
        if "土地基本信息" == key1:
            basic_info = {}
            tr_list1 = information_div.xpath(
                "./div[@class='p1015']/div[@class='tit_box02 border03']/following-sibling::table[@class='tablebox02 mt10']/tbody/tr"
            )
            for index, one_tr in enumerate(tr_list1):
                string_list = one_tr.xpath("string(.)").extract()
                td_list = []
                for one_str in string_list:
                    cleaned_str = CommonClass.clean_string(string=one_str,
                                                           char_to_remove=[
                                                               '\xa0',
                                                               '\n',
                                                               '\t',
                                                               ' ',
                                                           ])
                    td_list.append(cleaned_str.strip('\r'))
                basic_info[index] = "___".join(td_list)
            text[key1] = basic_info

        key2 = information_div.xpath(
            "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/text()"
        ).extract_first(default="")
        if "土地交易信息" == key2:
            trade_info = {}
            tr_list2 = information_div.xpath(
                "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/following-sibling::div[@class='banbox']/table[@class='tablebox02 mt10']/tbody/tr"
            )
            for index, one_tr in enumerate(tr_list2):
                string_list = one_tr.xpath("string(.)").extract()
                td_list = []
                for one_str in string_list:
                    cleaned_str = CommonClass.clean_string(string=one_str,
                                                           char_to_remove=[
                                                               '\xa0',
                                                               '\n',
                                                               '\t',
                                                               ' ',
                                                           ])
                    td_list.append(cleaned_str.strip('\r'))
                trade_info[index] = "___".join(td_list)
            text[key2] = trade_info

        # 20190730 cannot get 土地评估结果, todo ...
        # evaluation_div = response.xpath("//div[@id='divpg']")
        # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" )
        # if "土地评估结果" == key3:
        # 	evaluation_dict = {}
        # 	tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr")
        # 	for index, one_tr in enumerate( tr_list3 ):
        # 		this_td = one_tr.xpath("./td")
        # 		if this_td is None:
        # 			string_list = one_tr.xpath("string(./th)").extract()
        # 		else:
        # 			td_list = one_tr.xpath("./td")
        # 			string_list = []
        # 			for one_td in td_list:
        # 				unit = one_td.xpath("./text()").extract_first( default= "" )
        # 				amount = one_td.xpath("./span/text()").extract_first( default= "" )
        # 				string_list.append( f"{amount}___{unit}" )
        # 				# this_td_str_list = one_td.xpath("string(.)").extract()
        # 				# string_list.extend( this_td_str_list )
        # 		td_th_list = []
        # 		for one_str in string_list:
        # 			cleaned_str = CommonClass.clean_string( string = one_str, char_to_remove = [ '\xa0', '\n', '\t', ' ',] )
        # 			td_th_list.append( cleaned_str.strip('\r') )
        # 		evaluation_dict[index] = "___".join( td_th_list )
        # 	text[key3] = evaluation_dict

        # evaluation_div = response.xpath("//div[@id='divpg']")
        # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" )
        # if "土地评估结果" == key3:
        # 	evaluation_dict = {}
        # 	th_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr/th")
        # 	string_list = th_list3.xpath("string(.)").extract()
        # 	evaluation_dict["fields"] = "___".join( string_list )
        # 	tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr")
        # 	row2 = tr_list3[1].xpath("./td")
        # 	row2string = ""
        # 	str1 = row2[0].xpath("./text()").extract_first( default= "" )
        # 	str2 = row2[1].xpath("string(.)").extract()
        # 	str2 = "___".join( str2 )
        # 	str3amount = response.xpath("//span[@id='scbj_bpgj']")
        # 	str3unit = row2[2].xpath("./text()").extract_first( default= "" )
        # 	str4amount = response.xpath("//span[@id='scbj_bSumPrice']")
        # 	str4amount = str4amount.get()
        # 	str3amount = str3amount.get()
        # 	str4unit = row2[3].xpath("./text()").extract_first( default= "" )
        # 	str5 = row2[4].xpath("./a/@href").extract_first( default= "" )
        # 	evaluation_dict[str1] = f"{str2}___{str3amount} {str3unit}___{str4amount} {str4unit}___{str5}"
        # 	row3 = tr_list3[2].xpath("./td")
        # 	row3str = row3.xpath("string(.)").extract()
        # 	evaluation_dict["假设开发法"] = "___".join( row3str )
        # 	text[key3] = evaluation_dict

        if 0 < len(text): text["city"] = city
        return text

        # {'fields': '\xa0___推出楼面价___评估楼面价___评估总价___操作', '市场比较法': '暂无 元/㎡___ 元/㎡___ 万元___
        # /LandAssessment/b17ea17a-eefa-428b-8b53-461c2bdc67ea.html', '假设开发法': '假设开发法___暂无 元/㎡___元/㎡___万元___[进入评估报告]'}

    def log_for_picking_up_the_crawl_break_point(self,
                                                 page_type="detailed",
                                                 response=None):
        if "detailed" == page_type:
            resume_break_point_file_path = os.path.join(
                self.log_dir, self.resume_break_point_detailed_file_name)
        else:
            resume_break_point_file_path = os.path.join(
                self.log_dir, self.resume_break_point_list_file_name)
        try:
            with open(resume_break_point_file_path, "a") as f:
                f.write(f"{response.url}\n")
        except Exception as ex:
            error_msg = f"fail to write response.url into {resume_break_point_file_path}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

    def parse_detailed_page(self, response=None):
        url = response.url
        result_obj = parse.urlparse(url)
        has_url_error = self.url_contains_error(
            result_obj_path=result_obj.path)
        if has_url_error:
            return False

        total_pages, page = self.save_html(response=response)
        text = self.parse_detailed_response_field(response=response, city=city)
        if isinstance(text, dict) and 0 < len(text):
            try:
                loader = ItemLoader(item=Gzzfcj1Item(), response=response)
                loader = self.load_items_into_loader(loader=loader,
                                                     text=text,
                                                     url=url)
                self.log_for_picking_up_the_crawl_break_point(
                    page_type="detailed", response=response)
                yield loader.load_item()
            except Exception as ex:
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
                )

    def do_nothing_for_debug(self, response=None):
        self.logger.info(
            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}"
        )
        # print( response.body )
        # Inside Method request_proxy_ip of Class ProxyAgent, proxy server returns [{'IP': '49.87.226.131:10749'}]
        # b'{"REMOTE_ADDR":"49.87.226.131","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"49.87.226.131, 49.87.226.131"}'
        # 2019-06-20 16:28:55 [fangesf] INFO: Inside Method do_nothing_for_debug of Class FangesfSpider,
        # url = https://www.coursehelper.site/index/index/getHeaders?token=ad89558c89c3394167adbfd1484c8700
        # 2019-06-20 16:28:55 [stdout] INFO: b'{"REMOTE_ADDR":"139.196.200.61","HTTP_CLIENT_IP":"","HTTP_X_FORWARDED_FOR":"139.196.200.61, 139.196.200.61"}'

    def url_contains_error(self, url_obj_path=""):
        if 1 > len(url_obj_path):
            return False
        path_fragment_list = url_obj_path.split("/")
        if 1 > len(path_fragment_list):
            return False

        pass
        # do know any anticrawl methods yet

        return False

    def parse_list_page(self, response=None):
        url_obj = parse.urlparse(response.url)
        has_url_error = self.url_contains_error(url_obj_path=url_obj.path)
        if has_url_error:
            return False

        total_pages, page = self.save_html(response=response)
        print(
            f"total_pages = {total_pages}, page = {page}; url = {response.url}"
        )

        if 1 > total_pages:
            pass
            # -2, -1, 0: error_msg has been logged; just pass
        elif 100001 == total_pages:
            self.parse_detailed_page(response=response)
        else:
            link_list = self.extract_link_list(response=response)
            if self.debug:
                self.logger.info(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, url = {response.url}; link_list = {link_list}"
                )
            else:
                new_url = f"{url_obj.scheme}://{url_obj.netloc}"
                self.log_for_picking_up_the_crawl_break_point(
                    page_type="index", response=response)
                # 20190806不爬取详情页面了，赶紧先爬取列表页面
                for text_dict in link_list:
                    if isinstance(text_dict, dict) and 0 < len(text_dict):
                        try:
                            loader = ItemLoader(item=Gzzfcj1Item(),
                                                response=response)
                            loader = self.load_items_into_loader(
                                loader=loader,
                                text=text_dict,
                                url=response.url)
                            yield loader.load_item()
                        except Exception as ex:
                            self.logger.error(
                                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, fail to load item. Exception = {ex}"
                            )

                # crawling vertically
                # meta_dict = {
                # 	"page_type": "detailed",
                # 	"total_pages": 1,
                # }
                # if self.use_proxy:
                # 	proxies_dict = self.proxy_ip_pool()
                # 	meta_dict["proxy"] = proxies_dict["http"]

                # for one_link in link_list:
                # 	if 0 != one_link.find('/'): one_link = f"/{one_link}"
                # 	this_i_url = f"{new_url}{one_link}"
                # 	if this_i_url in self.crawled_detailed_url_list:
                # 		self.logger.info( f"previously crawled {this_i_url}" )
                # 	else:
                # 		self.logger.info( f"requesting {this_i_url}" )
                # 		yield scrapy.Request( url = this_i_url, cookies=self.cookie_dict, callback = self.parse_detailed_page, meta = meta_dict, dont_filter = True )

                # crawling horizontally
                # http://zfcj.gz.gov.cn/data/Laho/ProjectSearch.aspx?page=4
                if 1 < total_pages and 1 == page:
                    meta_dict = response.meta
                    if self.use_proxy:
                        proxies_dict = self.proxy_ip_pool()
                        meta_dict["proxy"] = proxies_dict["http"]
                    for i in range(total_pages - 1):
                        meta_dict["page"] = i + 2
                        this_i_url = f"{self.base_url}?page={i + 2}"
                        self.logger.info(
                            f"requesting list page at {this_i_url}")
                        yield scrapy.Request(
                            url=f"{this_i_url}",
                            callback=self.parse_list_page,
                            meta=meta_dict,
                            dont_filter=True)  # cookies=self.cookie_dict,

    def read_and_parse(self, response=None):
        file_list = os.listdir(self.saved_html_dir)
        for one_file in file_list:
            if -1 == one_file.find("index"):
                temp_list = one_file.split("___")
                apt_id = 0
                city = ""
                if 1 < len(temp_list):
                    apt_id = temp_list[1]
                    city = temp_list[0]
                url = f"https://{city}.esf.fang.com/chushou/3_{apt_id}.htm"  # can also be 16_, 10_, and others
                # https://sz.esf.fang.com/chushou/3_218307566.htm
                html_file_path = os.path.join(self.saved_html_dir, one_file)
                if os.path.isfile(html_file_path):
                    doc = None
                    with open(html_file_path, 'rb') as f:
                        # doc = f.read().decode('gb2312', 'ignore')
                        doc = f.read().decode('utf-8', 'ignore')
                    if doc is None:
                        self.logger.error(
                            f"Error: cannot read html file {html_file_path}.")
                        continue
                    response = Selector(text=doc, type="html")
                    text = self.parse_detailed_response_field(
                        response=response, city=city, apt_id=apt_id)
                    try:
                        response_for_items = TextResponse(
                            url=url,
                            status=200,
                            body=bytes(doc, encoding="utf-8"))
                        loader = ItemLoader(item=Gzzfcj1Item(),
                                            response=response_for_items)
                        loader = self.load_items_into_loader(loader=loader,
                                                             text=text,
                                                             url=url)
                        yield loader.load_item()
                    except Exception as ex:
                        self.logger.info(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, Exception = {ex}"
                        )
                    if self.debug:
                        break

    def write_log(self, content=None, logfilename=None, content_only=False):
        if content is not None and 0 < len(content):
            today = datetime.datetime.now().strftime("%Y%m%d")
            if logfilename is None:
                logfilename = f"{self.name}{today}.log"
            try:
                with open(os.path.join(self.log_dir, logfilename),
                          'a',
                          encoding='utf-8') as f:
                    if content_only:
                        info = f"{str(content)}\n"
                    else:
                        info = f"[{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}] {content}\n"
                    f.write(info)
                return 1
            except Exception as ex:
                return 0
        return -1