Esempio n. 1
0
    def process_item(self, item, spider):
        """
			revision: 20190730
		"""
        self.init_self_attributes(spider)

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
        excluded_list = [
            "page_type",
        ]
        key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys(
            item=item, excluded_key_list=excluded_list)
        index = -1
        content_dict = {}
        if "content" in key_list1 and "detailed" == page_type:
            index = key_list1.index("content")
            if -1 < index and index < len(item_list1):
                content_dict = eval(item_list1[index])
                item_list1.remove(item_list1[index])
                key_list1.remove("content")

                keys = []
                items = []
                for key, value in content_dict.items():
                    keys.append(key)
                    items.append(value)
                key_list = keys + key_list1
                item_list = items + item_list1

                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
        elif "detailed" == page_type:
            error_msg = f"no content in key_list1 ({key_list1})"
            spider.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        return item
Esempio n. 2
0
    def process_item(self, item, spider):
        """
			there are so many lat, and lng for one bus route (one item), therefore we do not request amap here.
		"""
        self.init_self_attributes(spider=spider)

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
                break

        excluded_list = [
            "page_type",
        ]
        key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys(
            item=item, excluded_key_list=excluded_list)

        if "detailed" == page_type:
            result_bool, key_list, item_list = CommonScrapyPipelineClass.extract_items_and_keys_from_content(
                raw_key_list=key_list1,
                raw_item_list=item_list1,
                content_field_name_str="content")
            if result_bool:
                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
            else:
                spider.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, no content in key_list1 ({key_list1})"
                )

        return item
Esempio n. 3
0
    def process_item(self, item, spider):
        self.init_self_attributes(spider=spider)

        random_key = random.randint(0, len(self.key_list) - 1)
        account = self.key_list[random_key]

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
        excluded_list = [
            "page_type",
        ]
        all_keys1, item_list1 = self.get_items_and_keys(
            item=item, excluded_list=excluded_list)
        index = -1
        content_dict = {}
        if "content" in all_keys1 and "detailed" == page_type:
            index = all_keys1.index("content")
            if -1 < index and index < len(item_list1):
                content_dict = eval(item_list1[index])
                item_list1.remove(item_list1[index])
                all_keys1.remove("content")
                content_dict["longitude"] = np.nan
                content_dict["latitude"] = np.nan
                content_dict["adcode"] = np.nan

                # request Gaode here
                if isinstance(item["url"], list):
                    temp_list = str(item["url"][0]).replace("https://", "")
                elif isinstance(item["url"], str):
                    temp_list = item["url"].replace("https://", "")
                temp_list = temp_list.split(".")
                city_name = temp_list[0] if 0 < len(temp_list) and 0 < len(
                    temp_list[0]) else ""
                if 0 < len(city_name):
                    city_name = self.check_city_name(city_name)
                    three_requests_for_tryout = [
                        "location",
                        "address",
                    ]
                    for one_tryout in three_requests_for_tryout:
                        if one_tryout in content_dict.keys():
                            result_dict = {}
                            params = {
                                "key":
                                account,
                                "address":
                                str(self.clean_addr(content_dict[one_tryout])),
                                "city":
                                city_name,
                            }
                            response = requests.get(self.base_gaode_url,
                                                    headers=self.headers,
                                                    params=params)
                            if 200 == response.status_code:
                                if self.save_every_response is not None and self.save_every_response:
                                    self.save_reponsed_json_file(
                                        rent_id=content_dict["rent_id"],
                                        response=response.text)
                                result_dict = self.parse_gaode_json(
                                    response.text)
                                if 0 < (result_dict["count"]):
                                    content_dict["longitude"] = result_dict[
                                        "longitude"]
                                    content_dict["latitude"] = result_dict[
                                        "latitude"]
                                    content_dict["adcode"] = result_dict[
                                        "adcode"]
                                    break
                keys = []
                items = []
                for key, value in content_dict.items():
                    keys.append(key)
                    items.append(value)
                key_list = keys + all_keys1
                item_list = items + item_list1

                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
        elif "detailed" == page_type:
            spider.logger.error(
                f"no content in all_keys1 ({all_keys1}) in Method process_item of Class FangPipeline. Exception = {ex}"
            )

        return item
Esempio n. 4
0
    def process_item(self, item, spider):
        """
			todo: some parts of this method can be moved to commonfunctions.py
		"""
        self.init_self_attributes(spider)

        random_key = random.randint(0, len(self.key_list) - 1)
        account = self.key_list[random_key]

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
        excluded_list = [
            "page_type",
        ]
        key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys(
            item=item, excluded_key_list=excluded_list)
        index = -1
        content_dict = {}
        if "content" in key_list1 and "detailed" == page_type:
            index = key_list1.index("content")
            if -1 < index and index < len(item_list1):
                content_dict = eval(item_list1[index])
                item_list1.remove(item_list1[index])
                key_list1.remove("content")
                content_dict["longitude"] = np.nan
                content_dict["latitude"] = np.nan
                content_dict["adcode"] = np.nan

                # request Gaode here
                if isinstance(item["url"], list):
                    url_str = str(item["url"][0])
                elif isinstance(item["url"], str):
                    url_str = item["url"]
                city_name_fang = self.get_city_or_district_name_from_url(
                    url=url_str)
                if 0 < len(city_name_fang):
                    city_name_amap = self.switch_city_name(
                        city_name=city_name_fang, spider=spider)
                    community_name = self.extract_community_name(
                        content_dict=content_dict)
                    spider.logger.info(
                        f"requesting Gaode using community name {community_name}"
                    )
                    if 0 < len(community_name):
                        result_dict = {}
                        params = {
                            "key":
                            account,
                            "address":
                            str(
                                CommonScrapyPipelineClass.clean_addr(
                                    text=community_name)),
                            "city":
                            city_name_amap,
                        }
                        try:
                            # 20190621发现爬取佛山的时候因为DNS解析失败而丢失了14条记录。这里增加代码,记录再次丢失。
                            # socket.gaierror: [Errno -3] Temporary failure in name resolution
                            response = requests.get(self.base_gaode_url,
                                                    headers=self.headers,
                                                    params=params)
                            if 200 == response.status_code:
                                if self.save_every_response is not None and self.save_every_response:
                                    self.save_reponsed_json_file(
                                        apt_id=content_dict["apt_id"],
                                        response=response.text,
                                        spider=spider)
                                result_dict = CommonScrapyPipelineClass.parse_gaode_json(
                                    json_text=response.text)
                                if 0 < (result_dict["count"]):
                                    content_dict["longitude"] = result_dict[
                                        "longitude"]
                                    content_dict["latitude"] = result_dict[
                                        "latitude"]
                                    content_dict["adcode"] = result_dict[
                                        "adcode"]
                        except Exception as ex:
                            spider.logger.error(
                                f"requests or other errors. Exception = {ex}")

                keys = []
                items = []
                for key, value in content_dict.items():
                    keys.append(key)
                    items.append(value)
                key_list = keys + key_list1
                item_list = items + item_list1

                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
        elif "detailed" == page_type:
            error_msg = f"no content in key_list1 ({key_list1})"
            spider.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        return item