Example #1
0
    def process_item(self, item, spider):
        """
			revision: 20190730
		"""
        self.init_self_attributes(spider)

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
        excluded_list = [
            "page_type",
        ]
        key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys(
            item=item, excluded_key_list=excluded_list)
        index = -1
        content_dict = {}
        if "content" in key_list1 and "detailed" == page_type:
            index = key_list1.index("content")
            if -1 < index and index < len(item_list1):
                content_dict = eval(item_list1[index])
                item_list1.remove(item_list1[index])
                key_list1.remove("content")

                keys = []
                items = []
                for key, value in content_dict.items():
                    keys.append(key)
                    items.append(value)
                key_list = keys + key_list1
                item_list = items + item_list1

                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
        elif "detailed" == page_type:
            error_msg = f"no content in key_list1 ({key_list1})"
            spider.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        return item
Example #2
0
 def init_self_attributes(self, spider):
     today = datetime.datetime.now().strftime("%Y%m%d")
     if 1 > len(self.crawled_dir):
         self.crawled_dir = spider.settings.get(name="CRAWLED_DIR",
                                                default="")
     if self.csv_file_path is None or 1 > len(self.csv_file_path):
         self.csv_file_path = os.path.join(self.crawled_dir,
                                           f"{spider.name}{today}.csv")
     if self.to_kafka is None:
         self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA",
                                             default=False)
     if 1 > len(self.kafka_topic):
         self.kafka_topic = spider.name
     if self.cluster_servers_for_spiders is None or 1 > len(
             self.cluster_servers_for_spiders):
         self.cluster_servers_for_spiders = spider.settings.get(
             name="CLUSTER_SERVERS_FOR_SPIDERS", default=[])
     if self.cluster_servers_for_kafka is None or 1 > len(
             self.cluster_servers_for_kafka):
         self.cluster_servers_for_kafka = spider.settings.get(
             name="CLUSTER_SERVERS_FOR_KAFKA", default=[])
     if socket.gethostname() in self.cluster_servers_for_spiders:
         self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka(
             kafka_producer=self.kafka_producer,
             kafka_servers=self.cluster_servers_for_kafka,
             spider_obj=spider)
Example #3
0
	def init_self_attributes(self, spider):
		if self.root_path is None or 1 > len( self.root_path ):
			self.root_path = spider.settings.get( "PROJECT_PATH", default = None )
		if self.overwrite_today is None or 1 > len( self.overwrite_today ):
			self.overwrite_today = spider.settings.get( "OVERWRITE_TODAY", default = "" )
		if 1 > len( self.overwrite_today ):
			self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

		# set all filenames, file paths, dir
		if 1 > len( self.crawled_dir ):
			self.crawled_dir = spider.settings.get( name='CRAWLED_DIR', default = "" )
		if 1 > len( self.detail_html_dir ):
			self.detail_html_dir = spider.settings.get( name="SAVED_DETAIL_HTML", default="" )
		if self.csv_file_path is None or 1 > len( self.csv_file_path ):
			self.csv_file_path = os.path.join( self.crawled_dir, f"{spider.name}_{self.overwrite_today}.csv" )

		if self.to_kafka is None:
			self.to_kafka = spider.settings.get( name="PIPELINE_TO_KAFKA", default = False )
		if 1 > len( self.kafka_topic ):
			self.kafka_topic = spider.name if hasattr( spider, "name" ) else ""
		if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders ):
			self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default = [] )
		if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka ):
			self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default = [] )
		if socket.gethostname() in self.cluster_servers_for_spiders:
			self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer = self.kafka_producer, kafka_servers = self.cluster_servers_for_kafka, spider_obj = spider )
Example #4
0
    def init_self_attributes(self, spider):
        if self.root_path is None or 1 > len(self.root_path):
            self.root_path = spider.settings.get("PROJECT_PATH", default=None)
        if self.overwrite_today is None or 1 > len(self.overwrite_today):
            self.overwrite_today = spider.settings.get("OVERWRITE_TODAY",
                                                       default="")
        if 1 > len(self.overwrite_today):
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # Gaode information
        if 1 > len(self.key_list):
            self.key_list = spider.settings.get("AMAP_KEYS", default=[])
        if 1 > len(self.city_name_dict):
            self.city_name_dict = spider.settings.get("CITY_NAME_DICT",
                                                      default={})
        if 1 > len(self.city_list):
            self.city_list = spider.settings.get("CITY_LIST", default=[])
        if 1 > len(self.district_list):
            self.district_list = spider.settings.get("DISTRICT_LIST",
                                                     default=[])
        if 1 > len(self.city_name_for_districts):
            self.city_name_for_districts = spider.settings.get(
                "CITY_NAME_FOR_DISTRICTS", default="")
        if self.save_every_response is None:
            self.save_every_response = spider.settings.get(
                "SAVE_EVERY_RESPONSE", default=False)
        if 1 > len(self.headers):
            self.headers = spider.settings.get("DEFAULT_REQUEST_HEADERS",
                                               default={})

        # set all filenames, file paths, dir
        if 1 > len(self.crawled_dir):
            self.crawled_dir = spider.settings.get(name='CRAWLED_DIR',
                                                   default="")
        if 1 > len(self.detail_html_dir):
            self.detail_html_dir = spider.settings.get(
                name="SAVED_DETAIL_HTML", default="")
        if 1 > len(self.gaode_json_dir):
            self.gaode_json_dir = spider.settings.get(name="SAVED_GAODE_JASON",
                                                      default="")
        if self.csv_file_path is None or 1 > len(self.csv_file_path):
            self.csv_file_path = os.path.join(
                self.crawled_dir, f"fang_esf{self.overwrite_today}.csv")

        if self.to_kafka is None:
            self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA",
                                                default=False)
        if self.cluster_servers_for_spiders is None or 1 > len(
                self.cluster_servers_for_spiders):
            self.cluster_servers_for_spiders = spider.settings.get(
                name="CLUSTER_SERVERS_FOR_SPIDERS", default=[])
        if self.cluster_servers_for_kafka is None or 1 > len(
                self.cluster_servers_for_kafka):
            self.cluster_servers_for_kafka = spider.settings.get(
                name="CLUSTER_SERVERS_FOR_KAFKA", default=[])
        if socket.gethostname() in self.cluster_servers_for_spiders:
            self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka(
                kafka_producer=self.kafka_producer,
                kafka_servers=self.cluster_servers_for_kafka,
                spider_obj=spider)
Example #5
0
 def close_spider(self, spider=None):
     CommonScrapyPipelineClass.log_close_spider(spider_obj=spider)
     if spider.xy_seen_updated_bool:
         log_file_path = os.path.join(spider.log_dir,
                                      spider.xy_response_log_file_name)
         bak_file_name = spider.xy_response_log_file_name.replace(
             ".log", ".bak")
         bak_file_path = os.path.join(spider.log_dir, bak_file_name)
         if os.path.isfile(log_file_path):
             shutil.copyfile(log_file_path, bak_file_path)
         try:
             with open(log_file_path, "w", encoding="utf-8") as xy_log_file:
                 for index, center_xy_str in enumerate(spider.xy_seen_dict):
                     item = spider.xy_seen_dict[center_xy_str]
                     xy_log_file.write(f"{center_xy_str},{item}\n")
         except Exception as ex:
             spider.logger.error(
                 f"cannot write historical xy_log_file ({log_file_path}). Exception = {ex}"
             )
Example #6
0
 def initialize_amap_xy(self, response):
     if response is None or not hasattr(response, "body") or not hasattr(
             response, "url") or not hasattr(response, "meta"):
         self.logger.error(
             f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object"
         )
         return None
     meta_dict = response.meta
     bd09xy = "%.6f,%.6f" % (meta_dict["x"], meta_dict["y"])
     index = meta_dict["index"]
     json_dict = json.loads(response.body)
     if "status" not in json_dict.keys(
     ) or "locations" not in json_dict.keys() or 1 != int(
             json_dict["status"]):
         self.logger.error(
             f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response status"
         )
         return None
     if not isinstance(json_dict["locations"],
                       str) or 1 > len(json_dict["locations"]):
         self.logger.error(
             f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response locations"
         )
         return None
     amap_xy = json_dict["locations"]
     this_row = f"{index}:{bd09xy}==>{amap_xy}"
     new_xy_file_name = "data4cities_amap.txt"
     new_xy_log_file_name = "bd09to_amap.log"
     new_xy_log_file_name = os.path.join(self.root_path, self.name,
                                         new_xy_log_file_name)
     new_xy_file_name = os.path.join(self.root_path, self.name,
                                     new_xy_file_name)
     CommonScrapyPipelineClass.append_row(
         spider_obj=self,
         key_list=["xy"],
         item_list=[amap_xy],
         csv_file_path_str=new_xy_file_name)
     CommonScrapyPipelineClass.append_row(
         spider_obj=self,
         key_list=["xy"],
         item_list=[this_row],
         csv_file_path_str=new_xy_log_file_name)
Example #7
0
    def process_item(self, item, spider):
        """
			there are so many lat, and lng for one bus route (one item), therefore we do not request amap here.
		"""
        self.init_self_attributes(spider=spider)

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
                break

        excluded_list = [
            "page_type",
        ]
        key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys(
            item=item, excluded_key_list=excluded_list)

        if "detailed" == page_type:
            result_bool, key_list, item_list = CommonScrapyPipelineClass.extract_items_and_keys_from_content(
                raw_key_list=key_list1,
                raw_item_list=item_list1,
                content_field_name_str="content")
            if result_bool:
                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
            else:
                spider.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, no content in key_list1 ({key_list1})"
                )

        return item
Example #8
0
 def close_spider(self, spider=None):
     CommonScrapyPipelineClass.log_close_spider(spider_obj=spider)
Example #9
0
    def process_item(self, item, spider):
        self.init_self_attributes(spider=spider)

        random_key = random.randint(0, len(self.key_list) - 1)
        account = self.key_list[random_key]

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
        excluded_list = [
            "page_type",
        ]
        all_keys1, item_list1 = self.get_items_and_keys(
            item=item, excluded_list=excluded_list)
        index = -1
        content_dict = {}
        if "content" in all_keys1 and "detailed" == page_type:
            index = all_keys1.index("content")
            if -1 < index and index < len(item_list1):
                content_dict = eval(item_list1[index])
                item_list1.remove(item_list1[index])
                all_keys1.remove("content")
                content_dict["longitude"] = np.nan
                content_dict["latitude"] = np.nan
                content_dict["adcode"] = np.nan

                # request Gaode here
                if isinstance(item["url"], list):
                    temp_list = str(item["url"][0]).replace("https://", "")
                elif isinstance(item["url"], str):
                    temp_list = item["url"].replace("https://", "")
                temp_list = temp_list.split(".")
                city_name = temp_list[0] if 0 < len(temp_list) and 0 < len(
                    temp_list[0]) else ""
                if 0 < len(city_name):
                    city_name = self.check_city_name(city_name)
                    three_requests_for_tryout = [
                        "location",
                        "address",
                    ]
                    for one_tryout in three_requests_for_tryout:
                        if one_tryout in content_dict.keys():
                            result_dict = {}
                            params = {
                                "key":
                                account,
                                "address":
                                str(self.clean_addr(content_dict[one_tryout])),
                                "city":
                                city_name,
                            }
                            response = requests.get(self.base_gaode_url,
                                                    headers=self.headers,
                                                    params=params)
                            if 200 == response.status_code:
                                if self.save_every_response is not None and self.save_every_response:
                                    self.save_reponsed_json_file(
                                        rent_id=content_dict["rent_id"],
                                        response=response.text)
                                result_dict = self.parse_gaode_json(
                                    response.text)
                                if 0 < (result_dict["count"]):
                                    content_dict["longitude"] = result_dict[
                                        "longitude"]
                                    content_dict["latitude"] = result_dict[
                                        "latitude"]
                                    content_dict["adcode"] = result_dict[
                                        "adcode"]
                                    break
                keys = []
                items = []
                for key, value in content_dict.items():
                    keys.append(key)
                    items.append(value)
                key_list = keys + all_keys1
                item_list = items + item_list1

                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
        elif "detailed" == page_type:
            spider.logger.error(
                f"no content in all_keys1 ({all_keys1}) in Method process_item of Class FangPipeline. Exception = {ex}"
            )

        return item
Example #10
0
    def process_item(self, item, spider):
        """
			todo: some parts of this method can be moved to commonfunctions.py
		"""
        self.init_self_attributes(spider)

        random_key = random.randint(0, len(self.key_list) - 1)
        account = self.key_list[random_key]

        page_type = ""
        for index, one in enumerate(item):
            if "page_type" == one and 1 == len(item["page_type"]):
                page_type = str(item["page_type"][0])
        excluded_list = [
            "page_type",
        ]
        key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys(
            item=item, excluded_key_list=excluded_list)
        index = -1
        content_dict = {}
        if "content" in key_list1 and "detailed" == page_type:
            index = key_list1.index("content")
            if -1 < index and index < len(item_list1):
                content_dict = eval(item_list1[index])
                item_list1.remove(item_list1[index])
                key_list1.remove("content")
                content_dict["longitude"] = np.nan
                content_dict["latitude"] = np.nan
                content_dict["adcode"] = np.nan

                # request Gaode here
                if isinstance(item["url"], list):
                    url_str = str(item["url"][0])
                elif isinstance(item["url"], str):
                    url_str = item["url"]
                city_name_fang = self.get_city_or_district_name_from_url(
                    url=url_str)
                if 0 < len(city_name_fang):
                    city_name_amap = self.switch_city_name(
                        city_name=city_name_fang, spider=spider)
                    community_name = self.extract_community_name(
                        content_dict=content_dict)
                    spider.logger.info(
                        f"requesting Gaode using community name {community_name}"
                    )
                    if 0 < len(community_name):
                        result_dict = {}
                        params = {
                            "key":
                            account,
                            "address":
                            str(
                                CommonScrapyPipelineClass.clean_addr(
                                    text=community_name)),
                            "city":
                            city_name_amap,
                        }
                        try:
                            # 20190621发现爬取佛山的时候因为DNS解析失败而丢失了14条记录。这里增加代码,记录再次丢失。
                            # socket.gaierror: [Errno -3] Temporary failure in name resolution
                            response = requests.get(self.base_gaode_url,
                                                    headers=self.headers,
                                                    params=params)
                            if 200 == response.status_code:
                                if self.save_every_response is not None and self.save_every_response:
                                    self.save_reponsed_json_file(
                                        apt_id=content_dict["apt_id"],
                                        response=response.text,
                                        spider=spider)
                                result_dict = CommonScrapyPipelineClass.parse_gaode_json(
                                    json_text=response.text)
                                if 0 < (result_dict["count"]):
                                    content_dict["longitude"] = result_dict[
                                        "longitude"]
                                    content_dict["latitude"] = result_dict[
                                        "latitude"]
                                    content_dict["adcode"] = result_dict[
                                        "adcode"]
                        except Exception as ex:
                            spider.logger.error(
                                f"requests or other errors. Exception = {ex}")

                keys = []
                items = []
                for key, value in content_dict.items():
                    keys.append(key)
                    items.append(value)
                key_list = keys + key_list1
                item_list = items + item_list1

                CommonScrapyPipelineClass.append_row(
                    spider_obj=spider,
                    key_list=key_list,
                    item_list=item_list,
                    csv_file_path_str=self.csv_file_path)
                if self.to_kafka and socket.gethostname(
                ) in self.cluster_servers_for_spiders:
                    CommonScrapyPipelineClass.pipeline_to_kafka(
                        spider_obj=spider,
                        key_list=key_list,
                        item_list=item_list,
                        kafka_topic_str=self.kafka_topic,
                        kafka_producer_obj=self.kafka_producer)
        elif "detailed" == page_type:
            error_msg = f"no content in key_list1 ({key_list1})"
            spider.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )

        return item