def process_item(self, item, spider): """ revision: 20190730 """ self.init_self_attributes(spider) page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) excluded_list = [ "page_type", ] key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys( item=item, excluded_key_list=excluded_list) index = -1 content_dict = {} if "content" in key_list1 and "detailed" == page_type: index = key_list1.index("content") if -1 < index and index < len(item_list1): content_dict = eval(item_list1[index]) item_list1.remove(item_list1[index]) key_list1.remove("content") keys = [] items = [] for key, value in content_dict.items(): keys.append(key) items.append(value) key_list = keys + key_list1 item_list = items + item_list1 CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) elif "detailed" == page_type: error_msg = f"no content in key_list1 ({key_list1})" spider.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return item
def init_self_attributes(self, spider): today = datetime.datetime.now().strftime("%Y%m%d") if 1 > len(self.crawled_dir): self.crawled_dir = spider.settings.get(name="CRAWLED_DIR", default="") if self.csv_file_path is None or 1 > len(self.csv_file_path): self.csv_file_path = os.path.join(self.crawled_dir, f"{spider.name}{today}.csv") if self.to_kafka is None: self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA", default=False) if 1 > len(self.kafka_topic): self.kafka_topic = spider.name if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders): self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default=[]) if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka): self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default=[]) if socket.gethostname() in self.cluster_servers_for_spiders: self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer=self.kafka_producer, kafka_servers=self.cluster_servers_for_kafka, spider_obj=spider)
def init_self_attributes(self, spider): if self.root_path is None or 1 > len( self.root_path ): self.root_path = spider.settings.get( "PROJECT_PATH", default = None ) if self.overwrite_today is None or 1 > len( self.overwrite_today ): self.overwrite_today = spider.settings.get( "OVERWRITE_TODAY", default = "" ) if 1 > len( self.overwrite_today ): self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # set all filenames, file paths, dir if 1 > len( self.crawled_dir ): self.crawled_dir = spider.settings.get( name='CRAWLED_DIR', default = "" ) if 1 > len( self.detail_html_dir ): self.detail_html_dir = spider.settings.get( name="SAVED_DETAIL_HTML", default="" ) if self.csv_file_path is None or 1 > len( self.csv_file_path ): self.csv_file_path = os.path.join( self.crawled_dir, f"{spider.name}_{self.overwrite_today}.csv" ) if self.to_kafka is None: self.to_kafka = spider.settings.get( name="PIPELINE_TO_KAFKA", default = False ) if 1 > len( self.kafka_topic ): self.kafka_topic = spider.name if hasattr( spider, "name" ) else "" if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders ): self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default = [] ) if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka ): self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default = [] ) if socket.gethostname() in self.cluster_servers_for_spiders: self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer = self.kafka_producer, kafka_servers = self.cluster_servers_for_kafka, spider_obj = spider )
def init_self_attributes(self, spider): if self.root_path is None or 1 > len(self.root_path): self.root_path = spider.settings.get("PROJECT_PATH", default=None) if self.overwrite_today is None or 1 > len(self.overwrite_today): self.overwrite_today = spider.settings.get("OVERWRITE_TODAY", default="") if 1 > len(self.overwrite_today): self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # Gaode information if 1 > len(self.key_list): self.key_list = spider.settings.get("AMAP_KEYS", default=[]) if 1 > len(self.city_name_dict): self.city_name_dict = spider.settings.get("CITY_NAME_DICT", default={}) if 1 > len(self.city_list): self.city_list = spider.settings.get("CITY_LIST", default=[]) if 1 > len(self.district_list): self.district_list = spider.settings.get("DISTRICT_LIST", default=[]) if 1 > len(self.city_name_for_districts): self.city_name_for_districts = spider.settings.get( "CITY_NAME_FOR_DISTRICTS", default="") if self.save_every_response is None: self.save_every_response = spider.settings.get( "SAVE_EVERY_RESPONSE", default=False) if 1 > len(self.headers): self.headers = spider.settings.get("DEFAULT_REQUEST_HEADERS", default={}) # set all filenames, file paths, dir if 1 > len(self.crawled_dir): self.crawled_dir = spider.settings.get(name='CRAWLED_DIR', default="") if 1 > len(self.detail_html_dir): self.detail_html_dir = spider.settings.get( name="SAVED_DETAIL_HTML", default="") if 1 > len(self.gaode_json_dir): self.gaode_json_dir = spider.settings.get(name="SAVED_GAODE_JASON", default="") if self.csv_file_path is None or 1 > len(self.csv_file_path): self.csv_file_path = os.path.join( self.crawled_dir, f"fang_esf{self.overwrite_today}.csv") if self.to_kafka is None: self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA", default=False) if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders): self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default=[]) if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka): self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default=[]) if socket.gethostname() in self.cluster_servers_for_spiders: self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer=self.kafka_producer, kafka_servers=self.cluster_servers_for_kafka, spider_obj=spider)
def close_spider(self, spider=None): CommonScrapyPipelineClass.log_close_spider(spider_obj=spider) if spider.xy_seen_updated_bool: log_file_path = os.path.join(spider.log_dir, spider.xy_response_log_file_name) bak_file_name = spider.xy_response_log_file_name.replace( ".log", ".bak") bak_file_path = os.path.join(spider.log_dir, bak_file_name) if os.path.isfile(log_file_path): shutil.copyfile(log_file_path, bak_file_path) try: with open(log_file_path, "w", encoding="utf-8") as xy_log_file: for index, center_xy_str in enumerate(spider.xy_seen_dict): item = spider.xy_seen_dict[center_xy_str] xy_log_file.write(f"{center_xy_str},{item}\n") except Exception as ex: spider.logger.error( f"cannot write historical xy_log_file ({log_file_path}). Exception = {ex}" )
def initialize_amap_xy(self, response): if response is None or not hasattr(response, "body") or not hasattr( response, "url") or not hasattr(response, "meta"): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response object" ) return None meta_dict = response.meta bd09xy = "%.6f,%.6f" % (meta_dict["x"], meta_dict["y"]) index = meta_dict["index"] json_dict = json.loads(response.body) if "status" not in json_dict.keys( ) or "locations" not in json_dict.keys() or 1 != int( json_dict["status"]): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response status" ) return None if not isinstance(json_dict["locations"], str) or 1 > len(json_dict["locations"]): self.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, bad response locations" ) return None amap_xy = json_dict["locations"] this_row = f"{index}:{bd09xy}==>{amap_xy}" new_xy_file_name = "data4cities_amap.txt" new_xy_log_file_name = "bd09to_amap.log" new_xy_log_file_name = os.path.join(self.root_path, self.name, new_xy_log_file_name) new_xy_file_name = os.path.join(self.root_path, self.name, new_xy_file_name) CommonScrapyPipelineClass.append_row( spider_obj=self, key_list=["xy"], item_list=[amap_xy], csv_file_path_str=new_xy_file_name) CommonScrapyPipelineClass.append_row( spider_obj=self, key_list=["xy"], item_list=[this_row], csv_file_path_str=new_xy_log_file_name)
def process_item(self, item, spider): """ there are so many lat, and lng for one bus route (one item), therefore we do not request amap here. """ self.init_self_attributes(spider=spider) page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) break excluded_list = [ "page_type", ] key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys( item=item, excluded_key_list=excluded_list) if "detailed" == page_type: result_bool, key_list, item_list = CommonScrapyPipelineClass.extract_items_and_keys_from_content( raw_key_list=key_list1, raw_item_list=item_list1, content_field_name_str="content") if result_bool: CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) else: spider.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, no content in key_list1 ({key_list1})" ) return item
def close_spider(self, spider=None): CommonScrapyPipelineClass.log_close_spider(spider_obj=spider)
def process_item(self, item, spider): self.init_self_attributes(spider=spider) random_key = random.randint(0, len(self.key_list) - 1) account = self.key_list[random_key] page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) excluded_list = [ "page_type", ] all_keys1, item_list1 = self.get_items_and_keys( item=item, excluded_list=excluded_list) index = -1 content_dict = {} if "content" in all_keys1 and "detailed" == page_type: index = all_keys1.index("content") if -1 < index and index < len(item_list1): content_dict = eval(item_list1[index]) item_list1.remove(item_list1[index]) all_keys1.remove("content") content_dict["longitude"] = np.nan content_dict["latitude"] = np.nan content_dict["adcode"] = np.nan # request Gaode here if isinstance(item["url"], list): temp_list = str(item["url"][0]).replace("https://", "") elif isinstance(item["url"], str): temp_list = item["url"].replace("https://", "") temp_list = temp_list.split(".") city_name = temp_list[0] if 0 < len(temp_list) and 0 < len( temp_list[0]) else "" if 0 < len(city_name): city_name = self.check_city_name(city_name) three_requests_for_tryout = [ "location", "address", ] for one_tryout in three_requests_for_tryout: if one_tryout in content_dict.keys(): result_dict = {} params = { "key": account, "address": str(self.clean_addr(content_dict[one_tryout])), "city": city_name, } response = requests.get(self.base_gaode_url, headers=self.headers, params=params) if 200 == response.status_code: if self.save_every_response is not None and self.save_every_response: self.save_reponsed_json_file( rent_id=content_dict["rent_id"], response=response.text) result_dict = self.parse_gaode_json( response.text) if 0 < (result_dict["count"]): content_dict["longitude"] = result_dict[ "longitude"] content_dict["latitude"] = result_dict[ "latitude"] content_dict["adcode"] = result_dict[ "adcode"] break keys = [] items = [] for key, value in content_dict.items(): keys.append(key) items.append(value) key_list = keys + all_keys1 item_list = items + item_list1 CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) elif "detailed" == page_type: spider.logger.error( f"no content in all_keys1 ({all_keys1}) in Method process_item of Class FangPipeline. Exception = {ex}" ) return item
def process_item(self, item, spider): """ todo: some parts of this method can be moved to commonfunctions.py """ self.init_self_attributes(spider) random_key = random.randint(0, len(self.key_list) - 1) account = self.key_list[random_key] page_type = "" for index, one in enumerate(item): if "page_type" == one and 1 == len(item["page_type"]): page_type = str(item["page_type"][0]) excluded_list = [ "page_type", ] key_list1, item_list1 = CommonScrapyPipelineClass.get_items_and_keys( item=item, excluded_key_list=excluded_list) index = -1 content_dict = {} if "content" in key_list1 and "detailed" == page_type: index = key_list1.index("content") if -1 < index and index < len(item_list1): content_dict = eval(item_list1[index]) item_list1.remove(item_list1[index]) key_list1.remove("content") content_dict["longitude"] = np.nan content_dict["latitude"] = np.nan content_dict["adcode"] = np.nan # request Gaode here if isinstance(item["url"], list): url_str = str(item["url"][0]) elif isinstance(item["url"], str): url_str = item["url"] city_name_fang = self.get_city_or_district_name_from_url( url=url_str) if 0 < len(city_name_fang): city_name_amap = self.switch_city_name( city_name=city_name_fang, spider=spider) community_name = self.extract_community_name( content_dict=content_dict) spider.logger.info( f"requesting Gaode using community name {community_name}" ) if 0 < len(community_name): result_dict = {} params = { "key": account, "address": str( CommonScrapyPipelineClass.clean_addr( text=community_name)), "city": city_name_amap, } try: # 20190621发现爬取佛山的时候因为DNS解析失败而丢失了14条记录。这里增加代码,记录再次丢失。 # socket.gaierror: [Errno -3] Temporary failure in name resolution response = requests.get(self.base_gaode_url, headers=self.headers, params=params) if 200 == response.status_code: if self.save_every_response is not None and self.save_every_response: self.save_reponsed_json_file( apt_id=content_dict["apt_id"], response=response.text, spider=spider) result_dict = CommonScrapyPipelineClass.parse_gaode_json( json_text=response.text) if 0 < (result_dict["count"]): content_dict["longitude"] = result_dict[ "longitude"] content_dict["latitude"] = result_dict[ "latitude"] content_dict["adcode"] = result_dict[ "adcode"] except Exception as ex: spider.logger.error( f"requests or other errors. Exception = {ex}") keys = [] items = [] for key, value in content_dict.items(): keys.append(key) items.append(value) key_list = keys + key_list1 item_list = items + item_list1 CommonScrapyPipelineClass.append_row( spider_obj=spider, key_list=key_list, item_list=item_list, csv_file_path_str=self.csv_file_path) if self.to_kafka and socket.gethostname( ) in self.cluster_servers_for_spiders: CommonScrapyPipelineClass.pipeline_to_kafka( spider_obj=spider, key_list=key_list, item_list=item_list, kafka_topic_str=self.kafka_topic, kafka_producer_obj=self.kafka_producer) elif "detailed" == page_type: error_msg = f"no content in key_list1 ({key_list1})" spider.logger.error( f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}" ) return item