def init_self_attributes(self, spider): if self.root_path is None or 1 > len( self.root_path ): self.root_path = spider.settings.get( "PROJECT_PATH", default = None ) if self.overwrite_today is None or 1 > len( self.overwrite_today ): self.overwrite_today = spider.settings.get( "OVERWRITE_TODAY", default = "" ) if 1 > len( self.overwrite_today ): self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # set all filenames, file paths, dir if 1 > len( self.crawled_dir ): self.crawled_dir = spider.settings.get( name='CRAWLED_DIR', default = "" ) if 1 > len( self.detail_html_dir ): self.detail_html_dir = spider.settings.get( name="SAVED_DETAIL_HTML", default="" ) if self.csv_file_path is None or 1 > len( self.csv_file_path ): self.csv_file_path = os.path.join( self.crawled_dir, f"{spider.name}_{self.overwrite_today}.csv" ) if self.to_kafka is None: self.to_kafka = spider.settings.get( name="PIPELINE_TO_KAFKA", default = False ) if 1 > len( self.kafka_topic ): self.kafka_topic = spider.name if hasattr( spider, "name" ) else "" if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders ): self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default = [] ) if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka ): self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default = [] ) if socket.gethostname() in self.cluster_servers_for_spiders: self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer = self.kafka_producer, kafka_servers = self.cluster_servers_for_kafka, spider_obj = spider )
def init_self_attributes(self, spider): today = datetime.datetime.now().strftime("%Y%m%d") if 1 > len(self.crawled_dir): self.crawled_dir = spider.settings.get(name="CRAWLED_DIR", default="") if self.csv_file_path is None or 1 > len(self.csv_file_path): self.csv_file_path = os.path.join(self.crawled_dir, f"{spider.name}{today}.csv") if self.to_kafka is None: self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA", default=False) if 1 > len(self.kafka_topic): self.kafka_topic = spider.name if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders): self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default=[]) if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka): self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default=[]) if socket.gethostname() in self.cluster_servers_for_spiders: self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer=self.kafka_producer, kafka_servers=self.cluster_servers_for_kafka, spider_obj=spider)
def init_self_attributes(self, spider): if self.root_path is None or 1 > len(self.root_path): self.root_path = spider.settings.get("PROJECT_PATH", default=None) if self.overwrite_today is None or 1 > len(self.overwrite_today): self.overwrite_today = spider.settings.get("OVERWRITE_TODAY", default="") if 1 > len(self.overwrite_today): self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d") # Gaode information if 1 > len(self.key_list): self.key_list = spider.settings.get("AMAP_KEYS", default=[]) if 1 > len(self.city_name_dict): self.city_name_dict = spider.settings.get("CITY_NAME_DICT", default={}) if 1 > len(self.city_list): self.city_list = spider.settings.get("CITY_LIST", default=[]) if 1 > len(self.district_list): self.district_list = spider.settings.get("DISTRICT_LIST", default=[]) if 1 > len(self.city_name_for_districts): self.city_name_for_districts = spider.settings.get( "CITY_NAME_FOR_DISTRICTS", default="") if self.save_every_response is None: self.save_every_response = spider.settings.get( "SAVE_EVERY_RESPONSE", default=False) if 1 > len(self.headers): self.headers = spider.settings.get("DEFAULT_REQUEST_HEADERS", default={}) # set all filenames, file paths, dir if 1 > len(self.crawled_dir): self.crawled_dir = spider.settings.get(name='CRAWLED_DIR', default="") if 1 > len(self.detail_html_dir): self.detail_html_dir = spider.settings.get( name="SAVED_DETAIL_HTML", default="") if 1 > len(self.gaode_json_dir): self.gaode_json_dir = spider.settings.get(name="SAVED_GAODE_JASON", default="") if self.csv_file_path is None or 1 > len(self.csv_file_path): self.csv_file_path = os.path.join( self.crawled_dir, f"fang_esf{self.overwrite_today}.csv") if self.to_kafka is None: self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA", default=False) if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders): self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default=[]) if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka): self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default=[]) if socket.gethostname() in self.cluster_servers_for_spiders: self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer=self.kafka_producer, kafka_servers=self.cluster_servers_for_kafka, spider_obj=spider)