Esempio n. 1
0
	def init_self_attributes(self, spider):
		if self.root_path is None or 1 > len( self.root_path ):
			self.root_path = spider.settings.get( "PROJECT_PATH", default = None )
		if self.overwrite_today is None or 1 > len( self.overwrite_today ):
			self.overwrite_today = spider.settings.get( "OVERWRITE_TODAY", default = "" )
		if 1 > len( self.overwrite_today ):
			self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

		# set all filenames, file paths, dir
		if 1 > len( self.crawled_dir ):
			self.crawled_dir = spider.settings.get( name='CRAWLED_DIR', default = "" )
		if 1 > len( self.detail_html_dir ):
			self.detail_html_dir = spider.settings.get( name="SAVED_DETAIL_HTML", default="" )
		if self.csv_file_path is None or 1 > len( self.csv_file_path ):
			self.csv_file_path = os.path.join( self.crawled_dir, f"{spider.name}_{self.overwrite_today}.csv" )

		if self.to_kafka is None:
			self.to_kafka = spider.settings.get( name="PIPELINE_TO_KAFKA", default = False )
		if 1 > len( self.kafka_topic ):
			self.kafka_topic = spider.name if hasattr( spider, "name" ) else ""
		if self.cluster_servers_for_spiders is None or 1 > len( self.cluster_servers_for_spiders ):
			self.cluster_servers_for_spiders = spider.settings.get( name="CLUSTER_SERVERS_FOR_SPIDERS", default = [] )
		if self.cluster_servers_for_kafka is None or 1 > len( self.cluster_servers_for_kafka ):
			self.cluster_servers_for_kafka = spider.settings.get( name="CLUSTER_SERVERS_FOR_KAFKA", default = [] )
		if socket.gethostname() in self.cluster_servers_for_spiders:
			self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka( kafka_producer = self.kafka_producer, kafka_servers = self.cluster_servers_for_kafka, spider_obj = spider )
Esempio n. 2
0
 def init_self_attributes(self, spider):
     today = datetime.datetime.now().strftime("%Y%m%d")
     if 1 > len(self.crawled_dir):
         self.crawled_dir = spider.settings.get(name="CRAWLED_DIR",
                                                default="")
     if self.csv_file_path is None or 1 > len(self.csv_file_path):
         self.csv_file_path = os.path.join(self.crawled_dir,
                                           f"{spider.name}{today}.csv")
     if self.to_kafka is None:
         self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA",
                                             default=False)
     if 1 > len(self.kafka_topic):
         self.kafka_topic = spider.name
     if self.cluster_servers_for_spiders is None or 1 > len(
             self.cluster_servers_for_spiders):
         self.cluster_servers_for_spiders = spider.settings.get(
             name="CLUSTER_SERVERS_FOR_SPIDERS", default=[])
     if self.cluster_servers_for_kafka is None or 1 > len(
             self.cluster_servers_for_kafka):
         self.cluster_servers_for_kafka = spider.settings.get(
             name="CLUSTER_SERVERS_FOR_KAFKA", default=[])
     if socket.gethostname() in self.cluster_servers_for_spiders:
         self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka(
             kafka_producer=self.kafka_producer,
             kafka_servers=self.cluster_servers_for_kafka,
             spider_obj=spider)
Esempio n. 3
0
    def init_self_attributes(self, spider):
        if self.root_path is None or 1 > len(self.root_path):
            self.root_path = spider.settings.get("PROJECT_PATH", default=None)
        if self.overwrite_today is None or 1 > len(self.overwrite_today):
            self.overwrite_today = spider.settings.get("OVERWRITE_TODAY",
                                                       default="")
        if 1 > len(self.overwrite_today):
            self.overwrite_today = datetime.datetime.now().strftime("%Y%m%d")

        # Gaode information
        if 1 > len(self.key_list):
            self.key_list = spider.settings.get("AMAP_KEYS", default=[])
        if 1 > len(self.city_name_dict):
            self.city_name_dict = spider.settings.get("CITY_NAME_DICT",
                                                      default={})
        if 1 > len(self.city_list):
            self.city_list = spider.settings.get("CITY_LIST", default=[])
        if 1 > len(self.district_list):
            self.district_list = spider.settings.get("DISTRICT_LIST",
                                                     default=[])
        if 1 > len(self.city_name_for_districts):
            self.city_name_for_districts = spider.settings.get(
                "CITY_NAME_FOR_DISTRICTS", default="")
        if self.save_every_response is None:
            self.save_every_response = spider.settings.get(
                "SAVE_EVERY_RESPONSE", default=False)
        if 1 > len(self.headers):
            self.headers = spider.settings.get("DEFAULT_REQUEST_HEADERS",
                                               default={})

        # set all filenames, file paths, dir
        if 1 > len(self.crawled_dir):
            self.crawled_dir = spider.settings.get(name='CRAWLED_DIR',
                                                   default="")
        if 1 > len(self.detail_html_dir):
            self.detail_html_dir = spider.settings.get(
                name="SAVED_DETAIL_HTML", default="")
        if 1 > len(self.gaode_json_dir):
            self.gaode_json_dir = spider.settings.get(name="SAVED_GAODE_JASON",
                                                      default="")
        if self.csv_file_path is None or 1 > len(self.csv_file_path):
            self.csv_file_path = os.path.join(
                self.crawled_dir, f"fang_esf{self.overwrite_today}.csv")

        if self.to_kafka is None:
            self.to_kafka = spider.settings.get(name="PIPELINE_TO_KAFKA",
                                                default=False)
        if self.cluster_servers_for_spiders is None or 1 > len(
                self.cluster_servers_for_spiders):
            self.cluster_servers_for_spiders = spider.settings.get(
                name="CLUSTER_SERVERS_FOR_SPIDERS", default=[])
        if self.cluster_servers_for_kafka is None or 1 > len(
                self.cluster_servers_for_kafka):
            self.cluster_servers_for_kafka = spider.settings.get(
                name="CLUSTER_SERVERS_FOR_KAFKA", default=[])
        if socket.gethostname() in self.cluster_servers_for_spiders:
            self.kafka_producer = CommonScrapyPipelineClass.initialize_kafka(
                kafka_producer=self.kafka_producer,
                kafka_servers=self.cluster_servers_for_kafka,
                spider_obj=spider)