Esempio n. 1
0
    def proxy_ip_pool(self):
        """
			迅联错误码10000		提取过快,请至少5秒提取一次
		"""
        if "DRAGONFLY" == self.proxy_agent:
            return CommonClass.get_proxies(proxy_dict={})
        now = time.time()
        need_new_proxy = False
        if self.proxy_ip_dict is None or 1 > len(self.proxy_ip_dict):
            need_new_proxy = True
        elif "expire" not in self.proxy_ip_dict.keys():
            need_new_proxy = True
        elif now + 3 > self.proxy_ip_dict["expire"]:
            need_new_proxy = True
        if need_new_proxy:
            proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                headers={},
                params_for_proxy_ip={},
                setup_xunlian_dict={},
                need_setup_xunlian=False,
                logger=self.logger)
            if 1 > len(proxies_dict):
                return self.proxy_ip_dict  # still return the old ip dict or {}
            proxies_dict["expire"] = now + random.randint(
                self.min_proxy_ip_life_time,
                self.max_proxy_ip_life_time)  # set ip life time
            self.proxy_ip_dict = proxies_dict
        return self.proxy_ip_dict
Esempio n. 2
0
 def process_request(self, request, spider):
     if self.proxy_meta is None or not isinstance(
             self.proxy_meta, dict) or 1 > len(self.proxy_meta):
         self.proxy_meta = CommonClass.get_proxies(proxy_dict={})
     if request.url.startswith("http://"):
         request.meta['proxy'] = self.proxy_meta['http']
     elif request.url.startswith("https://"):
         request.meta['proxy'] = self.proxy_meta['https']
Esempio n. 3
0
	def start_requests(self):
		self.init_self_attributes()
		self.make_dirs()

		if "READ_HTML" == self.run_purpose: # READ_HTML is one kind of debug
			url = 'http://quotes.toscrape.com/page/1/'
			yield scrapy.Request( url = url, callback = self.read_and_parse )
		elif "PRODUCTION_RUN" == self.run_purpose:
			urls = [
				# "http://www.cnemc.cn/sssj/", # 中国环境监测总局,实时数据页面
				self.base_url,
			]
			meta_dict = {}
			if self.use_proxy:
				proxies_dict = self.proxy_ip_pool()
				if 1 > len( proxies_dict):
					sys.exit(3)
				meta_dict["proxy"] = proxies_dict["http"]
			
			formdata_dict = {} # 没有任何表单字段需要post给目标网站
			for url in urls:
				# yield scrapy.RequestForm( url = url, callback = self.parse_json, meta = meta_dict, dont_filter = True )
				# yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
				self.last_request_time = time.time()
				yield scrapy.FormRequest( url = url, formdata = formdata_dict, callback = self.parse_json, meta = meta_dict, dont_filter = True )
		elif "CHECK_PROXY_IP" == self.run_purpose:
			now = int(time.time())
			token = f"Guangzhou{str(now)}"
			m = hashlib.md5()  
			m.update( token.encode(encoding = 'utf-8') )
			urls = [
				f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
			]
			
			if "DRAGONFLY" == self.proxy_agent:
				proxies_dict = CommonClass.get_proxies( proxy_dict = {} )
			else:
				proxies_dict = ProxyAgent.get_xunlian_proxy_dict(headers = {}, params_for_proxy_ip={}, setup_xunlian_dict = {}, need_setup_xunlian = False, logger=self.logger )
			if 0 < len( proxies_dict):
				meta_dict = {
					"proxy": proxies_dict["http"]
				}
				for url in urls:
					yield scrapy.Request( url=url, callback=self.do_nothing_for_debug, meta = meta_dict )
			else:
				self.logger.error( f"Error! No proxy ip returns. {proxies_dict}" )
		else:
			urls = [
				"http://quotes.toscrape.com/page/1/",
				"http://quotes.toscrape.com/page/2/",
			]
			for url in urls:
				yield scrapy.Request( url=url, callback=self.do_nothing_for_debug )
Esempio n. 4
0
    def init_self_attributes(self):
        self.run_purpose = self.settings.get(name='RUN_PURPOSE', default=None)

        # set all paths
        self.root_path = self.settings.get('PROJECT_PATH')
        self.crawled_folder_name = self.settings.get(name='CRAWLED_DIR',
                                                     default='crawled')
        self.detail_html_folder_name = self.settings.get(
            name='SAVED_DETAIL_HTML', default='detail_html')
        self.list_html_folder_name = self.settings.get(name='SAVED_LIST_HTML',
                                                       default='list_html')
        self.svg_text_css_folder_name = self.settings.get(name='SVG_TEXT_CSS',
                                                          default='svgtextcss')
        if self.run_purpose in [
                "PARSE_FIDDLER",
                "PARSE_DETAILED_HOTEL",
        ]:
            self.detail_html_folder_name = f"{ self.detail_html_folder_name }_fiddler"
            self.list_html_folder_name = f"{ self.list_html_folder_name }_fiddler"
            self.svg_text_css_folder_name = f"{ self.svg_text_css_folder_name }_fiddler"

        # whether this run is for debugging
        self.debug = self.settings.get(name='PROJECT_DEBUG', default=False)
        self.move_fiddler_file = self.settings.get(
            name='MOVE_FIDDLER_HTML_FILE', default=True)

        # get proxy header
        temp = CommonClass.get_proxies(proxy_dict={})
        self.proxy_meta = temp['http']

        self.database_city_district_table = self.settings.get(
            name='DATABASE_CITY_DISTRICT_TABLE', default={})
        self.database_level2name_table = self.settings.get(
            name='DATABASE_LEVEL2NAME_TABLE', default={})
        self.database_merchant_star_level_table = self.settings.get(
            name='DATABASE_MERCHANT_STAR_LEVEL_TABLE', default={})
        self.database_anticrawl20190505_table = self.settings.get(
            name='DATABASE_ANTICRAWL20190505_TABLE', default={})
        self.database_common_channel_list_table = self.settings.get(
            name='DATABASE_COMMON_CHANNEL_LIST_TABLE', default=[])
Esempio n. 5
0
	def __init__(self, root_path="", css_file="", css_string = "", send_requests=False, referer=None, save_requested_svg=True, csv_file=None, settings = None, folder="", logger=None):
		# read fiddler
		self.settings = None if settings is None else settings
		temp = self.settings.get( name = "RUN_PURPOSE", default=None )
		self.read_fiddler = False
		if "PARSE_FIDDLER" == temp:
			self.read_fiddler = True

		self.root_path = os.getcwd() if root_path is None or 1 > len( root_path) else root_path
		self.folder = "list_html" if folder is None or 1 > len( folder ) else folder
		self.spider_name = self.settings.get( "SPIDER_NAME" ) if self.settings is not None else ""
		self.svg_css_folder_name = self.settings.get( "SVG_TEXT_CSS" ) if self.settings is not None else ""
		if self.read_fiddler:
			self.svg_css_folder_name = f"{ self.svg_css_folder_name }_fiddler"

		self.css_file = "" if css_file is None or 1 > len( css_file) else css_file
		if self.css_file is not None and 0 < len( self.css_file ):
			self.css_file_path = os.path.join( self.root_path, self.spider_name, self.svg_css_folder_name, self.css_file )
		self.css_string = "" if css_string is None or 1 > len( css_string ) else css_string
		self.send_requests = False if send_requests is None else send_requests
		self.referer = None if referer is None or 1 > len( referer ) else referer
		self.save_requested_svg = True if save_requested_svg is None else save_requested_svg
		self.csv_file = "" if csv_file is None or 1 > len( csv_file ) else csv_file
		self.logger = None if logger is None else logger
		if self.logger is None:
			print( f"please pass the logger!" )
			sys.exit(2)
		self.use_proxy = True if self.settings.get( "HTTPPROXY_ENABLED" ) else False
		proxy_dict = CommonClass.get_proxies( proxy_dict = {} )
		self.proxies = proxy_dict['http']

		self.svg_files = {}
		self.svg_urls = {}
		self.svg_file_dict = {}
		self.svg_file_contents = {}
		self.payload = {}
		self.class_mapping = {}
		self.class_mapping_updated = False
		self.key_length = 0
Esempio n. 6
0
    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            urls = [
                # 广州
                "https://land.3fang.com/market/440100__1______1_1_1.html",  # 住宅用地: 26页
                "https://land.3fang.com/market/440100__2______1_1_1.html",  # 商业/办公用地: 17页
                "https://land.3fang.com/market/440100__3_2__0_100000__1_1_1.html",  # 工业用地, 已成交, 10万平米以下: 32页
                "https://land.3fang.com/market/440100__3_2__100000_500000__1_1_1.html",  # 工业用地, 已成交, 10-50万平米: 4页
                "https://land.3fang.com/market/440100__3_2__500000_100000000__1_1_1.html",  # 工业用地, 已成交, 50万平米以上: 1页
                "https://land.3fang.com/market/440100__3_1_____1_1_1.html",  # 工业用地, 未成交: 1页
                "https://land.3fang.com/market/440100__3_3_____1_1_1.html",  # 工业用地, 流拍: 7页
                "https://land.3fang.com/market/440100__4______1_1_1.html",  # 其他用地: 4页

                # # 佛山
                "https://land.3fang.com/market/440600__1_1_____1_1_1.html",  # 住宅用地, 未成交: 8页
                "https://land.3fang.com/market/440600__1_2__0_5000__1_1_1.html",  # 住宅用地, 已成交, 5千平米以下: 33页
                "https://land.3fang.com/market/440600__1_2__5000_100000__1_1_1.html",  # 住宅用地, 已成交, 5千到10万平米: 29页
                "https://land.3fang.com/market/440600__1_2__100000_100000000__1_1_1.html",  # 住宅用地, 已成交, 10万平米以上: 6页
                "https://land.3fang.com/market/440600__1_3_____1_1_1.html",  # 住宅用地, 流拍: 3页
                "https://land.3fang.com/market/440600__2______1_1_1.html",  # 商业用地: 19页
                "https://land.3fang.com/market/440600__3_1_____1_1_1.html",  # 工业用地, 未成交: 6页
                "https://land.3fang.com/market/440600__3_2__0_40000__1_1_1.html",  # 工业用地, 已成交, 4万平米以下: 32页
                "https://land.3fang.com/market/440600__3_2__40000_100000000__1_1_1.html",  # 工业用地, 已成交, 4万平米以上: 12页
                "https://land.3fang.com/market/440600__3_3_____1_1_1.html",  # 工业用地, 流拍: 1页
                "https://land.3fang.com/market/440600__4______1_1_1.html",  # 其他用地: 3页
            ]

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            cookie_dict = dict([
                pair.split("=", 1) for pair in self.cookie_string.split("; ")
            ])
            self.cookie_dict = cookie_dict
            for url in urls:
                url_object = parse.urlparse(url)
                path_list = url_object.path.split("/")
                for one in path_list:
                    if -1 == one.find(".html"):
                        continue
                    city_name = ""
                    city_code_list = one.split("_")
                    city_code = int(
                        city_code_list[0]) if 0 < len(city_code_list) else 0
                    if 0 < city_code and str(
                            city_code) in self.city_name_dict.keys():
                        city_name = self.city_name_dict[str(city_code)]
                    if 1 > len(city_name):
                        error_msg = f"{city_code} is NOT in self.city_name_dict.keys() ({self.city_name_dict.keys()})"
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                        )
                        sys.exit(4)
                    break
                meta_dict["city"] = city_name
                # cookie_dict = self.change_cookies( cookie_dict )
                yield scrapy.Request(url=url,
                                     cookies=cookie_dict,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
                # yield scrapy.Request( url = url, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
        elif "READ_CSV_AND_REDO" == self.run_purpose:
            english_city_name = {
                "佛山": "foshan",
                "广州": "guangzhou",
            }
            filename = "tudi_201808.csv"
            csv_file_path = os.path.join(self.crawled_dir, filename)
            url_list = []
            city_list = []
            try:
                with open(csv_file_path, newline="",
                          encoding="utf-8") as csvfile:
                    file_reader = csv.reader(
                        csvfile)  # , delimiter=' ', quotechar='|'
                    for row in file_reader:
                        if -1 < row[8].find("https:"):
                            url_list.append(row[8])
                            city_list.append(row[13])
            except Exception as ex:
                error_msg = f"cannot read csv file, Exception = {ex}"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )

            meta_dict = {
                "page_type": "detailed",
                "total_pages": 1,
            }
            self.cookie_dict = dict([
                pair.split("=", 1) for pair in self.cookie_string.split("; ")
            ])
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                meta_dict["proxy"] = proxies_dict["http"]

            for index, url in enumerate(url_list):
                chinese_city_name = city_list[index]
                meta_dict["city"] = english_city_name[chinese_city_name]
                yield scrapy.Request(url=url,
                                     cookies=self.cookie_dict,
                                     callback=self.parse_detailed_page,
                                     meta=meta_dict,
                                     dont_filter=True)
                break
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)
Esempio n. 7
0
    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            if "city" == self.city_name_for_districts:
                city_list = self.city_list
            else:
                city_list = self.district_list
            number_day_of_this_year = datetime.datetime.now().timetuple(
            ).tm_yday  # type == int
            seperate_into_days = self.settings.get("CRAWL_BATCHES", default=3)
            if seperate_into_days > len(city_list):
                seperate_into_days = len(city_list)
            batch_count = math.ceil(len(city_list) / seperate_into_days)
            today_batch = number_day_of_this_year % seperate_into_days
            start_index = today_batch * batch_count - 1
            end_index = (today_batch + 1) * batch_count
            urls = []
            for index, city in enumerate(city_list):
                if (start_index < index) and (index < end_index):
                    url = f"https://{city}.esf.fang.com/" if "city" == self.city_name_for_districts else f"https://{self.city_name_for_districts}.esf.fang.com/house-{city}/"
                    urls.append(url)

            meta_dict = {
                "page_type": "index",
                "total_pages": 0,
                "index_level": 0,
            }
            if "city" != self.city_name_for_districts:
                meta_dict["index_level"] = 1

            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "GET_CHANNELS" == self.run_purpose:  # GET_CHANNELS is one kind of debug
            urls = []
            city_list = self.settings.get("CITY_LIST", default=[])
            for index, city in enumerate(city_list):
                urls.append(f"https://{city}.esf.fang.com/")
            if 0 < len(urls):
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": 0,
                }
                yield scrapy.Request(url=urls[0],
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)
Esempio n. 8
0
    def start_requests(self):
        self.init_self_attributes()
        self.make_dirs()
        self.read_crawled_urls()

        if "READ_HTML" == self.run_purpose:  # READ_HTML is one kind of debug
            url = 'http://quotes.toscrape.com/page/1/'
            yield scrapy.Request(url=url, callback=self.read_and_parse)
        elif "PRODUCTION_RUN" == self.run_purpose:
            urls = [
                # 只有广州有阳光家缘
                "http://zfcj.gz.gov.cn/data/Laho/ProjectSearch.aspx",
            ]

            meta_dict = {
                "page_type": "index",
                "page": 1,
                "total_pages": 468,
            }
            if self.use_proxy:
                proxies_dict = self.proxy_ip_pool()
                if 1 > len(proxies_dict):
                    sys.exit(3)
                meta_dict["proxy"] = proxies_dict["http"]

            for url in urls:
                # yield scrapy.Request( url = url, cookies=cookie_dict, callback = self.parse_list_page, meta = meta_dict, dont_filter = True )
                yield scrapy.Request(url=url,
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "GET_CHANNELS" == self.run_purpose:  # GET_CHANNELS is one kind of debug
            urls = []
            city_list = self.settings.get("CITY_LIST", default=[])
            for index, city in enumerate(city_list):
                urls.append(f"https://{city}.esf.fang.com/")
            if 0 < len(urls):
                meta_dict = {
                    "page_type": "index",
                    "total_pages": 0,
                    "index_level": 0,
                }
                yield scrapy.Request(url=urls[0],
                                     callback=self.parse_list_page,
                                     meta=meta_dict,
                                     dont_filter=True)
        elif "CHECK_PROXY_IP" == self.run_purpose:
            now = int(time.time())
            token = f"Guangzhou{str(now)}"
            m = hashlib.md5()
            m.update(token.encode(encoding='utf-8'))
            urls = [
                f"https://www.coursehelper.site/index/index/getHeaders?token={m.hexdigest()}",
            ]

            if "DRAGONFLY" == self.proxy_agent:
                proxies_dict = CommonClass.get_proxies(proxy_dict={})
            else:
                proxies_dict = ProxyAgent.get_xunlian_proxy_dict(
                    headers={},
                    params_for_proxy_ip={},
                    setup_xunlian_dict={},
                    need_setup_xunlian=False,
                    logger=self.logger)
            if 0 < len(proxies_dict):
                meta_dict = {"proxy": proxies_dict["http"]}
                for url in urls:
                    yield scrapy.Request(url=url,
                                         callback=self.do_nothing_for_debug,
                                         meta=meta_dict)
            else:
                self.logger.error(
                    f"Error! No proxy ip returns. {proxies_dict}")
        else:
            urls = [
                "http://quotes.toscrape.com/page/1/",
                "http://quotes.toscrape.com/page/2/",
            ]
            for url in urls:
                yield scrapy.Request(url=url,
                                     callback=self.do_nothing_for_debug)