Esempio n. 1
0
    def generate_filename_from_url(self, url="", file_type=""):
        response_html = ""
        filename = ""
        filename_base = ""
        folder = self.list_html_folder_name
        now = datetime.datetime.now()
        today = now.strftime('%Y%m%d')

        url_fragments = url.split("/")
        while '' in url_fragments:
            url_fragments.remove('')
        # Examples:
        # http://www.dianping.com/chenzhou/ch10/g113
        # http://www.dianping.com/shop/72457872
        # http://www.dianping.com/shop/8910906/review_all/p624
        if "list2" == file_type:
            if 3 < len(url_fragments):
                filename_base = f"{url_fragments[-3]}_{url_fragments[-2]}_{url_fragments[-1]}"
                response_html = f"{filename_base}_{today}.html"
                filename = response_html
        elif "detailed" == file_type:
            folder = self.detail_html_folder_name
            if 3 < len(url_fragments) and "review_all" == url_fragments[-2]:
                shop_id = CommonClass.find_digits_from_str(url_fragments[-3])
                filename_base = f"shop_{shop_id}_{url_fragments[-1]}"
                response_html = f"{filename_base}_{today}.html"
                filename = response_html
            elif 2 < len(url_fragments):
                shop_id = CommonClass.find_digits_from_str(url_fragments[-1])
                filename_base = f"shop_{shop_id}_p1"
                response_html = f"{filename_base}_{today}.html"
                filename = response_html
        elif "css" == file_type:
            # http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/a59454e0c1813952099c1e006c298195.css
            folder = self.svg_text_css_folder_name
            if 1 < len(url_fragments) and url_fragments[-1].endswith(".css"):
                filename_base = url_fragments[-1].replace(".css", "")
                response_html = url_fragments[-1]
                filename = response_html

        if response_html is None or 1 > len(response_html):
            rand_int = random.randint(100000, 999999)
            response_html = f"unknown{rand_int}_{today}.html"
            self.logger.error(
                f"File {response_html} is used to store html page crawled from {url}"
            )

        return response_html, folder, filename, filename_base
Esempio n. 2
0
 def get_page_from_url(self, url=""):
     page_num = 0
     url_obj = parse.urlparse(url)
     if hasattr(url_obj, "path"):
         url_list = url_obj.path.split("/")
         for one in url_list:
             if 0 == one.find("pn"):
                 page_num = CommonClass.find_digits_from_str(
                     string=one, return_all=False)
     return int(page_num)
Esempio n. 3
0
	def find_more_house_ids(self, doc = ""):
		house_id_list = []
		counter = 0
		index = 0
		while True:
			index = doc.find("data-hid", index)
			if -1 == index:
				break
			sub_doc = doc[index+10:index+25]
			house_id_list.append( CommonClass.find_digits_from_str( sub_doc ) )
			index += 10
			counter += 1
		return house_id_list
Esempio n. 4
0
    def get_page_area_district_from_url(self, url_object=None):
        """
			https://fs.58.com/shangpucz/
			https://gz.58.com/shangpu/
			https://gz.58.com/tianhe/shangpucz/pn3/?area=20_50
			https://fs.58.com/foshan/shangpucz/pn2/			# foshan == 佛山周边,与禅城、高明、三水等同级
			https://gz.58.com/shangpucz/pn3/
			https://fs.58.com/shangpu/38143746902823x.shtml
		"""
        page = "1"
        district = ""
        shop_area = ""
        detailed_page = False
        if url_object is not None and hasattr(
                url_object,
                "netloc") and -1 < url_object.netloc.find("58.com"):
            # parse query
            has_shop_area = True
            if not hasattr(url_object, "query") or 1 > len(url_object.query):
                has_shop_area = False
            if has_shop_area:
                query_dict = parse.parse_qs(url_object.query)
                if "area" in query_dict.keys() and isinstance(
                        query_dict["area"],
                        list) and 0 < len(query_dict["area"]):
                    shop_area = query_dict["area"][0]

            # parse path
            if hasattr(url_object, "path"):
                url_list = url_object.path.split("/")
                temp_list = []
                for one in url_list:
                    if 0 < len(one) and -1 == one.find(
                            "shangpucz") and -1 == one.find(
                                "shangpu") and -1 == one.find("pn"):
                        temp_list.append(one)
                    elif -1 < one.find("pn"):
                        page = CommonClass.find_digits_from_str(
                            string=one, return_all=False)
                    elif -1 < one.find(".shtml"):
                        detailed_page = True
                if not detailed_page and 1 == len(temp_list):
                    district = temp_list[0]
        if detailed_page:
            page = "0"
        return (page, district, shop_area)
Esempio n. 5
0
    def read_json_and_parse(self, response):
        file_list = os.listdir(self.json_dir)
        # route0___0___20190615_234522.json
        for one_file in file_list:
            temp_list = one_file.split("___")
            preset_route = 0
            now = ""
            if 2 < len(temp_list):
                preset_route = temp_list[0]
                preset_route = preset_route.lstrip("route")
                preset_route = CommonClass.find_digits_from_str(
                    string=preset_route, return_all=False)
                preset_route = int(preset_route)
                now = temp_list[2]
                now = now.rstrip(".json")

                url = self.get_url_according_to_preset_route(
                    preset_route=preset_route)
                json_file_path = os.path.join(self.json_dir, one_file)
                if os.path.isfile(json_file_path):
                    try:
                        doc = None
                        with open(json_file_path, "rb") as f:
                            doc = f.read().decode("utf-8", "ignore")
                        if doc is None:
                            self.logger.error(
                                f"Error: cannot read html file {json_file_path}."
                            )
                            continue
                        text_dict = self.extract_text_dict_from_response_body(
                            body=doc, preset_route=preset_route, now=now)
                        if 0 < len(text_dict):
                            json_selector = Selector(text=doc, type=None)
                            loader = ItemLoader(item=DirectionbaiduItem(),
                                                selector=json_selector)
                            loader = self.load_items_into_loader(
                                loader=loader,
                                text=text_dict,
                                url=url,
                                now=now)
                            yield loader.load_item()
                    except Exception as ex:
                        self.logger.error(
                            f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, error happened during loading ItemLoader. Exception = {ex}"
                        )
Esempio n. 6
0
	def extract_all_detailed_html_links(self, string = ""):
		house_id_list = []
		if 1 > len( string ):
			return house_id_list
		doc = string.decode('utf-8')
		end_string = '";var search_result_list_num ='
		end_pos = len( doc )
		if -1 < doc.find( end_string ):
			end_pos = doc.find( end_string )
		doc = doc[ len('var search_result = "			'):end_pos ]
		doc = '<!DOCTYPE html><html><head lang="zh-cn"><title>腾讯房产列表</title></head><body>' + f"{doc}</body></html>"
		response = Selector( text=doc, type="html" )
		house_id_list = response.xpath("//div/@data-hid").extract()
		if 10 > len( house_id_list ):
			house_id_list = self.find_more_house_ids( doc = doc )
		else:
			temp_list = []
			for one_id in house_id_list:
				temp_list.append( CommonClass.find_digits_from_str( one_id ) )
			house_id_list = temp_list
		
		return house_id_list