Esempio n. 1
0
    def extract_link_list(self, response=None):
        record_list = []
        tr_list = response.xpath('//table[@class="resultTableC"]/tbody/tr')
        for one_tr in tr_list:
            try:
                detailed_page_link = one_tr.xpath(
                    './tr/td/a/@href').extract_first(default="")
                detailed_page_link = CommonClass.clean_string(
                    string=detailed_page_link,
                    char_to_remove=[
                        '\r',
                        '\n',
                        '\t',
                        ' ',
                    ])
                td_list = one_tr.xpath('./td')
                value_list = []
                for one_td in td_list:
                    value_list.append(
                        one_td.xpath("./a/text()").extract_first(default=""))

                # 检查这7个字段是否都是空字符串
                if 7 == len(value_list):
                    not_empty = False
                    for one_value in value_list:
                        if isinstance(one_value, str) and 0 < len(one_value):
                            not_empty = True
                            break
                if 7 == len(value_list) and not_empty:
                    this_record = {
                        "序号": value_list[0],
                        "项目名称": value_list[1],
                        "开发商": value_list[2],
                        "预售证": value_list[3],
                        "项目地址": value_list[4],
                        "住宅已售套数": value_list[5],
                        "住宅未售套数": value_list[6],
                        "详情链接": detailed_page_link,
                    }
                    record_list.append(this_record)
                elif 7 != len(value_list):
                    error_msg = f"value_list ({value_list}) has length other than 7"
                    self.logger.error(
                        f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                    )

            except Exception as ex:
                error_msg = f"xpath error! Exception = {ex}"
                self.logger.error(
                    f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
                )
        if 1 > len(record_list):
            error_msg = f"Fail to extract links from {response.url}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
        return record_list
Esempio n. 2
0
    def replace_one_node_text(self,
                              node=None,
                              this_node_class_name20190505=""):
        if node is None:
            return ""
        this_node_class_name = node.xpath("./@class").extract_first(default="")

        # the following 7 lines are for updated anticrawl methods on 20190505
        this_node_get_text = node.get()
        if this_node_get_text is not None and 0 < len(this_node_get_text):
            this_node_get_text5 = this_node_get_text.encode(
                'unicode_escape').decode('utf-8')
            if 6 == len(this_node_get_text5) and '\\' == this_node_get_text5[
                    0] and 'u' == this_node_get_text5[
                        1] and -1 < this_node_class_name20190505.find(
                            "shopNum"):
                key = this_node_get_text5[2:]
                if key in self.database_anticrawl20190505_table.keys():
                    # self.logger.warning( f"{this_node_get_text5} ==> {key}; found in {self.database_anticrawl20190505_table[ key ]}" )
                    return self.database_anticrawl20190505_table[key]
                # has no class as shopNum: ¥ ==> \uffe5

        not_in_class_mapping_dict = False

        for index, key in enumerate(self.class_mapping_dict):
            this_dict = self.class_mapping_dict[key]
            key_length = this_dict['key_length']
            all_keys = this_dict['all_keys']
            if key_length < len(
                    this_node_class_name
            ) and this_node_class_name[:key_length] in all_keys:
                value = this_dict['class_mapping'][
                    this_node_class_name] if this_node_class_name in this_dict[
                        'class_mapping'].keys() else ""
                if 0 < len(value):
                    return value
                else:
                    not_in_class_mapping_dict = True
                    self.logger.error(
                        f"cannot find {this_node_class_name} in saved mapping class {key}."
                    )
        if not_in_class_mapping_dict:
            return ""
        else:
            temp = CommonClass.clean_string(string=node.get(),
                                            char_to_remove=[
                                                '\r',
                                                '\n',
                                                '\t',
                                                ' ',
                                            ])
            return temp
Esempio n. 3
0
    def parse_detailed_response_field(self, response=None, city=""):
        text = {}
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text
        information_div = response.xpath("//div[@id='printData1']")

        title = information_div.xpath(
            "./div[@class='tit_box01']/text()").extract_first(default="")
        land_id = information_div.xpath(
            "./div[@class='menubox01 mt20']/span[@class='gray2']/text()"
        ).extract_first(default="")
        province_city = information_div.xpath(
            "string(./div[@class='menubox01 p0515']/div[@class='fl'])"
        ).extract()
        province_city = "___".join(province_city)

        if 0 < len(title): text["title"] = title
        if 0 < len(land_id): text["land_id"] = land_id
        if 0 < len(province_city): text["province_city"] = province_city

        key1 = information_div.xpath(
            "./div[@class='p1015']/div[@class='tit_box02 border03']/text()"
        ).extract_first(default="")
        if "土地基本信息" == key1:
            basic_info = {}
            tr_list1 = information_div.xpath(
                "./div[@class='p1015']/div[@class='tit_box02 border03']/following-sibling::table[@class='tablebox02 mt10']/tbody/tr"
            )
            for index, one_tr in enumerate(tr_list1):
                string_list = one_tr.xpath("string(.)").extract()
                td_list = []
                for one_str in string_list:
                    cleaned_str = CommonClass.clean_string(string=one_str,
                                                           char_to_remove=[
                                                               '\xa0',
                                                               '\n',
                                                               '\t',
                                                               ' ',
                                                           ])
                    td_list.append(cleaned_str.strip('\r'))
                basic_info[index] = "___".join(td_list)
            text[key1] = basic_info

        key2 = information_div.xpath(
            "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/text()"
        ).extract_first(default="")
        if "土地交易信息" == key2:
            trade_info = {}
            tr_list2 = information_div.xpath(
                "./div[@class='p1015']/div[@class='tit_box02 border03 mt20']/following-sibling::div[@class='banbox']/table[@class='tablebox02 mt10']/tbody/tr"
            )
            for index, one_tr in enumerate(tr_list2):
                string_list = one_tr.xpath("string(.)").extract()
                td_list = []
                for one_str in string_list:
                    cleaned_str = CommonClass.clean_string(string=one_str,
                                                           char_to_remove=[
                                                               '\xa0',
                                                               '\n',
                                                               '\t',
                                                               ' ',
                                                           ])
                    td_list.append(cleaned_str.strip('\r'))
                trade_info[index] = "___".join(td_list)
            text[key2] = trade_info

        # 20190730 cannot get 土地评估结果, todo ...
        # evaluation_div = response.xpath("//div[@id='divpg']")
        # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" )
        # if "土地评估结果" == key3:
        # 	evaluation_dict = {}
        # 	tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr")
        # 	for index, one_tr in enumerate( tr_list3 ):
        # 		this_td = one_tr.xpath("./td")
        # 		if this_td is None:
        # 			string_list = one_tr.xpath("string(./th)").extract()
        # 		else:
        # 			td_list = one_tr.xpath("./td")
        # 			string_list = []
        # 			for one_td in td_list:
        # 				unit = one_td.xpath("./text()").extract_first( default= "" )
        # 				amount = one_td.xpath("./span/text()").extract_first( default= "" )
        # 				string_list.append( f"{amount}___{unit}" )
        # 				# this_td_str_list = one_td.xpath("string(.)").extract()
        # 				# string_list.extend( this_td_str_list )
        # 		td_th_list = []
        # 		for one_str in string_list:
        # 			cleaned_str = CommonClass.clean_string( string = one_str, char_to_remove = [ '\xa0', '\n', '\t', ' ',] )
        # 			td_th_list.append( cleaned_str.strip('\r') )
        # 		evaluation_dict[index] = "___".join( td_th_list )
        # 	text[key3] = evaluation_dict

        # evaluation_div = response.xpath("//div[@id='divpg']")
        # key3 = evaluation_div.xpath("./div[@class='tit_box02 border03 mt20']/text()").extract_first( default= "" )
        # if "土地评估结果" == key3:
        # 	evaluation_dict = {}
        # 	th_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr/th")
        # 	string_list = th_list3.xpath("string(.)").extract()
        # 	evaluation_dict["fields"] = "___".join( string_list )
        # 	tr_list3 = evaluation_div.xpath("./div[@class='table-03']/table[@class='mt5']/tbody/tr")
        # 	row2 = tr_list3[1].xpath("./td")
        # 	row2string = ""
        # 	str1 = row2[0].xpath("./text()").extract_first( default= "" )
        # 	str2 = row2[1].xpath("string(.)").extract()
        # 	str2 = "___".join( str2 )
        # 	str3amount = response.xpath("//span[@id='scbj_bpgj']")
        # 	str3unit = row2[2].xpath("./text()").extract_first( default= "" )
        # 	str4amount = response.xpath("//span[@id='scbj_bSumPrice']")
        # 	str4amount = str4amount.get()
        # 	str3amount = str3amount.get()
        # 	str4unit = row2[3].xpath("./text()").extract_first( default= "" )
        # 	str5 = row2[4].xpath("./a/@href").extract_first( default= "" )
        # 	evaluation_dict[str1] = f"{str2}___{str3amount} {str3unit}___{str4amount} {str4unit}___{str5}"
        # 	row3 = tr_list3[2].xpath("./td")
        # 	row3str = row3.xpath("string(.)").extract()
        # 	evaluation_dict["假设开发法"] = "___".join( row3str )
        # 	text[key3] = evaluation_dict

        if 0 < len(text): text["city"] = city
        return text
Esempio n. 4
0
    def parse_one_bus_route_fields(self,
                                   response=None,
                                   city_str="",
                                   route_str=""):
        if response is None:
            return {}

        try:
            url = response.url
            url_obj = parse.urlparse(url)
            bus_route_id = url_obj.path.strip("/")
            bus_line_div = response.xpath("//div[@id='bus_line']")
            bus_line_information_div = bus_line_div.xpath(
                "./div[@class='bus_line_information ']/div[@class='bus_i_content']"
            )
            bus_route_title = bus_line_information_div.xpath(
                "./div[@class='bus_i_t1']/h1/text()").extract_first(default="")
            bus_route_title = CommonClass.clean_string(string=bus_route_title,
                                                       char_to_remove=[
                                                           ' ',
                                                           ' ',
                                                           '\xa0',
                                                           '&nbsp',
                                                       ])
            bus_route_district = bus_line_information_div.xpath(
                "./div[@class='bus_i_t1']/a[@class='bus_i_t2']/text()"
            ).extract_first(default="")
            bus_route_info_list = bus_line_information_div.xpath(
                "./p[@class='bus_i_t4']/text()").extract()
            bus_route_info_str = ""
            if 0 < len(bus_route_info_list):
                bus_route_info_str = "___".join(bus_route_info_list)
            bus_operation_interval_str = bus_line_div.xpath(
                "./div[@class='bus_label ']/p[@class='bus_label_t2']/text()"
            ).extract_first(default="")

            bus_direction_dict = {}
            all_way_div_list = bus_line_div.xpath(
                "./div[@class='bus_line_top ']")
            for index, one_way_div in enumerate(all_way_div_list):
                one_way_name_text_list = one_way_div.xpath(
                    "./div/strong/text()").extract()
                one_way_name = "___".join(one_way_name_text_list) if 0 < len(
                    one_way_name_text_list) else ""
                span_text_list = one_way_div.xpath("./span/text()").extract()
                one_way_stop_number = "___".join(
                    span_text_list) if 0 < len(span_text_list) else ""
                if 0 < len(one_way_stop_number):
                    one_way_stop_number = CommonClass.clean_string(
                        string=one_way_stop_number,
                        char_to_remove=[
                            ' ',
                            ' ',
                            '\xa0',
                        ])
                temp_dict = {
                    "one_way_name": one_way_name,
                    "one_way_stop_number": one_way_stop_number,
                }
                bus_direction_dict[index] = temp_dict

            bus_route_stop_round_trip_list = bus_line_div.xpath(
                "./div[@class='bus_line_site ']")
            for index, one_direction in enumerate(
                    bus_route_stop_round_trip_list):
                stop_sequence_list = one_direction.xpath(
                    "./div[@class='bus_site_layer']/div/i/text()").extract()
                stop_name_list = one_direction.xpath(
                    "./div[@class='bus_site_layer']/div/a/text()").extract()
                if len(stop_name_list) == len(stop_sequence_list):
                    temp_list = []
                    for stop_name_index, stop_name in enumerate(
                            stop_name_list):
                        temp_list.append(
                            f"{stop_sequence_list[stop_name_index]}___{stop_name}"
                        )
                    if index in bus_direction_dict.keys():
                        bus_direction_dict[index]["stops"] = temp_list
                    else:
                        bus_direction_dict[index] = {"stops": temp_list}

            text_dict = {
                "route_title": bus_route_title.strip(),
                "city": city_str,
                "route_name": route_str,
                "route_id": bus_route_id.strip(),
                "route_uri": url,
                "route_district": bus_route_district.strip(),
                "route_info": bus_route_info_str.strip(),
                "operation_interval": bus_operation_interval_str.strip(),
                "bus_directions": bus_direction_dict,
            }
            return text_dict
        except Exception as ex:
            error_msg = f"Error happened during parsing. Exception = {ex}"
            self.logger.error(
                f"Inside Method {sys._getframe().f_code.co_name} of Class {self.__class__.__name__}, {error_msg}"
            )
            return {}
Esempio n. 5
0
    def parse_detailed_response_field(self, response=None, city="", apt_id=""):
        text = {}
        if response is None:
            return text
        if "READ_HTML" == self.run_purpose and not isinstance(
                response, Selector):
            return text
        title = response.xpath("//div[@id='lpname']/h1/text()").extract_first(
            default="")
        if 1 > len(title):
            title = response.xpath(
                "//div[@class='tab-cont clearfix']/div[@class='title rel']/h1[@class='title floatl']/text()"
            ).extract_first(default="")

        title_right_box = response.xpath("//div[@class='tab-cont-right']")
        price_div = title_right_box.xpath(
            "./div[@class='tr-line clearfix zf_new_title']/div[@class='trl-item_top']/div[@class='rel floatl']/preceding-sibling::div"
        )
        price_list = price_div.xpath("string(.)").extract()
        price = "___".join(price_list)

        # extract features
        feature_div = title_right_box.xpath(
            "./div[@class='tr-line clearfix']/div[contains(@class,'trl-item1')]"
        )
        feature_dict = {}
        for one_item in feature_div:
            key = one_item.xpath(
                "./div[@class='font14']/text()").extract_first(default="")
            value = one_item.xpath("./div[@class='tt']/text()").extract_first(
                default="")
            if 0 < len(key):
                feature_dict[key] = CommonClass.clean_string(string=value,
                                                             char_to_remove=[
                                                                 '\r',
                                                                 '\n',
                                                                 '\t',
                                                                 ' ',
                                                             ])

        # extract location information
        location_div = title_right_box.xpath(
            "./div[@class='tr-line']/div[@class='trl-item2 clearfix']")
        location_dict = {}
        for one_location in location_div:
            key = one_location.xpath(
                "./div[@class='lab']/text()").extract_first(default="")
            value_list = one_location.xpath(
                "string(./div[@class='rcont'])").extract()
            temp_list = []
            for one_value in value_list:
                temp = CommonClass.clean_string(string=one_value,
                                                char_to_remove=[
                                                    '\xa0',
                                                    '\n',
                                                    '\t',
                                                    ' ',
                                                ])
                temp_list.append(temp.strip('\r'))
                # keep \r
            if 0 < len(key):
                key = CommonClass.clean_string(string=key,
                                               char_to_remove=[
                                                   '\u2003',
                                                   '\xa0',
                                                   '\n',
                                                   '\t',
                                                   ' ',
                                               ])
                location_dict[key] = "___".join(temp_list)

        information_box = response.xpath(
            "//div[@class='content-item fydes-item']")
        information_title_list = information_box.xpath(
            "string(./div[@class='title'])").extract()
        information_title = "___".join(
            information_title_list) if 0 < len(information_title_list) else ""
        information1div = information_box.xpath(
            "./div[@class='cont clearfix']/div[@class='text-item clearfix']")
        information_dict = {}
        for one_item in information1div:
            key = one_item.xpath("./span[@class='lab']/text()").extract_first(
                default="")
            value_list = one_item.xpath(
                "string(./span[@class='rcont'])").extract()
            temp_list = []
            for one_value in value_list:
                temp = CommonClass.clean_string(string=one_value,
                                                char_to_remove=[
                                                    '\xa0',
                                                    '\n',
                                                    '\t',
                                                    ' ',
                                                ])
                temp_list.append(temp.strip('\r'))
            if 0 < len(key):
                information_dict[key] = "___".join(temp_list)

        community_box1 = response.xpath("//div[@id='xq_message']")
        community_title = community_box1.xpath("./text()").extract_first(
            default="")
        community_title = CommonClass.clean_string(string=community_title,
                                                   char_to_remove=[
                                                       '\xa0',
                                                       '\n',
                                                       '\t',
                                                       ' ',
                                                   ])
        community_dict = {
            "title": community_title.strip('\r'),
        }
        community_box2 = community_box1.xpath("./following-sibling::div")
        community_box2line1 = community_box2.xpath(
            "./div[@class='topt clearfix']")
        line1_list = community_box2line1.xpath(
            "./div[@class='text-item clearfix']")
        for one_item in line1_list:
            key = one_item.xpath("./span[@class='lab']/text()").extract_first(
                default="")
            value_list = one_item.xpath(
                "string(./span[@class='rcont'])").extract()
            if 0 < len(key):
                community_dict[key] = "___".join(value_list)

        community_box2line2 = community_box2line1.xpath(
            "./following-sibling::div")
        line2_list = community_box2line2.xpath(
            "./div[@class='text-item clearfix']")
        for one_item in line2_list:
            key = one_item.xpath("./span[@class='lab']/text()").extract_first(
                default="")
            value = one_item.xpath(
                "./span[@class='rcont ']/text()").extract_first(default="")
            if 0 < len(key):
                key = CommonClass.clean_string(string=key,
                                               char_to_remove=[
                                                   '\xa0',
                                                   '\n',
                                                   '\t',
                                                   ' ',
                                               ])
                community_dict[key] = CommonClass.clean_string(string=value,
                                                               char_to_remove=[
                                                                   '\xa0',
                                                                   '\n',
                                                                   '\t',
                                                                   ' ',
                                                                   '\r',
                                                               ])

        community_box2line3 = community_box2line2.xpath(
            "./following-sibling::div")
        community_box2line3key = community_box2line3.xpath(
            "./div[@class='text-item']/span[@class='lab']/text()"
        ).extract_first(default="")
        community_box2line3value = community_box2line3.xpath(
            "string(./div[@class='text-item']/span[@class='rcont'])").extract(
            )
        temp_list = []
        for one_value in community_box2line3value:
            temp = CommonClass.clean_string(string=one_value,
                                            char_to_remove=[
                                                '\xa0',
                                                '\n',
                                                '\t',
                                                ' ',
                                            ])
            temp = temp.strip('\r')
            if 0 < len(temp):
                temp_list.append(temp)
        if 0 < len(community_box2line3key):
            community_dict[community_box2line3key] = "".join(temp_list)

        text = {
            "title": title.strip(),
            "price": price.strip(),
            "feature": feature_dict,
            "location": location_dict,
            "information": information_dict,
            "community": community_dict,
            "city": city,
            "apt_id": apt_id,
        }
        return text
Esempio n. 6
0
 def parse_response_field(self, response=None, city_name="", apt_id=""):
     text = {}
     if response is None:
         return text
     if "READ_HTML" == self.run_purpose and not isinstance(
             response, Selector):
         return text
     address_list = response.xpath(
         '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]')
     address = address_list[0].xpath(
         '//div[@class="rcont"]/a/text()').extract_first(
             default="") if 0 < len(address_list) else ""
     location_list = response.xpath(
         '//div[@class="trl-item2 clearfix"]/div[@class="rcont address_zf"]/a/text()'
     ).extract()
     if location_list is None or 1 > len(location_list):
         location_list = response.xpath(
             '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[@class="link-under"]/text()'
         ).extract()
         address_list = response.xpath(
             '//div[@class="trl-item2 clearfix"]/div[@class="rcont"]/a[not(@class)]/text()'
         ).extract()
         address = ""
         if 0 < len(address_list):
             address = ";".join(address_list)
     location_list.reverse()
     location = ""
     for one_location in location_list:
         location += one_location
     if 0 < len(address):
         address = CommonClass.clean_string(string=address,
                                            char_to_remove=[
                                                '\r',
                                                '\n',
                                                '\t',
                                                '"',
                                            ])
     if 0 < len(location):
         location = CommonClass.clean_string(string=location,
                                             char_to_remove=[
                                                 '\r',
                                                 '\n',
                                                 '\t',
                                                 '"',
                                             ])
     rent_div = response.xpath(
         '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1 rel"]'
     )
     if rent_div is None or 1 > len(rent_div):
         rent_div = response.xpath(
             '//div[@class="tr-line clearfix zf_new_title"]/div[@class="trl-item sty1"]'
         )
     temp = rent_div.css('::text').extract()
     rent_list = []
     for one_rent in temp:
         temp2 = one_rent.replace("\n", " ")
         temp2 = temp2.strip()
         if 0 < len(temp2):
             rent_list.append(temp2)
     while "" in rent_list:
         rent_list.remove("")
     rent = ""
     if 1 < len(rent_list):
         rent = rent_list[0] + rent_list[1]
     rent_type_div = response.xpath(
         '//div[@class="trl-item1 w146"]/div[@class="tt"]')
     rent_type = rent_type_div[0].css('div::text').extract_first(
         default="") if 0 < len(rent_type_div) else ""
     facing = rent_type_div[1].css('div::text').extract_first(
         default="") if 1 < len(rent_type_div) else ""
     apt_type_div = response.xpath(
         '//div[@class="trl-item1 w182"]/div[@class="tt"]')
     apt_type = apt_type_div[0].css('div::text').extract_first(
         default="") if 0 < len(apt_type_div) else ""
     floor = apt_type_div[1].css('div::text').extract_first(
         default="") if 1 < len(apt_type_div) else ""
     area_div = response.xpath(
         '//div[@class="trl-item1 w132"]/div[@class="tt"]')
     area = area_div[0].css('div::text').extract_first(
         default="") if 0 < len(area_div) else ""
     decorate = area_div[1].css('div::text').extract_first(
         default="") if 1 < len(area_div) else ""
     update_date_spans = response.xpath('//p[@class="gray9 fybh-zf"]/span')
     update_date = ""
     if 1 < len(update_date_spans):
         update_date = update_date_spans[1].css("::text").extract_first(
             default="")
     text = {
         "rent_id": f"{city_name}_{apt_id.strip()}_{self.overwrite_today}",
         "location": location.strip(),
         "address": address.strip(),
         "rent": rent.strip(),
         "rent_type": rent_type.strip(),
         "facing": facing.strip(),
         "apt_type": apt_type.strip(),
         "floor": floor.strip(),
         "area": area.strip(),
         "decorate": decorate.strip(),
         "update_date": update_date.strip(),
     }
     return text
Esempio n. 7
0
	def format_css(self, css=""):
		return_dict = {}
		if css is None or 1 > len(css):
			return return_dict
		csslen = len(css)
		i = 0
		xy_dict = {}
		svg_file_names = {}
		start = 0
		skip_font_face = False
		while i < csslen:
			if css[i] == '{':
				key = css[start:i]
				if -1 < key.find("."):
					key = CommonClass.clean_string( string = key, char_to_remove = ['\r', '\n', '\t', '.', ' ',] )
				elif -1 < key.find("class^="):
					key = CommonClass.clean_string( string = key, char_to_remove = ['\r', '\n', '\t', '.', ' ', ']', '"',] )
					key_list = key.split("[class^=")
					if 2 == len(key_list):
						key = f"{key_list[0]}___{key_list[1]}"
					else:
						self.logger.error( f"Error! key_list = {key_list}; key = {key}" )
						break
				elif -1 < key.find( "@font-face" ):
					skip_font_face = True
				else:
					self.logger.error( f"Error! key = {key}" )
					break
				value = None
				i += 1
				start = i
			elif css[i] == '}':
				value = css[start:i]
				value = CommonClass.clean_string( string = value, char_to_remove = ['\r', '\n', '\t', ] )

				if key is None or 1 > len(key):
					self.logger.error( f"Error! key is None. value = {value}" )
					break
				if -1 < value.find( "background:" ):
					value_list = value.split(" ")
					value_list = CommonClass.remove_0_len_element( list4remove = value_list )
					if 2 == len( value_list ):
						x_str = CommonClass.clean_string( string = value_list[0], char_to_remove = ['background:', 'px', ] )
						y_str = CommonClass.clean_string( string = value_list[1], char_to_remove = ['px;', ] )
						x = float(x_str)
						y = float(y_str)
						# x, y could equal to 0.0
						xy_dict[ key ] = { 'x':x, 'y':y, }
						key = None
					else:
						self.logger.error( f"Wrong value_list len. value = {value}; value_list = {value_list}" )
						break
				elif -1 < value.find( "background-image:" ):
					searchObj = re.search( r'url\((.*?)\)', value, re.M )
					if searchObj is None:
						self.logger.error( f"url not found. value = {value}" )
						break
					temp = searchObj.group()
					temp_list = temp.split("/")
					temp = CommonClass.clean_string( string = temp, char_to_remove = ["url(", ")"] )
					key_list = key.split("___")
					if 2 != len( key_list ):
						self.logger.error( f"Error! len of key_list is NOT 2. key_list = {key_list}; value = {value} " )
						break
					if key_list[1] in xy_dict.keys():
						self.logger.error( f"Error! xy_dict has key {key_list[1]}; value = {xy_dict[ key_list[1] ]}; value_list = {value_list}" )
						self.logger.info( xy_dict )
						break
					temp_dict = {
						'element': key_list[0],
						'filename': (temp_list[-1]).replace(')', ''),
						'url': temp,
					}
					svg_file_names[ key_list[1] ] = temp_dict
					if 1 > self.key_length:
						self.key_length = len(key_list[1])
					key = None
				elif skip_font_face:
					self.logger.warning( f"@font-face skipped" )
					skip_font_face = False
				else:
					self.logger.warning( f"background-image: not found. value = {value}; i = {i}" )
					# break # do NOT use break! just log it
				i += 1
				start = i
			else:
				i += 1
		return_dict['xy_dict'] = xy_dict
		return_dict['svg_file_names'] = svg_file_names
		return return_dict
Esempio n. 8
0
    def get_parse_dict_on_list_page(self, one_li=None, channel=""):
        """the html pages in different channels have different xpath
			return the right dict according to the input channel
			self.database_common_channel_list_table includes all channels but 'hotel' and 'ch70'
		"""
        this_page_xpath = {}
        this_page_dict = {}
        need_clean = []
        use_extract = []
        need_split_and_clean = []
        if channel in self.database_common_channel_list_table:
            use_extract = ['group_deal_list']
            this_page_xpath = {
                'title':
                "./div[@class='txt']/div[@class='tit']/a/h4/text()",
                'shop_id':
                "./div[@class='txt']/div[@class='tit']/a/@data-shopid",
                'star':
                "./div[@class='txt']/div[@class='comment']/span[contains(@class, 'sml-rank-stars')]/@title",
                'group_deal':
                "./div/a[@data-click-name='shop_info_groupdeal_click']/@title",
                'group_deal_list':
                "./div[@class='svr-info']/div/a[@data-click-name='shop_info_groupdeal_click']/@title",
                # group_deal_list found in [ 'ch10', 'ch15', 'ch30', 'ch45', 'ch50', 'ch65', 'ch75', 'ch80', 'ch85', 'ch95', ]:
                'address':
                "./div/a[@data-click-name='shop_map_click']/@data-address",
                'out_of_business':
                "./div[@class='txt']/div[@class='tit']/span[@class='istopTrade']/text()",
            }
            if 'ch10' == channel:
                need_split_and_clean = ['recommended_dishes']
                this_page_xpath[
                    'takeway'] = "./div/a[@data-click-name='shop_info_takeway_click']/@title"
                this_page_xpath[
                    'recommended_dishes'] = "string(./div[@class='txt']/div[@class='recommend'])"
            elif channel in ['ch30', 'ch25']:
                this_page_xpath[
                    'group_deal'] = "./div[@class='txt']/div[@class='tit']/div/a[@class='igroup']/@title"
        elif channel in [
                'ch70',
        ]:
            this_page_xpath = {
                'title':
                "./div[@class='info baby-info']/p[@class='title']/a[@class='shopname']/text()",
                'branch':
                "./div[@class='info baby-info']/p[@class='title']/span[@class='icon-sale']/a[@class='shopbranch']/em/text()",
                'shop_id':
                "./@data-shopid",
                'star':
                "./div[@class='info baby-info']/p[@class='remark']/span[contains(@class, 'item-rank-rst')]/@title",
                'review_numbers':
                "./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='comment-count']/a/text()",
                'mean_prices':
                "string(./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='average'])",
                'group_deal':
                "./div[@class='info baby-info']/div[@class='tuan-info']/a[@class='tuan']/@title",
            }
            need_clean = [
                'mean_prices',
            ]
        elif channel in [
                'ch90',
        ]:
            # ch90家装频道是201905以后增加的新频道,目前完全没有字符串加密。直接读取中文和数字即可
            this_page_xpath = {
                'title':
                "./div[@class='info baby-info']/p[@class='title']/a[@class='shopname']/text()",
                'branch':
                "./div[@class='info baby-info']/p[@class='title']/span[@class='icon-sale']/a[@class='shopbranch']/em/text()",
                'shop_id':
                "./@data-shopid",
                'star':
                "./div[@class='info baby-info']/p[@class='remark']/span[contains(@class, 'item-rank-rst')]/@title",
                'review_numbers':
                "./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='comment-count']/a/text()",
                'mean_prices':
                "string(./div[@class='info baby-info']/p[@class='baby-info-scraps']/span[@class='average'])",
                'group_deal':
                "./div[@class='info baby-info']/div[@class='tuan-info']/a[@class='tuan']/@title",
            }
            need_clean = [
                'mean_prices',
            ]
        elif channel in ['hotel']:
            use_extract = ['hotel_tags']
            need_clean = [
                'place',
                'price',
            ]
            this_page_xpath = {
                'shop_id':
                "./@data-poi",
                'title':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/h2[@class='hotel-name']/a/text()",
                'place':
                "string(./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/p[@class='place'])",
                'hotel_tags':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-info-main']/p[@class='hotel-tags']/span/text()",
                'price':
                "string(./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='price']/p)",
                'star':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='remark']/div[@class='item-rank-ctn']/div[@class='item-rank-ctn']/span/@class",
                'review_numbers':
                "./div[@class='hotel-info-ctn']/div[@class='hotel-remark']/div[@class='remark']/div[@class='item-rank-ctn']/div[@class='item-rank-ctn']/a/text()",
            }

        if one_li is not None:
            for index, key in enumerate(this_page_xpath):
                if key in use_extract:
                    temp_list = one_li.xpath(this_page_xpath[key]).extract()
                    this_page_dict[
                        key] = CommonClass.get_cleaned_string_by_splitting_list(
                            string_or_list=temp_list,
                            char_to_remove=[
                                '\r',
                                '\n',
                                '\t',
                                ' ',
                            ])
                elif key in need_clean:
                    temp_str = one_li.xpath(
                        this_page_xpath[key]).extract_first(default="")
                    this_page_dict[key] = CommonClass.clean_string(
                        string=temp_str,
                        char_to_remove=[
                            '\r',
                            '\n',
                            '\t',
                            ' ',
                        ])
                elif key in need_split_and_clean:
                    temp_string = one_li.xpath(
                        this_page_xpath[key]).extract_first(default="")
                    this_page_dict[
                        key] = CommonClass.get_cleaned_string_by_splitting_list(
                            string_or_list=temp_string,
                            char_to_remove=[
                                '\r',
                                '\n',
                                '\t',
                                ' ',
                            ])
                else:
                    this_page_dict[key] = one_li.xpath(
                        this_page_xpath[key]).extract_first(default="")

                # special fields
                if channel in ['hotel']:
                    if 'star' in this_page_dict.keys():
                        temp = this_page_dict['star'].replace(
                            "sml-rank-stars sml-str", "")
                        if re.match(r'^(\d)+$', temp):
                            temp = int(temp)
                            if temp in self.database_merchant_star_level_table.keys(
                            ):
                                this_page_dict[
                                    'star'] = self.database_merchant_star_level_table[
                                        temp]
                            else:
                                this_page_dict['star'] = this_page_dict[
                                    'star'].replace("sml-rank-stars sml-str",
                                                    "")
                        else:
                            this_page_dict['star'] = temp
                    if 'review_numbers' in this_page_dict.keys():
                        this_page_dict['review_numbers'] = this_page_dict[
                            'review_numbers'].replace("(", "")
                        this_page_dict['review_numbers'] = this_page_dict[
                            'review_numbers'].replace(")", "")
        shop_id = this_page_dict[
            'shop_id'] if 'shop_id' in this_page_dict.keys() else '0'

        # extract special nodes
        # no by now

        return this_page_dict, shop_id