def __parse_root_node(self, root_node):
     is_active = LxmlHelper.get_attribute_of_element(root_node, "is_active")
     if is_active is not None:
         self.is_active = str(True) == is_active
     class_str = LxmlHelper.get_attribute_of_element(root_node, "class")
     if class_str is not None:
         self.class_str = class_str
     id_str = LxmlHelper.get_attribute_of_element(root_node, "id")
     if id_str is not None:
         self.id_str = id_str
    def __parse_configure_file(self, configure_file):
        doc = etree.parse(configure_file)
        root = doc.xpath('/spider')[0]
        self.__parse_root_node(root)
        children = list(root)
        data_mapper_dir = {}
        default_data_mapper = False
        output_config_dir = {}
        default_output_config = False
        url_node = []
        for child in children:
            if child.tag == 'url':
                url_node.append(child)
            elif child.tag == 'mapping':
                data_mapper = DataMapper()
                table_node = child.xpath("//table")[0]
                data_mapper.init_setting_by_xml(table_node)
                setting_id = LxmlHelper.get_attribute_of_element(child, "id")
                if setting_id is not None:
                    data_mapper_dir[setting_id] = data_mapper
                elif not default_data_mapper:
                    default_data_mapper = True
                    data_mapper_dir["default"] = data_mapper
                else:
                    raise Exception("存在多个默认数据映射器")
            elif child.tag == 'outputs':
                output_config = LxmlHelper.convert_children_to_dir(child)
                outputs_id = LxmlHelper.get_attribute_of_element(child, 'id')
                if outputs_id is not None:
                    output_config_dir[outputs_id] = output_config
                elif not default_output_config:
                    default_output_config = True
                    output_config_dir["default"] = output_config
                else:
                    raise Exception("存在多个默认输出配置")
            else:
                print("unknown tag " + child.tag + " in config file")

        for node in url_node:
            self.top_url_step.append(
                self.__parse_url_node(node, data_mapper_dir,
                                      output_config_dir))
 def __parse_url_node(self, url_node, data_mapper_dir, output_dir):
     step = CrawlStep()
     url = url_node.text
     if url is None:
         url = LxmlHelper.get_attribute_of_element(url_node, 'value')
     mapper_id = LxmlHelper.get_attribute_of_element(url_node, 'mapping')
     output_id = LxmlHelper.get_attribute_of_element(url_node, 'output')
     if url is not None:
         url = url.strip()
         step.url = url
         self.url_to_step[url] = step
     if mapper_id is not None:
         if mapper_id not in data_mapper_dir.keys():
             raise Exception("不存在id为" + mapper_id + "的数据映射器")
         data_mapper = data_mapper_dir[mapper_id]
         if output_id is None or output_id not in output_dir.keys():
             output_config = output_dir["default"]
         else:
             output_config = output_dir[output_id]
         step.data_mapper = data_mapper
         step.output_config = output_config
     children = list(url_node)
     for child in children:
         search_condition = {}
         xpath_str = LxmlHelper.get_attribute_of_element(child, 'xpath')
         if xpath_str is not None:
             search_condition['xpath'] = xpath_str
         regex_str = LxmlHelper.get_attribute_of_element(child, 'regex')
         if regex_str is not None:
             search_condition['regex'] = regex_str
         if child.tag == 'url':
             step.search_child_page_by_condition(
                 search_condition,
                 self.__parse_url_node(child, data_mapper_dir, output_dir))
         elif child.tag == 'next':
             step.set_next_page(search_condition)
     return step
 def handle_html_node(current_node):
     string = current_node.text
     if len(list(current_node)) == 0 or (string is not None and not string.isspace()) or \
             current_node.tag == 'td':
         # 叶子节点
         return StringHelper.filter_illegal_char(
             LxmlHelper.get_text_of_node(current_node))
     else:
         # 非叶子节点
         children = list(current_node)
         result = []
         if len(children) <= 1:
             return StructuringParser.handle_html_node(children[0])
         for child in children:
             result.append(StructuringParser.handle_html_node(child))
         return result
Esempio n. 5
0
    def parse(self, response):
        current_url = response.request.url
        print("requesting " + response.request.url)
        steps = self.config_pool.get_steps(current_url)
        if steps is None:
            print("获取" + current_url + "的解析步骤时发生错误")
        else:
            # 找出子页面的链接
            if steps.has_child_page():
                xpath_str = 'xpath'
                regex_str = 'regex'
                doc = etree.HTML(response.body.decode(response.encoding))
                search_child_page_conditions = steps.search_conditions
                for condition in search_child_page_conditions:
                    if xpath_str in condition.keys():
                        a_tags = doc.xpath(condition[xpath_str])
                        if len(a_tags) == 0:
                            print("xpath:" + condition[xpath_str] + "在" +
                                  response.url + "中无匹配节点")
                        for a_tag in a_tags:
                            target_url = LxmlHelper.get_attribute_of_element(
                                a_tag, 'href')
                            target_url = response.urljoin(target_url)
                            # url含中文字符预处理
                            target_url = urllib.parse.quote(target_url,
                                                            safe=";/?:@&=+$,",
                                                            encoding="utf-8")
                            if regex_str in condition.keys() and ExampleSpider.is_node_match_regex(a_tag, condition[regex_str]) or\
                                    regex_str not in condition.keys():
                                self.config_pool.accpet_xpath_crawl_result(
                                    current_url, condition, target_url)
                                yield scrapy.Request(target_url)

            # 解析当前页面
            if steps.is_data_page():
                structuring_data = StructuringParser.parse(response)
                data_mapper = steps.data_mapper
                mapper_name = data_mapper.set_structuring_data(
                    structuring_data)
                print(current_url + "使用的数据映射器类型为" + mapper_name)
                for record in data_mapper:
                    item = ConfigurablespidersItem()
                    item['url'] = current_url
                    item['data_item'] = record
                    yield item
 def fix_row_and_col_span(node):
     parent = node.getparent()
     index = parent.index(node)
     row_span = LxmlHelper.get_attribute_of_element(node, "rowspan")
     if row_span is not None:
         siblings = parent
         for i in range(0, int(row_span) - 1):
             siblings = siblings.getnext()
             if siblings is None:
                 break
             copy_node = deepcopy(node)
             copy_node.set("rowspan", "0")
             siblings.insert(index, copy_node)
     # col_span = LxmlHelper.get_attribute_of_element(node, "colspan")
     # if col_span is not None:
     #     for i in range(0, int(col_span)-1):
     #         copy_node = deepcopy(node)
     #         copy_node.set("colspan", "0")
     #         parent.insert(index, copy_node)
     children = list(node)
     for child in children:
         StructuringParser.fix_row_and_col_span(child)
        def mapping(self, data):
            # 计算所有属性行的位置
            attributes_row_index_counter = {}
            for key in self.col_mapping_dic:
                coordinates = ArrayHelper.locate_value(data, key)
                for coordinate in coordinates:
                    x = tuple(coordinate[0:len(coordinate) - 1])
                    if x in attributes_row_index_counter.keys():
                        counter = attributes_row_index_counter[x]
                        attributes_row_index_counter[x] = counter + 1
                    else:
                        attributes_row_index_counter[x] = 1
            attributes_row_index = []
            number_of_target_attributes = len(self.get_target_attributes())
            for key in attributes_row_index_counter:
                if attributes_row_index_counter[key] >= min(
                        number_of_target_attributes, 3):
                    attributes_row_index.append(key)
            print("所有可能的属性行如下:")
            for index in attributes_row_index:
                print(ArrayHelper.get_value_by_coordinate(data, index))

            # 映射
            last_data_row_index = -1  # 记录上一个数据区的最后一行下标
            for coordinate in attributes_row_index:
                attribute_row_index = coordinate[len(coordinate) - 1]
                start_row_index = attribute_row_index + 1
                attribute_row = ArrayHelper.get_value_by_coordinate(
                    data, coordinate)
                table_coordinate = coordinate[0:len(coordinate) - 1]
                table = ArrayHelper.get_value_by_coordinate(
                    data, table_coordinate)
                if table is None:
                    continue

                # 计算下标和属性的对应关系
                index_to_attribute = {}
                select_attributes = []
                for i in range(0, len(attribute_row)):
                    target_attribute = DataMapper.get_target_attribute(
                        self.col_mapping_dic, attribute_row[i])
                    if target_attribute is not None:
                        index_to_attribute[i] = target_attribute
                        select_attributes.append(attribute_row[i])

                for i in range(start_row_index, len(table)):
                    row = table[i]
                    current_coordinate = list(coordinate[0:len(coordinate) -
                                                         1])
                    current_coordinate.append(i)
                    if len(row) != len(attribute_row) or tuple(
                            current_coordinate) in attributes_row_index:
                        last_data_row_index = i - 1
                        break
                    result = OrderedDict()
                    for j in range(0, len(row)):
                        tmp = row[j]
                        if isinstance(tmp, list):
                            tmp = ''.join(tmp)
                        if j in index_to_attribute.keys():
                            for attribute in index_to_attribute[j]:
                                result[attribute] = tmp

                    # 处理select节点
                    for node in self.select_node:
                        select_str = LxmlHelper.get_attribute_of_element(
                            node, "select")
                        regex_str = LxmlHelper.get_attribute_of_element(
                            node, "regex")
                        attribute_str = node.text

                        # 以'@'开头表示选择的是表头属性或者标题
                        value = None
                        if select_str.startswith('@'):
                            header_result = re.search('(?<=@h)[0-9]*',
                                                      select_str)
                            #  匹配表头
                            if header_result is not None:
                                header_index = int(header_result.group())
                                if regex_str is not None and len(
                                        regex_str) != 0:
                                    search_result = re.search(
                                        regex_str,
                                        select_attributes[header_index])
                                    if search_result is None:
                                        continue
                                    value = search_result.group()
                                else:
                                    value = select_attributes[header_index]
                            title_result = re.search('(?<=@t)[0-9]*',
                                                     select_str)
                            #  匹配标题
                            if title_result is not None:
                                title_index = int(title_result.group())
                                target_title_row_index = attribute_row_index - title_index
                                if target_title_row_index >= 0 and target_title_row_index > last_data_row_index:
                                    title = None
                                    for x in table[target_title_row_index]:
                                        if x is not None and len(x) != 0:
                                            title = x
                                            break
                                    if title is not None:
                                        if regex_str is not None and len(
                                                regex_str) != 0:
                                            search_result = re.search(
                                                regex_str, title)
                                            if search_result is None:
                                                continue
                                            value = search_result.group()
                                        else:
                                            value = title
                        else:
                            # TODO 其他选择方式
                            pass

                        result[attribute_str] = value
                    yield result
Esempio n. 8
0
 def is_node_match_regex(node, regex_str):
     text = LxmlHelper.get_text_of_node(node)
     search = re.search(regex_str, text)
     if search is not None:
         return True
     return False