def __parse_root_node(self, root_node): is_active = LxmlHelper.get_attribute_of_element(root_node, "is_active") if is_active is not None: self.is_active = str(True) == is_active class_str = LxmlHelper.get_attribute_of_element(root_node, "class") if class_str is not None: self.class_str = class_str id_str = LxmlHelper.get_attribute_of_element(root_node, "id") if id_str is not None: self.id_str = id_str
def __parse_configure_file(self, configure_file): doc = etree.parse(configure_file) root = doc.xpath('/spider')[0] self.__parse_root_node(root) children = list(root) data_mapper_dir = {} default_data_mapper = False output_config_dir = {} default_output_config = False url_node = [] for child in children: if child.tag == 'url': url_node.append(child) elif child.tag == 'mapping': data_mapper = DataMapper() table_node = child.xpath("//table")[0] data_mapper.init_setting_by_xml(table_node) setting_id = LxmlHelper.get_attribute_of_element(child, "id") if setting_id is not None: data_mapper_dir[setting_id] = data_mapper elif not default_data_mapper: default_data_mapper = True data_mapper_dir["default"] = data_mapper else: raise Exception("存在多个默认数据映射器") elif child.tag == 'outputs': output_config = LxmlHelper.convert_children_to_dir(child) outputs_id = LxmlHelper.get_attribute_of_element(child, 'id') if outputs_id is not None: output_config_dir[outputs_id] = output_config elif not default_output_config: default_output_config = True output_config_dir["default"] = output_config else: raise Exception("存在多个默认输出配置") else: print("unknown tag " + child.tag + " in config file") for node in url_node: self.top_url_step.append( self.__parse_url_node(node, data_mapper_dir, output_config_dir))
def __parse_url_node(self, url_node, data_mapper_dir, output_dir): step = CrawlStep() url = url_node.text if url is None: url = LxmlHelper.get_attribute_of_element(url_node, 'value') mapper_id = LxmlHelper.get_attribute_of_element(url_node, 'mapping') output_id = LxmlHelper.get_attribute_of_element(url_node, 'output') if url is not None: url = url.strip() step.url = url self.url_to_step[url] = step if mapper_id is not None: if mapper_id not in data_mapper_dir.keys(): raise Exception("不存在id为" + mapper_id + "的数据映射器") data_mapper = data_mapper_dir[mapper_id] if output_id is None or output_id not in output_dir.keys(): output_config = output_dir["default"] else: output_config = output_dir[output_id] step.data_mapper = data_mapper step.output_config = output_config children = list(url_node) for child in children: search_condition = {} xpath_str = LxmlHelper.get_attribute_of_element(child, 'xpath') if xpath_str is not None: search_condition['xpath'] = xpath_str regex_str = LxmlHelper.get_attribute_of_element(child, 'regex') if regex_str is not None: search_condition['regex'] = regex_str if child.tag == 'url': step.search_child_page_by_condition( search_condition, self.__parse_url_node(child, data_mapper_dir, output_dir)) elif child.tag == 'next': step.set_next_page(search_condition) return step
def handle_html_node(current_node): string = current_node.text if len(list(current_node)) == 0 or (string is not None and not string.isspace()) or \ current_node.tag == 'td': # 叶子节点 return StringHelper.filter_illegal_char( LxmlHelper.get_text_of_node(current_node)) else: # 非叶子节点 children = list(current_node) result = [] if len(children) <= 1: return StructuringParser.handle_html_node(children[0]) for child in children: result.append(StructuringParser.handle_html_node(child)) return result
def parse(self, response): current_url = response.request.url print("requesting " + response.request.url) steps = self.config_pool.get_steps(current_url) if steps is None: print("获取" + current_url + "的解析步骤时发生错误") else: # 找出子页面的链接 if steps.has_child_page(): xpath_str = 'xpath' regex_str = 'regex' doc = etree.HTML(response.body.decode(response.encoding)) search_child_page_conditions = steps.search_conditions for condition in search_child_page_conditions: if xpath_str in condition.keys(): a_tags = doc.xpath(condition[xpath_str]) if len(a_tags) == 0: print("xpath:" + condition[xpath_str] + "在" + response.url + "中无匹配节点") for a_tag in a_tags: target_url = LxmlHelper.get_attribute_of_element( a_tag, 'href') target_url = response.urljoin(target_url) # url含中文字符预处理 target_url = urllib.parse.quote(target_url, safe=";/?:@&=+$,", encoding="utf-8") if regex_str in condition.keys() and ExampleSpider.is_node_match_regex(a_tag, condition[regex_str]) or\ regex_str not in condition.keys(): self.config_pool.accpet_xpath_crawl_result( current_url, condition, target_url) yield scrapy.Request(target_url) # 解析当前页面 if steps.is_data_page(): structuring_data = StructuringParser.parse(response) data_mapper = steps.data_mapper mapper_name = data_mapper.set_structuring_data( structuring_data) print(current_url + "使用的数据映射器类型为" + mapper_name) for record in data_mapper: item = ConfigurablespidersItem() item['url'] = current_url item['data_item'] = record yield item
def fix_row_and_col_span(node): parent = node.getparent() index = parent.index(node) row_span = LxmlHelper.get_attribute_of_element(node, "rowspan") if row_span is not None: siblings = parent for i in range(0, int(row_span) - 1): siblings = siblings.getnext() if siblings is None: break copy_node = deepcopy(node) copy_node.set("rowspan", "0") siblings.insert(index, copy_node) # col_span = LxmlHelper.get_attribute_of_element(node, "colspan") # if col_span is not None: # for i in range(0, int(col_span)-1): # copy_node = deepcopy(node) # copy_node.set("colspan", "0") # parent.insert(index, copy_node) children = list(node) for child in children: StructuringParser.fix_row_and_col_span(child)
def mapping(self, data): # 计算所有属性行的位置 attributes_row_index_counter = {} for key in self.col_mapping_dic: coordinates = ArrayHelper.locate_value(data, key) for coordinate in coordinates: x = tuple(coordinate[0:len(coordinate) - 1]) if x in attributes_row_index_counter.keys(): counter = attributes_row_index_counter[x] attributes_row_index_counter[x] = counter + 1 else: attributes_row_index_counter[x] = 1 attributes_row_index = [] number_of_target_attributes = len(self.get_target_attributes()) for key in attributes_row_index_counter: if attributes_row_index_counter[key] >= min( number_of_target_attributes, 3): attributes_row_index.append(key) print("所有可能的属性行如下:") for index in attributes_row_index: print(ArrayHelper.get_value_by_coordinate(data, index)) # 映射 last_data_row_index = -1 # 记录上一个数据区的最后一行下标 for coordinate in attributes_row_index: attribute_row_index = coordinate[len(coordinate) - 1] start_row_index = attribute_row_index + 1 attribute_row = ArrayHelper.get_value_by_coordinate( data, coordinate) table_coordinate = coordinate[0:len(coordinate) - 1] table = ArrayHelper.get_value_by_coordinate( data, table_coordinate) if table is None: continue # 计算下标和属性的对应关系 index_to_attribute = {} select_attributes = [] for i in range(0, len(attribute_row)): target_attribute = DataMapper.get_target_attribute( self.col_mapping_dic, attribute_row[i]) if target_attribute is not None: index_to_attribute[i] = target_attribute select_attributes.append(attribute_row[i]) for i in range(start_row_index, len(table)): row = table[i] current_coordinate = list(coordinate[0:len(coordinate) - 1]) current_coordinate.append(i) if len(row) != len(attribute_row) or tuple( current_coordinate) in attributes_row_index: last_data_row_index = i - 1 break result = OrderedDict() for j in range(0, len(row)): tmp = row[j] if isinstance(tmp, list): tmp = ''.join(tmp) if j in index_to_attribute.keys(): for attribute in index_to_attribute[j]: result[attribute] = tmp # 处理select节点 for node in self.select_node: select_str = LxmlHelper.get_attribute_of_element( node, "select") regex_str = LxmlHelper.get_attribute_of_element( node, "regex") attribute_str = node.text # 以'@'开头表示选择的是表头属性或者标题 value = None if select_str.startswith('@'): header_result = re.search('(?<=@h)[0-9]*', select_str) # 匹配表头 if header_result is not None: header_index = int(header_result.group()) if regex_str is not None and len( regex_str) != 0: search_result = re.search( regex_str, select_attributes[header_index]) if search_result is None: continue value = search_result.group() else: value = select_attributes[header_index] title_result = re.search('(?<=@t)[0-9]*', select_str) # 匹配标题 if title_result is not None: title_index = int(title_result.group()) target_title_row_index = attribute_row_index - title_index if target_title_row_index >= 0 and target_title_row_index > last_data_row_index: title = None for x in table[target_title_row_index]: if x is not None and len(x) != 0: title = x break if title is not None: if regex_str is not None and len( regex_str) != 0: search_result = re.search( regex_str, title) if search_result is None: continue value = search_result.group() else: value = title else: # TODO 其他选择方式 pass result[attribute_str] = value yield result
def is_node_match_regex(node, regex_str): text = LxmlHelper.get_text_of_node(node) search = re.search(regex_str, text) if search is not None: return True return False