def __find_attribute_row_coordinate(self, structuring_data): attributes_row_index_coordinate = {} for key in self.attributes_map: coordinates = ArrayHelper.locate_value(structuring_data, key) for coordinate in coordinates: x = tuple(coordinate[0:len(coordinate) - 1]) if x in attributes_row_index_coordinate.keys(): counter = attributes_row_index_coordinate[x] attributes_row_index_coordinate[x] = counter + 1 else: attributes_row_index_coordinate[x] = 1 attributes_row_coordinate = [] number_of_target_attributes = len(self.get_target_attributes()) for key in attributes_row_index_coordinate: if attributes_row_index_coordinate[key] >= min( number_of_target_attributes, 3): attributes_row_coordinate.append(list(key)) if len(attributes_row_coordinate) == 0: print("找不到符合的属性行") if len(attributes_row_coordinate) != 0: print("所有潜在属性行如下:") for a in attributes_row_coordinate: print( ArrayHelper.get_value_by_coordinate( structuring_data, a)) return attributes_row_coordinate
def get_target_attributes(self): result = [] for value in self.attributes_map.values(): if isinstance(value, list): result.extend(value) else: result.append(value) return ArrayHelper.remove_duplicate_item(result)
def next_record(self): data_row_coordinate = self.__get_current_data_row_coordinate() if data_row_coordinate is None: return None data_row = ArrayHelper.get_value_by_coordinate(self.data, data_row_coordinate) result = {} index_to_attribute = self.__get_index_to_attribute() for j in range(0, len(data_row)): data_cell = data_row[j] if isinstance(data_cell, list): data_cell = ArrayHelper.join_array(data_cell) if j in index_to_attribute.keys(): result[index_to_attribute[j]] = data_cell target_attribute = self.get_target_attributes() for attribute in target_attribute: if attribute not in result.keys(): result[attribute] = '' return result
def __is_data_row_valid(self, data_row_index): table_coordinate = self.__get_current_table_coordinate() table = ArrayHelper.get_value_by_coordinate(self.data, table_coordinate) if data_row_index >= len(table): return False data_row_coordinate = list(table_coordinate) data_row_coordinate.append(data_row_index) data_row = ArrayHelper.get_value_by_coordinate(self.data, data_row_coordinate) attribute_row_coordinate = self.__get_current_attribute_row_coordinate( ) attribute_row = ArrayHelper.get_value_by_coordinate( self.data, attribute_row_coordinate) if len(attribute_row) != len( data_row ) or data_row_coordinate in self.attributes_row_coordinates: return False return True
def get_description_of_current_table(self): result = '' table_coordinate = self.__get_current_table_coordinate() table = ArrayHelper.get_value_by_coordinate(self.data, table_coordinate) start_index = 0 current_attribute_coordinate = self.attributes_row_coordinates[ self.current_attribute_row_index] end_index = current_attribute_coordinate[ len(current_attribute_coordinate) - 1] if self.last_data_row_coordinate is not None: len1 = len(self.last_data_row_coordinate) last_table_coordinate = self.last_data_row_coordinate[0:len1 - 1] current_table_coordinate = self.__get_current_table_coordinate() import operator if operator.eq(last_table_coordinate, current_table_coordinate): last_data_row_index = self.last_data_row_coordinate[len1 - 1] start_index = last_data_row_index + 1 for i in range(start_index, end_index): result = result + ArrayHelper.join_array(table[i]) return result
def __get_index_to_attribute(self): if self.index_to_attribute is not None \ and self.current_attribute_row_index == self.index_to_attribute_for_row_index: return self.index_to_attribute attribute_row = ArrayHelper.get_value_by_coordinate( self.data, self.__get_current_attribute_row_coordinate()) index_to_attribute = {} self.original_attributes = [] for i in range(0, len(attribute_row)): original_attribute = attribute_row[i] target_attribute = None if isinstance(original_attribute, list): original_attribute = ''.join(original_attribute) for key in self.attributes_map.keys(): if StringHelper.match_string(original_attribute, key): target_attribute = self.attributes_map[key] if target_attribute is not None: index_to_attribute[i] = target_attribute self.original_attributes.append(original_attribute) self.index_to_attribute = index_to_attribute self.index_to_attribute_for_row_index = self.current_attribute_row_index return self.index_to_attribute
def mapping(self, data): # 计算所有属性行的位置 attributes_row_index_counter = {} for key in self.col_mapping_dic: coordinates = ArrayHelper.locate_value(data, key) for coordinate in coordinates: x = tuple(coordinate[0:len(coordinate) - 1]) if x in attributes_row_index_counter.keys(): counter = attributes_row_index_counter[x] attributes_row_index_counter[x] = counter + 1 else: attributes_row_index_counter[x] = 1 attributes_row_index = [] number_of_target_attributes = len(self.get_target_attributes()) for key in attributes_row_index_counter: if attributes_row_index_counter[key] >= min( number_of_target_attributes, 3): attributes_row_index.append(key) print("所有可能的属性行如下:") for index in attributes_row_index: print(ArrayHelper.get_value_by_coordinate(data, index)) # 映射 last_data_row_index = -1 # 记录上一个数据区的最后一行下标 for coordinate in attributes_row_index: attribute_row_index = coordinate[len(coordinate) - 1] start_row_index = attribute_row_index + 1 attribute_row = ArrayHelper.get_value_by_coordinate( data, coordinate) table_coordinate = coordinate[0:len(coordinate) - 1] table = ArrayHelper.get_value_by_coordinate( data, table_coordinate) if table is None: continue # 计算下标和属性的对应关系 index_to_attribute = {} select_attributes = [] for i in range(0, len(attribute_row)): target_attribute = DataMapper.get_target_attribute( self.col_mapping_dic, attribute_row[i]) if target_attribute is not None: index_to_attribute[i] = target_attribute select_attributes.append(attribute_row[i]) for i in range(start_row_index, len(table)): row = table[i] current_coordinate = list(coordinate[0:len(coordinate) - 1]) current_coordinate.append(i) if len(row) != len(attribute_row) or tuple( current_coordinate) in attributes_row_index: last_data_row_index = i - 1 break result = OrderedDict() for j in range(0, len(row)): tmp = row[j] if isinstance(tmp, list): tmp = ''.join(tmp) if j in index_to_attribute.keys(): for attribute in index_to_attribute[j]: result[attribute] = tmp # 处理select节点 for node in self.select_node: select_str = LxmlHelper.get_attribute_of_element( node, "select") regex_str = LxmlHelper.get_attribute_of_element( node, "regex") attribute_str = node.text # 以'@'开头表示选择的是表头属性或者标题 value = None if select_str.startswith('@'): header_result = re.search('(?<=@h)[0-9]*', select_str) # 匹配表头 if header_result is not None: header_index = int(header_result.group()) if regex_str is not None and len( regex_str) != 0: search_result = re.search( regex_str, select_attributes[header_index]) if search_result is None: continue value = search_result.group() else: value = select_attributes[header_index] title_result = re.search('(?<=@t)[0-9]*', select_str) # 匹配标题 if title_result is not None: title_index = int(title_result.group()) target_title_row_index = attribute_row_index - title_index if target_title_row_index >= 0 and target_title_row_index > last_data_row_index: title = None for x in table[target_title_row_index]: if x is not None and len(x) != 0: title = x break if title is not None: if regex_str is not None and len( regex_str) != 0: search_result = re.search( regex_str, title) if search_result is None: continue value = search_result.group() else: value = title else: # TODO 其他选择方式 pass result[attribute_str] = value yield result
def get_headers(self): result = [] result.extend(self.get_target_attributes()) for node in self.select_node: result.append(node.text) return ArrayHelper.remove_duplicate_item(result)
def get_target_attributes(self): result = [] for value in self.col_mapping_dic.values(): result.extend(value) return ArrayHelper.remove_duplicate_item(result)