def __find_attribute_row_coordinate(self, structuring_data):
        attributes_row_index_coordinate = {}
        for key in self.attributes_map:
            coordinates = ArrayHelper.locate_value(structuring_data, key)
            for coordinate in coordinates:
                x = tuple(coordinate[0:len(coordinate) - 1])
                if x in attributes_row_index_coordinate.keys():
                    counter = attributes_row_index_coordinate[x]
                    attributes_row_index_coordinate[x] = counter + 1
                else:
                    attributes_row_index_coordinate[x] = 1

        attributes_row_coordinate = []
        number_of_target_attributes = len(self.get_target_attributes())
        for key in attributes_row_index_coordinate:
            if attributes_row_index_coordinate[key] >= min(
                    number_of_target_attributes, 3):
                attributes_row_coordinate.append(list(key))
        if len(attributes_row_coordinate) == 0:
            print("找不到符合的属性行")
            if len(attributes_row_coordinate) != 0:
                print("所有潜在属性行如下:")
                for a in attributes_row_coordinate:
                    print(
                        ArrayHelper.get_value_by_coordinate(
                            structuring_data, a))
        return attributes_row_coordinate
 def get_target_attributes(self):
     result = []
     for value in self.attributes_map.values():
         if isinstance(value, list):
             result.extend(value)
         else:
             result.append(value)
     return ArrayHelper.remove_duplicate_item(result)
 def next_record(self):
     data_row_coordinate = self.__get_current_data_row_coordinate()
     if data_row_coordinate is None:
         return None
     data_row = ArrayHelper.get_value_by_coordinate(self.data,
                                                    data_row_coordinate)
     result = {}
     index_to_attribute = self.__get_index_to_attribute()
     for j in range(0, len(data_row)):
         data_cell = data_row[j]
         if isinstance(data_cell, list):
             data_cell = ArrayHelper.join_array(data_cell)
         if j in index_to_attribute.keys():
             result[index_to_attribute[j]] = data_cell
     target_attribute = self.get_target_attributes()
     for attribute in target_attribute:
         if attribute not in result.keys():
             result[attribute] = ''
     return result
 def __is_data_row_valid(self, data_row_index):
     table_coordinate = self.__get_current_table_coordinate()
     table = ArrayHelper.get_value_by_coordinate(self.data,
                                                 table_coordinate)
     if data_row_index >= len(table):
         return False
     data_row_coordinate = list(table_coordinate)
     data_row_coordinate.append(data_row_index)
     data_row = ArrayHelper.get_value_by_coordinate(self.data,
                                                    data_row_coordinate)
     attribute_row_coordinate = self.__get_current_attribute_row_coordinate(
     )
     attribute_row = ArrayHelper.get_value_by_coordinate(
         self.data, attribute_row_coordinate)
     if len(attribute_row) != len(
             data_row
     ) or data_row_coordinate in self.attributes_row_coordinates:
         return False
     return True
 def get_description_of_current_table(self):
     result = ''
     table_coordinate = self.__get_current_table_coordinate()
     table = ArrayHelper.get_value_by_coordinate(self.data,
                                                 table_coordinate)
     start_index = 0
     current_attribute_coordinate = self.attributes_row_coordinates[
         self.current_attribute_row_index]
     end_index = current_attribute_coordinate[
         len(current_attribute_coordinate) - 1]
     if self.last_data_row_coordinate is not None:
         len1 = len(self.last_data_row_coordinate)
         last_table_coordinate = self.last_data_row_coordinate[0:len1 - 1]
         current_table_coordinate = self.__get_current_table_coordinate()
         import operator
         if operator.eq(last_table_coordinate, current_table_coordinate):
             last_data_row_index = self.last_data_row_coordinate[len1 - 1]
             start_index = last_data_row_index + 1
     for i in range(start_index, end_index):
         result = result + ArrayHelper.join_array(table[i])
     return result
 def __get_index_to_attribute(self):
     if self.index_to_attribute is not None \
             and self.current_attribute_row_index == self.index_to_attribute_for_row_index:
         return self.index_to_attribute
     attribute_row = ArrayHelper.get_value_by_coordinate(
         self.data, self.__get_current_attribute_row_coordinate())
     index_to_attribute = {}
     self.original_attributes = []
     for i in range(0, len(attribute_row)):
         original_attribute = attribute_row[i]
         target_attribute = None
         if isinstance(original_attribute, list):
             original_attribute = ''.join(original_attribute)
         for key in self.attributes_map.keys():
             if StringHelper.match_string(original_attribute, key):
                 target_attribute = self.attributes_map[key]
         if target_attribute is not None:
             index_to_attribute[i] = target_attribute
             self.original_attributes.append(original_attribute)
     self.index_to_attribute = index_to_attribute
     self.index_to_attribute_for_row_index = self.current_attribute_row_index
     return self.index_to_attribute
        def mapping(self, data):
            # 计算所有属性行的位置
            attributes_row_index_counter = {}
            for key in self.col_mapping_dic:
                coordinates = ArrayHelper.locate_value(data, key)
                for coordinate in coordinates:
                    x = tuple(coordinate[0:len(coordinate) - 1])
                    if x in attributes_row_index_counter.keys():
                        counter = attributes_row_index_counter[x]
                        attributes_row_index_counter[x] = counter + 1
                    else:
                        attributes_row_index_counter[x] = 1
            attributes_row_index = []
            number_of_target_attributes = len(self.get_target_attributes())
            for key in attributes_row_index_counter:
                if attributes_row_index_counter[key] >= min(
                        number_of_target_attributes, 3):
                    attributes_row_index.append(key)
            print("所有可能的属性行如下:")
            for index in attributes_row_index:
                print(ArrayHelper.get_value_by_coordinate(data, index))

            # 映射
            last_data_row_index = -1  # 记录上一个数据区的最后一行下标
            for coordinate in attributes_row_index:
                attribute_row_index = coordinate[len(coordinate) - 1]
                start_row_index = attribute_row_index + 1
                attribute_row = ArrayHelper.get_value_by_coordinate(
                    data, coordinate)
                table_coordinate = coordinate[0:len(coordinate) - 1]
                table = ArrayHelper.get_value_by_coordinate(
                    data, table_coordinate)
                if table is None:
                    continue

                # 计算下标和属性的对应关系
                index_to_attribute = {}
                select_attributes = []
                for i in range(0, len(attribute_row)):
                    target_attribute = DataMapper.get_target_attribute(
                        self.col_mapping_dic, attribute_row[i])
                    if target_attribute is not None:
                        index_to_attribute[i] = target_attribute
                        select_attributes.append(attribute_row[i])

                for i in range(start_row_index, len(table)):
                    row = table[i]
                    current_coordinate = list(coordinate[0:len(coordinate) -
                                                         1])
                    current_coordinate.append(i)
                    if len(row) != len(attribute_row) or tuple(
                            current_coordinate) in attributes_row_index:
                        last_data_row_index = i - 1
                        break
                    result = OrderedDict()
                    for j in range(0, len(row)):
                        tmp = row[j]
                        if isinstance(tmp, list):
                            tmp = ''.join(tmp)
                        if j in index_to_attribute.keys():
                            for attribute in index_to_attribute[j]:
                                result[attribute] = tmp

                    # 处理select节点
                    for node in self.select_node:
                        select_str = LxmlHelper.get_attribute_of_element(
                            node, "select")
                        regex_str = LxmlHelper.get_attribute_of_element(
                            node, "regex")
                        attribute_str = node.text

                        # 以'@'开头表示选择的是表头属性或者标题
                        value = None
                        if select_str.startswith('@'):
                            header_result = re.search('(?<=@h)[0-9]*',
                                                      select_str)
                            #  匹配表头
                            if header_result is not None:
                                header_index = int(header_result.group())
                                if regex_str is not None and len(
                                        regex_str) != 0:
                                    search_result = re.search(
                                        regex_str,
                                        select_attributes[header_index])
                                    if search_result is None:
                                        continue
                                    value = search_result.group()
                                else:
                                    value = select_attributes[header_index]
                            title_result = re.search('(?<=@t)[0-9]*',
                                                     select_str)
                            #  匹配标题
                            if title_result is not None:
                                title_index = int(title_result.group())
                                target_title_row_index = attribute_row_index - title_index
                                if target_title_row_index >= 0 and target_title_row_index > last_data_row_index:
                                    title = None
                                    for x in table[target_title_row_index]:
                                        if x is not None and len(x) != 0:
                                            title = x
                                            break
                                    if title is not None:
                                        if regex_str is not None and len(
                                                regex_str) != 0:
                                            search_result = re.search(
                                                regex_str, title)
                                            if search_result is None:
                                                continue
                                            value = search_result.group()
                                        else:
                                            value = title
                        else:
                            # TODO 其他选择方式
                            pass

                        result[attribute_str] = value
                    yield result
 def get_headers(self):
     result = []
     result.extend(self.get_target_attributes())
     for node in self.select_node:
         result.append(node.text)
     return ArrayHelper.remove_duplicate_item(result)
 def get_target_attributes(self):
     result = []
     for value in self.col_mapping_dic.values():
         result.extend(value)
     return ArrayHelper.remove_duplicate_item(result)