Example #1
0
 def _find_complete_block_bounds(self, table, used_cells,
                                 possible_block_start, start_pos, end_pos):
     '''
     Finds the end of a block from a start location and a suggested 
     end location.
     '''
     block_start = list(possible_block_start)
     block_end = list(possible_block_start)
     table_row = table[block_start[0]]
     used_row = used_cells[block_start[0]]
     # Find which column the titles end on
     for column_index in range(block_start[1], end_pos[1] + 1):
         # Ensure we catch the edge case of the data reaching the edge of
         # the table -- block_end should then equal end_pos
         block_end[1] = max(block_end[1], column_index)
         if (column_index == end_pos[1] or used_row[column_index]
                 or is_empty_cell(table_row[column_index])):
             break
     for row_index in range(block_start[0] + 1, end_pos[0] + 1):
         block_end[0] = row_index
         # Stop if we reach the end of the table space
         if block_end[0] == end_pos[0]:
             break
         table_row = table[row_index]
         blank = False
         for column_index in range(block_start[1], block_end[1]):
             if (column_index == block_end[1] or used_row[column_index]
                     or is_empty_cell(table_row[column_index])):
                 blank = True
                 break
         if blank:
             break
     return block_start, block_end
Example #2
0
 def _find_block_end(self, table, used_cells, block_start, block_end, 
                     start_pos, end_pos):
     '''
     Finds the end of a block from a start location and a suggested 
     end location.
     '''
     table_row = table[block_start[0]]
     used_row = used_cells[block_start[0]]
     # Find which column the titles end on
     for column_index in range(block_start[1], end_pos[1]+1):
         # Ensure we catch the edge case of the data reaching the edge of
         # the table -- block_end should then equal end_pos
         block_end[1] = max(block_end[1], column_index)
         if column_index == end_pos[1]:
             break
         if used_row[column_index]:
             break
         elif is_empty_cell(table_row[column_index]):
             table_column = TableTranspose(table)[column_index]
             used_column = TableTranspose(used_cells)[column_index]
             found_cell = False
             for row_index in range(block_start[0], block_end[0]):
                 if not is_empty_cell(table_column[row_index]):
                     found_cell = True
                     break
             # If we have a column of blanks, stop
             if not found_cell:
                 break
     return block_start, block_end
Example #3
0
 def _find_block_end(self, table, used_cells, block_start, block_end,
                     start_pos, end_pos):
     '''
     Finds the end of a block from a start location and a suggested 
     end location.
     '''
     table_row = table[block_start[0]]
     used_row = used_cells[block_start[0]]
     # Find which column the titles end on
     for column_index in range(block_start[1], end_pos[1] + 1):
         # Ensure we catch the edge case of the data reaching the edge of
         # the table -- block_end should then equal end_pos
         block_end[1] = max(block_end[1], column_index)
         if column_index == end_pos[1]:
             break
         if used_row[column_index]:
             break
         elif is_empty_cell(table_row[column_index]):
             table_column = TableTranspose(table)[column_index]
             used_column = TableTranspose(used_cells)[column_index]
             found_cell = False
             for row_index in range(block_start[0], block_end[0]):
                 if not is_empty_cell(table_column[row_index]):
                     found_cell = True
                     break
             # If we have a column of blanks, stop
             if not found_cell:
                 break
     return block_start, block_end
Example #4
0
 def _find_block_start(self, table, used_cells, possible_block_start, 
                       start_pos, end_pos):
     '''
     Finds the start of a block from a suggested start location.
     This location can be at a lower column but not a lower row.
     
     Note this also finds the lowest row of block_end.
     '''
     current_col = possible_block_start[1]
     block_start = list(possible_block_start)
     block_end = list(possible_block_start)
     repeat = True
     checked_all = False
     # Repeat until we've met satisfactory conditions for 
     # catching all edge cases or we've checked all valid
     # block locations
     while(not checked_all and repeat):
         block_end[0] = max(block_end[0], possible_block_start[0])
         block_end[1] = max(block_end[1], current_col)
         table_column = TableTranspose(table)[current_col]
         used_column = TableTranspose(used_cells)[current_col]
         # We need to find a non empty cell before we can stop
         blank_start = is_empty_cell(table_column[possible_block_start[0]])
         # Unless we have assume_complete_blocks set to True
         if blank_start and self.assume_complete_blocks:
             # Found a blank? We're done
             repeat = False
             break
         blank_exited = not blank_start
         blank_repeat_threshold = 3
         parent_title = blank_start or is_text_cell(table_column[possible_block_start[0]])
         #TODO refactor code below into new function for easier reading
         # Analyze the beginning columns
         for row_index in range(possible_block_start[0], end_pos[0]+1):
             # Ensure we catch the edge case of the data reaching the edge of
             # the table -- block_end should then equal end_pos
             if blank_exited:
                 block_end[0] = max(block_end[0], row_index)
             if row_index == end_pos[0] or used_column[row_index]:
                 # We've gone through the whole range
                 checked_all = True
                 break
             elif not blank_exited:
                 blank_exited = not is_empty_cell(table_column[row_index])
             elif is_empty_cell(table_column[row_index]):
                 current_col += 1
                 break
             else:
                 # Go find the left most column that's still valid
                 table_row = table[row_index]
                 used_row = used_cells[row_index]
                 for column_index in range(current_col, start_pos[1]-1, -1):
                     if is_empty_cell(table_row[column_index]) or used_row[column_index]:
                         break
                     else:
                         block_start[1] = min(block_start[1], column_index)
             # Check if we've seen few enough cells to guess that we have a repeating title
             repeat = blank_start or 1+row_index-possible_block_start[0] <= blank_repeat_threshold
         
     return block_start, block_end
Example #5
0
 def _find_complete_block_bounds(self, table, used_cells, possible_block_start, 
                                 start_pos, end_pos):
     '''
     Finds the end of a block from a start location and a suggested 
     end location.
     '''
     block_start = list(possible_block_start)
     block_end = list(possible_block_start)
     table_row = table[block_start[0]]
     used_row = used_cells[block_start[0]]
     # Find which column the titles end on
     for column_index in range(block_start[1], end_pos[1]+1):
         # Ensure we catch the edge case of the data reaching the edge of
         # the table -- block_end should then equal end_pos
         block_end[1] = max(block_end[1], column_index)
         if (column_index == end_pos[1] or used_row[column_index] or 
                 is_empty_cell(table_row[column_index])):
             break
     for row_index in range(block_start[0]+1, end_pos[0]+1):
         block_end[0] = row_index
         # Stop if we reach the end of the table space
         if block_end[0] == end_pos[0]:
             break
         table_row = table[row_index]
         blank = False
         for column_index in range(block_start[1], block_end[1]):
             if (column_index == block_end[1] or used_row[column_index] or 
                     is_empty_cell(table_row[column_index])):
                 blank = True
                 break
         if blank:
             break
     return block_start, block_end
Example #6
0
 def _find_valid_block(self, table, worksheet, flags, units, 
                       used_cells, start_pos, end_pos):
     '''
     Searches for the next location where a valid block could reside
     and constructs the block object representing that location.
     '''
     for row_index in range(len(table)):
         if row_index < start_pos[0] or row_index > end_pos[0]:
             continue
         convRow = table[row_index]
         used_row = used_cells[row_index]
         for column_index, conv in enumerate(convRow):
             if (column_index < start_pos[1] or 
                 column_index > end_pos[1] or used_row[column_index]):
                 continue
             # Is non empty cell?
             if not is_empty_cell(conv):
                 block_start, block_end = self._find_block_bounds(
                          table, used_cells, (row_index, column_index), 
                          start_pos, end_pos)
                 if (block_end[0] > block_start[0] and 
                     block_end[1] > block_start[1]):
                     try: return TableBlock(
                              table, used_cells, block_start, block_end, worksheet, 
                              flags, units, self.assume_complete_blocks)
                     except InvalidBlockError: pass
                     # Prevent infinite loops if something goes wrong
                     used_cells[row_index][column_index] = True
Example #7
0
 def _find_valid_block(self, table, worksheet, flags, units, used_cells,
                       start_pos, end_pos):
     '''
     Searches for the next location where a valid block could reside
     and constructs the block object representing that location.
     '''
     for row_index in range(len(table)):
         if row_index < start_pos[0] or row_index > end_pos[0]:
             continue
         convRow = table[row_index]
         used_row = used_cells[row_index]
         for column_index, conv in enumerate(convRow):
             if (column_index < start_pos[1] or column_index > end_pos[1]
                     or used_row[column_index]):
                 continue
             # Is non empty cell?
             if not is_empty_cell(conv):
                 block_start, block_end = self._find_block_bounds(
                     table, used_cells, (row_index, column_index),
                     start_pos, end_pos)
                 if (block_end[0] > block_start[0]
                         and block_end[1] > block_start[1]):
                     try:
                         return TableBlock(table, used_cells, block_start,
                                           block_end, worksheet, flags,
                                           units,
                                           self.assume_complete_blocks)
                     except InvalidBlockError:
                         pass
                     # Prevent infinite loops if something goes wrong
                     used_cells[row_index][column_index] = True
Example #8
0
    def _check_interpret_cell(self, cell, prior_cell, row_index, column_index):
        '''
        Helper function which checks cell type and performs cell translation to strings where
        necessary.

        Returns:
            A tuple of the form '(cell, changed)' where 'changed' indicates if 'cell' differs from
            input.
        '''
        changed = False
        if (not is_empty_cell(cell) and
            not is_text_cell(cell)):
            self.flag_change(self.flags, 'interpreted', (row_index, column_index),
                             self.worksheet, self.FLAGS['converted-to-string'])
            cell = str(cell)
            changed = True
        # If we find a blank cell, propagate the prior title
        elif is_empty_cell(cell):
            self.flag_change(self.flags, 'interpreted', (row_index, column_index),
                             self.worksheet, self.FLAGS['copied-title'])
            cell = prior_cell
            changed = True
        return cell, changed
Example #9
0
    def _check_interpret_cell(self, cell, prior_cell, row_index, column_index):
        '''
        Helper function which checks cell type and performs cell translation to strings where
        necessary.

        Returns:
            A tuple of the form '(cell, changed)' where 'changed' indicates if 'cell' differs from
            input.
        '''
        changed = False
        if (not is_empty_cell(cell) and not is_text_cell(cell)):
            self.flag_change(self.flags, 'interpreted',
                             (row_index, column_index), self.worksheet,
                             self.FLAGS['converted-to-string'])
            cell = str(cell)
            changed = True
        # If we find a blank cell, propagate the prior title
        elif is_empty_cell(cell):
            self.flag_change(self.flags, 'interpreted',
                             (row_index, column_index), self.worksheet,
                             self.FLAGS['copied-title'])
            cell = prior_cell
            changed = True
        return cell, changed
Example #10
0
    def _check_years(self, cell, prior_year):
        '''
        Helper method which defines the rules for checking for existence of a year indicator. If the
        cell is blank then prior_year is used to determine validity.
        '''
        # Anything outside these values shouldn't auto
        # categorize to strings
        min_year = 1900
        max_year = 2100

        # Empty cells could represent the prior cell's title,
        # but an empty cell before we find a year is not a title
        if is_empty_cell(cell):
            return bool(prior_year)
        # Check if we have a numbered cell between min and max years
        return is_num_cell(cell) and cell > min_year and cell < max_year
Example #11
0
    def _check_years(self, cell, prior_year):
        '''
        Helper method which defines the rules for checking for existence of a year indicator. If the
        cell is blank then prior_year is used to determine validity.
        '''
        # Anything outside these values shouldn't auto
        # categorize to strings
        min_year = 1900
        max_year = 2100

        # Empty cells could represent the prior cell's title,
        # but an empty cell before we find a year is not a title
        if is_empty_cell(cell):
            return bool(prior_year)
        # Check if we have a numbered cell between min and max years
        return is_num_cell(cell) and cell > min_year and cell < max_year
Example #12
0
    def _repair_column(self):
        '''
        Same as _repair_row but for columns.
        '''
        # Repair any title columns
        check_for_title = True
        for column_index in range(self.start[1], self.end[1]):
            table_column = TableTranspose(self.table)[column_index]
            column_start = table_column[self.start[0]]

            # Only iterate through columns starting with a blank cell
            if check_for_title and is_empty_cell(column_start):
                self._stringify_column(column_index)
            # Check for year titles in column or row
            elif (isinstance(column_start, basestring)
                  and re.search(allregex.year_regex, column_start)):
                self._check_stringify_year_column(column_index)
            else:
                check_for_title = False
Example #13
0
    def _repair_column(self):
        '''
        Same as _repair_row but for columns.
        '''
        # Repair any title columns
        check_for_title = True
        for column_index in range(self.start[1], self.end[1]):
            table_column = TableTranspose(self.table)[column_index]
            column_start = table_column[self.start[0]]

            # Only iterate through columns starting with a blank cell
            if check_for_title and is_empty_cell(column_start):
                self._stringify_column(column_index)
            # Check for year titles in column or row
            elif (isinstance(column_start, basestring) and
                  re.search(allregex.year_regex, column_start)):
                self._check_stringify_year_column(column_index)
            else:
                check_for_title = False
Example #14
0
    def _repair_row(self):
        '''
        Searches for missing titles that can be inferred from the surrounding data and automatically
        repairs those titles.
        '''
        # Repair any title rows
        check_for_title = True
        for row_index in range(self.start[0], self.end[0]):
            table_row = self.table[row_index]
            row_start = table_row[self.start[1]]

            # Look for empty cells leading titles
            if check_for_title and is_empty_cell(row_start):
                self._stringify_row(row_index)
            # Check for year titles in column or row
            elif (isinstance(row_start, basestring)
                  and re.search(allregex.year_regex, row_start)):
                self._check_stringify_year_row(row_index)
            else:
                check_for_title = False
Example #15
0
    def _repair_row(self):
        '''
        Searches for missing titles that can be inferred from the surrounding data and automatically
        repairs those titles.
        '''
        # Repair any title rows
        check_for_title = True
        for row_index in range(self.start[0], self.end[0]):
            table_row = self.table[row_index]
            row_start = table_row[self.start[1]]

            # Look for empty cells leading titles
            if check_for_title and is_empty_cell(row_start):
                self._stringify_row(row_index)
            # Check for year titles in column or row
            elif (isinstance(row_start, basestring) and
                  re.search(allregex.year_regex, row_start)):
                self._check_stringify_year_row(row_index)
            else:
                check_for_title = False
Example #16
0
    def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos):
        '''
        Finds the start of a block from a suggested start location. This location can be at a lower
        column but not a lower row. The function traverses columns until it finds a stopping
        condition or a repeat condition that restarts on the next column.

        Note this also finds the lowest row of block_end.
        '''
        current_col = possible_block_start[1]
        block_start = list(possible_block_start)
        block_end = list(possible_block_start)
        repeat = True
        checked_all = False

        # Repeat until we've met satisfactory conditions for catching all edge cases or we've
        # checked all valid block locations
        while not checked_all and repeat:
            block_end[0] = max(block_end[0], possible_block_start[0])
            block_end[1] = max(block_end[1], current_col)
            single_titled_block = True
            table_column = TableTranspose(table)[current_col]
            used_column = TableTranspose(used_cells)[current_col]
            # We need to find a non empty cell before we can stop
            blank_start = is_empty_cell(table_column[possible_block_start[0]])
            blank_exited = not blank_start
            # Unless we have assume_complete_blocks set to True
            if blank_start and self.assume_complete_blocks:
                # Found a blank? We're done
                repeat = False
                break

            #TODO refactor code below into new function for easier reading
            # Analyze the beginning columns
            for row_index in xrange(possible_block_start[0], end_pos[0] + 1):
                # Ensure we catch the edge case of the data reaching the edge of the table --
                # block_end should then equal end_pos
                if blank_exited:
                    block_end[0] = max(block_end[0], row_index)
                if row_index == end_pos[0] or used_column[row_index]:
                    # We've gone through the whole range
                    checked_all = True
                    repeat = False
                    break
                if not blank_exited:
                    blank_exited = not is_empty_cell(table_column[row_index])
                if single_titled_block and not self._single_length_title(table, row_index, current_col):
                    single_titled_block = False
                    # If we saw single length titles for several more than threshold rows, then we
                    # have a unique block before an actual content block
                    if self._above_blank_repeat_threshold(possible_block_start[0], row_index):
                        repeat = False
                        break
                if is_empty_cell(table_column[row_index]) and len(table[row_index]) > current_col + 1:
                    current_col += 1
                    break

                # Go find the left most column that's still valid
                table_row = table[row_index]
                used_row = used_cells[row_index]
                for column_index in range(current_col, start_pos[1] - 1, -1):
                    if is_empty_cell(table_row[column_index]) or used_row[column_index]:
                        break
                    else:
                        block_start[1] = min(block_start[1], column_index)
                # Check if we've seen few enough cells to guess that we have a repeating title
                repeat = blank_start or self._below_blank_repeat_threshold(possible_block_start[0], row_index)

        return block_start, block_end
Example #17
0
    def _find_block_start(self, table, used_cells, possible_block_start,
                          start_pos, end_pos):
        '''
        Finds the start of a block from a suggested start location. This location can be at a lower
        column but not a lower row. The function traverses columns until it finds a stopping
        condition or a repeat condition that restarts on the next column.

        Note this also finds the lowest row of block_end.
        '''
        current_col = possible_block_start[1]
        block_start = list(possible_block_start)
        block_end = list(possible_block_start)
        repeat = True
        checked_all = False

        # Repeat until we've met satisfactory conditions for catching all edge cases or we've
        # checked all valid block locations
        while not checked_all and repeat:
            block_end[0] = max(block_end[0], possible_block_start[0])
            block_end[1] = max(block_end[1], current_col)
            single_titled_block = True
            table_column = TableTranspose(table)[current_col]
            used_column = TableTranspose(used_cells)[current_col]
            # We need to find a non empty cell before we can stop
            blank_start = is_empty_cell(table_column[possible_block_start[0]])
            blank_exited = not blank_start
            # Unless we have assume_complete_blocks set to True
            if blank_start and self.assume_complete_blocks:
                # Found a blank? We're done
                repeat = False
                break

            #TODO refactor code below into new function for easier reading
            # Analyze the beginning columns
            for row_index in xrange(possible_block_start[0], end_pos[0] + 1):
                # Ensure we catch the edge case of the data reaching the edge of the table --
                # block_end should then equal end_pos
                if blank_exited:
                    block_end[0] = max(block_end[0], row_index)
                if row_index == end_pos[0] or used_column[row_index]:
                    # We've gone through the whole range
                    checked_all = True
                    repeat = False
                    break
                if not blank_exited:
                    blank_exited = not is_empty_cell(table_column[row_index])
                if single_titled_block and not self._single_length_title(
                        table, row_index, current_col):
                    single_titled_block = False
                    # If we saw single length titles for several more than threshold rows, then we
                    # have a unique block before an actual content block
                    if self._above_blank_repeat_threshold(
                            possible_block_start[0], row_index):
                        repeat = False
                        break
                if is_empty_cell(table_column[row_index]) and len(
                        table[row_index]) > current_col + 1:
                    current_col += 1
                    break

                # Go find the left most column that's still valid
                table_row = table[row_index]
                used_row = used_cells[row_index]
                for column_index in range(current_col, start_pos[1] - 1, -1):
                    if is_empty_cell(
                            table_row[column_index]) or used_row[column_index]:
                        break
                    else:
                        block_start[1] = min(block_start[1], column_index)
                # Check if we've seen few enough cells to guess that we have a repeating title
                repeat = blank_start or self._below_blank_repeat_threshold(
                    possible_block_start[0], row_index)

        return block_start, block_end
Example #18
0
    def _find_block_start(self, table, used_cells, possible_block_start,
                          start_pos, end_pos):
        '''
        Finds the start of a block from a suggested start location.
        This location can be at a lower column but not a lower row.
        
        Note this also finds the lowest row of block_end.
        '''
        current_col = possible_block_start[1]
        block_start = list(possible_block_start)
        block_end = list(possible_block_start)
        repeat = True
        checked_all = False
        # Repeat until we've met satisfactory conditions for
        # catching all edge cases or we've checked all valid
        # block locations
        while (not checked_all and repeat):
            block_end[0] = max(block_end[0], possible_block_start[0])
            block_end[1] = max(block_end[1], current_col)
            table_column = TableTranspose(table)[current_col]
            used_column = TableTranspose(used_cells)[current_col]
            # We need to find a non empty cell before we can stop
            blank_start = is_empty_cell(table_column[possible_block_start[0]])
            # Unless we have assume_complete_blocks set to True
            if blank_start and self.assume_complete_blocks:
                # Found a blank? We're done
                repeat = False
                break
            blank_exited = not blank_start
            blank_repeat_threshold = 3
            parent_title = blank_start or is_text_cell(
                table_column[possible_block_start[0]])
            #TODO refactor code below into new function for easier reading
            # Analyze the beginning columns
            for row_index in range(possible_block_start[0], end_pos[0] + 1):
                # Ensure we catch the edge case of the data reaching the edge of
                # the table -- block_end should then equal end_pos
                if blank_exited:
                    block_end[0] = max(block_end[0], row_index)
                if row_index == end_pos[0] or used_column[row_index]:
                    # We've gone through the whole range
                    checked_all = True
                    break
                elif not blank_exited:
                    blank_exited = not is_empty_cell(table_column[row_index])
                elif is_empty_cell(table_column[row_index]):
                    current_col += 1
                    break
                else:
                    # Go find the left most column that's still valid
                    table_row = table[row_index]
                    used_row = used_cells[row_index]
                    for column_index in range(current_col, start_pos[1] - 1,
                                              -1):
                        if is_empty_cell(table_row[column_index]
                                         ) or used_row[column_index]:
                            break
                        else:
                            block_start[1] = min(block_start[1], column_index)
                # Check if we've seen few enough cells to guess that we have a repeating title
                repeat = blank_start or 1 + row_index - possible_block_start[
                    0] <= blank_repeat_threshold

        return block_start, block_end