def _find_complete_block_bounds(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the end of a block from a start location and a suggested end location. ''' block_start = list(possible_block_start) block_end = list(possible_block_start) table_row = table[block_start[0]] used_row = used_cells[block_start[0]] # Find which column the titles end on for column_index in range(block_start[1], end_pos[1] + 1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos block_end[1] = max(block_end[1], column_index) if (column_index == end_pos[1] or used_row[column_index] or is_empty_cell(table_row[column_index])): break for row_index in range(block_start[0] + 1, end_pos[0] + 1): block_end[0] = row_index # Stop if we reach the end of the table space if block_end[0] == end_pos[0]: break table_row = table[row_index] blank = False for column_index in range(block_start[1], block_end[1]): if (column_index == block_end[1] or used_row[column_index] or is_empty_cell(table_row[column_index])): blank = True break if blank: break return block_start, block_end
def _find_block_end(self, table, used_cells, block_start, block_end, start_pos, end_pos): ''' Finds the end of a block from a start location and a suggested end location. ''' table_row = table[block_start[0]] used_row = used_cells[block_start[0]] # Find which column the titles end on for column_index in range(block_start[1], end_pos[1]+1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos block_end[1] = max(block_end[1], column_index) if column_index == end_pos[1]: break if used_row[column_index]: break elif is_empty_cell(table_row[column_index]): table_column = TableTranspose(table)[column_index] used_column = TableTranspose(used_cells)[column_index] found_cell = False for row_index in range(block_start[0], block_end[0]): if not is_empty_cell(table_column[row_index]): found_cell = True break # If we have a column of blanks, stop if not found_cell: break return block_start, block_end
def _find_block_end(self, table, used_cells, block_start, block_end, start_pos, end_pos): ''' Finds the end of a block from a start location and a suggested end location. ''' table_row = table[block_start[0]] used_row = used_cells[block_start[0]] # Find which column the titles end on for column_index in range(block_start[1], end_pos[1] + 1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos block_end[1] = max(block_end[1], column_index) if column_index == end_pos[1]: break if used_row[column_index]: break elif is_empty_cell(table_row[column_index]): table_column = TableTranspose(table)[column_index] used_column = TableTranspose(used_cells)[column_index] found_cell = False for row_index in range(block_start[0], block_end[0]): if not is_empty_cell(table_column[row_index]): found_cell = True break # If we have a column of blanks, stop if not found_cell: break return block_start, block_end
def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the start of a block from a suggested start location. This location can be at a lower column but not a lower row. Note this also finds the lowest row of block_end. ''' current_col = possible_block_start[1] block_start = list(possible_block_start) block_end = list(possible_block_start) repeat = True checked_all = False # Repeat until we've met satisfactory conditions for # catching all edge cases or we've checked all valid # block locations while(not checked_all and repeat): block_end[0] = max(block_end[0], possible_block_start[0]) block_end[1] = max(block_end[1], current_col) table_column = TableTranspose(table)[current_col] used_column = TableTranspose(used_cells)[current_col] # We need to find a non empty cell before we can stop blank_start = is_empty_cell(table_column[possible_block_start[0]]) # Unless we have assume_complete_blocks set to True if blank_start and self.assume_complete_blocks: # Found a blank? We're done repeat = False break blank_exited = not blank_start blank_repeat_threshold = 3 parent_title = blank_start or is_text_cell(table_column[possible_block_start[0]]) #TODO refactor code below into new function for easier reading # Analyze the beginning columns for row_index in range(possible_block_start[0], end_pos[0]+1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos if blank_exited: block_end[0] = max(block_end[0], row_index) if row_index == end_pos[0] or used_column[row_index]: # We've gone through the whole range checked_all = True break elif not blank_exited: blank_exited = not is_empty_cell(table_column[row_index]) elif is_empty_cell(table_column[row_index]): current_col += 1 break else: # Go find the left most column that's still valid table_row = table[row_index] used_row = used_cells[row_index] for column_index in range(current_col, start_pos[1]-1, -1): if is_empty_cell(table_row[column_index]) or used_row[column_index]: break else: block_start[1] = min(block_start[1], column_index) # Check if we've seen few enough cells to guess that we have a repeating title repeat = blank_start or 1+row_index-possible_block_start[0] <= blank_repeat_threshold return block_start, block_end
def _find_complete_block_bounds(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the end of a block from a start location and a suggested end location. ''' block_start = list(possible_block_start) block_end = list(possible_block_start) table_row = table[block_start[0]] used_row = used_cells[block_start[0]] # Find which column the titles end on for column_index in range(block_start[1], end_pos[1]+1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos block_end[1] = max(block_end[1], column_index) if (column_index == end_pos[1] or used_row[column_index] or is_empty_cell(table_row[column_index])): break for row_index in range(block_start[0]+1, end_pos[0]+1): block_end[0] = row_index # Stop if we reach the end of the table space if block_end[0] == end_pos[0]: break table_row = table[row_index] blank = False for column_index in range(block_start[1], block_end[1]): if (column_index == block_end[1] or used_row[column_index] or is_empty_cell(table_row[column_index])): blank = True break if blank: break return block_start, block_end
def _find_valid_block(self, table, worksheet, flags, units, used_cells, start_pos, end_pos): ''' Searches for the next location where a valid block could reside and constructs the block object representing that location. ''' for row_index in range(len(table)): if row_index < start_pos[0] or row_index > end_pos[0]: continue convRow = table[row_index] used_row = used_cells[row_index] for column_index, conv in enumerate(convRow): if (column_index < start_pos[1] or column_index > end_pos[1] or used_row[column_index]): continue # Is non empty cell? if not is_empty_cell(conv): block_start, block_end = self._find_block_bounds( table, used_cells, (row_index, column_index), start_pos, end_pos) if (block_end[0] > block_start[0] and block_end[1] > block_start[1]): try: return TableBlock( table, used_cells, block_start, block_end, worksheet, flags, units, self.assume_complete_blocks) except InvalidBlockError: pass # Prevent infinite loops if something goes wrong used_cells[row_index][column_index] = True
def _find_valid_block(self, table, worksheet, flags, units, used_cells, start_pos, end_pos): ''' Searches for the next location where a valid block could reside and constructs the block object representing that location. ''' for row_index in range(len(table)): if row_index < start_pos[0] or row_index > end_pos[0]: continue convRow = table[row_index] used_row = used_cells[row_index] for column_index, conv in enumerate(convRow): if (column_index < start_pos[1] or column_index > end_pos[1] or used_row[column_index]): continue # Is non empty cell? if not is_empty_cell(conv): block_start, block_end = self._find_block_bounds( table, used_cells, (row_index, column_index), start_pos, end_pos) if (block_end[0] > block_start[0] and block_end[1] > block_start[1]): try: return TableBlock(table, used_cells, block_start, block_end, worksheet, flags, units, self.assume_complete_blocks) except InvalidBlockError: pass # Prevent infinite loops if something goes wrong used_cells[row_index][column_index] = True
def _check_interpret_cell(self, cell, prior_cell, row_index, column_index): ''' Helper function which checks cell type and performs cell translation to strings where necessary. Returns: A tuple of the form '(cell, changed)' where 'changed' indicates if 'cell' differs from input. ''' changed = False if (not is_empty_cell(cell) and not is_text_cell(cell)): self.flag_change(self.flags, 'interpreted', (row_index, column_index), self.worksheet, self.FLAGS['converted-to-string']) cell = str(cell) changed = True # If we find a blank cell, propagate the prior title elif is_empty_cell(cell): self.flag_change(self.flags, 'interpreted', (row_index, column_index), self.worksheet, self.FLAGS['copied-title']) cell = prior_cell changed = True return cell, changed
def _check_years(self, cell, prior_year): ''' Helper method which defines the rules for checking for existence of a year indicator. If the cell is blank then prior_year is used to determine validity. ''' # Anything outside these values shouldn't auto # categorize to strings min_year = 1900 max_year = 2100 # Empty cells could represent the prior cell's title, # but an empty cell before we find a year is not a title if is_empty_cell(cell): return bool(prior_year) # Check if we have a numbered cell between min and max years return is_num_cell(cell) and cell > min_year and cell < max_year
def _repair_column(self): ''' Same as _repair_row but for columns. ''' # Repair any title columns check_for_title = True for column_index in range(self.start[1], self.end[1]): table_column = TableTranspose(self.table)[column_index] column_start = table_column[self.start[0]] # Only iterate through columns starting with a blank cell if check_for_title and is_empty_cell(column_start): self._stringify_column(column_index) # Check for year titles in column or row elif (isinstance(column_start, basestring) and re.search(allregex.year_regex, column_start)): self._check_stringify_year_column(column_index) else: check_for_title = False
def _repair_row(self): ''' Searches for missing titles that can be inferred from the surrounding data and automatically repairs those titles. ''' # Repair any title rows check_for_title = True for row_index in range(self.start[0], self.end[0]): table_row = self.table[row_index] row_start = table_row[self.start[1]] # Look for empty cells leading titles if check_for_title and is_empty_cell(row_start): self._stringify_row(row_index) # Check for year titles in column or row elif (isinstance(row_start, basestring) and re.search(allregex.year_regex, row_start)): self._check_stringify_year_row(row_index) else: check_for_title = False
def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the start of a block from a suggested start location. This location can be at a lower column but not a lower row. The function traverses columns until it finds a stopping condition or a repeat condition that restarts on the next column. Note this also finds the lowest row of block_end. ''' current_col = possible_block_start[1] block_start = list(possible_block_start) block_end = list(possible_block_start) repeat = True checked_all = False # Repeat until we've met satisfactory conditions for catching all edge cases or we've # checked all valid block locations while not checked_all and repeat: block_end[0] = max(block_end[0], possible_block_start[0]) block_end[1] = max(block_end[1], current_col) single_titled_block = True table_column = TableTranspose(table)[current_col] used_column = TableTranspose(used_cells)[current_col] # We need to find a non empty cell before we can stop blank_start = is_empty_cell(table_column[possible_block_start[0]]) blank_exited = not blank_start # Unless we have assume_complete_blocks set to True if blank_start and self.assume_complete_blocks: # Found a blank? We're done repeat = False break #TODO refactor code below into new function for easier reading # Analyze the beginning columns for row_index in xrange(possible_block_start[0], end_pos[0] + 1): # Ensure we catch the edge case of the data reaching the edge of the table -- # block_end should then equal end_pos if blank_exited: block_end[0] = max(block_end[0], row_index) if row_index == end_pos[0] or used_column[row_index]: # We've gone through the whole range checked_all = True repeat = False break if not blank_exited: blank_exited = not is_empty_cell(table_column[row_index]) if single_titled_block and not self._single_length_title(table, row_index, current_col): single_titled_block = False # If we saw single length titles for several more than threshold rows, then we # have a unique block before an actual content block if self._above_blank_repeat_threshold(possible_block_start[0], row_index): repeat = False break if is_empty_cell(table_column[row_index]) and len(table[row_index]) > current_col + 1: current_col += 1 break # Go find the left most column that's still valid table_row = table[row_index] used_row = used_cells[row_index] for column_index in range(current_col, start_pos[1] - 1, -1): if is_empty_cell(table_row[column_index]) or used_row[column_index]: break else: block_start[1] = min(block_start[1], column_index) # Check if we've seen few enough cells to guess that we have a repeating title repeat = blank_start or self._below_blank_repeat_threshold(possible_block_start[0], row_index) return block_start, block_end
def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the start of a block from a suggested start location. This location can be at a lower column but not a lower row. The function traverses columns until it finds a stopping condition or a repeat condition that restarts on the next column. Note this also finds the lowest row of block_end. ''' current_col = possible_block_start[1] block_start = list(possible_block_start) block_end = list(possible_block_start) repeat = True checked_all = False # Repeat until we've met satisfactory conditions for catching all edge cases or we've # checked all valid block locations while not checked_all and repeat: block_end[0] = max(block_end[0], possible_block_start[0]) block_end[1] = max(block_end[1], current_col) single_titled_block = True table_column = TableTranspose(table)[current_col] used_column = TableTranspose(used_cells)[current_col] # We need to find a non empty cell before we can stop blank_start = is_empty_cell(table_column[possible_block_start[0]]) blank_exited = not blank_start # Unless we have assume_complete_blocks set to True if blank_start and self.assume_complete_blocks: # Found a blank? We're done repeat = False break #TODO refactor code below into new function for easier reading # Analyze the beginning columns for row_index in xrange(possible_block_start[0], end_pos[0] + 1): # Ensure we catch the edge case of the data reaching the edge of the table -- # block_end should then equal end_pos if blank_exited: block_end[0] = max(block_end[0], row_index) if row_index == end_pos[0] or used_column[row_index]: # We've gone through the whole range checked_all = True repeat = False break if not blank_exited: blank_exited = not is_empty_cell(table_column[row_index]) if single_titled_block and not self._single_length_title( table, row_index, current_col): single_titled_block = False # If we saw single length titles for several more than threshold rows, then we # have a unique block before an actual content block if self._above_blank_repeat_threshold( possible_block_start[0], row_index): repeat = False break if is_empty_cell(table_column[row_index]) and len( table[row_index]) > current_col + 1: current_col += 1 break # Go find the left most column that's still valid table_row = table[row_index] used_row = used_cells[row_index] for column_index in range(current_col, start_pos[1] - 1, -1): if is_empty_cell( table_row[column_index]) or used_row[column_index]: break else: block_start[1] = min(block_start[1], column_index) # Check if we've seen few enough cells to guess that we have a repeating title repeat = blank_start or self._below_blank_repeat_threshold( possible_block_start[0], row_index) return block_start, block_end
def _find_block_start(self, table, used_cells, possible_block_start, start_pos, end_pos): ''' Finds the start of a block from a suggested start location. This location can be at a lower column but not a lower row. Note this also finds the lowest row of block_end. ''' current_col = possible_block_start[1] block_start = list(possible_block_start) block_end = list(possible_block_start) repeat = True checked_all = False # Repeat until we've met satisfactory conditions for # catching all edge cases or we've checked all valid # block locations while (not checked_all and repeat): block_end[0] = max(block_end[0], possible_block_start[0]) block_end[1] = max(block_end[1], current_col) table_column = TableTranspose(table)[current_col] used_column = TableTranspose(used_cells)[current_col] # We need to find a non empty cell before we can stop blank_start = is_empty_cell(table_column[possible_block_start[0]]) # Unless we have assume_complete_blocks set to True if blank_start and self.assume_complete_blocks: # Found a blank? We're done repeat = False break blank_exited = not blank_start blank_repeat_threshold = 3 parent_title = blank_start or is_text_cell( table_column[possible_block_start[0]]) #TODO refactor code below into new function for easier reading # Analyze the beginning columns for row_index in range(possible_block_start[0], end_pos[0] + 1): # Ensure we catch the edge case of the data reaching the edge of # the table -- block_end should then equal end_pos if blank_exited: block_end[0] = max(block_end[0], row_index) if row_index == end_pos[0] or used_column[row_index]: # We've gone through the whole range checked_all = True break elif not blank_exited: blank_exited = not is_empty_cell(table_column[row_index]) elif is_empty_cell(table_column[row_index]): current_col += 1 break else: # Go find the left most column that's still valid table_row = table[row_index] used_row = used_cells[row_index] for column_index in range(current_col, start_pos[1] - 1, -1): if is_empty_cell(table_row[column_index] ) or used_row[column_index]: break else: block_start[1] = min(block_start[1], column_index) # Check if we've seen few enough cells to guess that we have a repeating title repeat = blank_start or 1 + row_index - possible_block_start[ 0] <= blank_repeat_threshold return block_start, block_end