def graph_from_seeds(seeds, cell_source): """ This creates/updates a networkx graph from a list of cells. The graph is created when the cell_source is an instance of ExcelCompiler The graph is updated when the cell_source is an instance of Spreadsheet """ # when called from Spreadsheet instance, use the Spreadsheet cellmap and graph if hasattr(cell_source, 'G'): # ~ cell_source is a Spreadsheet cellmap = cell_source.cellmap cells = cellmap G = cell_source.G for c in seeds: G.add_node(c) cellmap[c.address()] = c # when called from ExcelCompiler instance, construct cellmap and graph from seeds else: # ~ cell_source is a ExcelCompiler cellmap = dict([(x.address(), x) for x in seeds]) cells = cell_source.cells # directed graph G = networkx.DiGraph() # match the info in cellmap for c in cellmap.values(): G.add_node(c) # cells to analyze: only formulas todo = [s for s in seeds if s.formula] steps = [i for i, s in enumerate(todo)] names = cell_source.named_ranges while todo: c1 = todo.pop() step = steps.pop() cursheet = c1.sheet ###### 1) looking for cell c1 dependencies #################### # print 'C1', c1.address() # in case a formula, get all cells that are arguments pystr, ast = cell2code(c1, names) # set the code & compile it (will flag problems sooner rather than later) c1.python_expression = pystr.replace('"', "'") # compilation is done later if 'OFFSET' in c1.formula or 'INDEX' in c1.formula: if c1.address( ) not in cell_source.named_ranges: # pointers names already treated in ExcelCompiler cell_source.pointers.add(c1.address()) # get all the cells/ranges this formula refers to deps = [x for x in ast.nodes() if isinstance(x, RangeNode)] # remove dupes deps = uniqueify(deps) ###### 2) connect dependencies in cells in graph #################### # ### LOG # tmp = [] # for dep in deps: # if dep not in names: # if "!" not in dep and cursheet != None: # dep = cursheet + "!" + dep # if dep not in cellmap: # tmp.append(dep) # #deps = tmp # logStep = "%s %s = %s " % ('|'*step, c1.address(), '',) # print logStep # if len(deps) > 1 and 'L' in deps[0] and deps[0] == deps[-1].replace('DG','L'): # print logStep, "[%s...%s]" % (deps[0], deps[-1]) # elif len(deps) > 0: # print logStep, "->", deps # else: # print logStep, "done" for dep in deps: dep_name = dep.tvalue.replace('$', '') # this is to avoid :A1 or A1: dep due to clean_pointers() returning an ExcelError if dep_name.startswith(':') or dep_name.endswith(':'): dep_name = dep_name.replace(':', '') # if not pointer, we need an absolute address if dep.tsubtype != 'pointer' and dep_name not in names and "!" not in dep_name and cursheet != None: dep_name = cursheet + "!" + dep_name # Named_ranges + ranges already parsed (previous iterations) if dep_name in cellmap: origins = [cellmap[dep_name]] target = cellmap[c1.address()] # if the dep_name is a multi-cell range, create a range object elif is_range(dep_name) or (dep_name in names and is_range(names[dep_name])): if dep_name in names: reference = names[dep_name] else: reference = dep_name if 'OFFSET' in reference or 'INDEX' in reference: start_end = prepare_pointer(reference, names, ref_cell=c1) rng = cell_source.range(start_end) if dep_name in names: # dep is a pointer range address = dep_name else: if c1.address( ) in names: # c1 holds is a pointer range address = c1.address() else: # a pointer range with no name, its address will be its name address = '%s:%s' % (start_end["start"], start_end["end"]) cell_source.pointers.add(address) else: address = dep_name # get a list of the addresses in this range that are not yet in the graph range_addresses = list( resolve_range(reference, should_flatten=True)[0]) cellmap_add_addresses = [ addr for addr in range_addresses if addr not in cellmap.keys() ] if len(cellmap_add_addresses) > 0: # this means there are cells to be added # get row and col dimensions for the sheet, assuming the whole range is in one sheet sheet_initial = split_address( cellmap_add_addresses[0])[0] max_rows, max_cols = max_dimension( cellmap, sheet_initial) # create empty cells that aren't in the cellmap for addr in cellmap_add_addresses: sheet_new, col_new, row_new = split_address(addr) # if somehow a new sheet comes up in the range, get the new dimensions if sheet_new != sheet_initial: sheet_initial = sheet_new max_rows, max_cols = max_dimension( cellmap, sheet_new) # add the empty cells if int(row_new) <= max_rows and int( col2num(col_new)) <= max_cols: # only add cells within the maximum bounds of the sheet to avoid too many evaluations # for A:A or 1:1 ranges cell_new = Cell(addr, sheet_new, value="", should_eval='False' ) # create new cell object cellmap[ addr] = cell_new # add it to the cellmap G.add_node(cell_new) # add it to the graph cell_source.cells[ addr] = cell_new # add it to the cell_source, used in this function rng = cell_source.range(reference) if address in cellmap: virtual_cell = cellmap[address] else: virtual_cell = Cell(address, None, value=rng, formula=reference, is_range=True, is_named_range=True) # save the range cellmap[address] = virtual_cell # add an edge from the range to the parent G.add_node(virtual_cell) # Cell(A1:A10) -> c1 or Cell(ExampleName) -> c1 G.add_edge(virtual_cell, c1) # cells in the range should point to the range as their parent target = virtual_cell origins = [] if len( list(rng.keys()) ) != 0: # could be better, but can't check on Exception types here... for child in rng.addresses: if child not in cellmap: origins.append(cells[child]) else: origins.append(cellmap[child]) else: # not a range if dep_name in names: reference = names[dep_name] else: reference = dep_name if reference in cells: if dep_name in names: virtual_cell = Cell(dep_name, None, value=cells[reference].value, formula=reference, is_range=False, is_named_range=True) G.add_node(virtual_cell) G.add_edge(cells[reference], virtual_cell) origins = [virtual_cell] else: cell = cells[reference] origins = [cell] cell = origins[0] if cell.formula is not None and ('OFFSET' in cell.formula or 'INDEX' in cell.formula): cell_source.pointers.add(cell.address()) else: virtual_cell = Cell(dep_name, None, value=None, formula=None, is_range=False, is_named_range=True) origins = [virtual_cell] target = c1 # process each cell for c2 in flatten(origins): # if we havent treated this cell allready if c2.address() not in cellmap: if c2.formula: # cell with a formula, needs to be added to the todo list todo.append(c2) steps.append(step + 1) else: # constant cell, no need for further processing, just remember to set the code pystr, ast = cell2code(c2, names) c2.python_expression = pystr c2.compile() # save in the cellmap cellmap[c2.address()] = c2 # add to the graph G.add_node(c2) # add an edge from the cell to the parent (range or cell) if (target != []): # print "Adding edge %s --> %s" % (c2.address(), target.address()) G.add_edge(c2, target) c1.compile( ) # cell compilation is done here because pointer ranges might update python_expressions return (cellmap, G)
def read_cells(archive, ignore_sheets = [], ignore_hidden = False): global debug print '___### Reading Cells from XLSX ###___' cells = {} cts = dict(read_content_types(archive)) strings_path = cts.get(SHARED_STRINGS) # source: https://bitbucket.org/openpyxl/openpyxl/src/93604327bce7aac5e8270674579af76d390e09c0/openpyxl/reader/excel.py?at=default&fileviewer=file-view-default if strings_path is not None: if strings_path.startswith("/"): strings_path = strings_path[1:] shared_strings = read_string_table(archive.read(strings_path)) else: shared_strings = [] for sheet in detect_worksheets(archive): sheet_name = sheet['title'] function_map = {} if sheet_name in ignore_sheets: continue root = ET.fromstring(archive.read(sheet['path'])) # it is necessary to use cElementTree from xml module, otherwise root.findall doesn't work as it should hidden_cols = False nb_hidden = 0 if ignore_hidden: hidden_col_min = None hidden_col_max = None for col in root.findall('.//{%s}cols/*' % SHEET_MAIN_NS): if 'hidden' in col.attrib and col.attrib['hidden'] == '1': hidden_cols = True hidden_col_min = int(col.attrib['min']) hidden_col_max = int(col.attrib['max']) for c in root.findall('.//{%s}c/*/..' % SHEET_MAIN_NS): cell_data_type = c.get('t', 'n') # if no type assigned, assign 'number' cell_address = c.attrib['r'] skip = False if hidden_cols: found = re.search(CELL_REF_RE, cell_address) col = col2num(found.group(1)) if col >= hidden_col_min and col <= hidden_col_max: nb_hidden += 1 skip = True if not skip: cell = {'a': '%s!%s' % (sheet_name, cell_address), 'f': None, 'v': None} if debug: print 'Cell', cell['a'] for child in c: child_data_type = child.get('t', 'n') # if no type assigned, assign 'number' if child.tag == '{%s}f' % SHEET_MAIN_NS : if 'ref' in child.attrib: # the first cell of a shared formula has a 'ref' attribute if debug: print '*** Found definition of shared formula ***', child.text, child.attrib['ref'] if "si" in child.attrib: function_map[child.attrib['si']] = (child.attrib['ref'], Translator(unicode('=' + child.text), cell_address)) # translator of openpyxl needs a unicode argument that starts with '=' # else: # print "Encountered cell with ref but not si: ", sheet_name, child.attrib['ref'] if child_data_type == 'shared': if debug: print '*** Found child %s of shared formula %s ***' % (cell_address, child.attrib['si']) ref = function_map[child.attrib['si']][0] formula = function_map[child.attrib['si']][1] translated = formula.translate_formula(cell_address) cell['f'] = translated[1:] # we need to get rid of the '=' else: cell['f'] = child.text elif child.tag == '{%s}v' % SHEET_MAIN_NS : if cell_data_type == 's' or cell_data_type == 'str': # value is a string try: # if it fails, it means that cell content is a string calculated from a formula cell['v'] = shared_strings[int(child.text)] except: cell['v'] = child.text elif cell_data_type == 'b': cell['v'] = bool(int(child.text)) elif cell_data_type == 'n': cell['v'] = _cast_number(child.text) elif child.text is None: continue if cell['f'] is not None or cell['v'] is not None: should_eval = 'always' if cell['f'] is not None and 'OFFSET' in cell['f'] else 'normal' # cleaned_formula = cell['f'] cleaned_formula = cell['f'].replace(", ", ",") if cell['f'] is not None else None if "!" in cell_address: cells[cell_address] = Cell(cell_address, sheet_name, value = cell['v'], formula = cleaned_formula, should_eval=should_eval) else: cells[sheet_name + "!" + cell_address] = Cell(cell_address, sheet_name, value = cell['v'], formula = cleaned_formula, should_eval=should_eval) if nb_hidden > 0: print 'Ignored %i hidden cells in sheet %s' % (nb_hidden, sheet_name) return cells
def read_cells(archive, ignore_sheets=[], ignore_hidden=False): global debug print('___### Reading Cells from XLSX ###___') cells = {} functions = set() cts = dict(read_content_types(archive)) strings_path = cts.get( SHARED_STRINGS ) # source: https://bitbucket.org/openpyxl/openpyxl/src/93604327bce7aac5e8270674579af76d390e09c0/openpyxl/reader/excel.py?at=default&fileviewer=file-view-default if strings_path is not None: if strings_path.startswith("/"): strings_path = strings_path[1:] shared_strings = read_string_table(archive.read(strings_path)) else: shared_strings = [] for sheet in detect_worksheets(archive): sheet_name = sheet['title'] function_map = {} if sheet_name in ignore_sheets: continue root = fromstring( archive.read(sheet['path']) ) # it is necessary to use cElementTree from xml module, otherwise root.findall doesn't work as it should hidden_cols = False nb_hidden = 0 if ignore_hidden: hidden_col_min = None hidden_col_max = None for col in root.findall('.//{%s}cols/*' % SHEET_MAIN_NS): if 'hidden' in col.attrib and col.attrib['hidden'] == '1': hidden_cols = True hidden_col_min = int(col.attrib['min']) hidden_col_max = int(col.attrib['max']) for c in root.findall('.//{%s}c/*/..' % SHEET_MAIN_NS): cell_data_type = c.get('t', 'n') # if no type assigned, assign 'number' cell_address = c.attrib['r'] skip = False if hidden_cols: found = re.search(CELL_REF_RE, cell_address) col = col2num(found.group(1)) if col >= hidden_col_min and col <= hidden_col_max: nb_hidden += 1 skip = True if not skip: cell = { 'a': '%s!%s' % (sheet_name, cell_address), 'f': None, 'v': None } if debug: print('Cell', cell['a']) for child in c: child_data_type = child.get( 't', 'n') # if no type assigned, assign 'number' if child.tag == '{%s}f' % SHEET_MAIN_NS: if 'ref' in child.attrib: # the first cell of a shared formula has a 'ref' attribute if debug: print( '*** Found definition of shared formula ***', child.text, child.attrib['ref']) if "si" in child.attrib: function_map[child.attrib['si']] = ( child.attrib['ref'], Translator(str('=' + child.text), cell_address) ) # translator of openpyxl needs a unicode argument that starts with '=' # else: # print "Encountered cell with ref but not si: ", sheet_name, child.attrib['ref'] if child_data_type == 'shared': if debug: print( '*** Found child %s of shared formula %s ***' % (cell_address, child.attrib['si'])) ref = function_map[child.attrib['si']][0] formula = function_map[child.attrib['si']][1] translated = formula.translate_formula( cell_address) cell['f'] = translated[ 1:] # we need to get rid of the '=' else: cell['f'] = child.text elif child.tag == '{%s}v' % SHEET_MAIN_NS: if cell_data_type == 's' or cell_data_type == 'str': # value is a string try: # if it fails, it means that cell content is a string calculated from a formula cell['v'] = shared_strings[int(child.text)] except: cell['v'] = child.text elif cell_data_type == 'b': cell['v'] = bool(int(child.text)) elif cell_data_type == 'n': cell['v'] = _cast_number(child.text) elif child.text is None: continue if cell['f'] is not None: pattern = re.compile(r"([A-Z][A-Z0-9]*)\(") found = re.findall(pattern, cell['f']) map(lambda x: functions.add(x), found) if cell['f'] is not None or cell['v'] is not None: should_eval = 'always' if cell[ 'f'] is not None and 'OFFSET' in cell['f'] else 'normal' # cleaned_formula = cell['f'] cleaned_formula = cell['f'].replace( ", ", ",") if cell['f'] is not None else None if "!" in cell_address: cells[cell_address] = Cell(cell_address, sheet_name, value=cell['v'], formula=cleaned_formula, should_eval=should_eval) else: cells[sheet_name + "!" + cell_address] = Cell( cell_address, sheet_name, value=cell['v'], formula=cleaned_formula, should_eval=should_eval) if nb_hidden > 0: print('Ignored %i hidden cells in sheet %s' % (nb_hidden, sheet_name)) print('Nb of different functions %i' % len(functions)) print(functions) for f in functions: if f not in existing: print('== Missing function: %s' % f) return cells
def graph_from_seeds(seeds, cell_source): """ This creates/updates a networkx graph from a list of cells. The graph is created when the cell_source is an instance of ExcelCompiler The graph is updated when the cell_source is an instance of Spreadsheet """ # when called from Spreadsheet instance, use the Spreadsheet cellmap and graph if hasattr(cell_source, 'G'): # ~ cell_source is a Spreadsheet cellmap = cell_source.cellmap cells = cellmap G = cell_source.G for c in seeds: G.add_node(c) cellmap[c.address()] = c # when called from ExcelCompiler instance, construct cellmap and graph from seeds else: # ~ cell_source is a ExcelCompiler cellmap = dict([(x.address(),x) for x in seeds]) cells = cell_source.cells # directed graph G = networkx.DiGraph() # match the info in cellmap for c in cellmap.values(): G.add_node(c) # cells to analyze: only formulas todo = [s for s in seeds if s.formula] steps = [i for i,s in enumerate(todo)] names = cell_source.named_ranges while todo: c1 = todo.pop() step = steps.pop() cursheet = c1.sheet ###### 1) looking for cell c1 dependencies #################### # print 'C1', c1.address() # in case a formula, get all cells that are arguments pystr, ast = cell2code(c1, names) # set the code & compile it (will flag problems sooner rather than later) c1.python_expression = pystr.replace('"', "'") # compilation is done later if 'OFFSET' in c1.formula or 'INDEX' in c1.formula: if c1.address() not in cell_source.named_ranges: # pointers names already treated in ExcelCompiler cell_source.pointers.add(c1.address()) # get all the cells/ranges this formula refers to deps = [x for x in ast.nodes() if isinstance(x,RangeNode)] # remove dupes deps = uniqueify(deps) ###### 2) connect dependencies in cells in graph #################### # ### LOG # tmp = [] # for dep in deps: # if dep not in names: # if "!" not in dep and cursheet != None: # dep = cursheet + "!" + dep # if dep not in cellmap: # tmp.append(dep) # #deps = tmp # logStep = "%s %s = %s " % ('|'*step, c1.address(), '',) # print logStep # if len(deps) > 1 and 'L' in deps[0] and deps[0] == deps[-1].replace('DG','L'): # print logStep, "[%s...%s]" % (deps[0], deps[-1]) # elif len(deps) > 0: # print logStep, "->", deps # else: # print logStep, "done" for dep in deps: dep_name = dep.tvalue.replace('$','') # this is to avoid :A1 or A1: dep due to clean_pointers() returning an ExcelError if dep_name.startswith(':') or dep_name.endswith(':'): dep_name = dep_name.replace(':', '') # if not pointer, we need an absolute address if dep.tsubtype != 'pointer' and dep_name not in names and "!" not in dep_name and cursheet != None: dep_name = cursheet + "!" + dep_name # Named_ranges + ranges already parsed (previous iterations) if dep_name in cellmap: origins = [cellmap[dep_name]] target = cellmap[c1.address()] # if the dep_name is a multi-cell range, create a range object elif is_range(dep_name) or (dep_name in names and is_range(names[dep_name])): if dep_name in names: reference = names[dep_name] else: reference = dep_name if 'OFFSET' in reference or 'INDEX' in reference: start_end = prepare_pointer(reference, names, ref_cell = c1) rng = cell_source.Range(start_end) if dep_name in names: # dep is a pointer range address = dep_name else: if c1.address() in names: # c1 holds is a pointer range address = c1.address() else: # a pointer range with no name, its address will be its name address = '%s:%s' % (start_end["start"], start_end["end"]) cell_source.pointers.add(address) else: address = dep_name # get a list of the addresses in this range that are not yet in the graph range_addresses = list(resolve_range(reference, should_flatten=True)[0]) cellmap_add_addresses = [addr for addr in range_addresses if addr not in cellmap.keys()] if len(cellmap_add_addresses) > 0: # this means there are cells to be added # get row and col dimensions for the sheet, assuming the whole range is in one sheet sheet_initial = split_address(cellmap_add_addresses[0])[0] max_rows, max_cols = max_dimension(cellmap, sheet_initial) # create empty cells that aren't in the cellmap for addr in cellmap_add_addresses: sheet_new, col_new, row_new = split_address(addr) # if somehow a new sheet comes up in the range, get the new dimensions if sheet_new != sheet_initial: sheet_initial = sheet_new max_rows, max_cols = max_dimension(cellmap, sheet_new) # add the empty cells if int(row_new) <= max_rows and int(col2num(col_new)) <= max_cols: # only add cells within the maximum bounds of the sheet to avoid too many evaluations # for A:A or 1:1 ranges cell_new = Cell(addr, sheet_new, value="", should_eval='False') # create new cell object cellmap[addr] = cell_new # add it to the cellmap G.add_node(cell_new) # add it to the graph cell_source.cells[addr] = cell_new # add it to the cell_source, used in this function rng = cell_source.Range(reference) if address in cellmap: virtual_cell = cellmap[address] else: virtual_cell = Cell(address, None, value = rng, formula = reference, is_range = True, is_named_range = True ) # save the range cellmap[address] = virtual_cell # add an edge from the range to the parent G.add_node(virtual_cell) # Cell(A1:A10) -> c1 or Cell(ExampleName) -> c1 G.add_edge(virtual_cell, c1) # cells in the range should point to the range as their parent target = virtual_cell origins = [] if len(list(rng.keys())) != 0: # could be better, but can't check on Exception types here... for child in rng.addresses: if child not in cellmap: origins.append(cells[child]) else: origins.append(cellmap[child]) else: # not a range if dep_name in names: reference = names[dep_name] else: reference = dep_name if reference in cells: if dep_name in names: virtual_cell = Cell(dep_name, None, value = cells[reference].value, formula = reference, is_range = False, is_named_range = True ) G.add_node(virtual_cell) G.add_edge(cells[reference], virtual_cell) origins = [virtual_cell] else: cell = cells[reference] origins = [cell] cell = origins[0] if cell.formula is not None and ('OFFSET' in cell.formula or 'INDEX' in cell.formula): cell_source.pointers.add(cell.address()) else: virtual_cell = Cell(dep_name, None, value = None, formula = None, is_range = False, is_named_range = True ) origins = [virtual_cell] target = c1 # process each cell for c2 in flatten(origins): # if we havent treated this cell allready if c2.address() not in cellmap: if c2.formula: # cell with a formula, needs to be added to the todo list todo.append(c2) steps.append(step+1) else: # constant cell, no need for further processing, just remember to set the code pystr,ast = cell2code(c2, names) c2.python_expression = pystr c2.compile() # save in the cellmap cellmap[c2.address()] = c2 # add to the graph G.add_node(c2) # add an edge from the cell to the parent (range or cell) if(target != []): # print "Adding edge %s --> %s" % (c2.address(), target.address()) G.add_edge(c2,target) c1.compile() # cell compilation is done here because pointer ranges might update python_expressions return (cellmap, G)