Esempio n. 1
0
def read_content_types(archive):
    """Read content types."""
    xml_source = archive.read(ARC_CONTENT_TYPES)
    root = fromstring(xml_source)
    contents_root = root.findall('{%s}Override' % CONTYPES_NS)
    for type in contents_root:
        yield type.get('ContentType'), type.get('PartName')
Esempio n. 2
0
def read_content_types(archive):
    """Read content types."""
    xml_source = archive.read(ARC_CONTENT_TYPES)
    root = fromstring(xml_source)
    contents_root = root.findall('{%s}Override' % CONTYPES_NS)
    for type in contents_root:
        yield type.get('ContentType'), type.get('PartName')
Esempio n. 3
0
def read_sheets(archive):
    """Read worksheet titles and ids for a workbook"""
    xml_source = archive.read(ARC_WORKBOOK)
    tree = fromstring(xml_source)
    for element in safe_iterator(tree, '{%s}sheet' % SHEET_MAIN_NS):
        attrib = element.attrib
        attrib['id'] = attrib["{%s}id" % REL_NS]
        del attrib["{%s}id" % REL_NS]
        if attrib['id']:
            yield attrib
Esempio n. 4
0
def read_sheets(archive):
    """Read worksheet titles and ids for a workbook"""
    xml_source = archive.read(ARC_WORKBOOK)
    tree = fromstring(xml_source)
    for element in safe_iterator(tree, '{%s}sheet' % SHEET_MAIN_NS):
        attrib = element.attrib
        attrib['id'] = attrib["{%s}id" % REL_NS]
        del attrib["{%s}id" % REL_NS]
        if attrib['id']:
            yield attrib
Esempio n. 5
0
def read_rels(archive):
    """Read relationships for a workbook"""
    xml_source = archive.read(ARC_WORKBOOK_RELS)
    tree = fromstring(xml_source)
    for element in safe_iterator(tree, '{%s}Relationship' % PKG_REL_NS):
        rId = element.get('Id')
        pth = element.get("Target")
        typ = element.get('Type')
        # normalise path
        if pth.startswith("/xl"):
            pth = pth.replace("/xl", "xl")
        elif not pth.startswith("xl") and not pth.startswith(".."):
            pth = "xl/" + pth
        yield rId, {'path': pth, 'type': typ}
Esempio n. 6
0
def read_rels(archive):
    """Read relationships for a workbook"""
    xml_source = archive.read(ARC_WORKBOOK_RELS)
    tree = fromstring(xml_source)
    for element in safe_iterator(tree, '{%s}Relationship' % PKG_REL_NS):
        rId = element.get('Id')
        pth = element.get("Target")
        typ = element.get('Type')
        # normalise path
        if pth.startswith("/xl"):
            pth = pth.replace("/xl", "xl")
        elif not pth.startswith("xl") and not pth.startswith(".."):
            pth = "xl/" + pth
        yield rId, {'path':pth, 'type':typ}
Esempio n. 7
0
def read_named_ranges(archive):

    root = fromstring(archive.read(ARC_WORKBOOK))

    dict = {}

    for name_node in safe_iterator(root, '{%s}definedName' % SHEET_MAIN_NS):
        name = name_node.get('name')
        # if name in dict:
        #     raise Exception('Named_range %s is defined in multiple sheets' % name)

        if not name_node.get('hidden'):
            if name_node.get('name') == 'tR':
                dict[name_node.get('name')] = 'Depreciation!A1:A1000'
            elif '!#REF' in name_node.text:
                dict[name_node.get('name')] = '#REF!'
            else:
                dict[name_node.get('name')] = name_node.text.replace('$','').replace(" ","")

    return dict
Esempio n. 8
0
def read_named_ranges(archive):

    root = fromstring(archive.read(ARC_WORKBOOK))

    dict = {}

    for name_node in safe_iterator(root, '{%s}definedName' % SHEET_MAIN_NS):
        name = name_node.get('name')
        # if name in dict:
        #     raise Exception('Named_range %s is defined in multiple sheets' % name)

        if not name_node.get('hidden'):
            if name_node.get('name') == 'tR':
                dict[name_node.get('name')] = 'Depreciation!A1:A1000'
            elif '!#REF' in name_node.text:
                dict[name_node.get('name')] = '#REF!'
            else:
                dict[name_node.get('name')] = name_node.text.replace(
                    '$', '').replace(" ", "")

    return dict
Esempio n. 9
0
def read_cells(archive, ignore_sheets=[], ignore_hidden=False):
    global debug

    print('___### Reading Cells from XLSX ###___')

    cells = {}

    functions = set()

    cts = dict(read_content_types(archive))

    strings_path = cts.get(
        SHARED_STRINGS
    )  # source: https://bitbucket.org/openpyxl/openpyxl/src/93604327bce7aac5e8270674579af76d390e09c0/openpyxl/reader/excel.py?at=default&fileviewer=file-view-default
    if strings_path is not None:
        if strings_path.startswith("/"):
            strings_path = strings_path[1:]
        shared_strings = read_string_table(archive.read(strings_path))
    else:
        shared_strings = []

    for sheet in detect_worksheets(archive):
        sheet_name = sheet['title']

        function_map = {}

        if sheet_name in ignore_sheets: continue

        root = fromstring(
            archive.read(sheet['path'])
        )  # it is necessary to use cElementTree from xml module, otherwise root.findall doesn't work as it should

        hidden_cols = False
        nb_hidden = 0

        if ignore_hidden:
            hidden_col_min = None
            hidden_col_max = None

            for col in root.findall('.//{%s}cols/*' % SHEET_MAIN_NS):
                if 'hidden' in col.attrib and col.attrib['hidden'] == '1':
                    hidden_cols = True
                    hidden_col_min = int(col.attrib['min'])
                    hidden_col_max = int(col.attrib['max'])

        for c in root.findall('.//{%s}c/*/..' % SHEET_MAIN_NS):
            cell_data_type = c.get('t',
                                   'n')  # if no type assigned, assign 'number'
            cell_address = c.attrib['r']

            skip = False

            if hidden_cols:
                found = re.search(CELL_REF_RE, cell_address)
                col = col2num(found.group(1))

                if col >= hidden_col_min and col <= hidden_col_max:
                    nb_hidden += 1
                    skip = True

            if not skip:
                cell = {
                    'a': '%s!%s' % (sheet_name, cell_address),
                    'f': None,
                    'v': None
                }
                if debug: print('Cell', cell['a'])
                for child in c:
                    child_data_type = child.get(
                        't', 'n')  # if no type assigned, assign 'number'

                    if child.tag == '{%s}f' % SHEET_MAIN_NS:
                        if 'ref' in child.attrib:  # the first cell of a shared formula has a 'ref' attribute
                            if debug:
                                print(
                                    '*** Found definition of shared formula ***',
                                    child.text, child.attrib['ref'])
                            if "si" in child.attrib:
                                function_map[child.attrib['si']] = (
                                    child.attrib['ref'],
                                    Translator(str('=' + child.text),
                                               cell_address)
                                )  # translator of openpyxl needs a unicode argument that starts with '='
                            # else:
                            #     print "Encountered cell with ref but not si: ", sheet_name, child.attrib['ref']
                        if child_data_type == 'shared':
                            if debug:
                                print(
                                    '*** Found child %s of shared formula %s ***'
                                    % (cell_address, child.attrib['si']))

                            ref = function_map[child.attrib['si']][0]
                            formula = function_map[child.attrib['si']][1]

                            translated = formula.translate_formula(
                                cell_address)
                            cell['f'] = translated[
                                1:]  # we need to get rid of the '='

                        else:
                            cell['f'] = child.text

                    elif child.tag == '{%s}v' % SHEET_MAIN_NS:
                        if cell_data_type == 's' or cell_data_type == 'str':  # value is a string
                            try:  # if it fails, it means that cell content is a string calculated from a formula
                                cell['v'] = shared_strings[int(child.text)]
                            except:
                                cell['v'] = child.text
                        elif cell_data_type == 'b':
                            cell['v'] = bool(int(child.text))
                        elif cell_data_type == 'n':
                            cell['v'] = _cast_number(child.text)

                    elif child.text is None:
                        continue

                if cell['f'] is not None:

                    pattern = re.compile(r"([A-Z][A-Z0-9]*)\(")
                    found = re.findall(pattern, cell['f'])

                    map(lambda x: functions.add(x), found)

                if cell['f'] is not None or cell['v'] is not None:
                    should_eval = 'always' if cell[
                        'f'] is not None and 'OFFSET' in cell['f'] else 'normal'

                    # cleaned_formula = cell['f']
                    cleaned_formula = cell['f'].replace(
                        ", ", ",") if cell['f'] is not None else None
                    if "!" in cell_address:
                        cells[cell_address] = Cell(cell_address,
                                                   sheet_name,
                                                   value=cell['v'],
                                                   formula=cleaned_formula,
                                                   should_eval=should_eval)
                    else:
                        cells[sheet_name + "!" + cell_address] = Cell(
                            cell_address,
                            sheet_name,
                            value=cell['v'],
                            formula=cleaned_formula,
                            should_eval=should_eval)

        if nb_hidden > 0:
            print('Ignored %i hidden cells in sheet %s' %
                  (nb_hidden, sheet_name))

    print('Nb of different functions %i' % len(functions))
    print(functions)

    for f in functions:
        if f not in existing:
            print('== Missing function: %s' % f)

    return cells
Esempio n. 10
0
def read_cells(archive, ignore_sheets = [], ignore_hidden = False):
    global debug

    print('___### Reading Cells from XLSX ###___')

    cells = {}

    functions = set()

    cts = dict(read_content_types(archive))

    strings_path = cts.get(SHARED_STRINGS) # source: https://bitbucket.org/openpyxl/openpyxl/src/93604327bce7aac5e8270674579af76d390e09c0/openpyxl/reader/excel.py?at=default&fileviewer=file-view-default
    if strings_path is not None:
        if strings_path.startswith("/"):
            strings_path = strings_path[1:]
        shared_strings = read_string_table(archive.read(strings_path))
    else:
        shared_strings = []

    for sheet in detect_worksheets(archive):
        sheet_name = sheet['title']

        function_map = {}

        if sheet_name in ignore_sheets: continue

        root = fromstring(archive.read(sheet['path'])) # it is necessary to use cElementTree from xml module, otherwise root.findall doesn't work as it should

        hidden_cols = False
        nb_hidden = 0

        if ignore_hidden:
            hidden_col_min = None
            hidden_col_max = None

            for col in root.findall('.//{%s}cols/*' % SHEET_MAIN_NS):
                if 'hidden' in col.attrib and col.attrib['hidden'] == '1':
                    hidden_cols = True
                    hidden_col_min = int(col.attrib['min'])
                    hidden_col_max = int(col.attrib['max'])

        for c in root.findall('.//{%s}c/*/..' % SHEET_MAIN_NS):
            cell_data_type = c.get('t', 'n') # if no type assigned, assign 'number'
            cell_address = c.attrib['r']

            skip = False

            if hidden_cols:
                found = re.search(CELL_REF_RE, cell_address)
                col = col2num(found.group(1))

                if col >= hidden_col_min and col <= hidden_col_max:
                    nb_hidden += 1
                    skip = True

            if not skip:
                cell = {'a': '%s!%s' % (sheet_name, cell_address), 'f': None, 'v': None}
                if debug: print('Cell', cell['a'])
                for child in c:
                    child_data_type = child.get('t', 'n') # if no type assigned, assign 'number'

                    if child.tag == '{%s}f' % SHEET_MAIN_NS :
                        if 'ref' in child.attrib: # the first cell of a shared formula has a 'ref' attribute
                            if debug: print('*** Found definition of shared formula ***', child.text, child.attrib['ref'])
                            if "si" in child.attrib:
                                function_map[child.attrib['si']] = (child.attrib['ref'], Translator(str('=' + child.text), cell_address)) # translator of openpyxl needs a unicode argument that starts with '='
                            # else:
                            #     print "Encountered cell with ref but not si: ", sheet_name, child.attrib['ref']
                        if child_data_type == 'shared':
                            if debug: print('*** Found child %s of shared formula %s ***' % (cell_address, child.attrib['si']))

                            ref = function_map[child.attrib['si']][0]
                            formula = function_map[child.attrib['si']][1]

                            translated = formula.translate_formula(cell_address)
                            cell['f'] = translated[1:] # we need to get rid of the '='

                        else:
                            cell['f'] = child.text

                    elif child.tag == '{%s}v' % SHEET_MAIN_NS :
                        if cell_data_type == 's' or cell_data_type == 'str': # value is a string
                            try: # if it fails, it means that cell content is a string calculated from a formula
                                cell['v'] = shared_strings[int(child.text)]
                            except:
                                cell['v'] = child.text
                        elif cell_data_type == 'b':
                            cell['v'] = bool(int(child.text))
                        elif cell_data_type == 'n':
                            cell['v'] = _cast_number(child.text)

                    elif child.text is None:
                        continue

                if cell['f'] is not None:

                    pattern = re.compile(r"([A-Z][A-Z0-9]*)\(")
                    found = re.findall(pattern, cell['f'])

                    map(lambda x: functions.add(x), found)

                if cell['f'] is not None or cell['v'] is not None:
                    should_eval = 'always' if cell['f'] is not None and 'OFFSET' in cell['f'] else 'normal'

                    # cleaned_formula = cell['f']
                    cleaned_formula = cell['f'].replace(", ", ",") if cell['f'] is not None else None
                    if "!" in cell_address:
                        cells[cell_address] = Cell(cell_address, sheet_name, value = cell['v'], formula = cleaned_formula, should_eval=should_eval)
                    else:
                        cells[sheet_name + "!" + cell_address] = Cell(cell_address, sheet_name, value = cell['v'], formula = cleaned_formula, should_eval=should_eval)

        if nb_hidden > 0:
            print('Ignored %i hidden cells in sheet %s' % (nb_hidden, sheet_name))

    print('Nb of different functions %i' % len(functions))
    print(functions)

    for f in functions:
        if f not in existing:
            print('== Missing function: %s' % f)

    return cells