Example #1
0
def parse_model(model):
    """Parses an Excel formula into tokens and returns the operand ranges.

    :param model: A text representation of an Excel formula.
    """

    parser = ExcelParser()
    tokens = parser.parse(model.formula)
    print(parser.prettyprint())
    return parser.getOperandRanges()
Example #2
0
def shunting_yard(expression, named_ranges, ref=None, tokenize_range=False):
    """
    Tokenize an excel formula expression into reverse polish notation

    Core algorithm taken from wikipedia with varargs extensions from
    http://www.kallisti.net.nz/blog/2008/02/extension-to-the-shunting-yard-algorithm-to-allow-variable-numbers-of-arguments-to-functions/


    The ref is the cell address which is passed down to the actual compiled python code.
    Range basic operations signature require this reference, so it has to be written during OperatorNode.emit()
    https://github.com/iOiurson/koala/blob/master/koala/ast/graph.py#L292.

    This is needed because Excel range basic operations (+, -, * ...) are applied on matching cells.

    Example:
    Cell C2 has the following formula 'A1:A3 + B1:B3'.
    The output will actually be A2 + B2, because the formula is relative to cell C2.
    """

    #remove leading =
    if expression.startswith('='):
        expression = expression[1:]

    p = ExcelParser(tokenize_range=tokenize_range)
    p.parse(expression)

    # insert tokens for '(' and ')', to make things clearer below
    tokens = []
    for t in p.tokens.items:
        if t.ttype == "function" and t.tsubtype == "start":
            t.tsubtype = ""
            tokens.append(t)
            tokens.append(f_token('(', 'arglist', 'start'))
        elif t.ttype == "function" and t.tsubtype == "stop":
            tokens.append(f_token(')', 'arglist', 'stop'))
        elif t.ttype == "subexpression" and t.tsubtype == "start":
            t.tvalue = '('
            tokens.append(t)
        elif t.ttype == "subexpression" and t.tsubtype == "stop":
            t.tvalue = ')'
            tokens.append(t)
        elif t.ttype == "operand" and t.tsubtype == "range" and t.tvalue in named_ranges:
            t.tsubtype = "named_range"
            tokens.append(t)
        else:
            tokens.append(t)

    #http://office.microsoft.com/en-us/excel-help/calculation-operators-and-precedence-HP010078886.aspx
    operators = {}
    operators[':'] = Operator(':', 8, 'left')
    operators[''] = Operator(' ', 8, 'left')
    operators[','] = Operator(',', 8, 'left')
    operators['u-'] = Operator('u-', 7, 'left')  #unary negation
    operators['%'] = Operator('%', 6, 'left')
    operators['^'] = Operator('^', 5, 'left')
    operators['*'] = Operator('*', 4, 'left')
    operators['/'] = Operator('/', 4, 'left')
    operators['+'] = Operator('+', 3, 'left')
    operators['-'] = Operator('-', 3, 'left')
    operators['&'] = Operator('&', 2, 'left')
    operators['='] = Operator('=', 1, 'left')
    operators['<'] = Operator('<', 1, 'left')
    operators['>'] = Operator('>', 1, 'left')
    operators['<='] = Operator('<=', 1, 'left')
    operators['>='] = Operator('>=', 1, 'left')
    operators['<>'] = Operator('<>', 1, 'left')

    output = collections.deque()
    stack = []
    were_values = []
    arg_count = []

    new_tokens = []

    # reconstruct expressions with ':' and replace the corresponding tokens by the reconstructed expression
    if not tokenize_range:
        for index, token in enumerate(tokens):
            new_tokens.append(token)

            if type(token.tvalue) == str:

                if token.tvalue.startswith(
                        ':'):  # example -> :OFFSET( or simply :A10
                    depth = 0
                    expr = ''

                    rev = reversed(tokens[:index])

                    for t in rev:  # going backwards, 'stop' starts, 'start' stops
                        if t.tsubtype == 'stop':
                            depth += 1
                        elif depth > 0 and t.tsubtype == 'start':
                            depth -= 1

                        expr = t.tvalue + expr

                        new_tokens.pop()

                        if depth == 0:
                            new_tokens.pop(
                            )  # these 2 lines are needed to remove INDEX()
                            new_tokens.pop()
                            expr = next(rev).tvalue + expr
                            break

                    expr += token.tvalue

                    depth = 0

                    if token.tvalue[1:] in ['OFFSET', 'INDEX']:
                        for t in tokens[(index + 1):]:
                            if t.tsubtype == 'start':
                                depth += 1
                            elif depth > 0 and t.tsubtype == 'stop':
                                depth -= 1

                            expr += t.tvalue

                            tokens.remove(t)

                            if depth == 0:
                                break

                    new_tokens.append(f_token(expr, 'operand', 'pointer'))

                elif ':OFFSET' in token.tvalue or ':INDEX' in token.tvalue:  # example -> A1:OFFSET(
                    depth = 0
                    expr = ''

                    expr += token.tvalue

                    for t in tokens[(index + 1):]:
                        if t.tsubtype == 'start':
                            depth += 1
                        elif t.tsubtype == 'stop':
                            depth -= 1

                        expr += t.tvalue

                        tokens.remove(t)

                        if depth == 0:
                            new_tokens.pop()
                            break

                    new_tokens.append(f_token(expr, 'operand', 'pointer'))

    tokens = new_tokens if new_tokens else tokens

    for t in tokens:

        if t.ttype == "operand":
            output.append(create_node(t, ref))
            if were_values:
                were_values.pop()
                were_values.append(True)

        elif t.ttype == "function":
            stack.append(t)
            arg_count.append(0)
            if were_values:
                were_values.pop()
                were_values.append(True)
            were_values.append(False)

        elif t.ttype == "argument":

            while stack and (stack[-1].tsubtype != "start"):
                output.append(create_node(stack.pop(), ref))

            if were_values.pop(): arg_count[-1] += 1
            were_values.append(False)

            if not len(stack):
                raise Exception("Mismatched or misplaced parentheses")

        elif t.ttype.startswith('operator'):

            if t.ttype.endswith('-prefix') and t.tvalue == "-":
                o1 = operators['u-']
            else:
                o1 = operators[t.tvalue]

            while stack and stack[-1].ttype.startswith('operator'):

                if stack[-1].ttype.endswith(
                        '-prefix') and stack[-1].tvalue == "-":
                    o2 = operators['u-']
                else:
                    o2 = operators[stack[-1].tvalue]

                if ((o1.associativity == "left"
                     and o1.precedence <= o2.precedence)
                        or (o1.associativity == "right"
                            and o1.precedence < o2.precedence)):
                    output.append(create_node(stack.pop(), ref))
                else:
                    break
            stack.append(t)

        elif t.tsubtype == "start":
            stack.append(t)

        elif t.tsubtype == "stop":

            while stack and stack[-1].tsubtype != "start":
                output.append(create_node(stack.pop(), ref))

            if not stack:
                raise Exception("Mismatched or misplaced parentheses")
            stack.pop()

            if stack and stack[-1].ttype == "function":
                f = create_node(stack.pop(), ref)
                a = arg_count.pop()
                w = were_values.pop()
                if w: a += 1
                f.num_args = a
                #print f, "has ",a," args"
                output.append(f)

    while stack:
        if (stack[-1].tsubtype == "start" or stack[-1].tsubtype == "stop"):
            raise Exception("Mismatched or misplaced parentheses")

        output.append(create_node(stack.pop(), ref))

    # convert to list
    return [x for x in output]
Example #3
0
def shunting_yard(expression, named_ranges, ref = None, tokenize_range = False):
    """
    Tokenize an excel formula expression into reverse polish notation

    Core algorithm taken from wikipedia with varargs extensions from
    http://www.kallisti.net.nz/blog/2008/02/extension-to-the-shunting-yard-algorithm-to-allow-variable-numbers-of-arguments-to-functions/


    The ref is the cell address which is passed down to the actual compiled python code.
    Range basic operations signature require this reference, so it has to be written during OperatorNode.emit()
    https://github.com/iOiurson/koala/blob/master/koala/ast/graph.py#L292.

    This is needed because Excel range basic operations (+, -, * ...) are applied on matching cells.

    Example:
    Cell C2 has the following formula 'A1:A3 + B1:B3'.
    The output will actually be A2 + B2, because the formula is relative to cell C2.
    """

    #remove leading =
    if expression.startswith('='):
        expression = expression[1:]

    p = ExcelParser(tokenize_range = tokenize_range);
    p.parse(expression)

    # insert tokens for '(' and ')', to make things clearer below
    tokens = []
    for t in p.tokens.items:
        if t.ttype == "function" and t.tsubtype == "start":
            t.tsubtype = ""
            tokens.append(t)
            tokens.append(f_token('(','arglist','start'))
        elif t.ttype == "function" and t.tsubtype == "stop":
            tokens.append(f_token(')','arglist','stop'))
        elif t.ttype == "subexpression" and t.tsubtype == "start":
            t.tvalue = '('
            tokens.append(t)
        elif t.ttype == "subexpression" and t.tsubtype == "stop":
            t.tvalue = ')'
            tokens.append(t)
        elif t.ttype == "operand" and t.tsubtype == "range" and t.tvalue in named_ranges:
            t.tsubtype = "named_range"
            tokens.append(t)
        else:
            tokens.append(t)

    #http://office.microsoft.com/en-us/excel-help/calculation-operators-and-precedence-HP010078886.aspx
    operators = {}
    operators[':'] = Operator(':',8,'left')
    operators[''] = Operator(' ',8,'left')
    operators[','] = Operator(',',8,'left')
    operators['u-'] = Operator('u-',7,'left') #unary negation
    operators['%'] = Operator('%',6,'left')
    operators['^'] = Operator('^',5,'left')
    operators['*'] = Operator('*',4,'left')
    operators['/'] = Operator('/',4,'left')
    operators['+'] = Operator('+',3,'left')
    operators['-'] = Operator('-',3,'left')
    operators['&'] = Operator('&',2,'left')
    operators['='] = Operator('=',1,'left')
    operators['<'] = Operator('<',1,'left')
    operators['>'] = Operator('>',1,'left')
    operators['<='] = Operator('<=',1,'left')
    operators['>='] = Operator('>=',1,'left')
    operators['<>'] = Operator('<>',1,'left')

    output = collections.deque()
    stack = []
    were_values = []
    arg_count = []

    new_tokens = []

    # reconstruct expressions with ':' and replace the corresponding tokens by the reconstructed expression
    if not tokenize_range:
        for index, token in enumerate(tokens):
            new_tokens.append(token)

            if type(token.tvalue) == str or type(token.tvalue) == unicode:

                if token.tvalue.startswith(':'): # example -> :OFFSET( or simply :A10
                    depth = 0
                    expr = ''

                    rev = reversed(tokens[:index])

                    for t in rev: # going backwards, 'stop' starts, 'start' stops
                        if t.tsubtype == 'stop':
                            depth += 1
                        elif depth > 0 and t.tsubtype == 'start':
                            depth -= 1

                        expr = t.tvalue + expr

                        new_tokens.pop()

                        if depth == 0:
                            new_tokens.pop() # these 2 lines are needed to remove INDEX()
                            new_tokens.pop()
                            expr = six.next(rev).tvalue + expr
                            break

                    expr += token.tvalue

                    depth = 0

                    if token.tvalue[1:] in ['OFFSET', 'INDEX']:
                        for t in tokens[(index + 1):]:
                            if t.tsubtype == 'start':
                                depth += 1
                            elif depth > 0 and t.tsubtype == 'stop':
                                depth -= 1

                            expr += t.tvalue

                            tokens.remove(t)

                            if depth == 0:
                                break

                    new_tokens.append(f_token(expr, 'operand', 'pointer'))

                elif ':OFFSET' in token.tvalue or ':INDEX' in token.tvalue: # example -> A1:OFFSET(
                    depth = 0
                    expr = ''

                    expr += token.tvalue

                    for t in tokens[(index + 1):]:
                        if t.tsubtype == 'start':
                            depth += 1
                        elif t.tsubtype == 'stop':
                            depth -= 1

                        expr += t.tvalue

                        tokens.remove(t)

                        if depth == 0:
                            new_tokens.pop()
                            break

                    new_tokens.append(f_token(expr, 'operand', 'pointer'))


    tokens = new_tokens if new_tokens else tokens

    for t in tokens:

        if t.ttype == "operand":
            output.append(create_node(t, ref))
            if were_values:
                were_values.pop()
                were_values.append(True)

        elif t.ttype == "function":
            stack.append(t)
            arg_count.append(0)
            if were_values:
                were_values.pop()
                were_values.append(True)
            were_values.append(False)

        elif t.ttype == "argument":

            while stack and (stack[-1].tsubtype != "start"):
                output.append(create_node(stack.pop(), ref))

            if were_values.pop(): arg_count[-1] += 1
            were_values.append(False)

            if not len(stack):
                raise Exception("Mismatched or misplaced parentheses")

        elif t.ttype.startswith('operator'):

            if t.ttype.endswith('-prefix') and t.tvalue =="-":
                o1 = operators['u-']
            else:
                o1 = operators[t.tvalue]

            while stack and stack[-1].ttype.startswith('operator'):

                if stack[-1].ttype.endswith('-prefix') and stack[-1].tvalue =="-":
                    o2 = operators['u-']
                else:
                    o2 = operators[stack[-1].tvalue]

                if ( (o1.associativity == "left" and o1.precedence <= o2.precedence)
                        or
                      (o1.associativity == "right" and o1.precedence < o2.precedence) ):
                    output.append(create_node(stack.pop(), ref))
                else:
                    break
            stack.append(t)

        elif t.tsubtype == "start":
            stack.append(t)

        elif t.tsubtype == "stop":

            while stack and stack[-1].tsubtype != "start":
                output.append(create_node(stack.pop(), ref))

            if not stack:
                raise Exception("Mismatched or misplaced parentheses")
            stack.pop()

            if stack and stack[-1].ttype == "function":
                f = create_node(stack.pop(), ref)
                a = arg_count.pop()
                w = were_values.pop()
                if w: a += 1
                f.num_args = a
                #print f, "has ",a," args"
                output.append(f)



    while stack:
        if (stack[-1].tsubtype == "start" or stack[-1].tsubtype == "stop"):
            raise Exception("Mismatched or misplaced parentheses")

        output.append(create_node(stack.pop(), ref))

    # convert to list
    return [x for x in output]