def parse_article_id(tokens, i, node):
    node['id'] = ''

    # article {articleId} de {lawReference}
    if i < len(tokens) and tokens[i] == 'L' and tokens[i + 1] == '.':
        while not re.compile('\d+(-\d+)?').match(tokens[i]):
            node['id'] += tokens[i]
            i += 1

    if i < len(tokens) and re.compile('\d+(-\d+)?').match(tokens[i]):
        node['id'] += tokens[i]
        # skip {articleId} and the following space
        i += 1
        i = alinea_lexer.skip_spaces(tokens, i)

    # {articleId} {articleLetter}
    # FIXME: handle the {articleLetter}{multiplicativeAdverb} case?
    if i < len(tokens) and re.compile('^[A-Z]$').match(tokens[i]):
        node['id'] += ' ' + tokens[i]
        # skip {articleLetter} and the following space
        i += 1
        i = alinea_lexer.skip_spaces(tokens, i)

    i = parse_multiplicative_adverb(tokens, i, node)

    if not node['id'] or is_space(node['id']):
        del node['id']

    return i
def parse_definition_list(tokens, i, parent):
    if i >= len(tokens):
        return i

    i = parse_definition(tokens, i, parent)
    i = alinea_lexer.skip_spaces(tokens, i)
    if ((i + 2 < len(tokens) and tokens[i] == u','
         and tokens[i + 2] in [u'à', u'au'])
            or (i + 2 < len(tokens) and tokens[i] == u'et')):
        i = parse_definition_list(tokens, i + 2, parent)
    i = alinea_lexer.skip_spaces(tokens, i)

    # est rédigé(es)
    # ainsi rédigé(es)
    # est ainsi rédigé(es)
    if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé')
            or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))):
        i += 6
        def_nodes = filter_nodes(
            parent, lambda x: 'type' in x and x['type'] in def_types)
        for def_node in def_nodes:
            i = alinea_lexer.skip_to_quote_start(tokens, i)
            i = parse_quote(tokens, i, def_node)

    return i
def parse_header2_definition(tokens, i, parent):
    if i >= len(tokens):
        return i

    debug(parent, tokens, i, 'parse_header2_definition')

    # un ... ° ({articlePartRef})
    if tokens[i].lower() == u'un' and ''.join(
            tokens[i + 2:i + 5]) == u'...' and tokens[i + 6] == u'°':
        node = create_node(parent, {'type': 'header2', 'children': []})
        # FIXME: should we simply ignore the 'order' field all together?
        node['order'] = '...'
        i += 8
        i = alinea_lexer.skip_spaces(tokens, i)
        if tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé':
            i = alinea_lexer.skip_to_quote_start(tokens, i + 4)
            i = parse_quote(tokens, i, node)
    # un {order}° ({orderLetter}) ({multiplicativeAdverb}) ({articlePartRef})
    elif tokens[i].lower() == u'un' and re.compile(u'\d+°').match(
            tokens[i + 2]):
        node = create_node(parent, {'type': 'header2', 'children': []})
        node['order'] = parse_int(tokens[i + 2])
        i += 4
        if re.compile(u'[A-Z]').match(tokens[i]):
            node['subOrder'] = tokens[i]
            i += 2
        i = parse_multiplicative_adverb(tokens, i, node)
        i = parse_article_part_reference(tokens, i, node)
        i = alinea_lexer.skip_spaces(tokens, i)
        if i < len(tokens) and tokens[i] == u'ainsi' and tokens[
                i + 2] == u'rédigé':
            i = alinea_lexer.skip_to_quote_start(tokens, i + 4)
            i = parse_quote(tokens, i, node)
    # des {start}° à {end}°
    elif (tokens[i].lower() == u'des'
          and re.compile(u'\d+°').match(tokens[i + 2])
          and tokens[i + 4] == u'à'
          and re.compile(u'\d+°').match(tokens[i + 6])):
        start = parse_int(tokens[i + 2])
        end = parse_int(tokens[i + 6])
        i += 8
        # ainsi rédigés
        if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or
            (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))):
            i = alinea_lexer.skip_to_quote_start(tokens, i + 4)
            i = parse_for_each(
                parse_quote, tokens, i,
                lambda: create_node(parent, {
                    'type': 'header2',
                    'children': []
                }))
    else:
        debug(parent, tokens, i, 'parse_header2_definition end')
        return i

    return i
def parse_reference_list(tokens, i, parent):
    if i >= len(tokens):
        return i

    i = parse_reference(tokens, i, parent)
    i = alinea_lexer.skip_spaces(tokens, i)
    if ((i + 2 < len(tokens) and tokens[i] == u','
         and tokens[i + 2] in [u'à', u'au'])
            or (i + 2 < len(tokens) and tokens[i] == u'et')):
        i = parse_reference_list(tokens, i + 2, parent)
    i = alinea_lexer.skip_spaces(tokens, i)

    return i
def parse_alinea_definition(tokens, i, parent):
    if i >= len(tokens):
        return i

    debug(parent, tokens, i, 'parse_alinea_definition')

    # {count} alinéa(s)
    if is_number_word(tokens[i]) and tokens[i + 2].startswith(u'alinéa'):
        count = word_to_number(tokens[i])
        i += 4
        # ainsi rédigé
        # est rédigé
        # est ainsi rédigé
        if (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé') or
            (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))):
            # we expect {count} definitions => {count} quotes
            # but they don't always match, so for now we parse all of the available contents
            # FIXME: issue a warning because the expected count doesn't match?
            i = alinea_lexer.skip_spaces(tokens, i)
            i = alinea_lexer.skip_to_quote_start(tokens, i)
            i = parse_for_each(
                parse_quote, tokens, i,
                lambda: create_node(parent, {
                    'type': 'alinea',
                    'children': []
                }))
        else:
            node = create_node(parent, {'type': 'alinea', 'count': count})
    else:
        debug(parent, tokens, i, 'parse_alinea_definition none')
        return i

    debug(parent, tokens, i, 'parse_alinea_definition end')

    return i
def parse_article_definition(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {
        'type': 'article',
        'children': [],
    })
    debug(parent, tokens, i, 'parse_article_definition')

    # un article
    if tokens[i] == u'un' and tokens[i + 2] == u'article':
        i += 4
    # l'article
    elif tokens[i] == u'l' and tokens[i + 2] == u'article':
        i += 4
    else:
        debug(parent, tokens, i, 'parse_article_definition none')
        remove_node(parent, node)
        return i

    i = parse_article_id(tokens, i, node)

    i = alinea_lexer.skip_spaces(tokens, i)
    if i < len(tokens) and tokens[i] == u'ainsi' and tokens[i +
                                                            2] == u'rédigé':
        i = alinea_lexer.skip_to_quote_start(tokens, i)
        i = parse_for_each(parse_quote, tokens, i, node)

    debug(parent, tokens, i, 'parse_article_definition end')

    return i
def parse_title_definition(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {
        'type': 'title',
        'children': [],
    })

    debug(parent, tokens, i, 'parse_title_definition')

    # un titre {order}
    if tokens[i].lower() == u'un' and tokens[
            i + 2] == u'titre' and is_roman_number(tokens[i + 4]):
        node['order'] = parse_roman_number(tokens[i + 4])
        i += 6
        i = parse_multiplicative_adverb(tokens, i, node)
    else:
        debug(parent, tokens, i, 'parse_title_definition none')
        remove_node(parent, node)
        return i

    i = alinea_lexer.skip_spaces(tokens, i)
    if tokens[i] == u'ainsi' and tokens[i + 2] == u'rédigé':
        i = alinea_lexer.skip_to_quote_start(tokens, i)
        i = parse_for_each(parse_quote, tokens, i, node)

    debug(parent, tokens, i, 'parse_title_definition end')

    return i
def parse_bill_header3(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {
        'type': 'bill-header3',
        'children': [],
    })

    debug(parent, tokens, i, 'parse_bill_header3')

    i = alinea_lexer.skip_spaces(tokens, i)
    match = re.compile('([a-z]+)').match(tokens[i])
    if match and (tokens[i + 1] == u')' or
                  (tokens[i + 2] == u'(' and tokens[i + 5] == u')')):
        node['order'] = ord(match.group()[0].encode('utf-8')) - ord('a') + 1
        # skip'{number}) ' or '{number} (nouveau))'
        if tokens[i + 1] == u')':
            i += 3
        else:
            i += 7
        # i = parse_edit(tokens, i, node)
    else:
        remove_node(parent, node)
        node = parent

    i = parse_edit(tokens, i, node)

    if node != parent and len(node['children']) == 0:
        remove_node(parent, node)

    debug(parent, tokens, i, 'parse_bill_header3 end')

    return i
def parse_bill_header2(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {
        'type': 'bill-header2',
        'order': 0,
        'children': [],
    })

    debug(parent, tokens, i, 'parse_bill_header2')

    i = alinea_lexer.skip_spaces(tokens, i)
    if re.compile(u'\d+°').match(tokens[i]):
        debug(parent, tokens, i, 'parse_bill_header2 found article header-2')

        node['order'] = parse_int(tokens[i])
        # skip {number}°
        i = alinea_lexer.skip_to_next_word(tokens, i + 2)
    else:
        remove_node(parent, node)
        node = parent

    i = parse_edit(tokens, i, node)
    i = parse_for_each(parse_bill_header3, tokens, i, node)

    if node != parent and len(node['children']) == 0:
        remove_node(parent, node)

    debug(parent, tokens, i, 'parse_bill_header2 end')

    return i
def parse_bill_header1(tokens, i, parent):
    if i >= len(tokens):
        return i

    i = alinea_lexer.skip_spaces(tokens, i)

    node = create_node(parent, {
        'type': 'bill-header1',
        'order': 0,
        'children': [],
    })

    debug(parent, tokens, i, 'parse_bill_header1')

    # skip'{romanNumber}.'
    if is_roman_number(tokens[i]) and tokens[i + 1] == u'.':
        debug(parent, tokens, i, 'parse_bill_header1 found article header-1')
        node['order'] = parse_roman_number(tokens[i])
        i = alinea_lexer.skip_to_next_word(tokens, i + 2)

    j = i
    i = parse_edit(tokens, i, node)
    i = parse_for_each(parse_bill_header2, tokens, i, node)
    if i == j:
        i = parse_raw_article_content(tokens, i, node)
    if len(node['children']) == 0:
        remove_node(parent, node)
    else:
        node['order'] = len(
            filter(lambda x: x['type'] == node['type'], parent['children']))

    debug(parent, tokens, i, 'parse_bill_header1 end')

    return i
def parse_law_reference(tokens, i, parent):
    if i >= len(tokens):
        return i

    j = i

    node = create_node(parent, {
        'type': 'law-reference',
        'lawId': '',
        'children': [],
    })

    debug(parent, tokens, i, 'parse_law_reference')

    # de l'ordonnance
    # l'ordonnance
    if i + 4 < len(tokens) and (tokens[i + 2] == u'ordonnance'
                                or tokens[i + 4] == u'ordonnance'):
        node['lawType'] = 'ordonnance'
        i = alinea_lexer.skip_to_token(tokens, i, u'ordonnance') + 2
    # de la loi
    # la loi
    elif i + 4 < len(tokens) and (
        (tokens[i] == u'la' and tokens[i + 2] == u'loi') or
        (tokens[i] == u'de' and tokens[i + 4] == u'loi')):
        i = alinea_lexer.skip_to_token(tokens, i, u'loi') + 2
    else:
        remove_node(parent, node)
        return i

    if tokens[i] == u'organique':
        node['lawType'] = 'organic'
        i += 2

    i = alinea_lexer.skip_to_token(tokens, i, u'n°') + 1
    # If we didn't find the "n°" token, the reference is incomplete and we forget about it.
    # FIXME: we might have to handle the "la même ordonnance" or "la même loi" incomplete reference cases.
    if i >= len(tokens):
        remove_node(parent, node)
        return j

    i = alinea_lexer.skip_spaces(tokens, i)
    node['lawId'] = tokens[i]
    # skip {lawId} and the following space
    i += 2

    if i < len(tokens) and tokens[i] == u'du':
        node['lawDate'] = tokens[i + 6] + u'-' + str(
            month_to_number(tokens[i + 4])) + u'-' + tokens[i + 2]
        # skip {lawDate} and the following space
        i += 7

    # i = alinea_lexer.skip_spaces(tokens, i)
    # if tokens[i] == u'relative':
    #     print('foo')

    debug(parent, tokens, i, 'parse_law_reference end')

    return i
def parse_multiplicative_adverb(tokens, i, node):
    if i >= len(tokens):
        return i

    adverbs = alinea_lexer.TOKEN_MULTIPLICATIVE_ADVERBS.sort(
        key=lambda s: -len(s))
    for adverb in alinea_lexer.TOKEN_MULTIPLICATIVE_ADVERBS:
        if tokens[i].endswith(adverb):
            node['is' + adverb.title()] = True
            # skip {multiplicativeAdverb} and the following space
            i += 1
            i = alinea_lexer.skip_spaces(tokens, i)
            return i
    return i
def parse_quote(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {'type': 'quote', 'words': ''})

    debug(parent, tokens, i, 'parse_quote')

    i = alinea_lexer.skip_spaces(tokens, i)

    # "
    if tokens[i] == alinea_lexer.TOKEN_DOUBLE_QUOTE_OPEN:
        i += 1
    # # est rédigé(es)
    # # ainsi rédigé(es)
    # # est ainsi rédigé(es)
    # elif (i + 2 < len(tokens) and tokens[i + 2].startswith(u'rédigé')
    #     or (i + 4 < len(tokens) and tokens[i + 4].startswith(u'rédigé'))):
    #     i = alinea_lexer.skip_to_quote_start(tokens, i + 2) + 1
    else:
        remove_node(parent, node)
        return i

    while i < len(tokens) and tokens[
            i] != alinea_lexer.TOKEN_DOUBLE_QUOTE_CLOSE and tokens[
                i] != alinea_lexer.TOKEN_NEW_LINE:
        node['words'] += tokens[i]
        i += 1

    # skipalinea_lexer.TOKEN_DOUBLE_QUOTE_CLOSE
    i += 1
    i = alinea_lexer.skip_spaces(tokens, i)

    debug(parent, tokens, i, 'parse_quote end')

    return i
def parse_header1_definition(tokens, i, parent):
    if i >= len(tokens):
        return i
    node = create_node(parent, {'type': 'header1', 'children': []})
    debug(parent, tokens, i, 'parse_header1_definition')
    # un {romanPartNumber}
    if tokens[i].lower() == u'un' and is_roman_number(tokens[i + 2]):
        node['order'] = parse_roman_number(tokens[i + 2])
        i += 4
        i = alinea_lexer.skip_spaces(tokens, i)
        if i + 2 < len(tokens) and tokens[i] == u'ainsi' and tokens[
                i + 2] == u'rédigé':
            i = alinea_lexer.skip_to_quote_start(tokens, i)
            i = parse_quote(tokens, i, node)
    else:
        debug(parent, tokens, i, 'parse_header1_definition end')
        remove_node(parent, node)
        return i

    return i
def parse_words_definition(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {'type': 'words', 'children': []})
    debug(parent, tokens, i, 'parse_words_definition')

    j = i
    i = parse_position(tokens, i, node)
    # le mot
    # les mots
    # des mots
    if tokens[i].lower() in [u'le', u'les', u'des'
                             ] and tokens[i + 2].startswith(u'mot'):
        i = alinea_lexer.skip_to_quote_start(tokens, i)
        i = parse_for_each(parse_quote, tokens, i, node)
        # i = alinea_lexer.skip_spaces(tokens, i)
    # le nombre
    # le chiffre
    elif tokens[i].lower() in [u'le'
                               ] and tokens[i + 2] in [u'nombre', u'chiffre']:
        i = alinea_lexer.skip_to_quote_start(tokens, i)
        i = parse_quote(tokens, i, node)
    # "
    elif tokens[i] == alinea_lexer.TOKEN_DOUBLE_QUOTE_OPEN:
        i = parse_for_each(parse_quote, tokens, i, node)
        i = alinea_lexer.skip_spaces(tokens, i)
    # la référence
    elif tokens[i] == u'la' and tokens[i + 2] == u'référence':
        i = alinea_lexer.skip_to_quote_start(tokens, i)
        i = parse_quote(tokens, i, node)
    else:
        debug(parent, tokens, i, 'parse_words_definition none')
        remove_node(parent, node)
        return j
    debug(parent, tokens, i, 'parse_words_definition end')
    return i
def parse_article_reference(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {'type': 'article-reference', 'id': ''})

    debug(parent, tokens, i, 'parse_article_reference')

    j = i
    i = parse_position(tokens, i, node)
    # de l'article
    # à l'article
    if tokens[i].lower() in [
            u'de', u'à'
    ] and tokens[i + 2] == u'l' and tokens[i + 4] == u'article':
        i += 5
        i = alinea_lexer.skip_spaces(tokens, i)
    # l'article
    elif tokens[i].lower() == u'l' and tokens[
            i +
            1] == alinea_lexer.TOKEN_SINGLE_QUOTE and tokens[i +
                                                             2] == u'article':
        i += 3
        i = alinea_lexer.skip_spaces(tokens, i)
    # elif tokens[i] == u'un' and tokens[i + 2] == u'article':
    #     i += 4
    # Article {articleNumber}
    elif tokens[i].lower().startswith(u'article'):
        i += 1
        i = alinea_lexer.skip_spaces(tokens, i)
    # le même article
    elif tokens[i].lower() == u'le' and tokens[i + 2] == u'même' and tokens[
            i + 4] == u'article':
        i += 6
        article_refs = filter_nodes(
            get_root(parent),
            lambda n: 'type' in n and n['type'] == 'article-reference')
        # the last one in order of traversal is the previous one in order of syntax
        # don't forget the current node is in the list too => -2 instead of -1
        article_ref = copy_node(article_refs[-2])
        push_node(parent, article_ref)
        remove_node(parent, node)
    else:
        remove_node(parent, node)
        return j

    i = parse_article_id(tokens, i, node)

    # i = parse_article_part_reference(tokens, i, node)
    # de la loi
    # de l'ordonnance
    # du code
    # les mots
    # l'alinéa
    i = parse_one_of([
        parse_law_reference, parse_code_reference, parse_words_reference,
        parse_alinea_reference
    ], tokens, i, node)

    # i = parse_quote(tokens, i, node)

    debug(parent, tokens, i, 'parse_article_reference end')

    return i
def parse_edit(tokens, i, parent):
    if i >= len(tokens):
        return i

    node = create_node(parent, {'type': 'edit'})

    debug(parent, tokens, i, 'parse_edit')

    r = i
    # i = parse_for_each(parse_reference, tokens, i, node)
    i = parse_reference_list(tokens, i, node)
    # if we did not parse a reference

    i = alinea_lexer.skip_spaces(tokens, i)

    # if we didn't find any reference as a subject and the subject/verb are not reversed
    if len(node['children']
           ) == 0 and tokens[i] != 'Est' and tokens[i] != 'Sont':
        remove_node(parent, node)
        debug(parent, tokens, i, 'parse_edit none')
        return i
    # i = r

    i = alinea_lexer.skip_tokens(
        tokens, i, lambda t: t.lower() not in [u'est', u'sont', u'devient'] and
        not t == u'.')
    if i + 2 >= len(tokens):
        remove_node(parent, node)
        debug(parent, tokens, i, 'parse_edit eof')
        return r

    # sont supprimés
    # sont supprimées
    # est supprimé
    # est supprimée
    # est abrogé
    # est abrogée
    # sont abrogés
    # sont abrogées
    if tokens[i +
              2].startswith(u'supprimé') or tokens[i +
                                                   2].startswith(u'abrogé'):
        node['editType'] = 'delete'
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
    # est ainsi rédigé
    # est ainsi rédigée
    # est ainsi modifié
    # est ainsi modifiée
    elif tokens[i +
                4].startswith(u'rédigé') or tokens[i +
                                                   4].startswith(u'modifié'):
        node['editType'] = 'edit'
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
        i = alinea_lexer.skip_spaces(tokens, i)
        i = parse_definition(tokens, i, node)
    # est remplacé par
    # est remplacée par
    # sont remplacés par
    # sont remplacées par
    elif tokens[i + 2].startswith(u'remplacé'):
        node['editType'] = 'replace'
        i += 6
        i = parse_definition(tokens, i, node)
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
    # remplacer
    elif tokens[i].lower() == u'remplacer':
        node['editType'] = 'replace'
        i += 2
        # i = parse_definition(tokens, i, node)
        i = parse_reference(tokens, i, node)
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
        if tokens[i].lower() == 'par':
            i += 2
            i = parse_definition(tokens, i, node)
            i = alinea_lexer.skip_to_end_of_line(tokens, i)
    # est inséré
    # est insérée
    # sont insérés
    # sont insérées
    # est ajouté
    # est ajoutée
    # sont ajoutés
    # sont ajoutées
    elif tokens[i +
                2].startswith(u'inséré') or tokens[i +
                                                   2].startswith(u'ajouté'):
        node['editType'] = 'add'
        i += 4
        i = parse_definition(tokens, i, node)
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
    # est ainsi rétabli
    elif tokens[i + 4].startswith(u'rétabli'):
        node['editType'] = 'add'
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
        i = alinea_lexer.skip_spaces(tokens, i)
        i = parse_definition(tokens, i, node)
    # est complété par
    elif tokens[i + 2] == u'complété':
        node['editType'] = 'add'
        i += 6
        # i = parse_definition(tokens, i, node)
        i = parse_definition_list(tokens, i, node)
        # i = alinea_lexer.skip_to_end_of_line(tokens, i)
    # devient
    elif tokens[i] == u'devient':
        node['editType'] = 'rename'
        i += 2
        i = parse_definition(tokens, i, node)
    else:
        i = r
        debug(parent, tokens, i, 'parse_edit remove')
        remove_node(parent, node)
        i = parse_raw_article_content(tokens, i, parent)
        i = alinea_lexer.skip_to_end_of_line(tokens, i)
        return i

    # We've parsed pretty much everything we could handle. At this point,
    # there should be no meaningful content. But their might be trailing
    # spaces or ponctuation (ofent "." or ";"), so we alinea_lexer.skip_ to the end of
    # the line.
    i = alinea_lexer.skip_to_end_of_line(tokens, i)

    debug(parent, tokens, i, 'parse_edit end')

    return i