Esempio n. 1
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''
    
    leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)]
    leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count
    
    result = []
    quote_stack = []
    index = 0
    
    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append( (leaf.lex, index) )
            
        elif (leaf.tag not in ("POS", ":")  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex in ("''", "'")): # an erroneously tagged single close quote.
              
            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''" or
                    open_quote == "`"  and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break
                
                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append( (span_begin, leaf_count-index, open_quote) )
                
            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")
                    
                result.append( (None, leaf_count-index, quote_type) )
        
        # Only advance the index for a leaf corresponding to a CCGbank leaf        
        else:
            index += 1
                
    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append( (span_begin, None, remaining_quote) )
        else:
            warn("Unexpected quote %s after exhausting input.", remaining_quote)
            
    return result
Esempio n. 2
0
def view_deriv(env, start_response):
    global node_index
    node_index = 0

    start_response('200 OK', [('Content-type', 'text/html')])
    variables = env['selector.vars']

    doc_id, deriv_id = int(variables['doc']), int(variables['deriv'])
    filename = 'chtb_%04d.fid' % doc_id

    doc = GuessReader(os.path.join(CORPORA_PATH, filename))
    if doc:
        bundle = doc[deriv_id]

        body = ''
        if bundle:
            body += '<div id="tree">'
            body += pprint(bundle.derivation,
                           sep='&nbsp;',
                           newline='<br/>',
                           node_repr=html_node_repr)
            body += '</div>'

            body += '<div id="main">'
            for leaf, n in izip(
                    leaves(bundle.derivation, lambda e: not is_ignored(e)),
                    count()):
                body += '''<span class="word"><span id="word%(index)d" onmouseover="$('pos').show();$('pos%(index)s').show();$('tree%(index)s').addClassName('highlighted');" onmouseout="$('tree%(index)s').removeClassName('highlighted');$('pos%(index)s').hide();$('pos').hide();">%(body)s</span></span>''' % {
                    'index': n,
                    'body': leaf.lex
                }

            body += prev_next_links(doc, doc_id, deriv_id)
            body += '</div>'

            body += '<div id="pos">'
            body += '<span id="pos_display">'
            for leaf, n in izip(
                    leaves(bundle.derivation, lambda e: not is_ignored(e)),
                    count()):
                body += '<span id="pos%d" style="display:none">%s</span>' % (
                    n, leaf.tag)
            body += '</span>'
            body += '</div>'

            yield layout(body)
        else:
            yield error_document()

    else:
        yield error_document()
Esempio n. 3
0
def html_node_repr(node):
    global node_index  # TODO: What's the proper way to do this in Python?
    if is_ignored(node): span_id = "trace"
    else:
        span_id = "tree%d" % node_index
        node_index += 1

    return '(<span id="%s">%s %s</span>)' % (span_id, node.tag, node.lex)
Esempio n. 4
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''

    leaf_nodes = [
        leaf for leaf in leaves(ptb_tree)
        if not is_ignored(leaf, ignoring_quotes=False)
    ]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [
        leaf for leaf in leaf_nodes
        if not is_ignored(leaf, ignoring_quotes=True)
    ]
    leaf_count = len(
        leaf_nodes_without_quotes)  # should be equal to the CCG leaf count

    result = []
    quote_stack = []
    index = 0

    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append((leaf.lex, index))

        elif (leaf.tag not in (
                "POS", ":"
        )  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex
              in ("''", "'")):  # an erroneously tagged single close quote.

            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''"
                        or open_quote == "`" and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break

                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append((span_begin, leaf_count - index, open_quote))

            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")

                result.append((None, leaf_count - index, quote_type))

        # Only advance the index for a leaf corresponding to a CCGbank leaf
        else:
            index += 1

    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append((span_begin, None, remaining_quote))
        else:
            warn("Unexpected quote %s after exhausting input.",
                 remaining_quote)

    return result