def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)] leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append( (leaf.lex, index) ) elif (leaf.tag not in ("POS", ":") # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append( (span_begin, leaf_count-index, open_quote) ) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append( (None, leaf_count-index, quote_type) ) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append( (span_begin, None, remaining_quote) ) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
def view_deriv(env, start_response): global node_index node_index = 0 start_response('200 OK', [('Content-type', 'text/html')]) variables = env['selector.vars'] doc_id, deriv_id = int(variables['doc']), int(variables['deriv']) filename = 'chtb_%04d.fid' % doc_id doc = GuessReader(os.path.join(CORPORA_PATH, filename)) if doc: bundle = doc[deriv_id] body = '' if bundle: body += '<div id="tree">' body += pprint(bundle.derivation, sep=' ', newline='<br/>', node_repr=html_node_repr) body += '</div>' body += '<div id="main">' for leaf, n in izip( leaves(bundle.derivation, lambda e: not is_ignored(e)), count()): body += '''<span class="word"><span id="word%(index)d" onmouseover="$('pos').show();$('pos%(index)s').show();$('tree%(index)s').addClassName('highlighted');" onmouseout="$('tree%(index)s').removeClassName('highlighted');$('pos%(index)s').hide();$('pos').hide();">%(body)s</span></span>''' % { 'index': n, 'body': leaf.lex } body += prev_next_links(doc, doc_id, deriv_id) body += '</div>' body += '<div id="pos">' body += '<span id="pos_display">' for leaf, n in izip( leaves(bundle.derivation, lambda e: not is_ignored(e)), count()): body += '<span id="pos%d" style="display:none">%s</span>' % ( n, leaf.tag) body += '</span>' body += '</div>' yield layout(body) else: yield error_document() else: yield error_document()
def html_node_repr(node): global node_index # TODO: What's the proper way to do this in Python? if is_ignored(node): span_id = "trace" else: span_id = "tree%d" % node_index node_index += 1 return '(<span id="%s">%s %s</span>)' % (span_id, node.tag, node.lex)
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [ leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False) ] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [ leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True) ] leaf_count = len( leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append((leaf.lex, index)) elif (leaf.tag not in ( "POS", ":" ) # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append((span_begin, leaf_count - index, open_quote)) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append((None, leaf_count - index, quote_type)) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append((span_begin, None, remaining_quote)) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result