Example #1
0
    def attach_quotes(self, deriv, span_begin, span_end, quote_type, higher,
                      quotes):
        leaf_count = len(list(leaves(deriv)))

        first_index = 0 if (span_begin is None) else span_begin
        last_index = 0 if (span_end is None) else span_end

        begin_node = get_leaf(deriv, first_index, "forwards")
        end_node = get_leaf(deriv, last_index, "backwards")

        if end_node:
            end_node = self.punct_class.process_punct(deriv, end_node,
                                                      span_end)

        lca_node = lca(begin_node, end_node)
        if lca_node:
            deriv = self.insert_quotes(deriv, lca_node, higher)

        quote_indices = [None, None]
        for index, leaf in enumerate(leaves(deriv)):
            if str(leaf.cat) == 'LQU':
                quote_indices[0] = index
            elif str(leaf.cat) == 'RQU':
                quote_indices[1] = index - 2

        return deriv, quote_indices
Example #2
0
def view_deriv(env, start_response):
    global node_index
    node_index = 0

    start_response('200 OK', [('Content-type', 'text/html')])
    variables = env['selector.vars']

    doc_id, deriv_id = int(variables['doc']), int(variables['deriv'])
    filename = 'chtb_%04d.fid' % doc_id

    doc = GuessReader(os.path.join(CORPORA_PATH, filename))
    if doc:
        bundle = doc[deriv_id]

        body = ''
        if bundle:
            body += '<div id="tree">'
            body += pprint(bundle.derivation,
                           sep='&nbsp;',
                           newline='<br/>',
                           node_repr=html_node_repr)
            body += '</div>'

            body += '<div id="main">'
            for leaf, n in izip(
                    leaves(bundle.derivation, lambda e: not is_ignored(e)),
                    count()):
                body += '''<span class="word"><span id="word%(index)d" onmouseover="$('pos').show();$('pos%(index)s').show();$('tree%(index)s').addClassName('highlighted');" onmouseout="$('tree%(index)s').removeClassName('highlighted');$('pos%(index)s').hide();$('pos').hide();">%(body)s</span></span>''' % {
                    'index': n,
                    'body': leaf.lex
                }

            body += prev_next_links(doc, doc_id, deriv_id)
            body += '</div>'

            body += '<div id="pos">'
            body += '<span id="pos_display">'
            for leaf, n in izip(
                    leaves(bundle.derivation, lambda e: not is_ignored(e)),
                    count()):
                body += '<span id="pos%d" style="display:none">%s</span>' % (
                    n, leaf.tag)
            body += '</span>'
            body += '</div>'

            yield layout(body)
        else:
            yield error_document()

    else:
        yield error_document()
Example #3
0
 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if is_np_internal_structure(node):
             all_leaves = list(leaves(node))
             node.kids = all_leaves
             
     self.write_derivation(bundle)
Example #4
0
    def accept_derivation(self, bundle):
        self.words += [
                (
                    e.lex,
                    str(e.cat),
#                    e.tag
                ) for e in leaves(bundle.derivation) ]
Example #5
0
def find_coindexed_trace(parent, trace_node):
    index = get_trace_index_from_tag(trace_node.tag)
    for kid in leaves(parent):
        match = IndexRegex.match(kid.lex)
        if match and match.group(1) == index[1:]:
            return kid
    return None
Example #6
0
def find_coindexed_trace(parent, trace_node):
    index = get_trace_index_from_tag(trace_node.tag)
    for kid in leaves(parent):
        match = IndexRegex.match(kid.lex)
        if match and match.group(1) == index[1:]:
            return kid
    return None
Example #7
0
 def accept_derivation(self, bundle):
     self.words += [
         (
             e.lex,
             str(e.cat),
             #                    e.tag
         ) for e in leaves(bundle.derivation)
     ]
Example #8
0
def naive_label_derivation(root):
    '''Applies the markedup labelling algorithm to each leaf under _root_.'''
    for leaf in leaves(root):
        leaf.cat = label(leaf.cat, lex=leaf.lex)
        # pre-populate the outermost slot with the lexical item
        leaf.cat.slot.head.lex = leaf.lex
        
    return root
Example #9
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''
    
    leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)]
    leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count
    
    result = []
    quote_stack = []
    index = 0
    
    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append( (leaf.lex, index) )
            
        elif (leaf.tag not in ("POS", ":")  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex in ("''", "'")): # an erroneously tagged single close quote.
              
            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''" or
                    open_quote == "`"  and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break
                
                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append( (span_begin, leaf_count-index, open_quote) )
                
            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")
                    
                result.append( (None, leaf_count-index, quote_type) )
        
        # Only advance the index for a leaf corresponding to a CCGbank leaf        
        else:
            index += 1
                
    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append( (span_begin, None, remaining_quote) )
        else:
            warn("Unexpected quote %s after exhausting input.", remaining_quote)
            
    return result
Example #10
0
def ccg2latex(root, glosses=None, abbreviate=False):
    def comb_symbol(comb):
        return arrows.get(comb, 'uline')
    def cat_repr(cat, i):
        cat_str = str(cat)
        if abbreviate is not False:
            if isinstance(abbreviate, xrange):
                if isinstance(i, int):
                    if i in abbreviate:
                        cat_str = abbr(cat_str)
                elif isinstance(i, xrange):
                    if abbreviate.start <= i.start < i.end <= abbreviate.end:
                        cat_str = abbr(cat_str)
            else:
                cat_str = abbr(cat_str)

        return sanitise_category(cat_str)
        
    out = ['\deriv{%d}{' % root.leaf_count()]
    all_leaves = list(leaves(root))
    
    # lex line
    if glosses is not None:
        leaf_bits = ("\\glosN{%s}{%s}" % (leaf.lex, gloss) for (leaf, gloss) in izip(all_leaves, glosses))
    else:
        leaf_bits = (("\\cjk{%s}" % leaf.lex) for leaf in all_leaves)
    out.append(' & '.join(leaf_bits) + '\\\\')
    
    # underlines line
    out.append( ' & '.join(["\uline{1}"] * root.leaf_count()) + '\\\\' )
    # cats line
    out.append( (' & '.join(("\\cf{%s}"%cat_repr(leaf.cat, i) for i, leaf in enumerate(all_leaves)))) + '\\\\' )
    
    rows = []
    for l, r, p in pairs_postorder(root):
        rows.append( (min_leaf_id(p, root), p.cat, analyse(l.cat, r and r.cat, p.cat), p.leaf_count()) )
        
    grouped_subrows = group(rows)
        
    for subrows in grouped_subrows:
        subline = []
        subout = []
        last_span = 0 # holds the index of the rightmost span in this row
        
        for leftmost_leaf_id, cat, comb, span in subrows:
            subline.append( "&"*(leftmost_leaf_id - last_span) + ("\%s{%s}" % (comb_symbol(comb), span)) )
            subout.append(  "&"*(leftmost_leaf_id - last_span) + ("\mc{%d}{%s}" % (span,
                cat_repr(cat, range(leftmost_leaf_id, leftmost_leaf_id+span)))) )
            
            last_span = leftmost_leaf_id+span-1

        # write out underlines line
        out.append(' '.join(subline) + '\\\\')
        # write out cats line
        out.append(' '.join(subout) + '\\\\')

    out.append('}')
    return '\n'.join(out)
Example #11
0
def make_derivation(deriv, assigned_id=None, leaf_id=0):
    '''Generates the body of the DOT representation.'''
    
    if deriv.is_leaf():
        if write_tree_indices:
            label = "%d %s" % (leaf_id, deriv.label_text())
        else:
            label = deriv.label_text()

        return '''%s [shape="none",height=0.17,label="%s"]\n''' % (assigned_id, label)
        
    else:
        ret = []
        root_id = assigned_id or get_id()

        for i, child in enumerate(deriv):
            child_id = get_id()

            if isinstance(deriv, (ccg.Leaf, ccg.Node)):
                comb_name = re.escape(Abbreviations.get(analyse(deriv.lch.cat, deriv.rch and deriv.rch.cat, deriv.cat), ''))
                
                if comb_name:
                    shape_type = "record"
                    label_text = "<o>%s|%s" % (deriv.label_text(), comb_name)
                else:
                    shape_type = "box"
                    label_text = deriv.label_text()
                    
                ret.append('''%s [shape="%s",height=0.1,label="%s"]\n''' % (root_id, shape_type, label_text))

                if config.highlight_head_arrows and i == int(deriv.head_index):
                    ret.append("%s:o -> %s:o [color=red]\n" % (root_id, child_id))
                else:
                    ret.append("%s:o -> %s:o\n" % (root_id, child_id))
                    
                ret.append(make_derivation(child, child_id, leaf_id=leaf_id))
                leaf_id += len(list(leaves(child)))
                
            else:
                ret.append('''%s [shape="box",height=0.1,label="%s"]\n''' % (root_id, deriv.label_text()))
                ret.append("%s -> %s\n" % (root_id, child_id)) 
                ret.append(make_derivation(child, child_id, leaf_id=leaf_id))
                leaf_id += len(list(leaves(child)))

        return ''.join(ret)
Example #12
0
 def accept_derivation(self, bundle):
     self.nderivs += 1
     self.nwords += len(bundle.derivation.text())
     for leaf in leaves(bundle.derivation):
         if self.is_trace(leaf):
             self.ecs += 1
             self.ec_types[base_tag(leaf.lex)] += 1
         else:
             self.tokens.add(leaf.lex)
Example #13
0
    def run_filters(self, filters, files):
        # If all given filters were not found or had wrong argument count, do nothing
        if not filters: return
        
        reader_args = {}
        if self.reader_class_name:
            try:
                reader_class = globals()[self.reader_class_name]
                info("Using reader class %s.", self.reader_class_name)
                
                reader_args['reader_class'] = reader_class
            except KeyError:
                raise RuntimeError("Reader class %s not found." % self.reader_class_name)
        
        for file in self.transform(files):
            if self.is_pair_spec(file):
                meta_reader = PairedReader
            else:
                meta_reader = DirFileGuessReader
                
            try:
                self.last_exceptions = []
                
                for derivation_bundle in meta_reader(file, verbose=self.verbose, **reader_args):
                    if self.verbose: info("Processing %s...", derivation_bundle.label())
                    try:
                        for filter in filters:
                            filter.context = derivation_bundle

                        if filter.accept_leaf is not None:
                            for leaf in leaves(derivation_bundle.derivation):
                                for filter in filters:
                                    filter.accept_leaf(leaf)

                                    if filter.accept_comb_and_slash_index is not None:
                                        try:
                                            for slash_index, comb in enumerate(applications_per_slash(leaf)):
                                                filter.accept_comb_and_slash_index(leaf, comb, slash_index)
                                        except AttributeError: # TODO: hacky and inefficient, need this to work for PTB too
                                            pass

                        for filter in filters:
                            filter.accept_derivation(derivation_bundle)
                            filter.context = None
                            
                    except IOError, e:
                        # If output is going to a pager, and the user requests an interrupt (^C)
                        # the filter fails with IOError: Broken pipe
                        # In that case, running filters on further derivations will continue to
                        # lead to 'Broken pipe', so just bail out
                        if e.errno == errno.EPIPE: return
                            
                    except Exception, e:
                        self.last_exceptions.append( (derivation_bundle, sys.exc_info()) )
                        
                        if self._break_on_exception:
                            raise FilterException(e, None)
Example #14
0
def is_np_internal_structure(node):
    # rule out things already tagged explicitly as coordination by tag.py
    if any(has_tags(kid, 'cC') for kid in node): return False

    return (node.tag.startswith('NP') and all(
        has_tags(kid, 'nN') or any(
            kid.tag.startswith(tag) for tag in NominalCategories) or kid.tag in
        ('PU', 'CC') or kid.tag.startswith('JJ') or kid.tag.startswith('CD')
        or kid.tag.startswith('OD') or has_tag(kid, '&')
        for kid in leaves(node)))
Example #15
0
def min_leaf_id(node, root):
    '''(Inefficiently) finds the leaf index of _node_ relative to _root_.'''
    cur = node
    while not cur.is_leaf():
        cur = cur[0]
        
    # cur is the left corner of _node_    
    for leaf_id, leaf in enumerate(leaves(root)):
        if leaf is cur:
            return leaf_id
Example #16
0
def is_np_internal_structure(node):
    # rule out things already tagged explicitly as coordination by tag.py
    if any(has_tags(kid, 'cC') for kid in node): return False
    
    return (node.tag.startswith('NP') and
            all(has_tags(kid, 'nN')
             or any(kid.tag.startswith(tag) for tag in NominalCategories)
             or kid.tag in ('PU', 'CC')
             or kid.tag.startswith('JJ')
             or kid.tag.startswith('CD')
             or kid.tag.startswith('OD')
             or has_tag(kid, '&') for kid in leaves(node)))
Example #17
0
def Precedes(candidate, node, context):
    if not node.is_leaf(): return False
    
    root = get_root(node)
    
    node_index = get_index_of_leaf(root, node)
    
    for successor in islice(leaves(root), node_index+1):
        if candidate.is_satisfied_by(successor, context): 
            return True
            
    return False
Example #18
0
    def accept_derivation(self, bundle):
        print bundle.label(),

        error_found = False
        for i, leaf in enumerate(leaves(bundle.derivation)):
            if i in self.indices:
                # check rules starting from this leaf
                for comb, (l, r, p) in combinators_and_path_from_node(leaf):
                    if comb is None:
                        error_found = True
                        print i, rule_repr(l, r, p),

        if not error_found: print 'none',
Example #19
0
    def accept_derivation(self, bundle):
        print bundle.label(),

        error_found = False
        for i, leaf in enumerate(leaves(bundle.derivation)):
            if i in self.indices:
                # check rules starting from this leaf
                for comb, (l, r, p) in combinators_and_path_from_node(leaf):
                    if comb is None:
                        error_found = True
                        print i, rule_repr(l, r, p),

        if not error_found: print 'none',
Example #20
0
    def accept_derivation(self, bundle):
        global merge_verb_compounds
        if merge_verb_compounds:
            for node in nodes(bundle.derivation):
                if node.tag in self.MergedTags:
                    replace_kid(node.parent, node, Leaf(node.tag, ''.join(kid.lex for kid in leaves(node)), node.parent))

        if normalise_foreign_names:
            for leaf in leaves(bundle.derivation):
                if self.is_candidate_foreign_name(leaf.lex):
                    kids = [ Leaf(leaf.tag, bit, None) for bit in leaf.lex.split(INTERPUNCT) ]
                    replace_kid(leaf.parent, leaf, Node('NP-PN', kids))

        if self.accept(bundle.derivation):
            self.write_derivation(bundle)
Example #21
0
 def attach_quotes(self, deriv, span_begin, span_end, quote_type, higher, quotes):
     leaf_count = len(list(leaves(deriv)))
     
     first_index = 0 if (span_begin is None) else span_begin
     last_index =  0 if (span_end is None)   else span_end
     
     begin_node = get_leaf(deriv, first_index, "forwards")
     end_node = get_leaf(deriv, last_index, "backwards")
     
     if end_node:
         end_node = self.punct_class.process_punct(deriv, end_node, span_end)
         
     lca_node = lca(begin_node, end_node)
     if lca_node:
         deriv = self.insert_quotes(deriv, lca_node, higher)
         
     quote_indices = [None, None]
     for index, leaf in enumerate(leaves(deriv)):
         if str(leaf.cat) == 'LQU':
             quote_indices[0] = index
         elif str(leaf.cat) == 'RQU':
             quote_indices[1] = index - 2
             
     return deriv, quote_indices
Example #22
0
    def attach_quotes(self, deriv, span_begin, span_end, quote_type, higher, quotes):
        '''Given a CCGbank derivation, a pair of indices denoting the span of quoted text, whether single or
double quotes are to be inserted, and quoting parameters, this does the insertion and returns a tuple (D, (b, e)),
where D is the new derivation (the root may have been changed through quote attachment) and the indices at which
the quotes have been inserted. Either b or e may be None to indicate that no opening or closing quote was inserted.'''

        do_left = quotes in ("both", "left")
        do_right = quotes in ("both", "right")
        
        first_index = 0 if (span_begin is None) else span_begin
        last_index = 0 if (span_end is None) else span_end
        
        leaf_count = len(list(leaves(deriv)))
        quoted_text = list(text_in_span(deriv, first_index, (leaf_count - last_index)))
        
        if (first_index is not None) or (last_index is not None):
            if higher == "left":
                if do_right:
                    deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_end, quote="end", quote_type=quote_type)
                if do_left:
                    deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_begin, quote="begin", quote_type=quote_type)
            elif higher == "right":
                if do_left:
                    deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_begin, quote="begin", quote_type=quote_type)
                if do_right:
                    deriv = self.insert_quote(deriv, tokens=quoted_text, at=span_end, quote="end", quote_type=quote_type)
                    
        quote_indices = []
        if (span_begin is not None) and do_left:
            quote_indices.append(span_begin)
        else:
            quote_indices.append( None ) 
            
        if (span_end is not None) and do_right:
            quote_indices.append(leaf_count - span_end - 1)
        else:
            quote_indices.append( None )
            
        return deriv, quote_indices
Example #23
0
def mkdeps(root, postprocessor=identity):
    for i, leaf in enumerate(leaves(root)):
        # Uniquify each leaf with an index
        leaf.lex += IndexSeparatorTemplate % i
        # Apply the left to right slash labelling 
        # (we abuse this to refer to slots, not slashes)
        leaf.cat.parg_labelled()
        # Populate the outermost (_) variable of each leaf
        leaf.cat.slot.head.lex = leaf.lex

    for (l, r, p) in pairs_postorder(root):
        _label_result(l, r, p)
            
    global unanalysed
    
    unaries = []

    for l, r, p in pairs_postorder(root):
        L, R, P = map(lambda x: x and x.cat, (l, r, p))
        comb = analyse(L, R, P)
        if not comb: debug("Unrecognised rule %s %s -> %s", L, R, P)
        
        unifier = []
        
        if config.debug:
            debug("%s %s %s (%s)", L, R, P, str(comb))

        if comb == 'fwd_appl': # [Xx/Yy]l Yy -> Xx
            unifier = unify(L.right, R)
            p.cat = L.left

        elif comb == 'bwd_appl': # Yy [Xx\Yy]r -> Xx
            unifier = unify(L, R.right)
            p.cat = R.left
                
        # Pro-drops which drop their outer argument
        # [(S_\NPy)_/NPx]_ -> [S_\NPy]_
        elif comb in ('object_prodrop', 'vp_vp_object_prodrop', 
            'yi_subject_prodrop', 'vp_modifier_subject_prodrop'):
            p.cat = L.left

        # [Xx/Yy]l [Yy/Zz]r -> [Xx/Zz]r
        elif comb == 'fwd_comp': # X/Y Y/Z -> X/Z
            if is_rooted_in(Sdcl, L, respecting_features=True):
                P.slot = L.slot
            else:
                P.slot = R.slot # lexical head comes from R (Y/Z)

            P.slot.var = fresh_var(prefix='K')

            unifier = unify(L.right, R.left)
            p.cat._left = L.left
            p.cat._right = R.right
            
        # [Yy\Zz]l [Xx\Yy]r -> [Xx\Zz]l
        elif comb == 'bwd_comp': # Y\Z X\Y -> X\Z
            if is_rooted_in(Sdcl, R, respecting_features=True):
                P.slot = R.slot
            else:
                P.slot = L.slot # lexical head comes from L (Y\Z)

            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(R.right, L.left)
            p.cat._left = R.left
            p.cat._right = L.right
            
        elif comb in ('s_np_apposition', 'vp_np_apposition'): # { S[dcl], S[dcl]\NP } NPy -> NPy
            P.slot = R.slot # = copy_vars
            unifier = unify(P, R)
            
        # NP NP -> N/N
        elif comb == 'np_np_to_nfn_apposition':
            # do the same as NP NP -> NP, except fill in the vars Ny/Ny
            P.right.slot.var = fresh_var(prefix='N')
            P.left.slot = P.right.slot

            register_unary(unaries, p, L.slot.head.lex)
            make_set_head_from(l, r, p)

        elif comb in ('conjoin', 'np_np_apposition'): # X X[conj] -> X
            make_set_head_from(l, r, p)

        elif comb in ('conj_absorb', 'conj_comma_absorb'): # conj X -> X[conj]
            copy_vars(frm=R, to=P)
            unify(P, R) # R.slot.head = P.slot.head
            
        elif comb == 'funny_conj': # conj X -> X
            p.cat = R
        
        elif comb == 'nongap_topicalisation': # {N, NP, S[dcl], QP}x -> [Sy/Sy]x
            P.slot = L.slot
            P.right.slot.var = fresh_var()
            P.left.slot = P.right.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb in ('np_gap_topicalisation', 's_gap_topicalisation', 'qp_gap_topicalisation'): # NPx -> [ Sy/(Sy/NPx)y ]y
            P.right.right.slot = L.slot
            P.slot.var = fresh_var()
            P.left.slot = P.right.left.slot = P.right.slot = P.slot
            
        elif comb == 'subject_prodrop': # (S[dcl]y\NPx)y -> S[dcl]y | [(S[dcl]y\NPx)y/NPz]y -> (S[dcl]y/NPz)y
            if P == parse_category(r'S[dcl]'):
                P.slot = L.slot
            elif P == parse_category(r'S[dcl]/NP'):
                P.slot = P.left.slot = L.slot
                P.right.slot = L.right.slot
            else:
                warn("Invalid parent category %s for subject prodrop.", P)
            
        elif comb == 'fwd_xcomp': # [Xx/Yy]l [Yy\Zz]r -> [Xx/Zz]r
            if is_rooted_in(Sdcl, L, respecting_features=True):
                P.slot = L.slot
            else:
                P.slot = R.slot # lexical head comes from R (Y/Z)
                
            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(L.right, R.left)
            p.cat._left = L.left
            p.cat._right = R.right

        elif comb == 'bwd_xcomp': # [Yy/Zz]l [Xx\Yy]r -> [Xx/Zz]l
            if is_rooted_in(Sdcl, R, respecting_features=True):
                P.slot = R.slot
            else:
                P.slot = L.slot # lexical head comes from L (Y\Z)
        
            # P.slot = L.slot
            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(R.right, L.left)
            p.cat._left = R.left
            p.cat._right = L.right
            
        elif comb == 'bwd_r1xcomp': # [(Yy/Zz)k/Ww]l [Xx\Yy]r -> [(Xx\Zz)k/Ww]l
            # TODO: where should P's lexical head come from? L or R?
            
            unifier = unify(L.left.left, R.right)
            p.cat._left._left = R.left
            p.cat._left._right = L.left.right
            p.cat._right = L.right

        elif comb in ('fwd_raise', 'bwd_raise'): # Xx -> [ Tf|(Tf|Xx)f ]f
            if P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/(S[dcl]\NP))'):
                # (S[dcl]y\NPz)y -> [ (S[dcl]f\NPg)f/((S[dcl]f\NPg)f\(S[dcl]y\NPz)y)f ]f
                P.left.slot.var = P.left.left.slot.var = P.right.slot.var = P.slot.var = fresh_var() # f 
                P.left.right.slot.var = fresh_var() # g
                
                copy_vars(frm=P.left, to=P.right.left)
                copy_vars(frm=L,      to=P.right.right)
                
                unifier = unify(L, P.right.right)
            elif P == parse_category(r'((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)'):
                # NPy -> [ ((S[dcl]v\NPw)v/QPz)v \ ( ((S[dcl]v\NPw)v/QPz)v/NPy )v ]v
                P.slot.var = fresh_var()
                P.left.slot = P.right.slot = \
                    P.left. left.slot = P.left. left.left.slot = \
                    P.right.left.slot = P.right.left.left.slot = \
                    P.right.left.left.left.slot = P.slot # v
#                P.right.right.slot = fresh_var() # y
                P.right.right.slot = L.slot
                P.left.right.slot.var = fresh_var('Z')
                P.right.left.right.slot = P.left.right.slot # z
                P.left.left.right.slot.var = fresh_var('W')
                P.right.left.left.right.slot = P.left.left.right.slot # w
                
                unifier = unify(L, P.right.right)
            elif P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/QP)'):
                # QPy -> [ (S[dcl]v\NPz)v \ ((S[dcl]v\NPz)v/QPy)v ]v
                P.slot.var = fresh_var()
                P.left.slot = P.left.left.slot = \
                    P.right.slot = P.right.left.slot = P.right.left.left.slot = P.slot # v
#                P.right.right.slot = fresh_var() # y
                P.right.right.slot = L.slot
                P.left.right.slot.var = fresh_var('Z')
                P.right.left.right.slot = P.left.right.slot # z
                
                unifier = unify(L, P.right.right)
            else:
                P.slot.var = fresh_var()

                P.right.left.slot = P.left.slot = P.right.slot = P.slot
                P.right.right.slot = L.slot

                unifier = unify(L, P.right.right)

        elif comb == 'np_typechange':
            P.slot = L.slot # = copy_vars
            unifier = unify(P, L)
            
        elif comb == 'lcp_np_typechange':
            P.slot = L.slot
            unifier = unify(P, L)
            
        elif comb in ('lcp_sfs_typechange', 'lcp_nfn_typechange'):
            P.left.slot.var = fresh_var()
            P.right.slot = P.left.slot
            
            P.slot = L.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb == 'lcp_sbnpfsbnp_typechange':
            # [(Sy\NPz)y/(Sy\NPz)y]_
            P.left.slot.var = fresh_var()
            P.left.left.slot = P.right.left.slot = P.right.slot = P.left.slot
            
            register_unary(unaries, p, L.slot.head.lex)
        
        elif comb == 'null_relativiser_typechange': # Xy -> (Nf/Nf)y
            P.slot = L.slot
            
            if P == _NfN:
                P.left.slot.var = fresh_var()
                
                P.right.slot = P.left.slot
                
                register_unary(unaries, p, L.slot.head.lex)
                
            elif P == _NfNfNfN:
                P.left.slot.var = fresh_var()
                P.left.left.slot.var = fresh_var(prefix="G")
                
                P.left.right.slot = P.left.left.slot
                P.right.slot = P.left.slot
                
                register_unary(unaries, p, L.slot.head.lex)
            else:
                warn("Unhandled null relativiser typechange: %s -> %s", L, P)
            
        # [NP/NP]y -> NPy
        elif comb == 'de_nominalisation':
            P.slot = L.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        # {M, QP}y -> (Nf/Nf)y
        elif comb == 'measure_word_number_elision':
            P.slot = L.slot
            
            P.left.slot.var = fresh_var()
            P.right.slot = P.left.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb == 'l_punct_absorb': # , X -> X[conj]
            # need to put conj feature back on parent
            p.cat = R.clone_adding_feature('conj')
            
        elif comb == 'r_punct_absorb':
            p.cat = L

        elif R and L == R and is_rooted_in(parse_category('S'), L): # VCD (stopgap)
            make_set_head_from(l, r, p)

        else:
            debug('Unhandled combinator %s (%s %s -> %s)', comb, L, R, P)
            unanalysed.add(comb)
            
            P.slot = R.slot if R else L.slot
            
        for (dest, src) in unifier:
            if isinstance(src, (basestring, list)):
                # Fake bidirectional unification:
                # -------------------------------
                # If variable X has been unified with value v,
                # rewrite all mentions of v in the output category to point to variable X
                # (v is uniquified by concatenating it with an ID, so this should hold)            
                for subcat in p.cat.nested_compound_categories():
                    if subcat.slot.head.lex == src:
                        subcat.slot = dest.slot
            
        if config.debug:
            debug("> %s" % p.cat)
            debug('---')
            
            if config.fail_on_unassigned_variables:
                assert no_unassigned_variables(p.cat), "Unassigned variables in %s" % p.cat
                
    if config.debug:
        debug('unaries: %s', unaries)
        
    # Collect deps from arguments
    deps = []
    for l in chain( leaves(root), unaries ):
        if config.debug: debug("%s %s", l, l.cat)
        
        C = l.cat
        while not C.is_leaf():
            arg = C.right
            if arg.slot.head.filler:
                #and not l.cat.left.slot == l.cat.right.slot):
        #        print "%s %s %s %s %s %s" % (C.slot.head.lex, C, arg.slot.head.lex, arg, l.cat, C.label)
                if C.label is None:
                    warn("Dependency generated on slash without label: %s %s", C, arg)
                deps.append( (C.slot.head.lex, arg.slot.head.lex, l.cat, C.label) )
            if is_modifier(C): break
            C = C.left

    # Produce dep pairs
    result = set()
    for depl, depr, head_cat, head_label in deps:
        for sdepl in set(seqify(depl)):
            for sdepr in set(seqify(depr)):
                if not (sdepl and sdepr):
                    debug("Dependency with None: %s %s", sdepl, sdepr)
                    continue
                    
                result.add( (postprocessor(sdepl), postprocessor(sdepr), head_cat, head_label) )
                
    if config.debug:
        for line in write_deps(result):
            debug(line)
    return result
Example #24
0
def is_np_internal_structure(node):
    return node.tag.startswith('NP') and node.count() > 1 and (
        all(kid.tag in ValidNPInternalTags for kid in leaves(node)))
Example #25
0
def caption_nwords(bundle):
    sys.stdout.write(str(len(list(leaves(bundle.derivation)))))
Example #26
0
def is_np_internal_structure(node):
    return node.tag.startswith('NP') and node.count() > 1 and (all(
        kid.tag in ValidNPInternalTags for kid in leaves(node)))
Example #27
0
    def run_filters(self, filters, files):
        # If all given filters were not found or had wrong argument count, do nothing
        if not filters: return

        reader_args = {}
        if self.reader_class_name:
            try:
                reader_class = globals()[self.reader_class_name]
                info("Using reader class %s.", self.reader_class_name)

                reader_args['reader_class'] = reader_class
            except KeyError:
                raise RuntimeError("Reader class %s not found." %
                                   self.reader_class_name)

        for file in self.transform(files):
            if self.is_pair_spec(file):
                meta_reader = PairedReader
            else:
                meta_reader = DirFileGuessReader

            try:
                self.last_exceptions = []

                for derivation_bundle in meta_reader(file,
                                                     verbose=self.verbose,
                                                     **reader_args):
                    if self.verbose:
                        info("Processing %s...", derivation_bundle.label())
                    try:
                        for filter in filters:
                            filter.context = derivation_bundle

                        if filter.accept_leaf is not None:
                            for leaf in leaves(derivation_bundle.derivation):
                                for filter in filters:
                                    filter.accept_leaf(leaf)

                                    if filter.accept_comb_and_slash_index is not None:
                                        try:
                                            for slash_index, comb in enumerate(
                                                    applications_per_slash(
                                                        leaf)):
                                                filter.accept_comb_and_slash_index(
                                                    leaf, comb, slash_index)
                                        except AttributeError:  # TODO: hacky and inefficient, need this to work for PTB too
                                            pass

                        for filter in filters:
                            filter.accept_derivation(derivation_bundle)
                            filter.context = None

                    except IOError, e:
                        # If output is going to a pager, and the user requests an interrupt (^C)
                        # the filter fails with IOError: Broken pipe
                        # In that case, running filters on further derivations will continue to
                        # lead to 'Broken pipe', so just bail out
                        if e.errno == errno.EPIPE: return

                    except Exception, e:
                        self.last_exceptions.append(
                            (derivation_bundle, sys.exc_info()))

                        if self._break_on_exception:
                            raise FilterException(e, None)
Example #28
0
 def transformer(self, bundle):
     return " ".join(self.format(leaf) for leaf in leaves(bundle.derivation))
Example #29
0
def get_remapper_for(deriv_id):
    filespec = deriv_id_to_filespec(deriv_id, with_section_dir=False)
    reader = GuessReader( os.path.join('cn', filespec) )
    bundle = iter(reader).next()
    root = bundle.derivation
    return remapper(leaves(root))
Example #30
0
def caption_nwords(bundle):
    sys.stdout.write(str(len(list(leaves(bundle.derivation)))))
Example #31
0
 def leaf_count(self):
     return len(list(leaves(self)))
Example #32
0
 def transformer(self, bundle):
     return " ".join(
         self.format(leaf) for leaf in leaves(bundle.derivation))
Example #33
0
def write_parg(bundle, deps):
    bits = ['<s id="%s"> %d' % (bundle.label(), len(list(leaves(bundle.derivation))))]
    bits += write_deps(deps)
    bits.append('<\s>')
    
    return '\n'.join(bits)
Example #34
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''

    leaf_nodes = [
        leaf for leaf in leaves(ptb_tree)
        if not is_ignored(leaf, ignoring_quotes=False)
    ]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [
        leaf for leaf in leaf_nodes
        if not is_ignored(leaf, ignoring_quotes=True)
    ]
    leaf_count = len(
        leaf_nodes_without_quotes)  # should be equal to the CCG leaf count

    result = []
    quote_stack = []
    index = 0

    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append((leaf.lex, index))

        elif (leaf.tag not in (
                "POS", ":"
        )  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex
              in ("''", "'")):  # an erroneously tagged single close quote.

            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''"
                        or open_quote == "`" and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break

                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append((span_begin, leaf_count - index, open_quote))

            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")

                result.append((None, leaf_count - index, quote_type))

        # Only advance the index for a leaf corresponding to a CCGbank leaf
        else:
            index += 1

    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append((span_begin, None, remaining_quote))
        else:
            warn("Unexpected quote %s after exhausting input.",
                 remaining_quote)

    return result
Example #35
0
        import psyco
        psyco.full()
    except ImportError: pass
    
    from munge.ccg.parse import *

    file = "final/%s" % sys.argv[1]
    t=naive_label_derivation(parse_tree(open(file).readlines()[2*int(sys.argv[2])+1]))
    print t
    print "sent:"
    print "-----"
    print ' '.join(t.text())
    deps = mkdeps(t)
    
    print "deps:"
    print "-----"
    for l, r in deps: print "%s|%s" % (l, r)
    
    print "leaves:"
    print "-------"
    for leaf in leaves(t):
        print leaf.lex, leaf.cat
        
    print "unhandled combs:"
    print "----------------"
    for comb in unanalysed:
        print comb
        
    print "finished:"
    print pprint(t)