Beispiel #1
0
    def output(self):
        headings = ("(%s ,) %s -> %s", "(, %s) %s -> %s", "%s (%s ,) -> %s", "%s (, %s) -> %s")
        for index, heading in enumerate(headings):
            env_hash = getattr(self, "e%d" % index)
            print heading % ("X", "Y", "Z")
            print "-" * len(heading)

            examples_hash = getattr(self, "e%d_examples" % index)

            for (l, r, p), f in sorted_by_value_desc(env_hash):
                triple = heading % (l, r, p)
                print "% 10d [%28s] %-60s %s" % (
                    f,
                    analyse(C(l), C(r), C(p)),
                    triple,
                    " ".join(examples_hash[(l, r, p)]),
                )
                if index == 0 and (l, r, p) in self.e3 and self.e3[(l, r, p)] <= f:
                    alt_triple = headings[3] % (l, r, p)
                    alt_freq = self.e3[(l, r, p)]
                    print "* % 8d%32s%-60s %s" % (alt_freq, " " * 32, alt_triple, " ".join(self.e3_examples[(l, r, p)]))
                elif index == 3 and (l, r, p) in self.e0 and self.e0[(l, r, p)] <= f:
                    alt_triple = headings[0] % (l, r, p)
                    alt_freq = self.e0[(l, r, p)]
                    print "* % 8d%32s%-60s %s" % (alt_freq, " " * 32, alt_triple, " ".join(self.e0_examples[(l, r, p)]))
Beispiel #2
0
 def output(self):
     headings = ("(%s ,) %s -> %s",
                 "(, %s) %s -> %s",
                 "%s (%s ,) -> %s",
                 "%s (, %s) -> %s")
     for index, heading in enumerate(headings):
         env_hash = getattr(self, 'e%d' % index)
         print heading % ('X', 'Y', 'Z')
         print "-" * len(heading)
         
         examples_hash = getattr(self, 'e%d_examples' % index)
         
         for (l, r, p), f in sorted_by_value_desc(env_hash):
             triple = heading % (l, r, p)
             print "% 10d [%28s] %-60s %s" % (f, analyse(C(l), C(r), C(p)), triple, ' '.join(examples_hash[(l, r, p)]))
             if (index == 0 and (l, r, p) in self.e3 and self.e3[(l, r, p)] <= f):
                 alt_triple = headings[3] % (l, r, p)
                 alt_freq = self.e3[(l, r, p)]
                 print "* % 8d%32s%-60s %s" % (alt_freq, " "*32, alt_triple, 
                                                 ' '.join(self.e3_examples[(l, r, p)]))
             elif (index == 3 and (l, r, p) in self.e0 and self.e0[(l, r, p)] <= f):
                 alt_triple = headings[0] % (l, r, p)
                 alt_freq = self.e0[(l, r, p)]
                 print "* % 8d%32s%-60s %s" % (alt_freq, " "*32, alt_triple, 
                                                 ' '.join(self.e0_examples[(l, r, p)]))
Beispiel #3
0
 def output(self):
     as_left = {}
     as_right = {}
     for (l, r, p), f in self.envs.iteritems():
         if l == ',':
             as_left[(l, r, p)] = f
         else:
             as_right[(l, r, p)] = f
     
     print ", _ -> _"
     print "--------"
     for (l, r, p), f in sorted_by_value_desc(as_left):
         print "% 10d [%28s] %s %20s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
     print "_ , -> _"
     print "--------"
     for (l, r, p), f in sorted_by_value_desc(as_right):
         print "% 10d [%28s] %20s %s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
Beispiel #4
0
    def output(self):
        as_left = {}
        as_right = {}
        for (l, r, p), f in self.envs.iteritems():
            if l == ",":
                as_left[(l, r, p)] = f
            else:
                as_right[(l, r, p)] = f

        print ", _ -> _"
        print "--------"
        for (l, r, p), f in sorted_by_value_desc(as_left):
            print "% 10d [%28s] %s %20s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
        print "_ , -> _"
        print "--------"
        for (l, r, p), f in sorted_by_value_desc(as_right):
            print "% 10d [%28s] %20s %s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
Beispiel #5
0
def ccg2latex(root, glosses=None, abbreviate=False):
    def comb_symbol(comb):
        return arrows.get(comb, 'uline')
    def cat_repr(cat, i):
        cat_str = str(cat)
        if abbreviate is not False:
            if isinstance(abbreviate, xrange):
                if isinstance(i, int):
                    if i in abbreviate:
                        cat_str = abbr(cat_str)
                elif isinstance(i, xrange):
                    if abbreviate.start <= i.start < i.end <= abbreviate.end:
                        cat_str = abbr(cat_str)
            else:
                cat_str = abbr(cat_str)

        return sanitise_category(cat_str)
        
    out = ['\deriv{%d}{' % root.leaf_count()]
    all_leaves = list(leaves(root))
    
    # lex line
    if glosses is not None:
        leaf_bits = ("\\glosN{%s}{%s}" % (leaf.lex, gloss) for (leaf, gloss) in izip(all_leaves, glosses))
    else:
        leaf_bits = (("\\cjk{%s}" % leaf.lex) for leaf in all_leaves)
    out.append(' & '.join(leaf_bits) + '\\\\')
    
    # underlines line
    out.append( ' & '.join(["\uline{1}"] * root.leaf_count()) + '\\\\' )
    # cats line
    out.append( (' & '.join(("\\cf{%s}"%cat_repr(leaf.cat, i) for i, leaf in enumerate(all_leaves)))) + '\\\\' )
    
    rows = []
    for l, r, p in pairs_postorder(root):
        rows.append( (min_leaf_id(p, root), p.cat, analyse(l.cat, r and r.cat, p.cat), p.leaf_count()) )
        
    grouped_subrows = group(rows)
        
    for subrows in grouped_subrows:
        subline = []
        subout = []
        last_span = 0 # holds the index of the rightmost span in this row
        
        for leftmost_leaf_id, cat, comb, span in subrows:
            subline.append( "&"*(leftmost_leaf_id - last_span) + ("\%s{%s}" % (comb_symbol(comb), span)) )
            subout.append(  "&"*(leftmost_leaf_id - last_span) + ("\mc{%d}{%s}" % (span,
                cat_repr(cat, range(leftmost_leaf_id, leftmost_leaf_id+span)))) )
            
            last_span = leftmost_leaf_id+span-1

        # write out underlines line
        out.append(' '.join(subline) + '\\\\')
        # write out cats line
        out.append(' '.join(subout) + '\\\\')

    out.append('}')
    return '\n'.join(out)
Beispiel #6
0
    def output(self):
        as_left = {}
        as_right = {}
        
        for (side, l, r, p), f in self.envs.iteritems():
            if side == AnalyseAbsorption.LEFT:
                as_left[ (l, r, p) ] = f
            elif side == AnalyseAbsorption.RIGHT:
                as_right[ (l, r, p) ] = f

        print "(X ,) Y -> Z"
        print "--------"
        for (l, r, p), f in sorted_by_value_desc(as_left):
            print "% 10d [%28s] (%s ,) %s -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)

        print "X (, Y) -> Z"
        print "--------"
        for (l, r, p), f in sorted_by_value_desc(as_right):
            print "% 10d [%28s] %s (, %s) -> %s" % (f, analyse(C(l), C(r), C(p)), l, r, p)
Beispiel #7
0
    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            self.total += 1

            result = analyse(node.lch.cat, node.rch and node.rch.cat, node.cat)
            if result == 'l_punct_absorb':
                self.l += 1
            elif result == 'r_punct_absorb':
                self.r += 1
            else:
                self.other += 1
Beispiel #8
0
 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         
         self.total += 1
         
         result = analyse(node.lch.cat, node.rch and node.rch.cat, node.cat)
         if result == 'l_punct_absorb':
             self.l += 1
         elif result == 'r_punct_absorb':
             self.r += 1
         else:
             self.other += 1
Beispiel #9
0
def percolate(deriv):
    '''This percolates mode changes made at the leaves up the derivation tree.'''
    for lch, rch in level_order_pairs(deriv):
        parent = lch.parent # (== rch.parent)
        
        if rch or parent: # If lch is not the root
            comb = analyse(lch.cat, rch and rch.cat, parent.cat)
            
            if str(comb) == 'bwd_r1xcomp':
                # (Y/a)/Z X\Y -> (X/a)/Z
                #      ^  ^       ^   ^
                #      |  |_______|   |
                #      |______________|
                
                copy_modes(lch.cat, parent.cat)
                copy_modes(rch.cat.left, parent.cat.left.left)
                
            elif str(comb).endswith('comp'): # is composition
                lmode = lch.cat.mode # lch and rch are both necessarily compound
                rmode = rch.cat.mode
                # also check that lmode and rmode include composition
                # maybe issue message saying broken derivation otherwise

                parent.cat.mode = mode_min(lmode, rmode)

                if comb in ("fwd_comp", "fwd_xcomp"): # X/Y Y/Z -> X/Z or X/Y Y\Z -> X\Z
                    # copy modes from arguments to result
                    copy_modes(lch.cat.left, parent.cat.left)
                    copy_modes(rch.cat.right, parent.cat.right)
                elif comb in ("bwd_comp", "bwd_xcomp"): # Y\Z X\Y -> X\Z or Y/Z X\Y -> X/Z
                    copy_modes(rch.cat.left, parent.cat.left)
                    copy_modes(lch.cat.right, parent.cat.right)

            elif str(comb).endswith('type'): # is type raising
                # uses default mode (which is :all)
                pass
    
            elif str(comb) == "r_punct_absorb": # X ; -> X
                copy_modes(lch.cat, parent.cat)

            elif str(comb).endswith('absorb') or str(comb) == "conjoin":
                copy_modes(rch.cat, parent.cat)

            else: # assume application
                if comb == "fwd_appl": # X/Y Y -> X
                    copy_modes(lch.cat.left, parent.cat)
                elif comb == "bwd_appl": # Y X\Y -> X
                    copy_modes(rch.cat.left, parent.cat)
Beispiel #10
0
def make_derivation(deriv, assigned_id=None, leaf_id=0):
    '''Generates the body of the DOT representation.'''
    
    if deriv.is_leaf():
        if write_tree_indices:
            label = "%d %s" % (leaf_id, deriv.label_text())
        else:
            label = deriv.label_text()

        return '''%s [shape="none",height=0.17,label="%s"]\n''' % (assigned_id, label)
        
    else:
        ret = []
        root_id = assigned_id or get_id()

        for i, child in enumerate(deriv):
            child_id = get_id()

            if isinstance(deriv, (ccg.Leaf, ccg.Node)):
                comb_name = re.escape(Abbreviations.get(analyse(deriv.lch.cat, deriv.rch and deriv.rch.cat, deriv.cat), ''))
                
                if comb_name:
                    shape_type = "record"
                    label_text = "<o>%s|%s" % (deriv.label_text(), comb_name)
                else:
                    shape_type = "box"
                    label_text = deriv.label_text()
                    
                ret.append('''%s [shape="%s",height=0.1,label="%s"]\n''' % (root_id, shape_type, label_text))

                if config.highlight_head_arrows and i == int(deriv.head_index):
                    ret.append("%s:o -> %s:o [color=red]\n" % (root_id, child_id))
                else:
                    ret.append("%s:o -> %s:o\n" % (root_id, child_id))
                    
                ret.append(make_derivation(child, child_id, leaf_id=leaf_id))
                leaf_id += len(list(leaves(child)))
                
            else:
                ret.append('''%s [shape="box",height=0.1,label="%s"]\n''' % (root_id, deriv.label_text()))
                ret.append("%s -> %s\n" % (root_id, child_id)) 
                ret.append(make_derivation(child, child_id, leaf_id=leaf_id))
                leaf_id += len(list(leaves(child)))

        return ''.join(ret)
Beispiel #11
0
def applications_per_slash_with_path(orig_path,
                                     slash_count,
                                     examine_modes=False):
    '''Given a category, returns a list whose index _i_ is the rule which consumed its _i_th slash, or None
if it was not consumed.'''
    result = []

    for slash in xrange(slash_count):
        consumer = None  # the rule which consumed this slash, if any
        first = True

        # We need to copy the path for each slash, because in each iteration we label
        # the categories in-place.
        orig_path, path = tee(orig_path, 2)

        for (prev_l, prev_r,
             prev_was_flipped), (l, r, was_flipped) in each_pair(path):
            if first:
                if prev_was_flipped and prev_r:
                    prev_r.labelled()
                elif not prev_was_flipped:
                    prev_l.labelled()
                first = False

            cur = r if was_flipped else l
            prev_cur = prev_r if prev_was_flipped else prev_l

            rule = analyse(prev_l, prev_r, cur, examine_modes)
            label_result(cur, prev_cur, rule, prev_was_flipped)

            if rule == 'fwd_appl': consumed_category = prev_l
            elif rule == 'bwd_appl': consumed_category = prev_r
            elif rule in ('fwd_comp', 'bwd_comp', 'bwd_xcomp', 'fwd_xcomp'):
                consumed_category = prev_cur
            else:
                consumed_category = None

            if consumed_category and consumed_category.label == slash:
                consumer = rule
                break

        result.append(consumer)

    return result
Beispiel #12
0
def _label_result(l, r, p):
    L, R, P = l.cat, r.cat if r else None, p.cat
    app = analyse(L, R, P)
    if not app: return

    #    print '> %s %s %s %s' % (app, L, R, p.cat)
    if app == 'fwd_appl':  # X/Y Y
        if L.left.label is not None:
            P.labelled(L.left.label)
    elif app == 'bwd_appl':  # Y X\Y
        if R.left.label is not None:
            P.labelled(R.left.label)
    elif (app in ('fwd_raise', 'bwd_raise')
          or app.endswith('gap_topicalisation')):  # X -> T|(T|X)
        if L.label is not None:
            P.right.right.labelled(L.label)
    elif app in ('fwd_comp', 'fwd_xcomp'):  # assume left headed
        if L.left.label is not None:
            P.left.labelled(L.left.label)
    elif app in ('bwd_comp', 'bwd_xcomp'):
        if R.left.label is not None:
            P.left.labelled(R.left.label)
Beispiel #13
0
def applications_per_slash_with_path(orig_path, slash_count, examine_modes=False):
    '''Given a category, returns a list whose index _i_ is the rule which consumed its _i_th slash, or None
if it was not consumed.'''
    result = []

    for slash in xrange(slash_count):
        consumer = None # the rule which consumed this slash, if any
        first = True
        
        # We need to copy the path for each slash, because in each iteration we label
        # the categories in-place.
        orig_path, path = tee(orig_path, 2)
        
        for (prev_l, prev_r, prev_was_flipped), (l, r, was_flipped) in each_pair(path):
            if first:
                if prev_was_flipped and prev_r:
                    prev_r.labelled()
                elif not prev_was_flipped:
                    prev_l.labelled()
                first = False

            cur      = r      if was_flipped      else l
            prev_cur = prev_r if prev_was_flipped else prev_l

            rule = analyse(prev_l, prev_r, cur, examine_modes)
            label_result(cur, prev_cur, rule, prev_was_flipped)

            if   rule == 'fwd_appl': consumed_category = prev_l
            elif rule == 'bwd_appl': consumed_category = prev_r
            elif rule in ('fwd_comp', 'bwd_comp', 'bwd_xcomp', 'fwd_xcomp'): consumed_category = prev_cur
            else: consumed_category = None

            if consumed_category and consumed_category.label == slash:
                consumer = rule
                break

        result.append( consumer )

    return result
Beispiel #14
0
def filter_fn(node):
    comb = analyse(node[0].category, node[1].category, node.category)
    return comb not in ('l_punct_absorb', 'r_punct_absorb', 'funny_conj')
Beispiel #15
0
 def output(self):
     for (l, r, p), freq in sorted_by_value_desc(self.counts):
         print "%8d | %15s  %-15s -> %-15s [%s]" % (freq, l, r, p, analyse(l, r, p))
Beispiel #16
0
def is_ucp(l, r, p):
    if r is None: return False
    
    return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r

for file in glob(sys.argv[1]):
    for bundle in CCGbankReader(file):
        has_unrecognised_rules, has_ucp = False, False
        
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue
            
            lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node))
            
            comb = analyse(*lrp)
            l, r, p = lrp
            rule_tuple = (str(l), str(r), str(p))
            
            if comb:
                combs[comb] += 1
            elif is_ucp(*lrp):
                ucp_rules[rule_tuple] += 1
                if not has_ucp:
                    with_ucp += 1
                has_ucp = True
            else:
                # Split unrecognised rules by type
                if r:
                    binary[rule_tuple] += 1
                else:
Beispiel #17
0
def applications_with_path(path):
    '''Yields a sequence of rule applications applied along a _path_ to the root.'''
    for (prev_l, prev_r, _), (l, r, was_flipped) in each_pair(path):
        yield analyse(prev_l, prev_r, r if was_flipped else l)
Beispiel #18
0
    def fix_categories_starting_from(self, node, until):
        '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct
CCG analysis.'''
        while node is not until:
            # Only fix binary rules
            if (not node.parent) or node.parent.count() < 2: break

            l, r, p = node.parent[0], node.parent[1], node.parent
            L, R, P = (n.category for n in (l, r, p))
            debug("L: %s R: %s P: %s", L, R, P)

            applied_rule = analyse(L, R, P)
            debug("[ %s'%s' %s'%s' -> %s'%s' ] %s", L, ''.join(l.text()), R,
                  ''.join(r.text()), P, ''.join(p.text()), applied_rule)

            if applied_rule is None:
                debug("invalid rule %s %s -> %s", L, R, P)

                if R.is_complex() and R.left.is_complex(
                ) and L == R.left.right:
                    # L       (X|L)|Y -> X|Y becomes
                    # X|(X|L) (X|L)|Y -> X|Y
                    T = R.left.left
                    new_category = typeraise(L, T, TR_FORWARD)  #T/(T|L)
                    node.parent[0] = Node(l.tag, [l],
                                          new_category,
                                          head_index=0)

                    new_parent_category = fcomp(new_category, R)
                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category

                    debug("New category: %s", new_category)

                elif L.is_complex() and L.left.is_complex(
                ) and R == L.left.right:
                    # (X|R)|Y R       -> X|Y  becomes
                    # (X|R)|Y X|(X|R) -> X|Y
                    T = L.left.left
                    new_category = typeraise(R, T, TR_BACKWARD)  #T|(T/R)
                    node.parent[1] = Node(r.tag, [r],
                                          new_category,
                                          head_index=0)

                    new_parent_category = bxcomp(L, new_category)
                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category

                    debug("New category: %s", new_category)

                # conj R -> P
                # Make P into R[conj]
                # L cannot be the comma category (,), otherwise we get a mis-analysis
                # in 2:22(5)
                if str(L) in ('conj', 'LCM'):
                    p.category = R.clone_adding_feature('conj')
                    debug("New category: %s", p.category)

                # L R[conj] -> P
                elif R.has_feature('conj'):
                    new_L = L.clone()

                    r.category = new_L.clone_adding_feature('conj')
                    p.category = new_L

                    debug("New category: %s", new_L)

                elif L.is_leaf():
                    # , R -> P[conj] becomes , R -> R[conj]
                    if P.has_feature('conj') and l.tag in (
                            'PU', 'CC'):  # treat as partial coordination
                        debug("Fixing coordination: %s" % P)
                        p.category = r.category.clone_adding_feature('conj')
                        debug("new parent category: %s" % p.category)

                    # , R -> P becomes , R -> R
                    elif l.tag == "PU" and not P.has_feature(
                            'conj'):  # treat as absorption
                        debug("Fixing left absorption: %s" % P)
                        p.category = r.category

                    # L       (X|L)|Y -> X|Y becomes
                    # X|(X|L) (X|L)|Y -> X|Y
                    elif R.is_complex() and R.left.is_complex(
                    ) and L == R.left.right:
                        T = R.left.left
                        new_category = typeraise(L, T, TR_FORWARD)  #T/(T|L)
                        node.parent[0] = Node(l.tag, [l],
                                              new_category,
                                              head_index=0)

                        new_parent_category = fcomp(new_category, R)
                        if new_parent_category:
                            debug("new parent category: %s",
                                  new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                elif R.is_leaf():
                    # R , -> P becomes R , -> R
                    if r.tag == "PU":  # treat as absorption
                        debug("Fixing right absorption: %s" % P)
                        p.category = l.category

                    # (X|R)|Y R       -> X|Y  becomes
                    # (X|R)|Y X|(X|R) -> X|Y
                    elif L.is_complex() and L.left.is_complex(
                    ) and R == L.left.right:
                        T = L.left.left
                        new_category = typeraise(R, T, TR_BACKWARD)  #T|(T/R)
                        node.parent[1] = Node(r.tag, [r],
                                              new_category,
                                              head_index=0)

                        new_parent_category = bxcomp(L, new_category)
                        if new_parent_category:
                            debug("new parent category: %s",
                                  new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                else:
                    new_parent_category = None

                    # try typeraising fix
                    # T/(T/X) (T\A)/X -> T can be fixed:
                    # (T\A)/((T\A)/X) (T\A)/X -> T\A
                    if self.is_topicalisation(L) and (L.right.right == R.right
                                                      and P == L.left
                                                      and P == R.left.left):
                        T_A = R.left
                        X = R.right

                        l.category = T_A / (T_A / X)
                        new_parent_category = T_A

                    # (X|X)|Z Y       -> X becomes
                    # (X|X)|Z X|(X|X) -> X|Z
                    elif L.is_complex() and L.left.is_complex(
                    ) and R == L.left.right:
                        T = L.left.left
                        new_category = typeraise(
                            R, R, TR_BACKWARD, strip_features=False)  #T/(T|L)
                        node.parent[1] = Node(r.tag, [r],
                                              new_category,
                                              head_index=0)

                        new_parent_category = bxcomp(L, new_category)
                        if new_parent_category:
                            debug("new parent category: %s",
                                  new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                    # Generalise over right modifiers of verbal categories (S[dcl]\X)$
                    elif self.is_verbal_category(
                            L) and L.is_complex() and L.left.is_complex():
                        T = L.left.right
                        new_category = typeraise(R, T, TR_BACKWARD)
                        debug('Trying out %s', new_category)

                        if bxcomp(L, new_category):
                            node.parent[1] = Node(r.tag, [r],
                                                  new_category,
                                                  head_index=0)
                            new_parent_category = bxcomp(L, new_category)

                    # Last ditch: try all of the composition rules to generalise over L R -> P
                    if not new_parent_category:
                        # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3))
                        # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category
                        new_parent_category = (
                            fcomp(L, R)
                            or bcomp(L, R, when=not self.is_relativiser(R))
                            or bxcomp(
                                L, R, when=not self.is_relativiser(R)
                            )  #or bxcomp2(L, R, when=self.is_verbal_category(L)) 
                            or fxcomp(L, R, when=not self.is_relativiser(R)))

                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category
                    else:
                        debug("couldn't fix, skipping")

            node = node.parent
            debug('')
Beispiel #19
0
def combinators_and_path_from_node(node):
    path = category_path_to_root(node)
    for (prev_l, prev_r, _), (l, r, was_flipped) in each_pair(path):
        l, r, p = prev_l, prev_r, r if was_flipped else l
        yield analyse(l, r, p), (l, r, p)
Beispiel #20
0
 def output(self):
     for (l, r, p), freq in sorted_by_value_desc(self.counts):
         print "%8d | %15s  %-15s -> %-15s [%s]" % (freq, l, r, p,
                                                    analyse(l, r, p))
Beispiel #21
0
def combinators_and_path_from_node(node):
    path = category_path_to_root(node)
    for (prev_l, prev_r, _), (l, r, was_flipped) in each_pair(path):
        l, r, p = prev_l, prev_r, r if was_flipped else l
        yield analyse(l, r, p), (l, r, p)
Beispiel #22
0
    def fix_categories_starting_from(self, node, until):
        '''Adjusts category labels from _node_ to _until_ (not inclusive) to obtain the correct
CCG analysis.'''
        while node is not until:
            # Only fix binary rules
            if (not node.parent) or node.parent.count() < 2: break

            l, r, p = node.parent[0], node.parent[1], node.parent
            L, R, P = (n.category for n in (l, r, p))
            debug("L: %s R: %s P: %s", L, R, P)

            applied_rule = analyse(L, R, P)
            debug("[ %s'%s' %s'%s' -> %s'%s' ] %s",
                L, ''.join(l.text()), R, ''.join(r.text()), P, ''.join(p.text()),
                applied_rule)

            if applied_rule is None:
                debug("invalid rule %s %s -> %s", L, R, P)
                
                if R.is_complex() and R.left.is_complex() and L == R.left.right:
                    # L       (X|L)|Y -> X|Y becomes
                    # X|(X|L) (X|L)|Y -> X|Y
                    T = R.left.left
                    new_category = typeraise(L, T, TR_FORWARD)#T/(T|L)
                    node.parent[0] = Node(l.tag, [l], new_category, head_index=0)

                    new_parent_category = fcomp(new_category, R)
                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category

                    debug("New category: %s", new_category)
                
                elif L.is_complex() and L.left.is_complex() and R == L.left.right:
                    # (X|R)|Y R       -> X|Y  becomes
                    # (X|R)|Y X|(X|R) -> X|Y
                    T = L.left.left
                    new_category = typeraise(R, T, TR_BACKWARD)#T|(T/R)
                    node.parent[1] = Node(r.tag, [r], new_category, head_index=0)

                    new_parent_category = bxcomp(L, new_category)
                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category

                    debug("New category: %s", new_category)

                # conj R -> P
                # Make P into R[conj]
                # L cannot be the comma category (,), otherwise we get a mis-analysis
                # in 2:22(5)
                if str(L) in ('conj', 'LCM'):
                    p.category = R.clone_adding_feature('conj')
                    debug("New category: %s", p.category)

                # L R[conj] -> P
                elif R.has_feature('conj'):
                    new_L = L.clone()

                    r.category = new_L.clone_adding_feature('conj')
                    p.category = new_L

                    debug("New category: %s", new_L)

                elif L.is_leaf():
                    # , R -> P[conj] becomes , R -> R[conj]
                    if P.has_feature('conj') and l.tag in ('PU', 'CC'): # treat as partial coordination
                        debug("Fixing coordination: %s" % P)
                        p.category = r.category.clone_adding_feature('conj')
                        debug("new parent category: %s" % p.category)
                        
                    # , R -> P becomes , R -> R
                    elif l.tag == "PU" and not P.has_feature('conj'): # treat as absorption
                        debug("Fixing left absorption: %s" % P)
                        p.category = r.category

                    # L       (X|L)|Y -> X|Y becomes
                    # X|(X|L) (X|L)|Y -> X|Y
                    elif R.is_complex() and R.left.is_complex() and L == R.left.right:
                        T = R.left.left
                        new_category = typeraise(L, T, TR_FORWARD)#T/(T|L)
                        node.parent[0] = Node(l.tag, [l], new_category, head_index=0)

                        new_parent_category = fcomp(new_category, R)
                        if new_parent_category:
                            debug("new parent category: %s", new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)
                        
                elif R.is_leaf():
                    # R , -> P becomes R , -> R
                    if r.tag == "PU": # treat as absorption
                        debug("Fixing right absorption: %s" % P)
                        p.category = l.category

                    # (X|R)|Y R       -> X|Y  becomes
                    # (X|R)|Y X|(X|R) -> X|Y
                    elif L.is_complex() and L.left.is_complex() and R == L.left.right:
                        T = L.left.left
                        new_category = typeraise(R, T, TR_BACKWARD)#T|(T/R)
                        node.parent[1] = Node(r.tag, [r], new_category, head_index=0)

                        new_parent_category = bxcomp(L, new_category)
                        if new_parent_category:
                            debug("new parent category: %s", new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)

                else:
                    new_parent_category = None
                    
                    # try typeraising fix
                    # T/(T/X) (T\A)/X -> T can be fixed:
                    # (T\A)/((T\A)/X) (T\A)/X -> T\A
                    if self.is_topicalisation(L) and (
                        L.right.right == R.right and
                        P == L.left and P == R.left.left):
                        T_A = R.left
                        X = R.right

                        l.category = T_A/(T_A/X)
                        new_parent_category = T_A
                        
                    # (X|X)|Z Y       -> X becomes
                    # (X|X)|Z X|(X|X) -> X|Z
                    elif L.is_complex() and L.left.is_complex() and R == L.left.right:
                        T = L.left.left
                        new_category = typeraise(R, R, TR_BACKWARD, strip_features=False)#T/(T|L)
                        node.parent[1] = Node(r.tag, [r], new_category, head_index=0)

                        new_parent_category = bxcomp(L, new_category)
                        if new_parent_category:
                            debug("new parent category: %s", new_parent_category)
                            p.category = new_parent_category

                        debug("New category: %s", new_category)
                                            
                    # Generalise over right modifiers of verbal categories (S[dcl]\X)$
                    elif self.is_verbal_category(L) and L.is_complex() and L.left.is_complex():
                        T = L.left.right
                        new_category = typeraise(R, T, TR_BACKWARD)
                        debug('Trying out %s', new_category)
                        
                        if bxcomp(L, new_category):
                            node.parent[1] = Node(r.tag, [r], new_category, head_index=0)
                            new_parent_category = bxcomp(L, new_category)

                    # Last ditch: try all of the composition rules to generalise over L R -> P
                    if not new_parent_category:
                        # having fxcomp creates bad categories in NP(IP DEC) construction (1:97(3))
                        # but, we need fxcomp to create the gap NP-TPC NP-SBJ(*T*) VP, so allow it when the rhs doesn't look like the DEC category
                        new_parent_category = (fcomp(L, R) or bcomp(L, R, when=not self.is_relativiser(R)) 
                                            or bxcomp(L, R, when=not self.is_relativiser(R)) #or bxcomp2(L, R, when=self.is_verbal_category(L)) 
                                            or fxcomp(L, R, when=not self.is_relativiser(R)))

                    if new_parent_category:
                        debug("new parent category: %s", new_parent_category)
                        p.category = new_parent_category
                    else:
                        debug("couldn't fix, skipping")

            node = node.parent
            debug('')
Beispiel #23
0
def applications_with_path(path):
    '''Yields a sequence of rule applications applied along a _path_ to the root.'''
    for (prev_l, prev_r, _), (l, r, was_flipped) in each_pair(path):
        yield analyse(prev_l, prev_r, r if was_flipped else l)
Beispiel #24
0
def filter_fn(node):
    comb = analyse(node[0].category, node[1].category, node.category)
    return comb not in ('l_punct_absorb', 'r_punct_absorb', 'funny_conj')
Beispiel #25
0
 def match_callback(self, match_node, bundle):
     key = analyse(match_node.lch.cat, match_node.rch.cat, match_node.cat)
     if key is None: key = 'other_typechange'
     #            print "(%s %s -> %s) %s" % (match_node.lch.cat, match_node.rch.cat, match_node.cat, key)
     self.counts[key] += 1
Beispiel #26
0
    def match_callback(self, match_node, bundle):
        key = analyse(match_node.lch.cat, match_node.rch.cat, match_node.cat)
        if key is None: key = 'other_typechange'
#            print "(%s %s -> %s) %s" % (match_node.lch.cat, match_node.rch.cat, match_node.cat, key)
        self.counts[key] += 1
Beispiel #27
0
    if r is None: return False

    return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r


for file in glob(sys.argv[1]):
    for bundle in CCGbankReader(file):
        has_unrecognised_rules, has_ucp = False, False

        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            lrp = map(lambda e: e and e.cat,
                      (node[0], node[1] if node.count() > 0 else None, node))

            comb = analyse(*lrp)
            l, r, p = lrp
            rule_tuple = (str(l), str(r), str(p))

            if comb:
                combs[comb] += 1
            elif is_ucp(*lrp):
                ucp_rules[rule_tuple] += 1
                if not has_ucp:
                    with_ucp += 1
                has_ucp = True
            else:
                # Split unrecognised rules by type
                if r:
                    binary[rule_tuple] += 1
                else:
Beispiel #28
0
def mkdeps(root, postprocessor=identity):
    for i, leaf in enumerate(leaves(root)):
        # Uniquify each leaf with an index
        leaf.lex += IndexSeparatorTemplate % i
        # Apply the left to right slash labelling 
        # (we abuse this to refer to slots, not slashes)
        leaf.cat.parg_labelled()
        # Populate the outermost (_) variable of each leaf
        leaf.cat.slot.head.lex = leaf.lex

    for (l, r, p) in pairs_postorder(root):
        _label_result(l, r, p)
            
    global unanalysed
    
    unaries = []

    for l, r, p in pairs_postorder(root):
        L, R, P = map(lambda x: x and x.cat, (l, r, p))
        comb = analyse(L, R, P)
        if not comb: debug("Unrecognised rule %s %s -> %s", L, R, P)
        
        unifier = []
        
        if config.debug:
            debug("%s %s %s (%s)", L, R, P, str(comb))

        if comb == 'fwd_appl': # [Xx/Yy]l Yy -> Xx
            unifier = unify(L.right, R)
            p.cat = L.left

        elif comb == 'bwd_appl': # Yy [Xx\Yy]r -> Xx
            unifier = unify(L, R.right)
            p.cat = R.left
                
        # Pro-drops which drop their outer argument
        # [(S_\NPy)_/NPx]_ -> [S_\NPy]_
        elif comb in ('object_prodrop', 'vp_vp_object_prodrop', 
            'yi_subject_prodrop', 'vp_modifier_subject_prodrop'):
            p.cat = L.left

        # [Xx/Yy]l [Yy/Zz]r -> [Xx/Zz]r
        elif comb == 'fwd_comp': # X/Y Y/Z -> X/Z
            if is_rooted_in(Sdcl, L, respecting_features=True):
                P.slot = L.slot
            else:
                P.slot = R.slot # lexical head comes from R (Y/Z)

            P.slot.var = fresh_var(prefix='K')

            unifier = unify(L.right, R.left)
            p.cat._left = L.left
            p.cat._right = R.right
            
        # [Yy\Zz]l [Xx\Yy]r -> [Xx\Zz]l
        elif comb == 'bwd_comp': # Y\Z X\Y -> X\Z
            if is_rooted_in(Sdcl, R, respecting_features=True):
                P.slot = R.slot
            else:
                P.slot = L.slot # lexical head comes from L (Y\Z)

            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(R.right, L.left)
            p.cat._left = R.left
            p.cat._right = L.right
            
        elif comb in ('s_np_apposition', 'vp_np_apposition'): # { S[dcl], S[dcl]\NP } NPy -> NPy
            P.slot = R.slot # = copy_vars
            unifier = unify(P, R)
            
        # NP NP -> N/N
        elif comb == 'np_np_to_nfn_apposition':
            # do the same as NP NP -> NP, except fill in the vars Ny/Ny
            P.right.slot.var = fresh_var(prefix='N')
            P.left.slot = P.right.slot

            register_unary(unaries, p, L.slot.head.lex)
            make_set_head_from(l, r, p)

        elif comb in ('conjoin', 'np_np_apposition'): # X X[conj] -> X
            make_set_head_from(l, r, p)

        elif comb in ('conj_absorb', 'conj_comma_absorb'): # conj X -> X[conj]
            copy_vars(frm=R, to=P)
            unify(P, R) # R.slot.head = P.slot.head
            
        elif comb == 'funny_conj': # conj X -> X
            p.cat = R
        
        elif comb == 'nongap_topicalisation': # {N, NP, S[dcl], QP}x -> [Sy/Sy]x
            P.slot = L.slot
            P.right.slot.var = fresh_var()
            P.left.slot = P.right.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb in ('np_gap_topicalisation', 's_gap_topicalisation', 'qp_gap_topicalisation'): # NPx -> [ Sy/(Sy/NPx)y ]y
            P.right.right.slot = L.slot
            P.slot.var = fresh_var()
            P.left.slot = P.right.left.slot = P.right.slot = P.slot
            
        elif comb == 'subject_prodrop': # (S[dcl]y\NPx)y -> S[dcl]y | [(S[dcl]y\NPx)y/NPz]y -> (S[dcl]y/NPz)y
            if P == parse_category(r'S[dcl]'):
                P.slot = L.slot
            elif P == parse_category(r'S[dcl]/NP'):
                P.slot = P.left.slot = L.slot
                P.right.slot = L.right.slot
            else:
                warn("Invalid parent category %s for subject prodrop.", P)
            
        elif comb == 'fwd_xcomp': # [Xx/Yy]l [Yy\Zz]r -> [Xx/Zz]r
            if is_rooted_in(Sdcl, L, respecting_features=True):
                P.slot = L.slot
            else:
                P.slot = R.slot # lexical head comes from R (Y/Z)
                
            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(L.right, R.left)
            p.cat._left = L.left
            p.cat._right = R.right

        elif comb == 'bwd_xcomp': # [Yy/Zz]l [Xx\Yy]r -> [Xx/Zz]l
            if is_rooted_in(Sdcl, R, respecting_features=True):
                P.slot = R.slot
            else:
                P.slot = L.slot # lexical head comes from L (Y\Z)
        
            # P.slot = L.slot
            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(R.right, L.left)
            p.cat._left = R.left
            p.cat._right = L.right
            
        elif comb == 'bwd_r1xcomp': # [(Yy/Zz)k/Ww]l [Xx\Yy]r -> [(Xx\Zz)k/Ww]l
            # TODO: where should P's lexical head come from? L or R?
            
            unifier = unify(L.left.left, R.right)
            p.cat._left._left = R.left
            p.cat._left._right = L.left.right
            p.cat._right = L.right

        elif comb in ('fwd_raise', 'bwd_raise'): # Xx -> [ Tf|(Tf|Xx)f ]f
            if P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/(S[dcl]\NP))'):
                # (S[dcl]y\NPz)y -> [ (S[dcl]f\NPg)f/((S[dcl]f\NPg)f\(S[dcl]y\NPz)y)f ]f
                P.left.slot.var = P.left.left.slot.var = P.right.slot.var = P.slot.var = fresh_var() # f 
                P.left.right.slot.var = fresh_var() # g
                
                copy_vars(frm=P.left, to=P.right.left)
                copy_vars(frm=L,      to=P.right.right)
                
                unifier = unify(L, P.right.right)
            elif P == parse_category(r'((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)'):
                # NPy -> [ ((S[dcl]v\NPw)v/QPz)v \ ( ((S[dcl]v\NPw)v/QPz)v/NPy )v ]v
                P.slot.var = fresh_var()
                P.left.slot = P.right.slot = \
                    P.left. left.slot = P.left. left.left.slot = \
                    P.right.left.slot = P.right.left.left.slot = \
                    P.right.left.left.left.slot = P.slot # v
#                P.right.right.slot = fresh_var() # y
                P.right.right.slot = L.slot
                P.left.right.slot.var = fresh_var('Z')
                P.right.left.right.slot = P.left.right.slot # z
                P.left.left.right.slot.var = fresh_var('W')
                P.right.left.left.right.slot = P.left.left.right.slot # w
                
                unifier = unify(L, P.right.right)
            elif P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/QP)'):
                # QPy -> [ (S[dcl]v\NPz)v \ ((S[dcl]v\NPz)v/QPy)v ]v
                P.slot.var = fresh_var()
                P.left.slot = P.left.left.slot = \
                    P.right.slot = P.right.left.slot = P.right.left.left.slot = P.slot # v
#                P.right.right.slot = fresh_var() # y
                P.right.right.slot = L.slot
                P.left.right.slot.var = fresh_var('Z')
                P.right.left.right.slot = P.left.right.slot # z
                
                unifier = unify(L, P.right.right)
            else:
                P.slot.var = fresh_var()

                P.right.left.slot = P.left.slot = P.right.slot = P.slot
                P.right.right.slot = L.slot

                unifier = unify(L, P.right.right)

        elif comb == 'np_typechange':
            P.slot = L.slot # = copy_vars
            unifier = unify(P, L)
            
        elif comb == 'lcp_np_typechange':
            P.slot = L.slot
            unifier = unify(P, L)
            
        elif comb in ('lcp_sfs_typechange', 'lcp_nfn_typechange'):
            P.left.slot.var = fresh_var()
            P.right.slot = P.left.slot
            
            P.slot = L.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb == 'lcp_sbnpfsbnp_typechange':
            # [(Sy\NPz)y/(Sy\NPz)y]_
            P.left.slot.var = fresh_var()
            P.left.left.slot = P.right.left.slot = P.right.slot = P.left.slot
            
            register_unary(unaries, p, L.slot.head.lex)
        
        elif comb == 'null_relativiser_typechange': # Xy -> (Nf/Nf)y
            P.slot = L.slot
            
            if P == _NfN:
                P.left.slot.var = fresh_var()
                
                P.right.slot = P.left.slot
                
                register_unary(unaries, p, L.slot.head.lex)
                
            elif P == _NfNfNfN:
                P.left.slot.var = fresh_var()
                P.left.left.slot.var = fresh_var(prefix="G")
                
                P.left.right.slot = P.left.left.slot
                P.right.slot = P.left.slot
                
                register_unary(unaries, p, L.slot.head.lex)
            else:
                warn("Unhandled null relativiser typechange: %s -> %s", L, P)
            
        # [NP/NP]y -> NPy
        elif comb == 'de_nominalisation':
            P.slot = L.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        # {M, QP}y -> (Nf/Nf)y
        elif comb == 'measure_word_number_elision':
            P.slot = L.slot
            
            P.left.slot.var = fresh_var()
            P.right.slot = P.left.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb == 'l_punct_absorb': # , X -> X[conj]
            # need to put conj feature back on parent
            p.cat = R.clone_adding_feature('conj')
            
        elif comb == 'r_punct_absorb':
            p.cat = L

        elif R and L == R and is_rooted_in(parse_category('S'), L): # VCD (stopgap)
            make_set_head_from(l, r, p)

        else:
            debug('Unhandled combinator %s (%s %s -> %s)', comb, L, R, P)
            unanalysed.add(comb)
            
            P.slot = R.slot if R else L.slot
            
        for (dest, src) in unifier:
            if isinstance(src, (basestring, list)):
                # Fake bidirectional unification:
                # -------------------------------
                # If variable X has been unified with value v,
                # rewrite all mentions of v in the output category to point to variable X
                # (v is uniquified by concatenating it with an ID, so this should hold)            
                for subcat in p.cat.nested_compound_categories():
                    if subcat.slot.head.lex == src:
                        subcat.slot = dest.slot
            
        if config.debug:
            debug("> %s" % p.cat)
            debug('---')
            
            if config.fail_on_unassigned_variables:
                assert no_unassigned_variables(p.cat), "Unassigned variables in %s" % p.cat
                
    if config.debug:
        debug('unaries: %s', unaries)
        
    # Collect deps from arguments
    deps = []
    for l in chain( leaves(root), unaries ):
        if config.debug: debug("%s %s", l, l.cat)
        
        C = l.cat
        while not C.is_leaf():
            arg = C.right
            if arg.slot.head.filler:
                #and not l.cat.left.slot == l.cat.right.slot):
        #        print "%s %s %s %s %s %s" % (C.slot.head.lex, C, arg.slot.head.lex, arg, l.cat, C.label)
                if C.label is None:
                    warn("Dependency generated on slash without label: %s %s", C, arg)
                deps.append( (C.slot.head.lex, arg.slot.head.lex, l.cat, C.label) )
            if is_modifier(C): break
            C = C.left

    # Produce dep pairs
    result = set()
    for depl, depr, head_cat, head_label in deps:
        for sdepl in set(seqify(depl)):
            for sdepr in set(seqify(depr)):
                if not (sdepl and sdepr):
                    debug("Dependency with None: %s %s", sdepl, sdepr)
                    continue
                    
                result.add( (postprocessor(sdepl), postprocessor(sdepr), head_cat, head_label) )
                
    if config.debug:
        for line in write_deps(result):
            debug(line)
    return result