Python nodesの例、munge.trees.traverse.nodes Pythonの例

コード例 #1

0

ファイルを表示

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if is_np_internal_structure(node):
             all_leaves = list(leaves(node))
             node.kids = all_leaves
             
     self.write_derivation(bundle)

コード例 #2

0

ファイルを表示

ファイル: nrules.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         if node.count() == 1 and node[0].is_leaf():
             self.unary[ self.signature(node) ] += 1
         
         self.freqs[ self.signature(node) ] += 1

コード例 #3

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.tag.startswith('IP-APP'):
                sibling = node.parent[1]
                head = self.get_head(sibling)

                self.headfreqs[head] += 1

コード例 #4

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue
            if node.count() == 1 and node[0].is_leaf():
                self.unary[self.signature(node)] += 1

            self.freqs[self.signature(node)] += 1

コード例 #5

0

ファイルを表示

ファイル: count.py プロジェクト: Oneplus/cnccgbank

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if not node.is_leaf():
                self.counts['total'] += 1
                kid_tags = ' '.join(kid.tag for kid in node)

                if is_predication(node):
                    self.counts['predication'] += 1
                    self.predication_kinds[kid_tags] += 1

                elif is_coordination(node): # coordination
                    self.counts['coordination'] += 1
                    self.coordination_kinds[kid_tags] += 1

                elif is_internal_structure(node):
                    self.counts['structure'] += 1

                elif node[0].is_leaf(): # head initial complementation
                    self.counts['head-initial'] += 1

                elif node[-1].is_leaf(): # head final complementation
                    self.counts['head-final'] += 1

                elif is_apposition(node):
                    self.counts['apposition'] += 1
                    self.apposition_kinds[kid_tags] += 1

                elif is_modification(node):
                    self.counts['modification'] += 1
                    self.modification_kinds[kid_tags] += 1

                else: # adjunction
                    self.counts['adjunction'] += 1
                    self.adjunction_kinds[kid_tags] += 1

コード例 #6

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if not node.is_leaf():
                self.counts['total'] += 1
                kid_tags = ' '.join(kid.tag for kid in node)

                if is_predication(node):
                    self.counts['predication'] += 1
                    self.predication_kinds[kid_tags] += 1

                elif is_coordination(node):  # coordination
                    self.counts['coordination'] += 1
                    self.coordination_kinds[kid_tags] += 1

                elif is_internal_structure(node):
                    self.counts['structure'] += 1

                elif node[0].is_leaf():  # head initial complementation
                    self.counts['head-initial'] += 1

                elif node[-1].is_leaf():  # head final complementation
                    self.counts['head-final'] += 1

                elif is_apposition(node):
                    self.counts['apposition'] += 1
                    self.apposition_kinds[kid_tags] += 1

                elif is_modification(node):
                    self.counts['modification'] += 1
                    self.modification_kinds[kid_tags] += 1

                else:  # adjunction
                    self.counts['adjunction'] += 1
                    self.adjunction_kinds[kid_tags] += 1

コード例 #7

0

ファイルを表示

ファイル: count.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
                 
         l, p = node[0].cat, node.cat
         r = node[1].cat if node.count() > 1 else None
         
         self.counts[ tuple(n for n in (l, r, p)) ] += 1

コード例 #8

0

ファイルを表示

    def accept(self, root):
        for node in nodes(root):
            cat = node.cat

            if has_bad_subcat(cat):
                self.bad_freqs[str(cat)] += 1
                return False
        return True

コード例 #9

0

ファイルを表示

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         if is_coordination(node):
             ccs = list(where(lambda kid: kid.tag == 'CC', node.kids))
             for cc in ccs:
                 self.conjs[base_tag(node.tag)][cc.lex] += 1
                 self.inverse[cc.lex][base_tag(node.tag)] += 1

コード例 #10

0

ファイルを表示

ファイル: conjs.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):        
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         if is_coordination(node):
             ccs = list(where(lambda kid: kid.tag == 'CC', node.kids))
             for cc in ccs:
                 self.conjs[base_tag(node.tag)][cc.lex] += 1
                 self.inverse[cc.lex][base_tag(node.tag)] += 1

コード例 #11

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            l, p = node[0].cat, node.cat
            r = node[1].cat if node.count() > 1 else None

            self.counts[tuple(n for n in (l, r, p))] += 1

コード例 #12

0

ファイルを表示

ファイル: commacheck.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         if is_coordination(node):
             def get_tag(kid):
                 if kid.tag in ('CC', 'PU'): return kid.lex
                 else: return kid.tag
             print ' '.join(get_tag(kid) for kid in node)

コード例 #13

0

ファイルを表示

ファイル: badatom.py プロジェクト: Oneplus/cnccgbank

 def accept(self, root):
     for node in nodes(root):
         cat = node.cat
         
         if has_bad_subcat(cat):
             self.bad_freqs[str(cat)] += 1
             return False
     return True

コード例 #14

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue
            if is_coordination(node):

                def get_tag(kid):
                    if kid.tag in ('CC', 'PU'): return kid.lex
                    else: return kid.tag

                print ' '.join(get_tag(kid) for kid in node)

コード例 #15

0

ファイルを表示

ファイル: count.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         
         self.total += 1
         
         result = analyse(node.lch.cat, node.rch and node.rch.cat, node.cat)
         if result == 'l_punct_absorb':
             self.l += 1
         elif result == 'r_punct_absorb':
             self.r += 1
         else:
             self.other += 1

コード例 #16

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            self.total += 1

            result = analyse(node.lch.cat, node.rch and node.rch.cat, node.cat)
            if result == 'l_punct_absorb':
                self.l += 1
            elif result == 'r_punct_absorb':
                self.r += 1
            else:
                self.other += 1

コード例 #17

0

ファイルを表示

    def accept_derivation(self, bundle):
        global merge_verb_compounds
        if merge_verb_compounds:
            for node in nodes(bundle.derivation):
                if node.tag in self.MergedTags:
                    replace_kid(node.parent, node, Leaf(node.tag, ''.join(kid.lex for kid in leaves(node)), node.parent))

        if normalise_foreign_names:
            for leaf in leaves(bundle.derivation):
                if self.is_candidate_foreign_name(leaf.lex):
                    kids = [ Leaf(leaf.tag, bit, None) for bit in leaf.lex.split(INTERPUNCT) ]
                    replace_kid(leaf.parent, leaf, Node('NP-PN', kids))

        if self.accept(bundle.derivation):
            self.write_derivation(bundle)

コード例 #18

0

ファイルを表示

ファイル: tag.py プロジェクト: VikingMew/cnccgbank

def postprocess(root):
    use_lcp_to_np = config.lcp_to_np_typechange

    for node in nodes(root):
        # Exclude the conjuncts in LCP coordination: we want the LCP->NP promotion to apply once to the result of the coordination
        if use_lcp_to_np and node.tag.startswith('LCP') and has_tags(node, 'lr'):
            # if we're in LCP coordination then we want to protect the conjuncts from being converted
            new_node = Node('NP', [node])
            node.parent = new_node

            inherit_tag(new_node, node)

            replace_kid(node.parent, node, new_node)

    return root

コード例 #19

0

ファイルを表示

def postprocess(root):
    use_lcp_to_np = config.lcp_to_np_typechange

    for node in nodes(root):
        # Exclude the conjuncts in LCP coordination: we want the LCP->NP promotion to apply once to the result of the coordination
        if use_lcp_to_np and node.tag.startswith('LCP') and has_tags(
                node, 'lr'):
            # if we're in LCP coordination then we want to protect the conjuncts from being converted
            new_node = Node('NP', [node])
            node.parent = new_node

            inherit_tag(new_node, node)

            replace_kid(node.parent, node, new_node)

    return root

コード例 #20

0

ファイルを表示

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.is_leaf(): continue
         
         if node.count() == 1:
             t = int(node.head_index), str(node.cat)
             
             key = str(node[0].cat)
             self.unary[key].add(t)
             self.unary_freqs[(key, t)] += 1
         else:
             t = int(node.head_index), str(node.cat)
             
             key = (str(node[0].cat), str(node[1].cat))
             self.binary[key].add(t)
             self.binary_freqs[(key, t)] += 1

コード例 #21

0

ファイルを表示

 def accept_derivation(self, bundle):
     def kids_have_same_tag(node):
         def tags_are_equal(t1, t2):
             if t1[0] == 'V' and t2[0] == 'V': return True
             if t1[0] == 'N' and t2[0] == 'N': return True
             return t1 == t2
         return all(tags_are_equal(node[0].tag, other.tag) for other in node[1:])
     self.nderivs += 1
     for node in nodes(bundle.derivation):
         if (node.count() > 1 and 
             (not node.tag.startswith('NP')) and
             (not node.tag.startswith('ADJP')) and
             (not node.tag.startswith('FRAG')) and
             (not node.tag.startswith('FLR')) and
             (not base_tag(node.tag) in ('VCD', 'VRD', 'VCP', 'VNV', 'VPT', 'VSB')) and
             (not kids_have_same_tag(node)) and
             all(base_tag(kid.tag) in WordTags for kid in node)):
             self.nbad += 1
             print node
             break

コード例 #22

0

ファイルを表示

ファイル: tgrep.py プロジェクト: Oneplus/cnccgbank

def multi_tgrep(deriv, query_callback_map):
    if not query_callback_map: raise RuntimeError('No query expressions given.')
    initialise()
    
    if _tgrep_debug:
        for expression in query_callback_map.keys():
            debug("Lexing %s", expression)
            lex.input(expression)
            for tok in iter(lex.token, None):
                debug("\t%s %s", tok.type, tok.value)
    
    queries = [yacc.parse(expression) for expression in query_callback_map.keys()]
    for node in nodes(deriv):
        for query_expr, query_str in izip(queries, query_callback_map.keys()):
            context = Context()
            if query_expr.is_satisfied_by(node, context):
                if context:
                    query_callback_map[query_str](node, **smash_key_case(context))
                else:
                    query_callback_map[query_str](node)

コード例 #23

0

ファイルを表示

def multi_tgrep(deriv, query_callback_map):
    if not query_callback_map:
        raise RuntimeError('No query expressions given.')
    initialise()

    if _tgrep_debug:
        for expression in query_callback_map.keys():
            debug("Lexing %s", expression)
            lex.input(expression)
            for tok in iter(lex.token, None):
                debug("\t%s %s", tok.type, tok.value)

    queries = [
        yacc.parse(expression) for expression in query_callback_map.keys()
    ]
    for node in nodes(deriv):
        for query_expr, query_str in izip(queries, query_callback_map.keys()):
            context = Context()
            if query_expr.is_satisfied_by(node, context):
                if context:
                    query_callback_map[query_str](node,
                                                  **smash_key_case(context))
                else:
                    query_callback_map[query_str](node)

コード例 #24

0

ファイルを表示

ファイル: find_unanalysed.py プロジェクト: Oneplus/cnccgbank

total, with_unrecognised_rules = 0, 0
ucp_rules = defaultdict(lambda: 0)
with_ucp = 0

unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0)

def is_ucp(l, r, p):
    if r is None: return False
    
    return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r

for file in glob(sys.argv[1]):
    for bundle in CCGbankReader(file):
        has_unrecognised_rules, has_ucp = False, False
        
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue
            
            lrp = map(lambda e: e and e.cat, (node[0], node[1] if node.count() > 0 else None, node))
            
            comb = analyse(*lrp)
            l, r, p = lrp
            rule_tuple = (str(l), str(r), str(p))
            
            if comb:
                combs[comb] += 1
            elif is_ucp(*lrp):
                ucp_rules[rule_tuple] += 1
                if not has_ucp:
                    with_ucp += 1
                has_ucp = True

コード例 #25

0

ファイルを表示

ファイル: branching.py プロジェクト: Oneplus/cnccgbank

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            self.branches += node.count()
            self.internals += 1

コード例 #26

0

ファイルを表示

ファイル: tagsperword.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.count() == 1: self.unaries += 1
     self.nsents += 1

コード例 #27

0

ファイルを表示

def preprocess(root):
    # IP < PP PU -> PP < PP PU (20:58(1))
    if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith(
            'PP'):
        root.tag = root[0].tag

    for node in nodes(root):
        if node.is_leaf(): continue

        if rewrite_lcp_as_np and node.tag.startswith('LCP'):
            node.tag = node.tag.replace('LCP', 'NP')

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True)
        # ---------------------
        # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP)
        if any(kid.lex in ("“", "「")
               for kid in leaf_kids(node)) and any(kid.lex in ("”", "」")
                                                   for kid in leaf_kids(node)):
            lqu = first_index_such_that(
                lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node)
            rqu = first_index_such_that(
                lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node)
            if rqu != node.count() - 1:
                quoted_kids = node.kids[lqu:rqu + 1]
                del node.kids[lqu:rqu + 1]

                last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids,
                                                            get_last=True)
                # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore
                if last_nonpunct_kid:
                    quoted_node = Node(last_nonpunct_kid.tag, quoted_kids)
                    node.kids.insert(lqu, quoted_node)

        # CPTB/Chinese-specific fixes
        # ---------------------------
        # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP))
        if first_kid and first_kid.tag == "P" and node.count() > 2:
            last_tag = last_kid.tag
            rest = node.kids[1:]
            del node.kids[1:]
            node.kids.append(Node(last_tag, rest, node))
        # 2:12(3). DNP-PRD fixed by adding a layer of NP
        elif (node.tag.startswith('VP') and node.count() == 2
              and node[0].tag.startswith('VC')
              and node[1].tag.startswith('DNP-PRD')):
            node[1] = Node('NP', [node[1]], node)
            # fix missing -OBJ tag from VP object complements (c.f. 31:18(4))
        elif (node.tag.startswith('VP') and node.count() >= 2
              and node.tag.startswith('VP') and node[0].tag == 'VV'
              and node[-1].tag == 'NP'):
            node[-1].tag += "-OBJ"
            # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5))
        elif any(
                is_repeated_unary_projection(xp, node)
                for xp in ('IP', 'VP', 'NP', 'CP')):
            node.kids = node[0].kids
        # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier,
        # just like a quantity
        elif node.tag == 'DP-SBJ':
            node.tag = 'QP-SBJ'
        # attach the PU preceding a PRN under the PRN
        elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1:
            maybe_pu = node[last_kid_index - 1]
            if maybe_pu.tag == 'PU':
                del node.kids[last_kid_index - 1]
                last_kid.kids.insert(0, maybe_pu)  # prepend
        # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC.
        elif node.tag == 'CP' and node.count(
        ) == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG':
            if get_first(node[0],
                         r'^/\*T\*/') and not get_first(node[0], r'/DEC/'):
                node[1].tag = 'DEC'

        elif node.tag.startswith('NP') and any(
                kid.tag.startswith('QP-APP') for kid in node):
            for kid in node:
                if kid.tag.startswith('QP-APP'):
                    kid.tag = kid.tag.replace('QP', 'NP')

        # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis
        elif node.tag.startswith('NP') and node.count(
        ) == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith(
                'NP-APP') and node[2].tag.startswith('NP-PN'):
            np_app, np_pn = node[1], node[2]
            del node.kids[1:]

            node.kids.append(Node(node.tag, [np_app, np_pn], node))

        # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19))
        elif node.tag == 'IP' and node.count(
        ) == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[
                2].tag == 'VP':
            advp = node.kids.pop(1)
            # VP is the new node[1]
            # now replace node[1] with Node(node[1])
            node[1] = Node(node[1].tag, [advp, node[1]], node)

        # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h)
        # only 3 cases: 23:61(5), 9:14(14), 21:3(11)
        elif node.tag == 'DNP' and node.count(
        ) == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG':
            replace_kid(node, node[0], Node('NP', [node[0]]))

        elif is_vnv(node) and node.count() == 3:
            # Re-analyse VNV as coordination
            node[1].tag = 'CC'

        # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1))
        elif node.count() == 1:
            # fix IP < VP by adding *pro*
            if node.tag.startswith('IP') and node[0].tag.startswith('VP'):
                leaf = Leaf('-NONE-', '*pro*', None)
                pro = Node('NP-SBJ', [leaf])

                node.kids.insert(0, pro)
            elif node[0].tag == 'JJ':
                if node.tag.startswith('ADVP'):
                    node.tag = node.tag.replace('ADVP', 'ADJP')
                elif node.tag.startswith('NP'):
                    node.tag = node.tag.replace('NP', 'ADJP')

            # fix NP < VV
            elif node.tag == 'NP' and node[0].tag == 'VV':
                node.tag = node.tag.replace('NP', 'VP')

            # fix NP < ADJP < JJ (5:35(1))
            elif node.tag == 'NP' and node[0].tag == 'ADJP':
                replace_kid(node.parent, node, node[0])

            # fix projections NP < QP
            elif node[0].tag.startswith('QP') and node.tag.startswith('NP'):
                inherit_tag(node[0], node)  # copy PCTB tags from NP to QP
                node.tag = node[0].tag  # copy QP to parent, replacing NP
                node.kids = node[0].kids
            elif node[0].tag == 'IP' and node.tag == 'CP-APP':
                inherit_tag(node[0], node)
                node.tag = node[0].tag
                node.kids = node[0].kids
            # CLP < NN
            elif node[0].tag == 'NN' and node.tag == 'CLP':
                node[0].tag = 'M'
            elif node[0].tag == 'NN' and node.tag.startswith("VP"):
                node[0].tag = 'VV'
            elif node[0].tag == 'CP':
                if node.tag == 'NP-PRD':
                    node.kids = node[0].kids
                else:
                    # Rewrite NP < { CP < { CP < DEC } }
                    # (i.e. 比 报告 的 早 一点) so that it's headed by the 的
                    expr = r'''/CP/ < { /CP/ < /DEC/ }'''
                    if get_first(node[0], expr):
                        node.kids = node[0].kids

            elif node[0].tag in ('NP', 'NP-PN', 'VP',
                                 'IP') and node.tag == 'PRN':
                node.kids = node[0].kids

            # ADVP < CS: shrink so that CS will be considered the head by binarise
            # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47)
            elif ((node.tag == 'ADVP' and node[0].tag == 'CS')
                  or (node[0].tag == 'M' and node.tag == 'CP')):
                replace_kid(node.parent, node, node[0])

            # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4)
            elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'):
                node.kids = node[0].kids

            # elif node.tag == 'VP' and node[0].tag == 'NP-PRD':
            #     replace_kid(node.parent, node, node[0])

            # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3)
            # elif node.tag == 'VP' and node[0].tag.startswith('IP'):
            #     replace_kid(node.parent, node, node[0])

        # Reshape LB (long bei)
        # ---------------------
        elif first_kid and first_kid.tag == "LB":
            expr = r'''* < { /LB/=LB
                       [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP
                       | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }'''
            top, ctx = get_first(node, expr, with_context=True)

            lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip
            top.kids = [lb, Node('IP', [sbj, pred])]
            # top.kids = [lb, sbj, pred]

        # elif False:
        elif first_kid and first_kid.tag == "BA":
            expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }'''

            result = get_first(node, expr, with_context=True)
            if result:
                top, ctx = result

                lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip
                #            top.kids = [lb, Node('IP', [sbj, pred])]
                top.kids = [lb, sbj, pred]

        # single mistagging CP-SBJ for CP in 24:58(1)
        elif node.tag == 'CP-SBJ':
            node.tag = 'CP'

        else:
            # Fix missing phrasal layer in NP < NN DEG (21:10(4))
            result = get_first(node,
                               r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }',
                               with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix missing phrasal layer in LCP < NN LC (11:17(9))
            result = get_first(node,
                               r'/LCP/=P < { /N[NRT]/=N $ /LC/ }',
                               with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix wrongly attached DEC (5:26(6))
            result = get_first(
                node,
                r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }',
                with_context=True)
            if result:
                _, ctx = result
                top, p, dec = ctx.top, ctx.p, ctx.dec

                top.kids.append(dec)
                p.kids.remove(dec)

            result = get_first(
                node,
                r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }',
                nonrecursive=True,
                with_context=True)
            if result:
                _, ctx = result
                pp, p, s = ctx.pp, ctx.p, ctx.s
                inherit_tag(s, p)
                replace_kid(pp, p, s)

            expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred

                del vp.kids
                if get_first(sbj, r'* < ^/\*PRO\*/'):
                    vp.kids = [v, pred]
                else:
                    vp.kids = [v, sbj, pred]

            expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                p = ctx.p

                if p.count() <= 3: continue

                cd_cc_cd, rest = p.kids[0:3], p.kids[3:]
                del p.kids[0:3]

                new_node = Node('QP', cd_cc_cd)
                p.kids.insert(0, new_node)

    return root

コード例 #28

0

ファイルを表示

ファイル: tag.py プロジェクト: VikingMew/cnccgbank

def preprocess(root):
    # IP < PP PU -> PP < PP PU (20:58(1))
    if root.count() == 2 and root[1].tag == 'PU' and root[0].tag.startswith('PP'): root.tag = root[0].tag

    for node in nodes(root):
        if node.is_leaf(): continue

        if rewrite_lcp_as_np and node.tag.startswith('LCP'):
            node.tag = node.tag.replace('LCP', 'NP')

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid,  last_kid_index  = get_nonpunct_kid(node, get_last=True)
        # ---------------------
        # Where LPU, RPU are paired punctuation, reshape YP(LPU ... XP RPU YP) into YP(XP(LPU ... XP) YP)
        if any(kid.lex in ("“", "「") for kid in leaf_kids(node)) and any(kid.lex in ("”", "」") for kid in leaf_kids(node)):
            lqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("“", "「"), node)
            rqu = first_index_such_that(lambda kid: kid.is_leaf() and kid.lex in ("”", "」"), node)
            if rqu != node.count()-1:
                quoted_kids = node.kids[lqu:rqu+1]
                del node.kids[lqu:rqu+1]

                last_nonpunct_kid, _ = get_nonpunct_element(quoted_kids, get_last=True)
                # Bad punctuation in 27:84(4) causes a mis-analysis, just ignore
                if last_nonpunct_kid:
                    quoted_node = Node(last_nonpunct_kid.tag, quoted_kids)
                    node.kids.insert(lqu, quoted_node)

        # CPTB/Chinese-specific fixes
        # ---------------------------
        # PP(P CP NP) in derivations like 5:11(3) should be PP(P NP(CP NP))
        if first_kid and first_kid.tag == "P" and node.count() > 2:
            last_tag = last_kid.tag
            rest = node.kids[1:]
            del node.kids[1:]
            node.kids.append(Node(last_tag, rest, node))
        # 2:12(3). DNP-PRD fixed by adding a layer of NP
        elif (node.tag.startswith('VP') and node.count() == 2 and
                node[0].tag.startswith('VC') and
                node[1].tag.startswith('DNP-PRD')): node[1] = Node('NP', [node[1]], node)
        # fix missing -OBJ tag from VP object complements (c.f. 31:18(4))
        elif (node.tag.startswith('VP') and node.count() >= 2 and
              node.tag.startswith('VP') and
              node[0].tag == 'VV' and
              node[-1].tag == 'NP'): node[-1].tag += "-OBJ"
        # fix bad annotation IP < IP (2:7(28)), VP < VP (0:1(5))
        elif any(is_repeated_unary_projection(xp, node) for xp in ('IP', 'VP', 'NP', 'CP')):
            node.kids = node[0].kids
        # treat DP-SBJ as QP-SBJ (6:37(9)): the rationale is that the determiner (e.g. 每) acts as a specifier,
        # just like a quantity
        elif node.tag == 'DP-SBJ':
            node.tag = 'QP-SBJ'
        # attach the PU preceding a PRN under the PRN
        elif last_kid and last_kid.tag == 'PRN' and last_kid.count() == 1:
            maybe_pu = node[last_kid_index-1]
            if maybe_pu.tag == 'PU':
                del node.kids[last_kid_index-1]
                last_kid.kids.insert(0, maybe_pu) # prepend
        # DEG instead of DEC (29:34(3)). if there's a trace in DEG's sibling and no DEC, then change DEG to DEC.
        elif node.tag == 'CP' and node.count() == 2 and node[0].tag == 'IP' and node[1].tag == 'DEG':
            if get_first(node[0], r'^/\*T\*/') and not get_first(node[0], r'/DEC/'):
                node[1].tag = 'DEC'

        elif node.tag.startswith('NP') and any(kid.tag.startswith('QP-APP') for kid in node):
            for kid in node:
                if kid.tag.startswith('QP-APP'): kid.tag = kid.tag.replace('QP', 'NP')

        # NP(CP NP-APP NP-PN) -> NP(CP NP(NP-APP NP-PN)) so that NP(NP-APP NP-PN) can receive NP internal structure-type analysis
        elif node.tag.startswith('NP') and node.count() == 3 and node[0].tag.startswith('CP') and node[1].tag.startswith('NP-APP') and node[2].tag.startswith('NP-PN'):
            np_app, np_pn = node[1], node[2]
            del node.kids[1:]

            node.kids.append(Node(node.tag, [np_app, np_pn], node))

        # IP < NP-SBJ ADVP VP rather than IP < NP-SBJ VP(ADVP VP) (25:59(12), 6:92(19))
        elif node.tag == 'IP' and node.count() == 3 and node[0].tag == 'NP-SBJ' and node[1].tag == 'ADVP' and node[2].tag == 'VP':
            advp = node.kids.pop(1)
            # VP is the new node[1]
            # now replace node[1] with Node(node[1])
            node[1] = Node(node[1].tag, [advp, node[1]], node)

        # fixing DNP(PN DEG), which causes mis-tagging DNP(PN:l DEG:h)
        # only 3 cases: 23:61(5), 9:14(14), 21:3(11)
        elif node.tag == 'DNP' and node.count() == 2 and node[0].tag == 'PN' and node[1].tag == 'DEG':
            replace_kid(node, node[0], Node('NP', [node[0]]))

        elif is_vnv(node) and node.count() == 3:
            # Re-analyse VNV as coordination
            node[1].tag = 'CC'

        # fix mistaggings of the form ADVP < JJ (1:7(9)), NP < JJ (5:35(1))
        elif node.count() == 1:
            # fix IP < VP by adding *pro*
            if node.tag.startswith('IP') and node[0].tag.startswith('VP'):
                leaf = Leaf('-NONE-', '*pro*', None)
                pro = Node('NP-SBJ', [leaf])

                node.kids.insert(0, pro)
            elif node[0].tag == 'JJ':
                if node.tag.startswith('ADVP'):
                    node.tag = node.tag.replace('ADVP', 'ADJP')
                elif node.tag.startswith('NP'):
                    node.tag = node.tag.replace('NP', 'ADJP')

            # fix NP < VV
            elif node.tag == 'NP' and node[0].tag == 'VV':
                node.tag = node.tag.replace('NP', 'VP')

            # fix NP < ADJP < JJ (5:35(1))
            elif node.tag == 'NP' and node[0].tag == 'ADJP':
                replace_kid(node.parent, node, node[0])

            # fix projections NP < QP
            elif node[0].tag.startswith('QP') and node.tag.startswith('NP'):
                inherit_tag(node[0], node) # copy PCTB tags from NP to QP
                node.tag = node[0].tag # copy QP to parent, replacing NP
                node.kids = node[0].kids
            elif node[0].tag == 'IP' and node.tag == 'CP-APP':
                inherit_tag(node[0], node)
                node.tag = node[0].tag
                node.kids = node[0].kids
            # CLP < NN
            elif node[0].tag == 'NN' and node.tag == 'CLP':
                node[0].tag = 'M'
            elif node[0].tag == 'NN' and node.tag.startswith("VP"):
                node[0].tag = 'VV'
            elif node[0].tag == 'CP':
                if node.tag == 'NP-PRD':
                    node.kids = node[0].kids
                else:
                    # Rewrite NP < { CP < { CP < DEC } } 
                    # (i.e. 比 报告 的 早 一点) so that it's headed by the 的
                    expr = r'''/CP/ < { /CP/ < /DEC/ }'''
                    if get_first(node[0], expr):
                        node.kids = node[0].kids
                        
            elif node[0].tag in ('NP', 'NP-PN', 'VP', 'IP') and node.tag == 'PRN':
                node.kids = node[0].kids
                
            # ADVP < CS: shrink so that CS will be considered the head by binarise
            # CP < M: tagging error 7:14(8), 10:51(4), 11:13(32), 11:15(47)
            elif ((node.tag == 'ADVP' and node[0].tag == 'CS') or  
                  (node[0].tag == 'M' and node.tag == 'CP')):
                replace_kid(node.parent, node, node[0])
                
            # fix NP<DNP so that it's headed by the DEC 8:38(18), 0:30(4)
            elif node.tag.startswith('NP') and node[0].tag.startswith('DNP'):
                node.kids = node[0].kids

            # elif node.tag == 'VP' and node[0].tag == 'NP-PRD':
            #     replace_kid(node.parent, node, node[0])
            
            # couple of noisy derivs like 10:35(80), 10:26(121), 11:37(3)
            # elif node.tag == 'VP' and node[0].tag.startswith('IP'):
            #     replace_kid(node.parent, node, node[0])
                
        # Reshape LB (long bei)
        # ---------------------
        elif first_kid and first_kid.tag == "LB":
            expr = r'''* < { /LB/=LB
                       [ $ { * < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED }=IP
                       | $ { /CP/=CP < { *=IP < /-(SBJ|OBJ|PN)/a=SBJ < /(V[PV]|VRD|VSB)/=PRED } } ] }'''
            top, ctx = get_first(node, expr, with_context=True)

            lb, sbj, pred, cp, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.cp, ctx.ip
            top.kids = [lb, Node('IP', [sbj, pred])]
            # top.kids = [lb, sbj, pred]
            
        # elif False:
        elif first_kid and first_kid.tag == "BA":
            expr = r'''* < { /BA/=LB $ { /IP/ < /NP/=SBJ < /VP/=PRED } }'''
                
            result = get_first(node, expr, with_context=True)
            if result:
                top, ctx = result

                lb, sbj, pred, ip = ctx.lb, ctx.sbj, ctx.pred, ctx.ip
    #            top.kids = [lb, Node('IP', [sbj, pred])]
                top.kids = [lb, sbj, pred]

        # single mistagging CP-SBJ for CP in 24:58(1)
        elif node.tag == 'CP-SBJ': node.tag = 'CP'
        
        else:
            # Fix missing phrasal layer in NP < NN DEG (21:10(4))
            result = get_first(node, r'/DNP/=P < { /N[NRT]/=N $ /DEG/ }', with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix missing phrasal layer in LCP < NN LC (11:17(9))
            result = get_first(node, r'/LCP/=P < { /N[NRT]/=N $ /LC/ }', with_context=True)
            if result:
                p, ctx = result
                n = ctx.n
                replace_kid(p, n, Node('NP', [n]))

            # Fix wrongly attached DEC (5:26(6))
            result = get_first(node, r'/CP/=TOP < { /IP/=P < { /NP/ $ /VP/ $ /DEC/=DEC } }', with_context=True)
            if result:
                _, ctx = result
                top, p, dec = ctx.top, ctx.p, ctx.dec

                top.kids.append(dec)
                p.kids.remove(dec)

            result = get_first(node, r'*=PP < { /IP-TPC/=P <1 { /NP/=T < ^/\*PRO\*/ } <2 /VP/=S }', nonrecursive=True, with_context=True)
            if result:
                _, ctx = result
                pp, p, s = ctx.pp, ctx.p, ctx.s
                inherit_tag(s, p)
                replace_kid(pp, p, s)

            expr = r'''/VP/=VP <1 /VV/=V <2 { /IP-OBJ/ <1 /NP-SBJ/=SBJ <2 /VP/=PRED }'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                vp, v, sbj, pred = ctx.vp, ctx.v, ctx.sbj, ctx.pred

                del vp.kids
                if get_first(sbj, r'* < ^/\*PRO\*/'):
                    vp.kids = [v, pred]
                else:
                    vp.kids = [v, sbj, pred]

            expr = r'''/QP/=P <1 /CD/ <2 /CC/ <3 /CD/'''
            result = get_first(node, expr, with_context=True)
            if result:
                _, ctx = result
                p = ctx.p

                if p.count() <= 3: continue

                cd_cc_cd, rest = p.kids[0:3], p.kids[3:]
                del p.kids[0:3]

                new_node = Node('QP', cd_cc_cd)
                p.kids.insert(0, new_node)

    return root

コード例 #29

0

ファイルを表示

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         if node.count() == 1: self.unaries += 1
     self.nsents += 1

コード例 #30

0

ファイルを表示

with_ucp = 0

unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0)


def is_ucp(l, r, p):
    if r is None: return False

    return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r


for file in glob(sys.argv[1]):
    for bundle in CCGbankReader(file):
        has_unrecognised_rules, has_ucp = False, False

        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            lrp = map(lambda e: e and e.cat,
                      (node[0], node[1] if node.count() > 0 else None, node))

            comb = analyse(*lrp)
            l, r, p = lrp
            rule_tuple = (str(l), str(r), str(p))

            if comb:
                combs[comb] += 1
            elif is_ucp(*lrp):
                ucp_rules[rule_tuple] += 1
                if not has_ucp:
                    with_ucp += 1

コード例 #31

0

ファイルを表示

ファイル: ops.py プロジェクト: Oneplus/cnccgbank

def Dominates(candidate, node, context):
    if node.is_leaf(): return False
    return any(candidate.is_satisfied_by(internal_node, context) for internal_node in nodes(node))

コード例 #32

0

ファイルを表示

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         self.atoms.update(ListAtoms.get_atoms(node.cat))

コード例 #33

0

ファイルを表示

ファイル: tag.py プロジェクト: VikingMew/cnccgbank

def label(root):
    root = preprocess(root)

    for node in nodes(root):
        if node.is_leaf(): continue

        at_top = False
        if node.parent is None: at_top = True

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid,  last_kid_index  = get_nonpunct_kid(node, get_last=True)

        if first_kid is None: continue

        for kid in node:
            if has_modification_tag(kid):
                tag(kid, 'm')

            elif kid.tag == 'MSP':
                tag(kid, 'a')

            elif kid.tag == 'FLR':
                tag(kid, 'a')

            elif kid.tag == 'ETC':
                tag(kid, '&')

            else:
                tag_if_topicalisation(kid)

        if is_prn(node):
            # PRN tagging error in 10:49(69)
            if not first_kid: continue

            node.tag = first_kid.tag
            tag(node, 'p')
            tag(node[0], 'h') # assume that the first PU introduces the PRN

        elif node.tag == 'FRAG':
            tag_adjunction(node, last_kid)

        # occasionally something that looks like CCG right absorption occurs in the original annotation (0:23(8))
        elif is_right_absorption(node):
            pass

        elif is_predication(node):
            sbj_assigned = False
            vp_assigned = False

            for kid in reversed(node):
                if not sbj_assigned and (kid.tag.rfind('-SBJ') != -1 or
                    # TODO: we can get IP < NP-PN VP (0:40(5)). is this correct?
                    # exclude NP-PN-LOC (10:62(25))
                    (kid.tag.rfind('-PN') != -1 and kid.tag.rfind('-PN-LOC') == -1) or
                    # NP-APP VP in 11:31(88)
                    (kid.tag.rfind('-APP') != -1) or
                    kid.tag == "NP"):

                    tag(kid, 'l') # TODO: is subject always left of predicate?
                    sbj_assigned = True
                elif not vp_assigned and kid.tag == 'VP':
                    tag(kid, 'h')
                    vp_assigned = True
                # elif _has_modification_tag(kid) and kid.tag.startswith('IP'):
                #     tag(kid, 'm')
                elif kid.tag not in ('PU', 'CC'):
                    tag(kid, 'a')

            if punct_cued_typechange:
                for i, kid in enumerate(node):
                    try:
                        # exclude IP-SBJ PU VP from having the PU tagged :h (1:53(9))
                        if (kid.tag.startswith('IP-') or kid.tag.startswith('CP-')) and \
                           kid.tag.find('-TPC') == -1 and \
                           kid.tag.find('-SBJ') == -1 and \
                           node[i+1].tag == 'PU':
                            tag(node[i+1], 'h')

                    except: continue

        elif node.count() == 1 and node.tag.startswith('VP') and is_verb_compound(node[0]):
            pass

        elif is_vpt(node): # fen de kai, da bu ying. vpt is head-final
            left = True
            for kid in node:
                if kid.tag.startswith("AD") or kid.tag.startswith("DER"):
                    tag(kid, 'h')
                    left = False
                elif left:
                    tag(kid, 'l')
                else:
                    tag(kid, 'r')

        elif is_vsb(node): # VSB is modifier+head, and hence is head-final
            tag(first_kid, 'h')
            for kid in node[1:]:
                if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'):
                    tag(kid, 'a') # treat aspect particles as adjuncts
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'r')

        elif is_vcd(node):
            pass

        elif is_vcp(node):
            tag(first_kid, 'h')
            for kid in node[1:]:
                if kid.tag == "VC":
                    tag(kid, 'a')

        elif is_vrd(node) or is_vsb(node): # vrd is head-initial
            tag(first_kid, 'h')
            for kid in node[1:]:
                if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'):
                    tag(kid, 'a') # treat aspect particles as adjuncts
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'r')

        elif is_coordination(node, at_top=at_top): # coordination
            for kid in node:
                if kid.tag == "ETC":
                    tag(kid, '&')

                if kid.tag not in ('CC', 'PU', 'CSC'):
                    tag(kid, 'c')

        elif is_np_internal_structure(node):
            first = True
            for kid in reversed(node.kids):
#                if kid.tag.startswith('PRN'): continue

                if kid.tag == 'ETC':
                    tag(kid, '&')
                elif kid.tag not in ('CC', 'PU'):
                    if first:
                        tag(kid, 'N')
                        first = False
                    else:
                        tag(kid, 'n')
                else:
                    pass

        # must be above is_coordination (it subsumes UCP)
        elif is_ucp(node):
            left_conjunct_tag = first_kid.tag
            # NOTE:
            # There are some cases where the UCP annotation is suspect (1:36(11))
            # we will obtain the wrong analysis in these cases because the UCP node
            # does not directly dominate its conjuncts

            old_tag = node.tag
            node.tag = left_conjunct_tag
            node.tag = inherit_tag_str(node.tag, old_tag)

            for kid in nodes(node):
                if kid.tag is None:
                    print kid
                    print kid.tag

                if kid.tag.startswith('UCP'):
                    kid.tag = left_conjunct_tag
            for kid in node:
                if kid.tag == 'ETC':
                    tag(kid, '&')
                elif kid.tag not in ('CC', 'PU'):
                    tag(kid, 'C')

        # exclude VP < VV AS: we want to tag this
        elif (node.count() == 1) or is_verb_compound(node):
            pass

        elif ((first_kid.is_leaf() # head initial complementation
            # quoted verb (see fix in _preprocess_ function)
#           or all((kid.is_leaf() and kid.tag in ('PU', 'VV')) for kid in first_kid)
           or satisfies_all(
                lambda fkid: any(kid.is_leaf() and kid.tag == 'PU' for kid in fkid),
                lambda fkid: any(kid.is_leaf() and kid.tag == 'VV' for kid in fkid))(first_kid)
           or is_verb_compound(first_kid)
           # HACK: to fix weird case of unary PP < P causing adjunction analysis instead of head-initial
           # (because of PP IP configuration) in 10:76(4)
           or first_kid.tag == 'PP' and first_kid.count() == 1 and first_kid[0].tag == "P")):

            tag(first_kid, 'h')
            for kid in node[1:]:
                if is_postverbal_adjunct_tag(kid.tag) or kid.tag.startswith('ADVP'):
                    tag(kid, 'a') # treat aspect particles as adjuncts
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'r')

        # head final complementation
        # This has to come after head initial complementation, otherwise we get the tagging VP < VV:l AS:h
        elif (last_kid.is_leaf() or
              is_verb_compound(last_kid) or
              # lcp internal structure (cf 10:2(13)) is possible: despite the structure (LCP (NP) (LCP))
              # this should be treated as head-final complementation, not adjunction.
              is_lcp_internal_structure(last_kid)) and not any(first_kid.tag.find(ftag) != -1 for ftag in FunctionTags):

            if last_kid.tag.startswith('SP'):
                # Treat final 吗 as the head to get the type-change category S[q]\S[dcl] (25:21(5))
                if has_question_tag(node):
                    tag(last_kid, 'h')
                else:
                    tag(last_kid, 'a')

            else: tag(last_kid, 'h')

            # cf 2:23(7),1:9(28), a number of derivations have (CP(WHNP-1 CP(IP) DEC) XP) instead of
            # the expected (CP (WHNP-1) CP(IP DEC) XP)
            # This lets us treat what would otherwise be considered head-final as an
            # adjunction
            if last_kid.tag.startswith('DEC'):
                for kid in node[0:-1]:
                    if kid.tag.startswith('WHNP') or kid.tag.startswith('WHPP'): tag(kid, 'a')
                    elif not (kid.tag.startswith('PU') or
                        kid.tag.startswith('ADVP')): # ADVP as sibling of IP in 11:39(63)
                        tag(kid, 'l')
            else:
                for kid in node[0:-1]:
                    if (last_kid.tag in VerbalCategories and (
                            is_postverbal_adjunct_tag(kid.tag) or
                            # exception added to account for direct modification of V{V,A} with ADVP (0:47(9))
                            kid.tag.startswith('ADVP'))):
                        tag(kid, 'a') # treat aspect particles as adjuncts
                    elif not kid.tag.startswith('PU'):
                        tag(kid, 'l')

        # TODO: if this is below coordination, then NP(NP-APP PU NP) is considered coordination instead of apposition (10:70(15))
        #       actually, what happens if we get english-style apposition (NP1 , NP2)?
        elif is_apposition(node):
            if False:#any(kid.tag == 'IP-APP' for kid in node):
                tag(last_kid, 'h')
            else:
                tag(last_kid, 'r') # HACK: assume apposition is right-headed

            for kid in node:
                if not kid.tag.startswith('PU'):
                    # exclude CP-APP (see is_apposition() above)
                    if kid.tag.endswith('-APP') and not kid.tag.startswith('CP'):
                        tag(kid, 'A')
                    else:
                        tag(kid, 'a')

        elif is_modification(node):
            tag(last_kid, 'h')

            for kid in node[0:-1]:
                if has_modification_tag(kid):
                    tag(kid, 'm')
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'a')

                else:
                    tag_if_topicalisation(kid)

        elif is_argument_cluster(node):
            for kid in node:
                tag(kid, '@')

        else: # adjunction
            tag_adjunction(node, last_kid)

    root = postprocess(root)

    return root

コード例 #34

0

ファイルを表示

    def accept_derivation(self, bundle):
        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            self.branches += node.count()
            self.internals += 1

コード例 #35

0

ファイルを表示

def label(root):
    root = preprocess(root)

    for node in nodes(root):
        if node.is_leaf(): continue

        at_top = False
        if node.parent is None: at_top = True

        first_kid, first_kid_index = get_nonpunct_kid(node, get_last=False)
        last_kid, last_kid_index = get_nonpunct_kid(node, get_last=True)

        if first_kid is None: continue

        for kid in node:
            if has_modification_tag(kid):
                tag(kid, 'm')

            elif kid.tag == 'MSP':
                tag(kid, 'a')

            elif kid.tag == 'FLR':
                tag(kid, 'a')

            elif kid.tag == 'ETC':
                tag(kid, '&')

            else:
                tag_if_topicalisation(kid)

        if is_prn(node):
            # PRN tagging error in 10:49(69)
            if not first_kid: continue

            node.tag = first_kid.tag
            tag(node, 'p')
            tag(node[0], 'h')  # assume that the first PU introduces the PRN

        elif node.tag == 'FRAG':
            tag_adjunction(node, last_kid)

        # occasionally something that looks like CCG right absorption occurs in the original annotation (0:23(8))
        elif is_right_absorption(node):
            pass

        elif is_predication(node):
            sbj_assigned = False
            vp_assigned = False

            for kid in reversed(node):
                if not sbj_assigned and (
                        kid.tag.rfind('-SBJ') != -1 or
                        # TODO: we can get IP < NP-PN VP (0:40(5)). is this correct?
                        # exclude NP-PN-LOC (10:62(25))
                    (kid.tag.rfind('-PN') != -1
                     and kid.tag.rfind('-PN-LOC') == -1) or
                        # NP-APP VP in 11:31(88)
                    (kid.tag.rfind('-APP') != -1) or kid.tag == "NP"):

                    tag(kid, 'l')  # TODO: is subject always left of predicate?
                    sbj_assigned = True
                elif not vp_assigned and kid.tag == 'VP':
                    tag(kid, 'h')
                    vp_assigned = True
                # elif _has_modification_tag(kid) and kid.tag.startswith('IP'):
                #     tag(kid, 'm')
                elif kid.tag not in ('PU', 'CC'):
                    tag(kid, 'a')

            if punct_cued_typechange:
                for i, kid in enumerate(node):
                    try:
                        # exclude IP-SBJ PU VP from having the PU tagged :h (1:53(9))
                        if (kid.tag.startswith('IP-') or kid.tag.startswith('CP-')) and \
                           kid.tag.find('-TPC') == -1 and \
                           kid.tag.find('-SBJ') == -1 and \
                           node[i+1].tag == 'PU':
                            tag(node[i + 1], 'h')

                    except:
                        continue

        elif node.count() == 1 and node.tag.startswith(
                'VP') and is_verb_compound(node[0]):
            pass

        elif is_vpt(node):  # fen de kai, da bu ying. vpt is head-final
            left = True
            for kid in node:
                if kid.tag.startswith("AD") or kid.tag.startswith("DER"):
                    tag(kid, 'h')
                    left = False
                elif left:
                    tag(kid, 'l')
                else:
                    tag(kid, 'r')

        elif is_vsb(node):  # VSB is modifier+head, and hence is head-final
            tag(first_kid, 'h')
            for kid in node[1:]:
                if is_postverbal_adjunct_tag(
                        kid.tag) or kid.tag.startswith('ADVP'):
                    tag(kid, 'a')  # treat aspect particles as adjuncts
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'r')

        elif is_vcd(node):
            pass

        elif is_vcp(node):
            tag(first_kid, 'h')
            for kid in node[1:]:
                if kid.tag == "VC":
                    tag(kid, 'a')

        elif is_vrd(node) or is_vsb(node):  # vrd is head-initial
            tag(first_kid, 'h')
            for kid in node[1:]:
                if is_postverbal_adjunct_tag(
                        kid.tag) or kid.tag.startswith('ADVP'):
                    tag(kid, 'a')  # treat aspect particles as adjuncts
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'r')

        elif is_coordination(node, at_top=at_top):  # coordination
            for kid in node:
                if kid.tag == "ETC":
                    tag(kid, '&')

                if kid.tag not in ('CC', 'PU', 'CSC'):
                    tag(kid, 'c')

        elif is_np_internal_structure(node):
            first = True
            for kid in reversed(node.kids):
                #                if kid.tag.startswith('PRN'): continue

                if kid.tag == 'ETC':
                    tag(kid, '&')
                elif kid.tag not in ('CC', 'PU'):
                    if first:
                        tag(kid, 'N')
                        first = False
                    else:
                        tag(kid, 'n')
                else:
                    pass

        # must be above is_coordination (it subsumes UCP)
        elif is_ucp(node):
            left_conjunct_tag = first_kid.tag
            # NOTE:
            # There are some cases where the UCP annotation is suspect (1:36(11))
            # we will obtain the wrong analysis in these cases because the UCP node
            # does not directly dominate its conjuncts

            old_tag = node.tag
            node.tag = left_conjunct_tag
            node.tag = inherit_tag_str(node.tag, old_tag)

            for kid in nodes(node):
                if kid.tag is None:
                    print kid
                    print kid.tag

                if kid.tag.startswith('UCP'):
                    kid.tag = left_conjunct_tag
            for kid in node:
                if kid.tag == 'ETC':
                    tag(kid, '&')
                elif kid.tag not in ('CC', 'PU'):
                    tag(kid, 'C')

        # exclude VP < VV AS: we want to tag this
        elif (node.count() == 1) or is_verb_compound(node):
            pass

        elif ((
                first_kid.is_leaf()  # head initial complementation
                # quoted verb (see fix in _preprocess_ function)
                #           or all((kid.is_leaf() and kid.tag in ('PU', 'VV')) for kid in first_kid)
                or satisfies_all(
                    lambda fkid: any(kid.is_leaf() and kid.tag == 'PU'
                                     for kid in fkid),
                    lambda fkid: any(kid.is_leaf() and kid.tag == 'VV'
                                     for kid in fkid))(first_kid) or
                is_verb_compound(first_kid)
                # HACK: to fix weird case of unary PP < P causing adjunction analysis instead of head-initial
                # (because of PP IP configuration) in 10:76(4)
                or first_kid.tag == 'PP' and first_kid.count() == 1
                and first_kid[0].tag == "P")):

            tag(first_kid, 'h')
            for kid in node[1:]:
                if is_postverbal_adjunct_tag(
                        kid.tag) or kid.tag.startswith('ADVP'):
                    tag(kid, 'a')  # treat aspect particles as adjuncts
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'r')

        # head final complementation
        # This has to come after head initial complementation, otherwise we get the tagging VP < VV:l AS:h
        elif (last_kid.is_leaf() or is_verb_compound(last_kid) or
              # lcp internal structure (cf 10:2(13)) is possible: despite the structure (LCP (NP) (LCP))
              # this should be treated as head-final complementation, not adjunction.
              is_lcp_internal_structure(last_kid)) and not any(
                  first_kid.tag.find(ftag) != -1 for ftag in FunctionTags):

            if last_kid.tag.startswith('SP'):
                # Treat final 吗 as the head to get the type-change category S[q]\S[dcl] (25:21(5))
                if has_question_tag(node):
                    tag(last_kid, 'h')
                else:
                    tag(last_kid, 'a')

            else:
                tag(last_kid, 'h')

            # cf 2:23(7),1:9(28), a number of derivations have (CP(WHNP-1 CP(IP) DEC) XP) instead of
            # the expected (CP (WHNP-1) CP(IP DEC) XP)
            # This lets us treat what would otherwise be considered head-final as an
            # adjunction
            if last_kid.tag.startswith('DEC'):
                for kid in node[0:-1]:
                    if kid.tag.startswith('WHNP') or kid.tag.startswith(
                            'WHPP'):
                        tag(kid, 'a')
                    elif not (kid.tag.startswith('PU')
                              or kid.tag.startswith('ADVP')
                              ):  # ADVP as sibling of IP in 11:39(63)
                        tag(kid, 'l')
            else:
                for kid in node[0:-1]:
                    if (last_kid.tag in VerbalCategories and
                        (is_postverbal_adjunct_tag(kid.tag) or
                         # exception added to account for direct modification of V{V,A} with ADVP (0:47(9))
                         kid.tag.startswith('ADVP'))):
                        tag(kid, 'a')  # treat aspect particles as adjuncts
                    elif not kid.tag.startswith('PU'):
                        tag(kid, 'l')

        # TODO: if this is below coordination, then NP(NP-APP PU NP) is considered coordination instead of apposition (10:70(15))
        #       actually, what happens if we get english-style apposition (NP1 , NP2)?
        elif is_apposition(node):
            if False:  #any(kid.tag == 'IP-APP' for kid in node):
                tag(last_kid, 'h')
            else:
                tag(last_kid, 'r')  # HACK: assume apposition is right-headed

            for kid in node:
                if not kid.tag.startswith('PU'):
                    # exclude CP-APP (see is_apposition() above)
                    if kid.tag.endswith(
                            '-APP') and not kid.tag.startswith('CP'):
                        tag(kid, 'A')
                    else:
                        tag(kid, 'a')

        elif is_modification(node):
            tag(last_kid, 'h')

            for kid in node[0:-1]:
                if has_modification_tag(kid):
                    tag(kid, 'm')
                elif not kid.tag.startswith('PU'):
                    tag(kid, 'a')

                else:
                    tag_if_topicalisation(kid)

        elif is_argument_cluster(node):
            for kid in node:
                tag(kid, '@')

        else:  # adjunction
            tag_adjunction(node, last_kid)

    root = postprocess(root)

    return root

コード例 #36

0

ファイルを表示

ファイル: count.py プロジェクト: Oneplus/cnccgbank

 def accept_derivation(self, bundle):
     for node in nodes(bundle.derivation):
         self.atoms.update(ListAtoms.get_atoms(node.cat))