Esempio n. 1
0
def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher,
            quotes, quoter):
    '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.'''
    with file(ccg_auto_out, 'w') as ccg_out:
        with file(ccg_parg_out, 'w') as parg_out:
            penn_trees = list(PTBReader(ptb_file))
            ccg_trees = list(CCGbankReader(ccg_file))
            deps = list(CCGbankDepsReader(deps_file))

            matched_penn_trees = match_trees(penn_trees, ccg_trees)

            for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees,
                                                     ccg_trees, deps):
                ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation

                quote_spans = spans(ptb_tree)
                while quote_spans:
                    value = quote_spans.pop(0)
                    span_start, span_end, quote_type = value
                    if span_start is None and span_end is None: continue

                    info("Reinstating quotes to %s (%s, %s)",
                         ccg_bundle.label(), span_start, span_end)

                    ccg_tree, quote_indices = quoter.attach_quotes(
                        ccg_tree, span_start, span_end, quote_type, higher,
                        quotes)
                    # In case a new root has been installed, re-assign the new root to the CCGbank bundle
                    ccg_bundle.derivation = ccg_tree

                    # Shift remaining quote span indices by the number of quotes that have been inserted
                    quote_spans = fix_quote_spans(quote_spans, quote_indices)
                    dep = fix_dependencies(dep, quote_indices)

                print >> parg_out, dep
                print >> ccg_out, ccg_bundle
Esempio n. 2
0
unrecognised_rules = defaultdict(lambda: 0)
total, with_unrecognised_rules = 0, 0
ucp_rules = defaultdict(lambda: 0)
with_ucp = 0

unary, binary = defaultdict(lambda: 0), defaultdict(lambda: 0)


def is_ucp(l, r, p):
    if r is None: return False

    return l in (conj, C('LCM'), C(',')) and p.has_feature('conj') and p != r


for file in glob(sys.argv[1]):
    for bundle in CCGbankReader(file):
        has_unrecognised_rules, has_ucp = False, False

        for node in nodes(bundle.derivation):
            if node.is_leaf(): continue

            lrp = map(lambda e: e and e.cat,
                      (node[0], node[1] if node.count() > 0 else None, node))

            comb = analyse(*lrp)
            l, r, p = lrp
            rule_tuple = (str(l), str(r), str(p))

            if comb:
                combs[comb] += 1
            elif is_ucp(*lrp):
Esempio n. 3
0
def load_ccgbank_tree(fn, deriv_no):
    for i, doc in enumerate(CCGbankReader(fn)):
        if i == deriv_no: return doc.derivation
    return None