Beispiel #1
0
    def fix_cat_for(self, leaf, slash_index, mode):
        key_category = re.sub(r'[-.*@]', '', str(leaf.cat))
        if not (key_category in self.permitted_cats):
            warn("No entry in splitdef file for category %s", leaf.cat)
            return
            
        alternatives = self.permitted_cats[key_category]
        #print "All alternatives: %s" % alternatives
            
        old_modes = self.modes_for_cat(leaf.cat)
        
        def is_invalid_alternative(alt):
            alt_modes = self.modes_for_cat(alt)
            if len(alt_modes) != len(old_modes):
                warn("Replacement category %s has different size to original category %s", alt, leaf.cat)
                
            modes_for_comparison = zip(alt_modes, old_modes)
            del modes_for_comparison[slash_index]

            return str(leaf.cat) == str(alt) or \
                   any((ModeTier[alt] < ModeTier[old]) for (alt, old) in modes_for_comparison)
                   
        valids = list(reject(alternatives, is_invalid_alternative))
        if not valids:
            warn("No valid alternative for %s which preserves mode `%s' on slash %d", leaf.cat, mode, slash_index)
            return
            
        #print "Alternatives: %s" % valids
        alternative = min(valids, key=lambda e: self.permissiveness(e, slash_index))
        debug("%s `%s' -> %s", leaf.cat, leaf.lex, alternative)
        
        leaf.cat = alternative
Beispiel #2
0
    def run(self, filters_to_run, files):
        '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.'''
        filters = []

        for filter_name, args in filters_to_run:
            # For a no-args switch, optparse passes in None; we substitute an empty tuple for
            # consistency
            if not args: args = ()

            try:
                filter_class = self.available_filters_dict[filter_name]
                
                actual, expected = len(args), get_argcount_for_method(filter_class.__init__)
                if actual != expected:
                    warn("Skipping filter %s; %d arguments given, %d expected.", filter_name, actual, expected)
                    continue
                    
                filters.append(filter_class(*args))
            except KeyError:
                err("No filter with name `%s' found.", filter_name)
                
        # convert short notation in file specifiers to proper paths
        def expand_short_notation(fn):
            # short notation is 
            # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv
            m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn)
            if m:
                corpus_dir, sec, doc, deriv = m.groups()
                return os.path.join(corpus_dir, 'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv)))
            return fn
            
        files = [expand_short_notation(file) for file in files]

        self.run_filters(filters, files)
Beispiel #3
0
 def determine_sec_and_doc(self, filename):
     '''Determines the section and document number given a filename of the form ``wsj_SSDD.mrg".'''
     matches = self.SecDocRegex.match(os.path.basename(filename))
     if matches and len(matches.groups()) == 2:
         return (int(i) for i in matches.groups())
     else:
         warn("Skipping malformed section/document specifier: `%s'", filename)
         return 0, 0
Beispiel #4
0
    def is_satisfied_by(self, node, context):
        try:
            # Determine whether rhs matches the candidate node
            return self.op_func(self.rhs, node, context)
        except KeyError:
            warn("Invalid operator %s encountered.", self.operator)

        return False
Beispiel #5
0
    def is_satisfied_by(self, node, context):
        try:
            # Determine whether rhs matches the candidate node
            return self.op_func(self.rhs, node, context)
        except KeyError:
            warn("Invalid operator %s encountered.", self.operator)

        return False
Beispiel #6
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''
    
    leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)]
    leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count
    
    result = []
    quote_stack = []
    index = 0
    
    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append( (leaf.lex, index) )
            
        elif (leaf.tag not in ("POS", ":")  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex in ("''", "'")): # an erroneously tagged single close quote.
              
            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''" or
                    open_quote == "`"  and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break
                
                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append( (span_begin, leaf_count-index, open_quote) )
                
            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")
                    
                result.append( (None, leaf_count-index, quote_type) )
        
        # Only advance the index for a leaf corresponding to a CCGbank leaf        
        else:
            index += 1
                
    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append( (span_begin, None, remaining_quote) )
        else:
            warn("Unexpected quote %s after exhausting input.", remaining_quote)
            
    return result
Beispiel #7
0
 def determine_sec_and_doc(self, filename):
     '''Determines the section and document number given a filename of the form ``wsj_SSDD.mrg".'''
     matches = self.SecDocRegex.match(os.path.basename(filename))
     if matches and len(matches.groups()) == 2:
         return (int(i) for i in matches.groups())
     else:
         warn("Skipping malformed section/document specifier: `%s'",
              filename)
         return 0, 0
Beispiel #8
0
        def is_invalid_alternative(alt):
            alt_modes = self.modes_for_cat(alt)
            if len(alt_modes) != len(old_modes):
                warn("Replacement category %s has different size to original category %s", alt, leaf.cat)
                
            modes_for_comparison = zip(alt_modes, old_modes)
            del modes_for_comparison[slash_index]

            return str(leaf.cat) == str(alt) or \
                   any((ModeTier[alt] < ModeTier[old]) for (alt, old) in modes_for_comparison)
Beispiel #9
0
def load_requested_packages(module_names):
    '''Tries to load each module named in _module_names_, returning an array of the loadable module objects found in that module.'''
    loaded_modules = []
    
    for module in module_names:
        try:
            # Suppose we want to import A.B.C. When fromlist is any value but [], it returns A.B.C.
            # Otherwise, it only returns the topmost module, A.
            loaded_modules.append( __import__(module, fromlist=[module]) )
        except ImportError, e:
            warn("Couldn't import module %s (%s)", module, e)
Beispiel #10
0
    def determine_reader(self, preview):
        '''Applies each of the guessers to the document, returning the corresponding reader class 
if a guesser matches.'''
        for guesser in self.guessers:
            if guesser.identify(preview):
                return guesser.reader_class()
        else:
            warn("determine_reader: No reader could be guessed given context ``%s''; assuming %s",
                preview,
                guesser.reader_class())
            return self.default.reader_class()
Beispiel #11
0
    def relabel_relativiser(self, node):
        # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP)
        
        result = get_first(node, r'*=S $ /(DEC|SP)/=REL', with_context=True, left_to_right=True)

        if result is not None:
            _, context = result
            s, relativiser = context.s, context.rel
            
            relativiser.category = relativiser.category.clone_with(right=s.category)
            debug("New rel category: %s", relativiser.category)

            return True
        else:
            warn("Couldn't find relativiser under %s", node)
            return False
Beispiel #12
0
    def relabel_relativiser(self, node):
        # Relabel the relativiser category (NP/NP)\S to (NP/NP)\(S|NP)

        result = get_first(node,
                           r'*=S $ /(DEC|SP)/=REL',
                           with_context=True,
                           left_to_right=True)

        if result is not None:
            _, context = result
            s, relativiser = context.s, context.rel

            relativiser.category = relativiser.category.clone_with(
                right=s.category)
            debug("New rel category: %s", relativiser.category)

            return True
        else:
            warn("Couldn't find relativiser under %s", node)
            return False
Beispiel #13
0
def get_available_filters_dict(loaded_modules):
    '''Given a list of module objects, returns a dictionary mapping from filter names to valid 
filter objects found in those modules' namespaces.'''
    
    filters_found = {}
    
    for module in loaded_modules:
        for symbol_name in dir(module):
            obj = getattr(module, symbol_name)
            
            # Only consider classes which are strict subclasses of Filter
            if (type(obj) is TypeType and 
                issubclass(obj, munge.proc.filter.Filter) and
                not obj.is_abstract() and
                obj is not munge.proc.filter.Filter):
                if symbol_name in filters_found:
                    warn("An already loaded filter with the name %s has been overwritten by a filter with the same name.", symbol_name)
                    
                filters_found[symbol_name] = obj
    
    return filters_found
Beispiel #14
0
def match_trees(penn_trees, ccg_trees):
    '''Given two lists, of PTB and CCGbank trees which we believe to belong to the same document file, this removes
those PTB trees which do not correspond to any CCGbank tree. We assume that the given CCGbank derivations are
a subsequence of the given PTB derivations.'''
    cur_ptb_index = 0
    result = []

    for ccg_bundle in ccg_trees:
        ccg_tree_matched = False

        while not ccg_tree_matched:
            if cur_ptb_index >= len(penn_trees): break

            ccg_text = ccg_bundle.derivation.text()
            # We want to compare the CCG text against the PTB text stripped of quotes
            ptb_text = penn_trees[cur_ptb_index].derivation.text(
                with_quotes=False)

            if ptb_text != ccg_text:
                warn("In document %s:", ccg_bundle.label())
                warn("\tCCG tokens: %s", ' '.join(ccg_text))
                warn("\tPTB tokens: %s", ' '.join(ptb_text))
            else:
                result.append(penn_trees[cur_ptb_index])
                ccg_tree_matched = True

            cur_ptb_index += 1

    return result
Beispiel #15
0
def match_trees(penn_trees, ccg_trees):
    '''Given two lists, of PTB and CCGbank trees which we believe to belong to the same document file, this removes
those PTB trees which do not correspond to any CCGbank tree. We assume that the given CCGbank derivations are
a subsequence of the given PTB derivations.'''
    cur_ptb_index = 0
    result = []
    
    for ccg_bundle in ccg_trees:
        ccg_tree_matched = False
        
        while not ccg_tree_matched:
            if cur_ptb_index >= len(penn_trees): break
        
            ccg_text = ccg_bundle.derivation.text()
            # We want to compare the CCG text against the PTB text stripped of quotes
            ptb_text = penn_trees[cur_ptb_index].derivation.text(with_quotes=False)
        
            if ptb_text != ccg_text:
                warn("In document %s:", ccg_bundle.label())
                warn("\tCCG tokens: %s", ' '.join(ccg_text))
                warn("\tPTB tokens: %s", ' '.join(ptb_text))
            else:
                result.append( penn_trees[cur_ptb_index] )
                ccg_tree_matched = True
            
            cur_ptb_index += 1
            
    return result
Beispiel #16
0
    def __iter__(self):
        path, index = padded_rsplit(self.path, ':', 1)

        if not os.path.exists(path):
            # TODO: This doesn't skip the current file (can we do that from inside the iterator?)
            warn("%s does not exist, so skipping.", path)

        if self.reader_class:
            reader_arg = { 'reader': self.reader_class }
        else:
            reader_arg = {}
            
        if os.path.isdir(path):
            reader = MultiGuessReader(path, verbose=self.verbose, **reader_arg)
        else:
            if self.reader_class:
                reader = self.reader_class(self.path)
            else:
                reader = GuessReader(self.path)

        for deriv_bundle in reader:
            yield deriv_bundle
Beispiel #17
0
    def run(self, filters_to_run, files):
        '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.'''
        filters = []

        for filter_name, args in filters_to_run:
            # For a no-args switch, optparse passes in None; we substitute an empty tuple for
            # consistency
            if not args: args = ()

            try:
                filter_class = self.available_filters_dict[filter_name]

                actual, expected = len(args), get_argcount_for_method(
                    filter_class.__init__)
                if actual != expected:
                    warn(
                        "Skipping filter %s; %d arguments given, %d expected.",
                        filter_name, actual, expected)
                    continue

                filters.append(filter_class(*args))
            except KeyError:
                err("No filter with name `%s' found.", filter_name)

        # convert short notation in file specifiers to proper paths
        def expand_short_notation(fn):
            # short notation is
            # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv
            m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn)
            if m:
                corpus_dir, sec, doc, deriv = m.groups()
                return os.path.join(
                    corpus_dir,
                    'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv)))
            return fn

        files = [expand_short_notation(file) for file in files]

        self.run_filters(filters, files)
Beispiel #18
0
def main(argv):
    parser = OptionParser()

    register_builtin_switches(parser)                        
    opts, args = parser.parse_args(argv)
    
    if not all_required_args_present(opts):
        parser.print_help()
        sys.exit(1)
    
    quoter_class = {
        'span': SpanQuoter,
        'lca' : LCAQuoter
    }[opts.quote_method]
    punct_class = {
        'swap' : SwapComma,
        'shift': ShiftComma
    }.get(opts.punct_method, None)
    quoter = quoter_class(punct_class)
    
    remaining_args = args[1:]
    if not remaining_args:
        # If no sec/doc specifiers are given, assume 'all sections all documents'
        remaining_args.append(':')
        
    ptb_files_spec = parse_requested_derivs(remaining_args)
    
    for sec_glob, doc_glob in ptb_files_spec:
        for ptb_file in glob(os.path.join(opts.penn_in, sec_glob, "wsj_%s%s.mrg" % (sec_glob, doc_glob))):
            info("Processing %s", ptb_file)
            
            matches = PTBFileRegex.search(ptb_file)
            if matches and len(matches.groups()) == 2:
                sec, doc = matches.groups()
                
                ccg_file =  os.path.join(opts.ccg_in, 'AUTO', sec, "wsj_%s%s.auto" % (sec, doc))
                deps_file = os.path.join(opts.ccg_in, 'PARG', sec, "wsj_%s%s.parg" % (sec, doc))
                
                if not opts.quiet:
                    if not os.path.exists(ccg_file):
                        warn("No corresponding CCGbank file %s for Penn file %s", ccg_file, ptb_file)
                    if not os.path.exists(deps_file):
                        warn("No corresponding CCGbank dependency file %s for CCG file %s", deps_file, ccg_file)
                        
                ccg_auto_dir, ccg_parg_dir = [os.path.join(opts.outdir, part, sec) for part in ('AUTO', 'PARG')]
                if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir)
                if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir)
                
                ccg_auto_out, ccg_parg_out = (os.path.join(ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)),
                                              os.path.join(ccg_parg_dir, 'wsj_%s%s.parg' % (sec, doc)))
                                              
                process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, 
                                     opts.higher, opts.quotes, quoter)
                
            else:
                warn("Could not find, so ignoring %s", ptb_file)
Beispiel #19
0
def _label_node(node, inside_np_internal_structure=False, do_shrink=True):
    # NP<NN shrinking happens unconditionally if use_bare_N is false
    do_np_shrink = (not use_bare_N) or inside_np_internal_structure
    
    if node.is_leaf(): return node
    elif node.count() == 1:
        node.head_index = 0

        # shrinkage rules (NP < NN shrinks to NN)
        if (do_shrink and
            ((do_np_shrink and
                ((node.tag.startswith('NP') 
                    and not has_tag(node, 'A')
                    and has_noun_tag(node[0])) or
                node[0].tag == 'AD')) or
            (node.tag.startswith('VP') or is_verb_compound(node)) and  # a handful of VRDs project a single child (11:29(4))
                (has_verbal_tag(node[0]) or 
                 matches(node[0], 'VPT', 'VSB', 'VRD', 'VCD', 'VNV', 'AD', 'PP', 'QP', 'LCP', 'NP')) or
            (node.tag.startswith('ADJP') and matches(node[0], 'JJ', 'AD', 'NN', 'OD')) # bad tagging 25:40(5), 31:37(6)
            ) or
            (node.tag.startswith('ADVP') and exactly_matches(node[0], 'AD', 'CS', 'NN')) or
            (matches(node, 'NP-MNR', 'NP-PRP') and has_noun_tag(node[0])) or
            # 8:1(5)
            (node.tag.startswith('NP-PN') and 
                node.tag.endswith(':a') and exactly_matches(node[0], 'NR')) or
            (node.tag.startswith('CLP') and exactly_matches(node[0], 'M')) or
            (node.tag.startswith('LCP') and exactly_matches(node[0], 'LC')) or
            # DT < OD found in 6:25(11)
            (node.tag.startswith('DP') and exactly_matches(node[0], 'DT', 'OD')) or
            # QP < AD in 24:68(8)
            (node.tag.startswith('QP') and matches(node[0], 'QP', 'M')) or
            # see head-initial case in tag.py (hack for unary PP < P)
            (node.tag.startswith('PP') and exactly_matches(node[0], 'P')) or
            # see bad tagging (WHNP CP DEC) in tag.py head-final case
            (node.tag.startswith('CP') and matches(node[0], 'IP')) or
            (node.tag.startswith('INTJ') and exactly_matches(node[0], 'IJ')) or
            (node.tag.startswith('LST') and exactly_matches(node[0], 'OD', 'CD')) or
            # the below is to fix a tagging error in 10:49(69)
            (node.tag.startswith('PRN') and exactly_matches(node[0], 'PU')) or
            # 0:15(5) LST < PU
            (node.tag.startswith('LST') and exactly_matches(node[0], 'PU')) or
            # unary DNP < QP in e.g. NP(DNP(QP(sanshi sui)) gongren) (5:51(6)) is meant to
            # suggest implicit 'de' but this causes the spurious QP -> N/N rule
            (node.tag.startswith('DNP') and matches(node[0], 'QP')) or
            # includes any tags of the form NP-X-PRD (see 10:67(32))
            # but excludes VP(VC NP-PRD), which we want to analyse with VC |- (S[dcl]\NP)/NP
            ( node.tag.startswith('NP') and node.tag.find('-PRD') != -1 and has_noun_tag(node[0]) and not node.parent.kids[0].tag.startswith('VC') ) or
            matches(node, 'FLR') or matches(node, 'FW')):
            
            replacement = node[0]
            inherit_tag(replacement, node, strip_marker=True)
            replace_kid(node.parent, node, node[0])
            return label_node(replacement)
            
        # NN for 25:61(7)
        elif (node.tag.startswith("QP") and exactly_matches(node[0], "OD", "CD", 'NN')):
            
            replacement = node[0]
            inherit_tag(replacement, node)
            replace_kid(node.parent, node, node[0])
            #replacement.tag = node.tag
            
            return label_node(replacement)
        
        # promotion rules (NP < PN shrinks to NP (with PN's lexical item and pos tag))
        # shrink NP-TMP < NT so that the NT lexical item gets the adjunct category
        elif ((node.tag.startswith('NP') and (exactly_matches(node[0], "PN") or matches(node[0], 'NT', 'DT'))) or
              # 21:2(6)
              (node.tag.startswith('ADVP') and exactly_matches(node[0], 'CC', 'PN')) or

              (node.tag.startswith('ADJP') and exactly_matches(node[0], 'PN', 'DT')) or
              # 28:82(8)
              (node.tag.startswith('DP') and matches(node[0], 'NN', 'PN')) or
              (matches(node, #'NP-PRD', 'NP-TTL-PRD', 'NP-PN-PRD',
                             'NP-LOC', 'NP-ADV',
                             'NP-PN-TMP', 'NP-PN-LOC', 'NP-TMP', 'NP-DIR', 'NP-PN-DIR')
                  and has_noun_tag(node[0]))):
                  
            replacement = node[0]
            inherit_tag(replacement, node)
            replace_kid(node.parent, node, node[0])
            #replacement.tag = node.tag
            
            return label_node(replacement)
        
        # one child nodes
        else:
            node.kids[0] = label_node(node.kids[0])
            return node
    
    elif is_S_NP_apposition(node):
        # When NP(IP-APP NP), shrinks the NP<NN so we can get a head-final
        # analysis.
        # Without the following check, fails on 5:95(17) where NP(IP-APP NN) 
        # instead of the usual NP(IP-APP NP)
        # However, we don't want to shrink unless node[1] is actually a unary
        # projection (otherwise we'd delete leaves like in 0:89(16))
        if not node[1].is_leaf() and node[1].count() == 1 and node[1][0].is_leaf():
            inherit_tag(node[1][0], node[1])
            node.kids[1] = node[1][0]
            
        return label_head_final(node)
    
    elif is_predication(node):
        return label_predication(node)
    elif is_prn(node):
        # although we want a head-initial analysis, we want a right-branching structure
        return label_adjunction(node, inside_np_internal_structure=True)
    elif is_apposition(node):
        return label_apposition(node, inside_np_internal_structure=True)
    elif is_np_structure(node):# and not node[0].tag.startswith('IP-APP'):
        return label_adjunction(node, inside_np_internal_structure=True) # TODO: misnomer
    elif is_np_internal_structure(node):
        return label_np_internal_structure(node)
    # 0:68(4) has both cases. If there are NP modifiers of a QP or an ADJP, we want them shrunk.
    elif node.kids[-1].tag in ('QP:h', 'ADJP:h'):
        return label_adjunction(node, inside_np_internal_structure=True)
    elif node.tag.startswith('VRD'):
        return label_head_initial(node)
    elif (is_adjunction(node)
       or is_verb_compound(node)
       or is_modification(node)):
        return label_adjunction(node)
    elif is_head_final(node):
        return label_head_final(node)
    elif is_head_initial(node):
        return label_head_initial(node)
    elif is_coordination(node) or is_ucp(node):
        return label_coordination(node, inside_np_internal_structure=True)
    else:
        warn("binarise: No known configuration for %s", node)
        return label_adjunction(node)
Beispiel #20
0
def t_error(t):
    warn("Illegal character `%s' encountered.", t.value[0])
    t.lexer.skip(1)
Beispiel #21
0
def t_error(t):
    warn("Illegal character `%s' encountered.", t.value[0])
    t.lexer.skip(1)
Beispiel #22
0
def label(node, inside_np=False):
    '''
    Labels the descendants of _node_ and returns _node_.
    '''
    global BareN

    if node.category is None:
        node.category = ptb_to_cat(node)

        # if this matches the IP root with a *PRO* trace under it, then
        # we shouldn't map IP -> S, but rather IP -> S\NP
        if node.tag.startswith('NT'):  # map NT -> NP, not N
            node.category = NP
        elif has_noun_tag(node):
            node.category = BareN
        else:
            node.category = ptb_to_cat(node)

    if node.is_leaf():
        if not node.category:
            node.category = ptb_to_cat(node)
        return node

    # NP/NP (CP) -> NP
    elif is_cp_to_np_nominalisation(node):
        node[0].category = NPfNP
        node.kids[0] = label(node[0])

        return node

    # VSB is analysed as head-final
    elif node.tag.startswith('VSB'):
        node[1].category = node.category
        node.kids[1] = label(node[1])

        node[0].category = featureless(node.category) / featureless(
            node[1].category)
        node.kids[0] = label(node.kids[0])

        return node

    # VCD is treated like apposition
    elif node.tag.startswith('VCD'):
        if has_verbal_tag(node[0]):
            node[0].category = node.category
        else:
            node[0].category = ptb_to_cat(node[0])

        node.kids[0] = label(node[0])

        if has_verbal_tag(node[1]):
            node[1].category = node.category
        else:
            node[1].category = ptb_to_cat(node[1])

        node.kids[1] = label(node[1])

        return node

    elif node.tag.startswith('VRD'):
        return label_right_adjunction(node)

    # must be above is_apposition, because there exist NP-APP:a ETC:& cases
    elif is_etc(node):
        return label_etc_head_final(node)

    elif is_S_NP_apposition(node):
        return rename_category_while_labelling_with(
            label_head_final, node,
            BareN if node.category == NP else node.category)

    elif (node.count() == 1 or is_topicalisation(node)
          or is_topicalisation_without_gap(node) or is_apposition(node)
          or is_argument_cluster(node) or is_modification(node)):

        node.kids[0] = label(node[0])
        if node.count() > 1:
            node.kids[1] = label(node[1])

        return node

    elif is_partial_ucp(node):
        return label_partial_coordination(node, ucp=True)
    elif is_ucp(node):
        return label_coordination(node, ucp=True)

    elif is_predication(node):
        return label_predication(node)

    elif is_left_absorption(node):
        return label_left_absorption(node)
    elif is_right_absorption(node):
        return label_right_absorption(node)

    elif is_right_adjunction(node):  # (:h :a), for aspect particles
        return label_right_adjunction(node)

    elif is_partial_coordination(node):
        return label_partial_coordination(node)
    elif is_coordination(node):
        return label_coordination(node)

    elif is_np_structure(node):  # and not node[0].tag.startswith('IP-APP'):
        return rename_category_while_labelling_with(
            label_np_structure,
            node,
            BareN,
            when=lambda category: category == NP)

    elif is_np_internal_structure(node):
        return label_np_internal_structure(node)

    elif is_punctuation_headed(node):
        return label_head_final(node)

    elif is_adjunction(node):
        return label_adjunction(node)

    elif is_head_final(node):
        return label_head_final(node)
    elif is_head_initial(node):
        return label_head_initial(node)

    else:
        warn(
            "Node did not match any known patterns -- assuming adjunction: %s",
            node.__repr__(suppress_lex=True))
        return label_adjunction(node)
Beispiel #23
0
def label(node, inside_np=False):
    '''
    Labels the descendants of _node_ and returns _node_.
    '''
    global BareN
    
    if node.category is None:
        node.category = ptb_to_cat(node)
        
        # if this matches the IP root with a *PRO* trace under it, then
        # we shouldn't map IP -> S, but rather IP -> S\NP
        if node.tag.startswith('NT'): # map NT -> NP, not N
            node.category = NP
        elif has_noun_tag(node):
            node.category = BareN
        else:
            node.category = ptb_to_cat(node)
    
    if node.is_leaf():
        if not node.category:
            node.category = ptb_to_cat(node)
        return node
    
    # NP/NP (CP) -> NP
    elif is_cp_to_np_nominalisation(node):
        node[0].category = NPfNP
        node.kids[0] = label(node[0])
        
        return node
    
    # VSB is analysed as head-final
    elif node.tag.startswith('VSB'):
        node[1].category = node.category
        node.kids[1] = label(node[1])
        
        node[0].category = featureless(node.category) / featureless(node[1].category)
        node.kids[0] = label(node.kids[0])
        
        return node
        
    # VCD is treated like apposition
    elif node.tag.startswith('VCD'):
        if has_verbal_tag(node[0]):
            node[0].category = node.category
        else:
            node[0].category = ptb_to_cat(node[0])
            
        node.kids[0] = label(node[0])
            
        if has_verbal_tag(node[1]):
            node[1].category = node.category
        else:
            node[1].category = ptb_to_cat(node[1])
            
        node.kids[1] = label(node[1])
            
        return node
        
    elif node.tag.startswith('VRD'):
        return label_right_adjunction(node)

    # must be above is_apposition, because there exist NP-APP:a ETC:& cases
    elif is_etc(node):
        return label_etc_head_final(node)
        
    elif is_S_NP_apposition(node):
        return rename_category_while_labelling_with(label_head_final, node, BareN if node.category == NP else node.category)
        
    elif (node.count() == 1
       or is_topicalisation(node)
       or is_topicalisation_without_gap(node)
       or is_apposition(node)
       or is_argument_cluster(node)
       or is_modification(node)):
        
        node.kids[0] = label(node[0])
        if node.count() > 1:
            node.kids[1] = label(node[1])
        
        return node
    
    elif is_partial_ucp(node):
        return label_partial_coordination(node, ucp=True)
    elif is_ucp(node):
        return label_coordination(node, ucp=True)
        
    elif is_predication(node):
        return label_predication(node)

    elif is_left_absorption(node):
        return label_left_absorption(node)
    elif is_right_absorption(node):
        return label_right_absorption(node)
        
    elif is_right_adjunction(node): # (:h :a), for aspect particles
        return label_right_adjunction(node)
        
    elif is_partial_coordination(node):
        return label_partial_coordination(node)
    elif is_coordination(node):
        return label_coordination(node)
    
    elif is_np_structure(node):# and not node[0].tag.startswith('IP-APP'):
        return rename_category_while_labelling_with(
            label_np_structure,
            node, BareN,
            when=lambda category: category == NP)
    
    elif is_np_internal_structure(node):
        return label_np_internal_structure(node)
        
    elif is_punctuation_headed(node):
        return label_head_final(node)
    
    elif is_adjunction(node):
        return label_adjunction(node)
    
    elif is_head_final(node):
        return label_head_final(node)
    elif is_head_initial(node):
        return label_head_initial(node)
    
    else:
        warn("Node did not match any known patterns -- assuming adjunction: %s",
            node.__repr__(suppress_lex=True))
        return label_adjunction(node)
Beispiel #24
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''

    leaf_nodes = [
        leaf for leaf in leaves(ptb_tree)
        if not is_ignored(leaf, ignoring_quotes=False)
    ]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [
        leaf for leaf in leaf_nodes
        if not is_ignored(leaf, ignoring_quotes=True)
    ]
    leaf_count = len(
        leaf_nodes_without_quotes)  # should be equal to the CCG leaf count

    result = []
    quote_stack = []
    index = 0

    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append((leaf.lex, index))

        elif (leaf.tag not in (
                "POS", ":"
        )  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex
              in ("''", "'")):  # an erroneously tagged single close quote.

            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''"
                        or open_quote == "`" and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break

                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append((span_begin, leaf_count - index, open_quote))

            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")

                result.append((None, leaf_count - index, quote_type))

        # Only advance the index for a leaf corresponding to a CCGbank leaf
        else:
            index += 1

    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append((span_begin, None, remaining_quote))
        else:
            warn("Unexpected quote %s after exhausting input.",
                 remaining_quote)

    return result
Beispiel #25
0
def main(argv):
    parser = OptionParser()

    register_builtin_switches(parser)
    opts, args = parser.parse_args(argv)

    if not all_required_args_present(opts):
        parser.print_help()
        sys.exit(1)

    quoter_class = {'span': SpanQuoter, 'lca': LCAQuoter}[opts.quote_method]
    punct_class = {
        'swap': SwapComma,
        'shift': ShiftComma
    }.get(opts.punct_method, None)
    quoter = quoter_class(punct_class)

    remaining_args = args[1:]
    if not remaining_args:
        # If no sec/doc specifiers are given, assume 'all sections all documents'
        remaining_args.append(':')

    ptb_files_spec = parse_requested_derivs(remaining_args)

    for sec_glob, doc_glob in ptb_files_spec:
        for ptb_file in glob(
                os.path.join(opts.penn_in, sec_glob,
                             "wsj_%s%s.mrg" % (sec_glob, doc_glob))):
            info("Processing %s", ptb_file)

            matches = PTBFileRegex.search(ptb_file)
            if matches and len(matches.groups()) == 2:
                sec, doc = matches.groups()

                ccg_file = os.path.join(opts.ccg_in, 'AUTO', sec,
                                        "wsj_%s%s.auto" % (sec, doc))
                deps_file = os.path.join(opts.ccg_in, 'PARG', sec,
                                         "wsj_%s%s.parg" % (sec, doc))

                if not opts.quiet:
                    if not os.path.exists(ccg_file):
                        warn(
                            "No corresponding CCGbank file %s for Penn file %s",
                            ccg_file, ptb_file)
                    if not os.path.exists(deps_file):
                        warn(
                            "No corresponding CCGbank dependency file %s for CCG file %s",
                            deps_file, ccg_file)

                ccg_auto_dir, ccg_parg_dir = [
                    os.path.join(opts.outdir, part, sec)
                    for part in ('AUTO', 'PARG')
                ]
                if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir)
                if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir)

                ccg_auto_out, ccg_parg_out = (os.path.join(
                    ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)),
                                              os.path.join(
                                                  ccg_parg_dir,
                                                  'wsj_%s%s.parg' %
                                                  (sec, doc)))

                process(ptb_file, ccg_file, deps_file, ccg_auto_out,
                        ccg_parg_out, opts.higher, opts.quotes, quoter)

            else:
                warn("Could not find, so ignoring %s", ptb_file)
Beispiel #26
0
def mkdeps(root, postprocessor=identity):
    for i, leaf in enumerate(leaves(root)):
        # Uniquify each leaf with an index
        leaf.lex += IndexSeparatorTemplate % i
        # Apply the left to right slash labelling 
        # (we abuse this to refer to slots, not slashes)
        leaf.cat.parg_labelled()
        # Populate the outermost (_) variable of each leaf
        leaf.cat.slot.head.lex = leaf.lex

    for (l, r, p) in pairs_postorder(root):
        _label_result(l, r, p)
            
    global unanalysed
    
    unaries = []

    for l, r, p in pairs_postorder(root):
        L, R, P = map(lambda x: x and x.cat, (l, r, p))
        comb = analyse(L, R, P)
        if not comb: debug("Unrecognised rule %s %s -> %s", L, R, P)
        
        unifier = []
        
        if config.debug:
            debug("%s %s %s (%s)", L, R, P, str(comb))

        if comb == 'fwd_appl': # [Xx/Yy]l Yy -> Xx
            unifier = unify(L.right, R)
            p.cat = L.left

        elif comb == 'bwd_appl': # Yy [Xx\Yy]r -> Xx
            unifier = unify(L, R.right)
            p.cat = R.left
                
        # Pro-drops which drop their outer argument
        # [(S_\NPy)_/NPx]_ -> [S_\NPy]_
        elif comb in ('object_prodrop', 'vp_vp_object_prodrop', 
            'yi_subject_prodrop', 'vp_modifier_subject_prodrop'):
            p.cat = L.left

        # [Xx/Yy]l [Yy/Zz]r -> [Xx/Zz]r
        elif comb == 'fwd_comp': # X/Y Y/Z -> X/Z
            if is_rooted_in(Sdcl, L, respecting_features=True):
                P.slot = L.slot
            else:
                P.slot = R.slot # lexical head comes from R (Y/Z)

            P.slot.var = fresh_var(prefix='K')

            unifier = unify(L.right, R.left)
            p.cat._left = L.left
            p.cat._right = R.right
            
        # [Yy\Zz]l [Xx\Yy]r -> [Xx\Zz]l
        elif comb == 'bwd_comp': # Y\Z X\Y -> X\Z
            if is_rooted_in(Sdcl, R, respecting_features=True):
                P.slot = R.slot
            else:
                P.slot = L.slot # lexical head comes from L (Y\Z)

            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(R.right, L.left)
            p.cat._left = R.left
            p.cat._right = L.right
            
        elif comb in ('s_np_apposition', 'vp_np_apposition'): # { S[dcl], S[dcl]\NP } NPy -> NPy
            P.slot = R.slot # = copy_vars
            unifier = unify(P, R)
            
        # NP NP -> N/N
        elif comb == 'np_np_to_nfn_apposition':
            # do the same as NP NP -> NP, except fill in the vars Ny/Ny
            P.right.slot.var = fresh_var(prefix='N')
            P.left.slot = P.right.slot

            register_unary(unaries, p, L.slot.head.lex)
            make_set_head_from(l, r, p)

        elif comb in ('conjoin', 'np_np_apposition'): # X X[conj] -> X
            make_set_head_from(l, r, p)

        elif comb in ('conj_absorb', 'conj_comma_absorb'): # conj X -> X[conj]
            copy_vars(frm=R, to=P)
            unify(P, R) # R.slot.head = P.slot.head
            
        elif comb == 'funny_conj': # conj X -> X
            p.cat = R
        
        elif comb == 'nongap_topicalisation': # {N, NP, S[dcl], QP}x -> [Sy/Sy]x
            P.slot = L.slot
            P.right.slot.var = fresh_var()
            P.left.slot = P.right.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb in ('np_gap_topicalisation', 's_gap_topicalisation', 'qp_gap_topicalisation'): # NPx -> [ Sy/(Sy/NPx)y ]y
            P.right.right.slot = L.slot
            P.slot.var = fresh_var()
            P.left.slot = P.right.left.slot = P.right.slot = P.slot
            
        elif comb == 'subject_prodrop': # (S[dcl]y\NPx)y -> S[dcl]y | [(S[dcl]y\NPx)y/NPz]y -> (S[dcl]y/NPz)y
            if P == parse_category(r'S[dcl]'):
                P.slot = L.slot
            elif P == parse_category(r'S[dcl]/NP'):
                P.slot = P.left.slot = L.slot
                P.right.slot = L.right.slot
            else:
                warn("Invalid parent category %s for subject prodrop.", P)
            
        elif comb == 'fwd_xcomp': # [Xx/Yy]l [Yy\Zz]r -> [Xx/Zz]r
            if is_rooted_in(Sdcl, L, respecting_features=True):
                P.slot = L.slot
            else:
                P.slot = R.slot # lexical head comes from R (Y/Z)
                
            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(L.right, R.left)
            p.cat._left = L.left
            p.cat._right = R.right

        elif comb == 'bwd_xcomp': # [Yy/Zz]l [Xx\Yy]r -> [Xx/Zz]l
            if is_rooted_in(Sdcl, R, respecting_features=True):
                P.slot = R.slot
            else:
                P.slot = L.slot # lexical head comes from L (Y\Z)
        
            # P.slot = L.slot
            P.slot.var = fresh_var(prefix='K')
            
            unifier = unify(R.right, L.left)
            p.cat._left = R.left
            p.cat._right = L.right
            
        elif comb == 'bwd_r1xcomp': # [(Yy/Zz)k/Ww]l [Xx\Yy]r -> [(Xx\Zz)k/Ww]l
            # TODO: where should P's lexical head come from? L or R?
            
            unifier = unify(L.left.left, R.right)
            p.cat._left._left = R.left
            p.cat._left._right = L.left.right
            p.cat._right = L.right

        elif comb in ('fwd_raise', 'bwd_raise'): # Xx -> [ Tf|(Tf|Xx)f ]f
            if P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/(S[dcl]\NP))'):
                # (S[dcl]y\NPz)y -> [ (S[dcl]f\NPg)f/((S[dcl]f\NPg)f\(S[dcl]y\NPz)y)f ]f
                P.left.slot.var = P.left.left.slot.var = P.right.slot.var = P.slot.var = fresh_var() # f 
                P.left.right.slot.var = fresh_var() # g
                
                copy_vars(frm=P.left, to=P.right.left)
                copy_vars(frm=L,      to=P.right.right)
                
                unifier = unify(L, P.right.right)
            elif P == parse_category(r'((S[dcl]\NP)/QP)\(((S[dcl]\NP)/QP)/NP)'):
                # NPy -> [ ((S[dcl]v\NPw)v/QPz)v \ ( ((S[dcl]v\NPw)v/QPz)v/NPy )v ]v
                P.slot.var = fresh_var()
                P.left.slot = P.right.slot = \
                    P.left. left.slot = P.left. left.left.slot = \
                    P.right.left.slot = P.right.left.left.slot = \
                    P.right.left.left.left.slot = P.slot # v
#                P.right.right.slot = fresh_var() # y
                P.right.right.slot = L.slot
                P.left.right.slot.var = fresh_var('Z')
                P.right.left.right.slot = P.left.right.slot # z
                P.left.left.right.slot.var = fresh_var('W')
                P.right.left.left.right.slot = P.left.left.right.slot # w
                
                unifier = unify(L, P.right.right)
            elif P == parse_category(r'(S[dcl]\NP)\((S[dcl]\NP)/QP)'):
                # QPy -> [ (S[dcl]v\NPz)v \ ((S[dcl]v\NPz)v/QPy)v ]v
                P.slot.var = fresh_var()
                P.left.slot = P.left.left.slot = \
                    P.right.slot = P.right.left.slot = P.right.left.left.slot = P.slot # v
#                P.right.right.slot = fresh_var() # y
                P.right.right.slot = L.slot
                P.left.right.slot.var = fresh_var('Z')
                P.right.left.right.slot = P.left.right.slot # z
                
                unifier = unify(L, P.right.right)
            else:
                P.slot.var = fresh_var()

                P.right.left.slot = P.left.slot = P.right.slot = P.slot
                P.right.right.slot = L.slot

                unifier = unify(L, P.right.right)

        elif comb == 'np_typechange':
            P.slot = L.slot # = copy_vars
            unifier = unify(P, L)
            
        elif comb == 'lcp_np_typechange':
            P.slot = L.slot
            unifier = unify(P, L)
            
        elif comb in ('lcp_sfs_typechange', 'lcp_nfn_typechange'):
            P.left.slot.var = fresh_var()
            P.right.slot = P.left.slot
            
            P.slot = L.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb == 'lcp_sbnpfsbnp_typechange':
            # [(Sy\NPz)y/(Sy\NPz)y]_
            P.left.slot.var = fresh_var()
            P.left.left.slot = P.right.left.slot = P.right.slot = P.left.slot
            
            register_unary(unaries, p, L.slot.head.lex)
        
        elif comb == 'null_relativiser_typechange': # Xy -> (Nf/Nf)y
            P.slot = L.slot
            
            if P == _NfN:
                P.left.slot.var = fresh_var()
                
                P.right.slot = P.left.slot
                
                register_unary(unaries, p, L.slot.head.lex)
                
            elif P == _NfNfNfN:
                P.left.slot.var = fresh_var()
                P.left.left.slot.var = fresh_var(prefix="G")
                
                P.left.right.slot = P.left.left.slot
                P.right.slot = P.left.slot
                
                register_unary(unaries, p, L.slot.head.lex)
            else:
                warn("Unhandled null relativiser typechange: %s -> %s", L, P)
            
        # [NP/NP]y -> NPy
        elif comb == 'de_nominalisation':
            P.slot = L.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        # {M, QP}y -> (Nf/Nf)y
        elif comb == 'measure_word_number_elision':
            P.slot = L.slot
            
            P.left.slot.var = fresh_var()
            P.right.slot = P.left.slot
            
            register_unary(unaries, p, L.slot.head.lex)
            
        elif comb == 'l_punct_absorb': # , X -> X[conj]
            # need to put conj feature back on parent
            p.cat = R.clone_adding_feature('conj')
            
        elif comb == 'r_punct_absorb':
            p.cat = L

        elif R and L == R and is_rooted_in(parse_category('S'), L): # VCD (stopgap)
            make_set_head_from(l, r, p)

        else:
            debug('Unhandled combinator %s (%s %s -> %s)', comb, L, R, P)
            unanalysed.add(comb)
            
            P.slot = R.slot if R else L.slot
            
        for (dest, src) in unifier:
            if isinstance(src, (basestring, list)):
                # Fake bidirectional unification:
                # -------------------------------
                # If variable X has been unified with value v,
                # rewrite all mentions of v in the output category to point to variable X
                # (v is uniquified by concatenating it with an ID, so this should hold)            
                for subcat in p.cat.nested_compound_categories():
                    if subcat.slot.head.lex == src:
                        subcat.slot = dest.slot
            
        if config.debug:
            debug("> %s" % p.cat)
            debug('---')
            
            if config.fail_on_unassigned_variables:
                assert no_unassigned_variables(p.cat), "Unassigned variables in %s" % p.cat
                
    if config.debug:
        debug('unaries: %s', unaries)
        
    # Collect deps from arguments
    deps = []
    for l in chain( leaves(root), unaries ):
        if config.debug: debug("%s %s", l, l.cat)
        
        C = l.cat
        while not C.is_leaf():
            arg = C.right
            if arg.slot.head.filler:
                #and not l.cat.left.slot == l.cat.right.slot):
        #        print "%s %s %s %s %s %s" % (C.slot.head.lex, C, arg.slot.head.lex, arg, l.cat, C.label)
                if C.label is None:
                    warn("Dependency generated on slash without label: %s %s", C, arg)
                deps.append( (C.slot.head.lex, arg.slot.head.lex, l.cat, C.label) )
            if is_modifier(C): break
            C = C.left

    # Produce dep pairs
    result = set()
    for depl, depr, head_cat, head_label in deps:
        for sdepl in set(seqify(depl)):
            for sdepr in set(seqify(depr)):
                if not (sdepl and sdepr):
                    debug("Dependency with None: %s %s", sdepl, sdepr)
                    continue
                    
                result.add( (postprocessor(sdepl), postprocessor(sdepr), head_cat, head_label) )
                
    if config.debug:
        for line in write_deps(result):
            debug(line)
    return result