Example #1
0
    def run(self, filters_to_run, files):
        '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.'''
        filters = []

        for filter_name, args in filters_to_run:
            # For a no-args switch, optparse passes in None; we substitute an empty tuple for
            # consistency
            if not args: args = ()

            try:
                filter_class = self.available_filters_dict[filter_name]
                
                actual, expected = len(args), get_argcount_for_method(filter_class.__init__)
                if actual != expected:
                    warn("Skipping filter %s; %d arguments given, %d expected.", filter_name, actual, expected)
                    continue
                    
                filters.append(filter_class(*args))
            except KeyError:
                err("No filter with name `%s' found.", filter_name)
                
        # convert short notation in file specifiers to proper paths
        def expand_short_notation(fn):
            # short notation is 
            # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv
            m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn)
            if m:
                corpus_dir, sec, doc, deriv = m.groups()
                return os.path.join(corpus_dir, 'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv)))
            return fn
            
        files = [expand_short_notation(file) for file in files]

        self.run_filters(filters, files)
Example #2
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''
    
    leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)]
    leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count
    
    result = []
    quote_stack = []
    index = 0
    
    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append( (leaf.lex, index) )
            
        elif (leaf.tag not in ("POS", ":")  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex in ("''", "'")): # an erroneously tagged single close quote.
              
            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''" or
                    open_quote == "`"  and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break
                
                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append( (span_begin, leaf_count-index, open_quote) )
                
            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")
                    
                result.append( (None, leaf_count-index, quote_type) )
        
        # Only advance the index for a leaf corresponding to a CCGbank leaf        
        else:
            index += 1
                
    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append( (span_begin, None, remaining_quote) )
        else:
            warn("Unexpected quote %s after exhausting input.", remaining_quote)
            
    return result
Example #3
0
 def process(bundle):
     try:
         deps = get_deps(bundle.derivation)
     # Squelch! We need an empty PARG entry even if the process fails, otherwise AUTO and PARG are out of sync
     except Exception, e: 
         err("Processing failed on derivation %s:", bundle.label())
         sys.stderr.flush()
         traceback.print_exc()
         deps = []
Example #4
0
def all_required_args_present(opts):
    '''Ensures that the required arguments are all present, notifies the user as to which of them are not,
then returns whether all required arguments were present.'''
    arg_missing = False
    for (required_arg, arg_switches) in required_args.iteritems():
        if getattr(opts, required_arg, None) is None:
            err("Argument %s is mandatory.", arg_switches)
            arg_missing = True

    return not arg_missing
Example #5
0
def all_required_args_present(opts):
    '''Ensures that the required arguments are all present, notifies the user as to which of them are not,
then returns whether all required arguments were present.'''
    arg_missing = False
    for (required_arg, arg_switches) in required_args.iteritems():
        if getattr(opts, required_arg, None) is None:
            err("Argument %s is mandatory.", arg_switches)
            arg_missing = True

    return not arg_missing
Example #6
0
def main(argv):
    parser = OptionParser(
        conflict_handler='resolve')  # Intelligently resolve switch collisions
    parser.set_defaults(verbose=False,
                        filters_to_run=[],
                        packages=BuiltInPackages)

    # If any library loading switches (-l) are given, collect their names and remove them from argv
    argv, user_defined_libraries = filter_library_switches(argv)
    argv, autoloaded_libraries = filter_autoload_paths(argv)

    # Load built-in filters (those under BuiltInPackages)
    # Load user-requested filters (passed by -l on the command line)
    all_libraries = BuiltInPackages + user_defined_libraries + autoloaded_libraries
    tracer = TraceCore(libraries=all_libraries)

    # For each available filter, allow it to be invoked with switches on the command line
    for filter in tracer.available_filters_dict.values():
        add_filter_to_optparser(parser, filter)
    # Load built-in optparse switches
    register_builtin_switches(parser)

    if len(argv) <= 1:
        parser.print_help()
        sys.exit(1)

    # Perform option parse, check for user-requested filter classes
    opts, remaining_args = parser.parse_args(argv)
    # Done with parser
    parser.destroy()

    if opts.debug:
        config.set(debug=True)

    # Set verbose switch if given on command line
    tracer.verbose = opts.verbose
    tracer.break_on_exception = opts.break_on_exception

    # Set override Reader if given on command line
    tracer.reader_class_name = opts.reader_class_name

    # Take remaining arguments as input file names
    files = remaining_args[1:]  # remaining_args[0] seems to be sys.argv[0]

    # If switch -L was passed, dump out all available filter names and quit
    if opts.do_list_filters:
        tracer.list_filters()
        sys.exit(0)

    # Run requested filters
    try:
        tracer.run(opts.filters_to_run, files)
    except RuntimeError, e:
        err('RuntimeError: %s', e)
        sys.exit(1)
Example #7
0
 def get_op_func_for(self, operator):
     if operator in Operators:
         return Operators[operator]
     else:
         for regex, op_func_maker in IntArgOperators.iteritems():
             matches = re.match(regex, operator)
             
             if matches:
                 return op_func_maker(*matches.groups())
         else:
             err('Invalid operator %s encountered.', self.operator)
Example #8
0
    def get_op_func_for(self, operator):
        if operator in Operators:
            return Operators[operator]
        else:
            for regex, op_func_maker in IntArgOperators.iteritems():
                matches = re.match(regex, operator)

                if matches:
                    return op_func_maker(*matches.groups())
            else:
                err('Invalid operator %s encountered.', self.operator)
Example #9
0
def main(argv):
    parser = OptionParser(conflict_handler='resolve') # Intelligently resolve switch collisions
    parser.set_defaults(verbose=False, filters_to_run=[], packages=BuiltInPackages)

    # If any library loading switches (-l) are given, collect their names and remove them from argv
    argv, user_defined_libraries = filter_library_switches(argv)
    argv, autoloaded_libraries   = filter_autoload_paths(argv)
    
    # Load built-in filters (those under BuiltInPackages)
    # Load user-requested filters (passed by -l on the command line)
    all_libraries = BuiltInPackages + user_defined_libraries + autoloaded_libraries
    tracer = TraceCore(libraries=all_libraries)
    
    # For each available filter, allow it to be invoked with switches on the command line
    for filter in tracer.available_filters_dict.values(): add_filter_to_optparser(parser, filter)
    # Load built-in optparse switches
    register_builtin_switches(parser)

    if len(argv) <= 1:
        parser.print_help()
        sys.exit(1)
    
    # Perform option parse, check for user-requested filter classes
    opts, remaining_args = parser.parse_args(argv)
    # Done with parser
    parser.destroy()
    
    if opts.debug:
        config.set(debug=True)
            
    # Set verbose switch if given on command line
    tracer.verbose = opts.verbose
    tracer.break_on_exception = opts.break_on_exception
    
    # Set override Reader if given on command line
    tracer.reader_class_name = opts.reader_class_name
    
    # Take remaining arguments as input file names
    files = remaining_args[1:] # remaining_args[0] seems to be sys.argv[0]
    
    # If switch -L was passed, dump out all available filter names and quit
    if opts.do_list_filters:
        tracer.list_filters()
        sys.exit(0)
        
    # Run requested filters
    try:
        tracer.run(opts.filters_to_run, files)
    except RuntimeError, e:
        err('RuntimeError: %s', e)
        sys.exit(1)
Example #10
0
    def get_filter_by_switch(self, switch_name):
        '''Retrieves the filter object based on its short or long form switch name.'''
        is_option_long_name = switch_name.startswith('--')

        for filter in self.tracer.available_filters_dict.values():
            if is_option_long_name:
                if filter.long_opt == switch_name[2:]:
                    return filter.__name__
            else:
                if filter.opt == switch_name[1:]:
                    return filter.__name__

        err("No filter with switch %s found.", switch_name)
        return None
Example #11
0
    def run(self, filters_to_run, files):
        '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.'''
        filters = []

        for filter_name, args in filters_to_run:
            # For a no-args switch, optparse passes in None; we substitute an empty tuple for
            # consistency
            if not args: args = ()

            try:
                filter_class = self.available_filters_dict[filter_name]

                actual, expected = len(args), get_argcount_for_method(
                    filter_class.__init__)
                if actual != expected:
                    warn(
                        "Skipping filter %s; %d arguments given, %d expected.",
                        filter_name, actual, expected)
                    continue

                filters.append(filter_class(*args))
            except KeyError:
                err("No filter with name `%s' found.", filter_name)

        # convert short notation in file specifiers to proper paths
        def expand_short_notation(fn):
            # short notation is
            # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv
            m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn)
            if m:
                corpus_dir, sec, doc, deriv = m.groups()
                return os.path.join(
                    corpus_dir,
                    'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv)))
            return fn

        files = [expand_short_notation(file) for file in files]

        self.run_filters(filters, files)
Example #12
0
def write_dot_format(deriv, fn, format, label=""):
    cin = cout = None
    try:
        global dot_path
        if not dot_path:
            dot_path = os.popen('which dot').read().strip()
            if not dot_path:
                err('dot not found on this system. Ensure that dot is in the PATH.')
                return
            
        cmd = '%s -T%s -o %s 2>/dev/null' % (dot_path, format, fn)
        pipes = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
        
        cin, cout = pipes.stdin, pipes.stdout
        cin.write(make_graph(deriv, label=label)); cin.close()
        
        pipes.wait()
        if pipes.returncode is not None and pipes.returncode != 0:
            raise RuntimeError('dot terminated with non-zero return code: %d' % pipes.returncode)

    finally:
        if cin:  cin.close()
        if cout: cout.close()
Example #13
0
def p_full_regex(stk):
    # unlike the other productions, this returns a tuple of arguments to be splatted and
    # passed to a regex-taking node type (RE or REValue)
    '''
    full_regex : REGEX
               | REGEX REGEX_SPEC
    '''
    # Extract the regex between the slash delimiters
    regex = stk[1][1:-1].decode('u8')

    if len(stk) == 2:
        stk[0] = (regex, {'anchor_at_start': True})  # anchor_at_start = True
    elif len(stk) == 3:
        spec = stk[2]
        kwargs = {}
        if 'a' in spec:
            kwargs['anchor_at_start'] = False
        if 'u' in spec:
            kwargs['unicode'] = True

        if spec not in 'au':
            err('Invalid regex specifier %s.', spec)

        stk[0] = (regex, kwargs)
Example #14
0
def p_full_regex(stk):
    # unlike the other productions, this returns a tuple of arguments to be splatted and
    # passed to a regex-taking node type (RE or REValue)
    '''
    full_regex : REGEX
               | REGEX REGEX_SPEC
    '''
    # Extract the regex between the slash delimiters
    regex = stk[1][1:-1].decode('u8')
    
    if len(stk) == 2:
        stk[0] = (regex, {'anchor_at_start': True}) # anchor_at_start = True
    elif len(stk) == 3:
        spec = stk[2]
        kwargs = {}
        if 'a' in spec:
            kwargs['anchor_at_start'] = False
        if 'u' in spec:
            kwargs['unicode'] = True
            
        if spec not in 'au':
            err('Invalid regex specifier %s.', spec)
        
        stk[0] = (regex, kwargs)
Example #15
0
def spans(ptb_tree):
    '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token 
from the end of the given PTB derivation span a P-quoted portion of the text.'''

    leaf_nodes = [
        leaf for leaf in leaves(ptb_tree)
        if not is_ignored(leaf, ignoring_quotes=False)
    ]
    # TODO: do this without incurring another full pass through the full nodes list
    leaf_nodes_without_quotes = [
        leaf for leaf in leaf_nodes
        if not is_ignored(leaf, ignoring_quotes=True)
    ]
    leaf_count = len(
        leaf_nodes_without_quotes)  # should be equal to the CCG leaf count

    result = []
    quote_stack = []
    index = 0

    for leaf in leaf_nodes:
        # Push open quote
        if leaf.lex in ("``", "`"):
            quote_stack.append((leaf.lex, index))

        elif (leaf.tag not in (
                "POS", ":"
        )  # The check for colon is to maintain derivation 21:61(24), which contains
              and leaf.lex
              in ("''", "'")):  # an erroneously tagged single close quote.

            # Pop open quote and match with close quote
            if quote_stack:
                open_quote, span_begin = quote_stack.pop()
                if (open_quote == "``" and leaf.lex != "''"
                        or open_quote == "`" and leaf.lex != "'"):
                    warn("Unbalanced quotes, abandoning.")
                    break

                # We treat the span end index as leaf_count-index, not that minus one,
                # because when we encounter the close quote, we are already one index
                # past the end of the quoted span.
                result.append((span_begin, leaf_count - index, open_quote))

            # Quote stack is empty, assume quoted span starts from beginning of string
            else:
                if leaf.lex == "''":
                    quote_type = "``"
                elif leaf.lex == "'":
                    quote_type = "`"
                else:
                    err("spans: should not reach")

                result.append((None, leaf_count - index, quote_type))

        # Only advance the index for a leaf corresponding to a CCGbank leaf
        else:
            index += 1

    # While open quotes are still on the stack, assume quoted span continues to end of string
    while quote_stack:
        remaining_quote, span_begin = quote_stack.pop()
        if remaining_quote in ("``", "`"):
            result.append((span_begin, None, remaining_quote))
        else:
            warn("Unexpected quote %s after exhausting input.",
                 remaining_quote)

    return result
Example #16
0
def p_error(stk):
    err("Syntax error encountered: %s", stk)
Example #17
0
def set_config_file(option, opt_string, value, parser, *args, **kwargs):
    try:
        config.config_file = value
    except IOError, e:
        err("Couldn't load config file `%s': %s", value, e)
Example #18
0
                        # the filter fails with IOError: Broken pipe
                        # In that case, running filters on further derivations will continue to
                        # lead to 'Broken pipe', so just bail out
                        if e.errno == errno.EPIPE: return
                            
                    except Exception, e:
                        self.last_exceptions.append( (derivation_bundle, sys.exc_info()) )
                        
                        if self._break_on_exception:
                            raise FilterException(e, None)
                else:
                    if self.last_exceptions:
                        raise FilterException(e, None)
                        
            except FilterException, e:
                for bundle, exception in self.last_exceptions:
                    err("Processing failed on derivation %s of file %s:", bundle.label(), file)
                    sys.excepthook(*exception)
                    
            except IOError, e:
                for bundle, exception in self.last_exceptions:
                    err("Processing failed on derivation %s of file %s:", bundle.label(), file)
                    sys.excepthook(*exception)
                err("Processing failed with IOError: %s", e)
                raise

        for filter in filters:
            filter.output()
            if self.verbose:
                print >>sys.stderr, "---"
Example #19
0
                        # lead to 'Broken pipe', so just bail out
                        if e.errno == errno.EPIPE: return

                    except Exception, e:
                        self.last_exceptions.append(
                            (derivation_bundle, sys.exc_info()))

                        if self._break_on_exception:
                            raise FilterException(e, None)
                else:
                    if self.last_exceptions:
                        raise FilterException(e, None)

            except FilterException, e:
                for bundle, exception in self.last_exceptions:
                    err("Processing failed on derivation %s of file %s:",
                        bundle.label(), file)
                    sys.excepthook(*exception)

            except IOError, e:
                for bundle, exception in self.last_exceptions:
                    err("Processing failed on derivation %s of file %s:",
                        bundle.label(), file)
                    sys.excepthook(*exception)
                err("Processing failed with IOError: %s", e)
                raise

        for filter in filters:
            filter.output()
            if self.verbose:
                print >> sys.stderr, "---"
Example #20
0
def p_error(stk):
    err("Syntax error encountered: %s", stk)