def run(self, filters_to_run, files): '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.''' filters = [] for filter_name, args in filters_to_run: # For a no-args switch, optparse passes in None; we substitute an empty tuple for # consistency if not args: args = () try: filter_class = self.available_filters_dict[filter_name] actual, expected = len(args), get_argcount_for_method(filter_class.__init__) if actual != expected: warn("Skipping filter %s; %d arguments given, %d expected.", filter_name, actual, expected) continue filters.append(filter_class(*args)) except KeyError: err("No filter with name `%s' found.", filter_name) # convert short notation in file specifiers to proper paths def expand_short_notation(fn): # short notation is # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn) if m: corpus_dir, sec, doc, deriv = m.groups() return os.path.join(corpus_dir, 'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv))) return fn files = [expand_short_notation(file) for file in files] self.run_filters(filters, files)
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False)] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True)] leaf_count = len(leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append( (leaf.lex, index) ) elif (leaf.tag not in ("POS", ":") # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append( (span_begin, leaf_count-index, open_quote) ) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append( (None, leaf_count-index, quote_type) ) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append( (span_begin, None, remaining_quote) ) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
def process(bundle): try: deps = get_deps(bundle.derivation) # Squelch! We need an empty PARG entry even if the process fails, otherwise AUTO and PARG are out of sync except Exception, e: err("Processing failed on derivation %s:", bundle.label()) sys.stderr.flush() traceback.print_exc() deps = []
def all_required_args_present(opts): '''Ensures that the required arguments are all present, notifies the user as to which of them are not, then returns whether all required arguments were present.''' arg_missing = False for (required_arg, arg_switches) in required_args.iteritems(): if getattr(opts, required_arg, None) is None: err("Argument %s is mandatory.", arg_switches) arg_missing = True return not arg_missing
def main(argv): parser = OptionParser( conflict_handler='resolve') # Intelligently resolve switch collisions parser.set_defaults(verbose=False, filters_to_run=[], packages=BuiltInPackages) # If any library loading switches (-l) are given, collect their names and remove them from argv argv, user_defined_libraries = filter_library_switches(argv) argv, autoloaded_libraries = filter_autoload_paths(argv) # Load built-in filters (those under BuiltInPackages) # Load user-requested filters (passed by -l on the command line) all_libraries = BuiltInPackages + user_defined_libraries + autoloaded_libraries tracer = TraceCore(libraries=all_libraries) # For each available filter, allow it to be invoked with switches on the command line for filter in tracer.available_filters_dict.values(): add_filter_to_optparser(parser, filter) # Load built-in optparse switches register_builtin_switches(parser) if len(argv) <= 1: parser.print_help() sys.exit(1) # Perform option parse, check for user-requested filter classes opts, remaining_args = parser.parse_args(argv) # Done with parser parser.destroy() if opts.debug: config.set(debug=True) # Set verbose switch if given on command line tracer.verbose = opts.verbose tracer.break_on_exception = opts.break_on_exception # Set override Reader if given on command line tracer.reader_class_name = opts.reader_class_name # Take remaining arguments as input file names files = remaining_args[1:] # remaining_args[0] seems to be sys.argv[0] # If switch -L was passed, dump out all available filter names and quit if opts.do_list_filters: tracer.list_filters() sys.exit(0) # Run requested filters try: tracer.run(opts.filters_to_run, files) except RuntimeError, e: err('RuntimeError: %s', e) sys.exit(1)
def get_op_func_for(self, operator): if operator in Operators: return Operators[operator] else: for regex, op_func_maker in IntArgOperators.iteritems(): matches = re.match(regex, operator) if matches: return op_func_maker(*matches.groups()) else: err('Invalid operator %s encountered.', self.operator)
def main(argv): parser = OptionParser(conflict_handler='resolve') # Intelligently resolve switch collisions parser.set_defaults(verbose=False, filters_to_run=[], packages=BuiltInPackages) # If any library loading switches (-l) are given, collect their names and remove them from argv argv, user_defined_libraries = filter_library_switches(argv) argv, autoloaded_libraries = filter_autoload_paths(argv) # Load built-in filters (those under BuiltInPackages) # Load user-requested filters (passed by -l on the command line) all_libraries = BuiltInPackages + user_defined_libraries + autoloaded_libraries tracer = TraceCore(libraries=all_libraries) # For each available filter, allow it to be invoked with switches on the command line for filter in tracer.available_filters_dict.values(): add_filter_to_optparser(parser, filter) # Load built-in optparse switches register_builtin_switches(parser) if len(argv) <= 1: parser.print_help() sys.exit(1) # Perform option parse, check for user-requested filter classes opts, remaining_args = parser.parse_args(argv) # Done with parser parser.destroy() if opts.debug: config.set(debug=True) # Set verbose switch if given on command line tracer.verbose = opts.verbose tracer.break_on_exception = opts.break_on_exception # Set override Reader if given on command line tracer.reader_class_name = opts.reader_class_name # Take remaining arguments as input file names files = remaining_args[1:] # remaining_args[0] seems to be sys.argv[0] # If switch -L was passed, dump out all available filter names and quit if opts.do_list_filters: tracer.list_filters() sys.exit(0) # Run requested filters try: tracer.run(opts.filters_to_run, files) except RuntimeError, e: err('RuntimeError: %s', e) sys.exit(1)
def get_filter_by_switch(self, switch_name): '''Retrieves the filter object based on its short or long form switch name.''' is_option_long_name = switch_name.startswith('--') for filter in self.tracer.available_filters_dict.values(): if is_option_long_name: if filter.long_opt == switch_name[2:]: return filter.__name__ else: if filter.opt == switch_name[1:]: return filter.__name__ err("No filter with switch %s found.", switch_name) return None
def run(self, filters_to_run, files): '''Performs a processing run, given a list of filter names to run, and a list of file specifiers.''' filters = [] for filter_name, args in filters_to_run: # For a no-args switch, optparse passes in None; we substitute an empty tuple for # consistency if not args: args = () try: filter_class = self.available_filters_dict[filter_name] actual, expected = len(args), get_argcount_for_method( filter_class.__init__) if actual != expected: warn( "Skipping filter %s; %d arguments given, %d expected.", filter_name, actual, expected) continue filters.append(filter_class(*args)) except KeyError: err("No filter with name `%s' found.", filter_name) # convert short notation in file specifiers to proper paths def expand_short_notation(fn): # short notation is # corpus:ss,dd,deriv -> corpus/chtb_ssdd.fid:deriv m = re.match(r'([^:]+):(\d+),(\d+),(\d+)', fn) if m: corpus_dir, sec, doc, deriv = m.groups() return os.path.join( corpus_dir, 'chtb_%02d%02d.fid:%d' % (int(sec), int(doc), int(deriv))) return fn files = [expand_short_notation(file) for file in files] self.run_filters(filters, files)
def write_dot_format(deriv, fn, format, label=""): cin = cout = None try: global dot_path if not dot_path: dot_path = os.popen('which dot').read().strip() if not dot_path: err('dot not found on this system. Ensure that dot is in the PATH.') return cmd = '%s -T%s -o %s 2>/dev/null' % (dot_path, format, fn) pipes = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True) cin, cout = pipes.stdin, pipes.stdout cin.write(make_graph(deriv, label=label)); cin.close() pipes.wait() if pipes.returncode is not None and pipes.returncode != 0: raise RuntimeError('dot terminated with non-zero return code: %d' % pipes.returncode) finally: if cin: cin.close() if cout: cout.close()
def p_full_regex(stk): # unlike the other productions, this returns a tuple of arguments to be splatted and # passed to a regex-taking node type (RE or REValue) ''' full_regex : REGEX | REGEX REGEX_SPEC ''' # Extract the regex between the slash delimiters regex = stk[1][1:-1].decode('u8') if len(stk) == 2: stk[0] = (regex, {'anchor_at_start': True}) # anchor_at_start = True elif len(stk) == 3: spec = stk[2] kwargs = {} if 'a' in spec: kwargs['anchor_at_start'] = False if 'u' in spec: kwargs['unicode'] = True if spec not in 'au': err('Invalid regex specifier %s.', spec) stk[0] = (regex, kwargs)
def spans(ptb_tree): '''Returns a sequence of tuples (B, E, P), P in ("``", "`"), where the Bth token from the start, and the Eth token from the end of the given PTB derivation span a P-quoted portion of the text.''' leaf_nodes = [ leaf for leaf in leaves(ptb_tree) if not is_ignored(leaf, ignoring_quotes=False) ] # TODO: do this without incurring another full pass through the full nodes list leaf_nodes_without_quotes = [ leaf for leaf in leaf_nodes if not is_ignored(leaf, ignoring_quotes=True) ] leaf_count = len( leaf_nodes_without_quotes) # should be equal to the CCG leaf count result = [] quote_stack = [] index = 0 for leaf in leaf_nodes: # Push open quote if leaf.lex in ("``", "`"): quote_stack.append((leaf.lex, index)) elif (leaf.tag not in ( "POS", ":" ) # The check for colon is to maintain derivation 21:61(24), which contains and leaf.lex in ("''", "'")): # an erroneously tagged single close quote. # Pop open quote and match with close quote if quote_stack: open_quote, span_begin = quote_stack.pop() if (open_quote == "``" and leaf.lex != "''" or open_quote == "`" and leaf.lex != "'"): warn("Unbalanced quotes, abandoning.") break # We treat the span end index as leaf_count-index, not that minus one, # because when we encounter the close quote, we are already one index # past the end of the quoted span. result.append((span_begin, leaf_count - index, open_quote)) # Quote stack is empty, assume quoted span starts from beginning of string else: if leaf.lex == "''": quote_type = "``" elif leaf.lex == "'": quote_type = "`" else: err("spans: should not reach") result.append((None, leaf_count - index, quote_type)) # Only advance the index for a leaf corresponding to a CCGbank leaf else: index += 1 # While open quotes are still on the stack, assume quoted span continues to end of string while quote_stack: remaining_quote, span_begin = quote_stack.pop() if remaining_quote in ("``", "`"): result.append((span_begin, None, remaining_quote)) else: warn("Unexpected quote %s after exhausting input.", remaining_quote) return result
def p_error(stk): err("Syntax error encountered: %s", stk)
def set_config_file(option, opt_string, value, parser, *args, **kwargs): try: config.config_file = value except IOError, e: err("Couldn't load config file `%s': %s", value, e)
# the filter fails with IOError: Broken pipe # In that case, running filters on further derivations will continue to # lead to 'Broken pipe', so just bail out if e.errno == errno.EPIPE: return except Exception, e: self.last_exceptions.append( (derivation_bundle, sys.exc_info()) ) if self._break_on_exception: raise FilterException(e, None) else: if self.last_exceptions: raise FilterException(e, None) except FilterException, e: for bundle, exception in self.last_exceptions: err("Processing failed on derivation %s of file %s:", bundle.label(), file) sys.excepthook(*exception) except IOError, e: for bundle, exception in self.last_exceptions: err("Processing failed on derivation %s of file %s:", bundle.label(), file) sys.excepthook(*exception) err("Processing failed with IOError: %s", e) raise for filter in filters: filter.output() if self.verbose: print >>sys.stderr, "---"
# lead to 'Broken pipe', so just bail out if e.errno == errno.EPIPE: return except Exception, e: self.last_exceptions.append( (derivation_bundle, sys.exc_info())) if self._break_on_exception: raise FilterException(e, None) else: if self.last_exceptions: raise FilterException(e, None) except FilterException, e: for bundle, exception in self.last_exceptions: err("Processing failed on derivation %s of file %s:", bundle.label(), file) sys.excepthook(*exception) except IOError, e: for bundle, exception in self.last_exceptions: err("Processing failed on derivation %s of file %s:", bundle.label(), file) sys.excepthook(*exception) err("Processing failed with IOError: %s", e) raise for filter in filters: filter.output() if self.verbose: print >> sys.stderr, "---"