def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher, quotes, quoter): '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.''' with file(ccg_auto_out, 'w') as ccg_out: with file(ccg_parg_out, 'w') as parg_out: penn_trees = list(PTBReader(ptb_file)) ccg_trees = list(CCGbankReader(ccg_file)) deps = list(CCGbankDepsReader(deps_file)) matched_penn_trees = match_trees(penn_trees, ccg_trees) for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees, ccg_trees, deps): ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation quote_spans = spans(ptb_tree) while quote_spans: value = quote_spans.pop(0) span_start, span_end, quote_type = value if span_start is None and span_end is None: continue info("Reinstating quotes to %s (%s, %s)", ccg_bundle.label(), span_start, span_end) ccg_tree, quote_indices = quoter.attach_quotes(ccg_tree, span_start, span_end, quote_type, higher, quotes) # In case a new root has been installed, re-assign the new root to the CCGbank bundle ccg_bundle.derivation = ccg_tree # Shift remaining quote span indices by the number of quotes that have been inserted quote_spans = fix_quote_spans(quote_spans, quote_indices) dep = fix_dependencies(dep, quote_indices) print >> parg_out, dep print >> ccg_out, ccg_bundle
def __iter__(self): for left, right in izip(self.reader(self.leftdir), self.reader(self.rightdir)): info("Processing %s/%s", left.label(), right.label()) deriv = Derivation(left, right) yield deriv del deriv del left del right
def run_filters(self, filters, files): # If all given filters were not found or had wrong argument count, do nothing if not filters: return reader_args = {} if self.reader_class_name: try: reader_class = globals()[self.reader_class_name] info("Using reader class %s.", self.reader_class_name) reader_args['reader_class'] = reader_class except KeyError: raise RuntimeError("Reader class %s not found." % self.reader_class_name) for file in self.transform(files): if self.is_pair_spec(file): meta_reader = PairedReader else: meta_reader = DirFileGuessReader try: self.last_exceptions = [] for derivation_bundle in meta_reader(file, verbose=self.verbose, **reader_args): if self.verbose: info("Processing %s...", derivation_bundle.label()) try: for filter in filters: filter.context = derivation_bundle if filter.accept_leaf is not None: for leaf in leaves(derivation_bundle.derivation): for filter in filters: filter.accept_leaf(leaf) if filter.accept_comb_and_slash_index is not None: try: for slash_index, comb in enumerate(applications_per_slash(leaf)): filter.accept_comb_and_slash_index(leaf, comb, slash_index) except AttributeError: # TODO: hacky and inefficient, need this to work for PTB too pass for filter in filters: filter.accept_derivation(derivation_bundle) filter.context = None except IOError, e: # If output is going to a pager, and the user requests an interrupt (^C) # the filter fails with IOError: Broken pipe # In that case, running filters on further derivations will continue to # lead to 'Broken pipe', so just bail out if e.errno == errno.EPIPE: return except Exception, e: self.last_exceptions.append( (derivation_bundle, sys.exc_info()) ) if self._break_on_exception: raise FilterException(e, None)
def main(argv): parser = OptionParser() register_builtin_switches(parser) opts, args = parser.parse_args(argv) if not all_required_args_present(opts): parser.print_help() sys.exit(1) quoter_class = { 'span': SpanQuoter, 'lca' : LCAQuoter }[opts.quote_method] punct_class = { 'swap' : SwapComma, 'shift': ShiftComma }.get(opts.punct_method, None) quoter = quoter_class(punct_class) remaining_args = args[1:] if not remaining_args: # If no sec/doc specifiers are given, assume 'all sections all documents' remaining_args.append(':') ptb_files_spec = parse_requested_derivs(remaining_args) for sec_glob, doc_glob in ptb_files_spec: for ptb_file in glob(os.path.join(opts.penn_in, sec_glob, "wsj_%s%s.mrg" % (sec_glob, doc_glob))): info("Processing %s", ptb_file) matches = PTBFileRegex.search(ptb_file) if matches and len(matches.groups()) == 2: sec, doc = matches.groups() ccg_file = os.path.join(opts.ccg_in, 'AUTO', sec, "wsj_%s%s.auto" % (sec, doc)) deps_file = os.path.join(opts.ccg_in, 'PARG', sec, "wsj_%s%s.parg" % (sec, doc)) if not opts.quiet: if not os.path.exists(ccg_file): warn("No corresponding CCGbank file %s for Penn file %s", ccg_file, ptb_file) if not os.path.exists(deps_file): warn("No corresponding CCGbank dependency file %s for CCG file %s", deps_file, ccg_file) ccg_auto_dir, ccg_parg_dir = [os.path.join(opts.outdir, part, sec) for part in ('AUTO', 'PARG')] if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir) if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir) ccg_auto_out, ccg_parg_out = (os.path.join(ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)), os.path.join(ccg_parg_dir, 'wsj_%s%s.parg' % (sec, doc))) process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, opts.higher, opts.quotes, quoter) else: warn("Could not find, so ignoring %s", ptb_file)
def __iter__(self): # TODO: duplication for section_path in self.sections: # If _topdir_ has directories under, expand to use the files it contains if os.path.isdir(section_path): docs = glob(os.path.join(section_path, '*')) for doc_path in docs: if self.verbose: info("Processing %s...", doc_path) reader = self.reader(doc_path) for deriv_bundle in reader: yield deriv_bundle del reader # Otherwise _topdir_ is flat: read the files it contains else: reader = self.reader(section_path) for deriv_bundle in reader: yield deriv_bundle del reader
def process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, higher, quotes, quoter): '''Reinstates quotes given a PTB file and its corresponding CCGbank file and deps file.''' with file(ccg_auto_out, 'w') as ccg_out: with file(ccg_parg_out, 'w') as parg_out: penn_trees = list(PTBReader(ptb_file)) ccg_trees = list(CCGbankReader(ccg_file)) deps = list(CCGbankDepsReader(deps_file)) matched_penn_trees = match_trees(penn_trees, ccg_trees) for (ptb_bundle, ccg_bundle, dep) in zip(matched_penn_trees, ccg_trees, deps): ptb_tree, ccg_tree = ptb_bundle.derivation, ccg_bundle.derivation quote_spans = spans(ptb_tree) while quote_spans: value = quote_spans.pop(0) span_start, span_end, quote_type = value if span_start is None and span_end is None: continue info("Reinstating quotes to %s (%s, %s)", ccg_bundle.label(), span_start, span_end) ccg_tree, quote_indices = quoter.attach_quotes( ccg_tree, span_start, span_end, quote_type, higher, quotes) # In case a new root has been installed, re-assign the new root to the CCGbank bundle ccg_bundle.derivation = ccg_tree # Shift remaining quote span indices by the number of quotes that have been inserted quote_spans = fix_quote_spans(quote_spans, quote_indices) dep = fix_dependencies(dep, quote_indices) print >> parg_out, dep print >> ccg_out, ccg_bundle
def main(argv): parser = OptionParser() register_builtin_switches(parser) opts, args = parser.parse_args(argv) if not all_required_args_present(opts): parser.print_help() sys.exit(1) quoter_class = {'span': SpanQuoter, 'lca': LCAQuoter}[opts.quote_method] punct_class = { 'swap': SwapComma, 'shift': ShiftComma }.get(opts.punct_method, None) quoter = quoter_class(punct_class) remaining_args = args[1:] if not remaining_args: # If no sec/doc specifiers are given, assume 'all sections all documents' remaining_args.append(':') ptb_files_spec = parse_requested_derivs(remaining_args) for sec_glob, doc_glob in ptb_files_spec: for ptb_file in glob( os.path.join(opts.penn_in, sec_glob, "wsj_%s%s.mrg" % (sec_glob, doc_glob))): info("Processing %s", ptb_file) matches = PTBFileRegex.search(ptb_file) if matches and len(matches.groups()) == 2: sec, doc = matches.groups() ccg_file = os.path.join(opts.ccg_in, 'AUTO', sec, "wsj_%s%s.auto" % (sec, doc)) deps_file = os.path.join(opts.ccg_in, 'PARG', sec, "wsj_%s%s.parg" % (sec, doc)) if not opts.quiet: if not os.path.exists(ccg_file): warn( "No corresponding CCGbank file %s for Penn file %s", ccg_file, ptb_file) if not os.path.exists(deps_file): warn( "No corresponding CCGbank dependency file %s for CCG file %s", deps_file, ccg_file) ccg_auto_dir, ccg_parg_dir = [ os.path.join(opts.outdir, part, sec) for part in ('AUTO', 'PARG') ] if not os.path.exists(ccg_auto_dir): os.makedirs(ccg_auto_dir) if not os.path.exists(ccg_parg_dir): os.makedirs(ccg_parg_dir) ccg_auto_out, ccg_parg_out = (os.path.join( ccg_auto_dir, 'wsj_%s%s.auto' % (sec, doc)), os.path.join( ccg_parg_dir, 'wsj_%s%s.parg' % (sec, doc))) process(ptb_file, ccg_file, deps_file, ccg_auto_out, ccg_parg_out, opts.higher, opts.quotes, quoter) else: warn("Could not find, so ignoring %s", ptb_file)
def run_filters(self, filters, files): # If all given filters were not found or had wrong argument count, do nothing if not filters: return reader_args = {} if self.reader_class_name: try: reader_class = globals()[self.reader_class_name] info("Using reader class %s.", self.reader_class_name) reader_args['reader_class'] = reader_class except KeyError: raise RuntimeError("Reader class %s not found." % self.reader_class_name) for file in self.transform(files): if self.is_pair_spec(file): meta_reader = PairedReader else: meta_reader = DirFileGuessReader try: self.last_exceptions = [] for derivation_bundle in meta_reader(file, verbose=self.verbose, **reader_args): if self.verbose: info("Processing %s...", derivation_bundle.label()) try: for filter in filters: filter.context = derivation_bundle if filter.accept_leaf is not None: for leaf in leaves(derivation_bundle.derivation): for filter in filters: filter.accept_leaf(leaf) if filter.accept_comb_and_slash_index is not None: try: for slash_index, comb in enumerate( applications_per_slash( leaf)): filter.accept_comb_and_slash_index( leaf, comb, slash_index) except AttributeError: # TODO: hacky and inefficient, need this to work for PTB too pass for filter in filters: filter.accept_derivation(derivation_bundle) filter.context = None except IOError, e: # If output is going to a pager, and the user requests an interrupt (^C) # the filter fails with IOError: Broken pipe # In that case, running filters on further derivations will continue to # lead to 'Broken pipe', so just bail out if e.errno == errno.EPIPE: return except Exception, e: self.last_exceptions.append( (derivation_bundle, sys.exc_info())) if self._break_on_exception: raise FilterException(e, None)