class GenerateApp(App): """ Generate empty documents. """ ndocs_ap = ArgumentParser() ndocs_ap.add_argument( 'ndocs', nargs='?', metavar='COUNT', type=int, default=float('inf'), help='The number of documents to generate (default: infinity)') arg_parsers = (ndocs_ap, OSTREAM_AP) def __call__(self): empty = io.BytesIO() writer = dr.Writer(empty, dr.Doc) writer.write(dr.Doc()) empty = empty.getvalue() out = self.args.out_stream if six.PY3: out = out.buffer i = 0 while i < self.args.ndocs: out.write(empty) i += 1
class ListStoresApp(App): """ List the stores available in the corpus. Where multiple documents are input, also indicates the number of documents where they appear. """ # Extend to list fields, and fields on stored types ls_arg_parser = ArgumentParser() ls_arg_parser.add_argument('-e', '--each-doc', dest='show_each', default=False, action='store_true', help='List stores for each doc') arg_parsers = ( ls_arg_parser, DESERIALISE_AP, ) def __call__(self): counter = collections.defaultdict(int) for i, doc in enumerate(self.raw_stream_reader): names = list(get_store_names(doc)) if self.args.show_each: print(' '.join(sorted(names))) for name in names: counter[name] += 1 try: if i == 1: fmt = '{name}' else: fmt = '{name}\t{count}' except NameError: print("No documents found", out=sys.stderr) for k, v in sorted(counter.items(), key=lambda tup: (-tup[1], tup[0])): print(fmt.format(name=k, count=v))
class KFoldsEvaluator(Evaluator): """Distribute to each of k folds""" ap = ArgumentParser() ap.add_argument('kfolds', type=int) arg_parsers = (ap, ) def __call__(self, doc, ind): return ind % self.args.kfolds
class RandomEvaluator(Evaluator): """Shuffle the input randomly""" ap = ArgumentParser() ap.add_argument('--seed', dest='rand_seed', type=int, default=None) arg_parsers = (ap, ) def __init__(self, argparser, args): super(RandomEvaluator, self).__init__(argparser, args) import random self.gen_random = random.Random(self.args.rand_seed).random def __call__(self, doc, ind): return self.gen_random()
class RenameApp(App): """ Rename specified fields or stores. """ # TODO: rename annotation classes rename_list_ap = ArgumentParser() rename_list_ap.add_argument( 'renames', nargs='+', type=RenameField, help='Rename description of form [Class.]new_name=old_name') arg_parsers = (rename_list_ap, ISTREAM_AP, OSTREAM_AP) def __init__(self, argparser, args): rename_dict = collections.defaultdict(set) for klass, new, old in (args.renames or ()): rename_dict[klass].add((new, old)) args.renames = dict(rename_dict) super(RenameApp, self).__init__(argparser, args) def __call__(self): # FIXME: externalise reflection methods reader, writer = self.stream_reader_writer for doc in reader: classes = {None: doc.__class__} classes.update((store.klass_name, store._klass) for store in six.itervalues(doc._dr_stores)) for klass_name, klass in six.iteritems(classes): try: renames = self.args.renames[klass_name] except KeyError: continue relevant = [] for new, old in renames: try: del klass._dr_s2p[old] except KeyError: pass else: relevant.append((new, old)) # s2p isn't used in Writer at present, but we'll update it just in case klass._dr_s2p.update(relevant) fields = klass._dr_fields.copy() fields.update(getattr(klass, '_dr_stores', ())) for new, old in relevant: fields[old].serial = new writer.write(doc)
class SubsetApp(App): """ Extract documents by non-negative index or slice (a generalisation of head). Behaviour is undefined for overlapping slices. """ arg_parser = ArgumentParser() arg_parser.add_argument( 'slices', nargs='+', type=subset_type, help= 'Non-negative slices in Python-like notation, e.g. 0, 5, :10, 5:10, 5:' ) arg_parsers = (arg_parser, ISTREAM_AP, OSTREAM_AP) @staticmethod def gen_subsets(it, *slices): if not slices: for obj in it: yield obj starts = {sl.start for sl in slices} if None in starts: starts.add(0) stops = {sl.stop for sl in slices} if None in stops: pairs = enumerate(it) else: pairs = zip(range(max(stops)), it) yielding = False for i, obj in pairs: yielding = (yielding and i not in stops) or i in starts if yielding: yield obj def _run(self, *slices): # TODO: avoid desiralising writer = self.raw_stream_writer reader = self.raw_stream_reader for doc in self.gen_subsets(reader, *slices): writer.write(doc) def __call__(self): self._run(*self.args.slices)
class SrcGenerator(App): """ Generate source code for declaring types as instantiated in a given corpus, assuming headers are identical throughout. """ srcgen_ap = ArgumentParser() add_subparsers(srcgen_ap, sorted(SrcGenLang.CLASSES.items()), 'gen_cls', title='target languages') srcgen_ap.add_argument('--doc-name', default='Document', help='The name of the document class (default: %(default)r)') srcgen_ap.add_argument('--indent', default=' ', help='The indent text (defaukt: %(default)r)') arg_parsers = (srcgen_ap, DESERIALISE_AP, OSTREAM_AP) def __init__(self, argparser, args): super(SrcGenerator, self).__init__(argparser, args) self.generate = args.gen_cls(argparser, args) def __call__(self): doc = next(self.stream_reader) schema = doc._dr_rt.copy_to_schema() # WARNING: using private self.generate(schema)
class SetFieldApp(App): """ Set a named field on each document to a value. """ field_name_ap = ArgumentParser() field_name_ap.add_argument('field_name', help='The field name to set') arg_parsers = (field_name_ap, get_evaluator_ap(), DESERIALISE_AP, OSTREAM_AP) def __call__(self): attr = self.args.field_name evaluator = self.evaluator reader, writer = self.stream_reader_writer for i, doc in enumerate(reader): if attr not in doc._dr_s2p: # TODO: externalise reflection methods doc._dr_s2p[attr] = attr doc._dr_fields[attr] = dr.Field(serial=attr) setattr(doc, attr, evaluator(doc, i)) writer.write(doc)
class WriteConll(App): """Writes documents in CONLL format, or a format which similarly lists fields separated by some delimiter. Example invocation: `cat docs.dr | dr conll --doc-class some.module.Document --norm -f pos --iob1 chunk.tag` For `--iob1 'chunk.tag'` to work, this assumes some.module.Document.drcli_decorate includes the following decoration: reverse_slices('chunks', 'tokens', 'span', all_attr='chunk') """ annotations_ap = ArgumentParser() annotations_ap.add_argument( '--tok-store', dest='get_tokens', default=attrgetter('tokens'), type=attrgetter, help='Specify a particular Token store (default: tokens)') annotations_ap.add_argument( '--sent-store', dest='get_sentences', default=attrgetter('sentences'), type=attrgetter, help='Specify a particular Sentence store (default: sentences)') annotations_ap.add_argument( '--sent-tok-slice', dest='get_sent_tok_slice', default=attrgetter('span'), type=attrgetter, help= 'The field on Sentence objects which indicates its slice over tokens (default: span)' ) annotations_ap.add_argument( '--ignore-sents', dest='get_sentences', action='store_const', const=lambda doc: (_SuperSentence(), ), help='List all tokens as if in a single sentence') # TODO: use streams instead of string operations formatting_ap = ArgumentParser() formatting_ap.add_argument('--field-sep', dest='fmt_fields', default=fmt_separator('\t'), type=fmt_separator, help='Separator between fields (default: tab)') formatting_ap.add_argument( '--tok-sep', dest='fmt_toks', default=fmt_separator('\n'), type=fmt_separator, help='Separator between tokens (default: newline)') formatting_ap.add_argument( '--sent-sep', dest='fmt_sents', default=fmt_separator('\n\n'), type=fmt_separator, help='Separator between sentences (default: double-newline)') formatting_ap.add_argument( '--doc-sep', dest='fmt_docs', default=fmt_separator('\n\n#BEGIN-DOC\n\n'), type=fmt_separator, help='Separator between documents (default: #BEGIN-DOC)') formatting_ap.add_argument('--candc', action=SetCandcAction, nargs=0, help='Use default C&C tagger format') field_list_ap = ArgumentParser() field_list_ap.add_argument('--norm', dest='field_extractors', const=get_norm, action='append_const', help='Output the normal token form') field_list_ap.add_argument('--raw', dest='field_extractors', const=get_raw, action='append_const', help='Output the raw token form') field_list_ap.add_argument('-f', '--field', dest='field_extractors', type=attrgetter, action='append', help='Output the specified field') field_list_ap.add_argument( '--fn', dest='field_extractors', type=import_string, action='append', help='Output the result of a function given a token') # Slice fields: field_list_ap.add_argument( '--iob1', dest='field_extractors', action=_AppendSliceField, slice_fmt=partial(_IOB, mode=_IOB.IOB1), help= 'Outputs IOB1 given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)' ) field_list_ap.add_argument( '--iob2', dest='field_extractors', action=_AppendSliceField, slice_fmt=partial(_IOB, mode=_IOB.IOB2), help= 'Outputs IOB2 given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)' ) field_list_ap.add_argument( '--bilou', dest='field_extractors', action=_AppendSliceField, slice_fmt=_BILOU, help= 'Outputs BILOU given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)' ) field_list_ap.add_argument( '--bmewo', dest='field_extractors', action=_AppendSliceField, slice_fmt=partial(_BILOU, tags='BMEOW'), help= 'Outputs BMEWO given the name of an attribute resulting from reverse_slices(.., all_attr=MY_ATTR)' ) # TODO: allow decorators to be specified on the command-line arg_parsers = (field_list_ap, formatting_ap, annotations_ap, DESERIALISE_AP) def __init__(self, argparser, args): if not args.field_extractors: argparser.error('At least one field extractor is required') if not hasattr(args, 'clean_field'): args.clean_field = lambda s: s super(WriteConll, self).__init__(argparser, args) def __call__(self): self.write_flattened( sys.stdout.write, self.args.fmt_docs( self.process_doc(doc) for doc in self.stream_reader)) def write_flattened(self, write, iterable): for fragment in iterable: if isinstance(fragment, six.string_types): write(fragment) else: self.write_flattened(write, fragment) def process_doc(self, doc): token_store = self.args.get_tokens(doc) return self.args.fmt_sents( self.begin_sentence() or self.process_sent(sent, token_store) for sent in self.args.get_sentences(doc)) def process_sent(self, sent, tok_store): return self.args.fmt_toks( self.process_tok(tok) for tok in tok_store[self.args.get_sent_tok_slice(sent)]) def process_tok(self, tok): return self.args.fmt_fields( self.args.clean_field(str(extr(tok))) for extr in self.args.field_extractors) def begin_sentence(self): # TODO: should only need to do these checks once per instance for extr in self.args.field_extractors: f = getattr(extr, 'begin_sentence', None) if f: f()
class CountApp(App): """ Count the number of documents or annotations in named stores. Examples: %(prog)s # display the number of documents found on standard input %(prog)s *.dr # list the number of documents in each .dr file and their total %(prog)s -a # display the number of elements in each store %(prog)s -s tokens # display the total number of elements in the 'tokens' store %(prog)s -ds tokens # same with document count %(prog)s -ds tokens -s sentences # same with number of 'sentences' elements %(prog)s -ea # display the number of elements in each store per document %(prog)s -eac # display the cumulative number of elements in each store per document %(prog)s -eacj # the same with output in JSON rather than a table %(prog)s -tcv10 # every 10 documents, display the time and number of documents processed %(prog)s -aj --average --bytes # display as JSON the average and total number of bytes consumed by each store """ count_arg_parser = ArgumentParser() count_arg_parser.add_argument('-s', '--store', metavar='ATTR', dest='count_stores', action='append', default=[], help='Count the specified store') count_arg_parser.add_argument( '-d', '--docs', dest='count_docs', action='store_true', help='Count the number of documents (default without stores specified)' ) count_arg_parser.add_argument( '-a', '--all', dest='count_all', action='store_true', help='Count docs and elements in all stores found on the first document' ) count_arg_parser.add_argument('-v', '--every', dest='show_interval', type=int, metavar='N', help='Show counts every N docs') count_arg_parser.add_argument('-e', '--every1', dest='show_interval', action='store_const', const=1, help='Show counts every doc') count_arg_parser.add_argument( '--bytes', dest='count_bytes', action='store_true', default=False, help= 'Count the number of bytes for each store, rather than the number of elements' ) count_arg_parser.add_argument('--no-subtotal', dest='show_subtotal', default=True, action='store_false', help='Hides total count per input file') count_arg_parser.add_argument( '--no-total', dest='show_total', default=True, action='store_false', help='Hides total count across all documents') count_arg_parser.add_argument('--average', dest='show_average', default=False, action='store_true', help='Show an average size per document') count_arg_parser.add_argument( '--no-header', dest='show_header', default=True, action='store_false', help= 'Hides the field names displayed by --fmt-table with more than one field output' ) count_arg_parser.add_argument('-c', '--cumulative', default=False, action='store_true', help='Show cumulative counts') count_arg_parser.add_argument('-t', '--timestamp', action='store_true', default=False, help='Output the time with each count') count_arg_parser.add_argument( '--sep', dest='field_sep', default='\t', help='Output field separator (with --fmt-table)') count_arg_parser.add_argument('--fmt-table', dest='formatter_cls', action='store_const', const=CountTableFormatter, default=CountTableFormatter, help='Format output as a table (default)') count_arg_parser.add_argument('-j', '--fmt-json', dest='formatter_cls', action='store_const', const=CountJsonFormatter, help='Format output as JSON') count_arg_parser.add_argument( 'files', nargs='*', type=DrInputType, help='Specify files by name rather than standard input') arg_parsers = ( count_arg_parser, ISTREAM_AP, ) def __init__(self, argparser, args): if args.count_all and (args.count_docs or args.count_stores): argparser.error( '--all flag may not be used in conjunction with --docs or store names' ) if not (args.count_docs or args.count_stores or args.count_all): args.count_docs = True if args.count_all: args.count_docs = True elif 1 == len(args.count_stores) + (1 if args.count_docs else 0): args.show_header = False if not args.files: args.files = [args.in_stream] if len(args.files) <= 1: args.show_subtotal = False if not (args.show_interval or args.show_header or args.show_total or args.show_subtotal or args.show_average): argparser.error('Nothing to display') if args.cumulative and not args.show_interval and not args.show_subtotal: argparser.error( '--cumulative may not apply without --every or per-file subtotals' ) self.formatter = args.formatter_cls(args, sys.stdout) super(CountApp, self).__init__(argparser, args) def __call__(self): consts = CountFormatter unit = consts.COUNT_BYTES if self.args.count_bytes else consts.COUNT_ELEMENTS self.formatter.start() i = 0 for in_file in self.args.files: if i and not self.args.cumulative: subtotals = [0] * len(extractors) for doc in read_raw_docs(in_file, on_end='break'): if not i: names, extractors = self._get_counters(doc) totals = [0] * len(extractors) subtotals = [0] * len(extractors) self.formatter.set_fields(names) doc_counts = [extract(doc) for extract in extractors] for j, c in enumerate(doc_counts): subtotals[j] += c totals[j] += c if self.args.show_interval and ( i + 1) % self.args.show_interval == 0: if self.args.cumulative: self.formatter.add_row(totals, i, agg=consts.AGG_SUM, filename=in_file.name, unit=unit) else: self.formatter.add_row(doc_counts, i, filename=in_file.name, unit=unit) i += 1 if self.args.show_subtotal: try: self.formatter.add_row(subtotals, consts.FILE, agg=consts.AGG_SUM, filename=in_file.name, unit=unit) except NameError: print("No documents to count", file=sys.stderr) try: if self.args.show_total: self.formatter.add_row(totals, consts.ALL, agg=consts.AGG_SUM, unit=unit) if self.args.show_average: self.formatter.add_row([x / i for x in totals], consts.ALL, agg=consts.AGG_AVG, unit=unit) except NameError: print("No documents to count", file=sys.stderr) self.formatter.finish() def _get_counters(self, doc): names = [] extractors = [] if self.args.count_all: self.args.count_stores = sorted(get_store_names(doc)) if self.args.count_bytes: self.args.count_stores.insert(0, b'__meta__') else: self.args.count_stores = [ name.encode('utf-8') for name in self.args.count_stores ] if self.args.count_docs: names.append('docs') extractors.append(self._doc_counter) for store in self.args.count_stores: names.append(store.decode('utf-8')) extractors.append(self._make_store_counter(store)) return names, extractors @staticmethod def _doc_counter(doc): return 1 def _make_store_counter(self, attr): if not self.args.count_bytes: def count(doc): for name, klass, nelem in doc.stores: if name == attr: return nelem return 0 else: # TODO: use wire count, relying on Joel's patches to msgpack-python def count(doc): if attr == '__meta__': return len(msgpack.packb(doc.doc)) for i, (name, klass, nelem) in enumerate(doc.stores): if name == attr: return len(msgpack.packb(doc.instances[i])) return 0 return count
class UpgradeVersionApp(App): """Upgrade wire format""" MAX_VERSION = 3 ver_ap = ArgumentParser() ver_ap.add_argument('-t', '--target', dest='target_version', metavar='VERSION', default=MAX_VERSION, type=int, help='The target version number') # TODO: add arguments to save output to input file arg_parsers = (ver_ap, ISTREAM_AP, OSTREAM_AP) def __call__(self): unpacker = msgpack.Unpacker(self.args.in_stream, use_list=True, encoding=None) out = self.args.out_stream if six.PY3: out = out.buffer while self.process_doc(unpacker, out): pass def process_doc(self, messages, out): try: version = next(messages) except StopIteration: return False if not isinstance(version, int): # Put the first message back on: messages = itertools.chain((version, ), messages) version = 1 for version in range(version, self.args.target_version): messages = getattr(self, 'update_to_v{0}'.format(version + 1))(messages) msgpack.pack( self.args.target_version, out, use_bin_type=True) # update functions do not output version for msg in messages: msgpack.pack(msg, out, use_bin_type=True) return True def update_to_v2(self, messages): """ Performs the following changes: * Replaces is_slice value TRUE with NULL * Replaces slice stop from absolute to relative offset """ # TODO: accept options to make certain fields self-pointers slice_fields = collections.defaultdict(set) meta_klass = None try: klasses = next(messages) except StopIteration as e: self._ended_early(self, e) for knum, (name, fields) in enumerate(klasses): if name == '__meta__': meta_klass = knum for fnum, fdef in enumerate(fields): if fdef.get(FieldType.IS_SLICE): # None is the new True fdef[FieldType.IS_SLICE] = None slice_fields[knum].add(fnum) yield klasses # changed del klasses try: stores = next(messages) except StopIteration: self._ended_early(self, e) yield stores # unchanged for knum in itertools.chain((meta_klass, ), (k for name, k, size in stores)): try: nbytes = next(messages) instances = next(messages) except StopIteration: self._ended_early(self, e) if knum not in slice_fields: # unchanged yield nbytes yield instances continue inst_iter = (instances, ) if isinstance(instances, dict) else instances ksl_fields = slice_fields[knum] for instance in inst_iter: for f in ksl_fields: val = instance.get(f) if val: instance[f] = (val[0], val[1] - val[0]) # changed yield len(msgpack.packb(instances)) yield instances def _ended_early(self, exc): raise ValueError('Messages ended mid-document!') def _upgrade_obj_to_v2(self, obj): if isinstance(obj, list): for i, x in enumerate(obj): obj[i] = self._upgrade_obj_to_v2(x) elif isinstance(obj, dict): new_obj = {} for k, v in obj.iteritems(): new_obj[self._upgrade_obj_to_v2(k)] = self._upgrade_obj_to_v2( v) obj = new_obj elif isinstance(obj, str): try: obj = obj.decode('utf-8') except UnicodeDecodeError: pass return obj def update_to_v3(self, messages): """ Tries to decode as UTF-8 all values that were the old MessagePack string type. If they successfully decode, write them back out as a new MessagePack UTF-8 type; otherwise write them out as a new MesagePack bytes type. """ klasses = next(messages) assert isinstance(klasses, list) stores = next(messages) assert isinstance(stores, list) doc_instance_nbytes = next(messages) assert isinstance(doc_instance_nbytes, int) doc_instance = next(messages) assert isinstance(doc_instance, dict) all_instance_groups = [] for i in range(len(stores)): instance_nbytes = next(messages) assert isinstance(instance_nbytes, int) instance_groups = next(messages) assert isinstance(instance_groups, list) all_instance_groups.append(instance_groups) klasses = self._upgrade_obj_to_v2(klasses) yield klasses stores = self._upgrade_obj_to_v2(stores) yield stores doc_instance = self._upgrade_obj_to_v2(doc_instance) yield len(msgpack.packb(doc_instance, use_bin_type=True)) yield doc_instance for instance_groups in all_instance_groups: instance_groups = self._upgrade_obj_to_v2(instance_groups) yield len(msgpack.packb(instance_groups, use_bin_type=True)) yield instance_groups
class SelectApp(App): """ Select only (or remove) specified fields on each document. """ field_list_ap = ArgumentParser() field_list_ap.add_argument( 'fields', nargs='+', type=SelectField, help= 'Fields or stores to include (or exclude with -x). These are attributes on the document by default. When taking the form Class.field, Class objects will be similarly processed to retain or exclude given fields.' ) field_list_ap.add_argument( '-x', '--exclude', action='store_true', default=False, help= 'Treat all fields listed as those to exclude rather than to retain.') arg_parsers = (field_list_ap, ISTREAM_AP, OSTREAM_AP) def __init__(self, argparser, args): field_dict = collections.defaultdict(set) for klass, field in (args.fields or ()): field_dict[klass].add(field) args.doc_fields = field_dict[None] args.annot_fields = dict(field_dict) if args.exclude: self._perform = self._perform_exclude else: self._perform = self._perform_select super(SelectApp, self).__init__(argparser, args) def __call__(self): # FIXME: externalise reflection methods ... or avoid it by just deleting attributes reader, writer = self.stream_reader_writer for doc in reader: for store in six.itervalues(doc._dr_stores): try: fields = self.args.annot_fields[store.klass_name] except KeyError: continue self._perform(fields, store._klass._dr_s2p, store._klass._dr_fields) if self.args.doc_fields: self._perform(self.args.doc_fields, doc._dr_s2p, doc._dr_fields, doc._dr_stores) writer.write(doc) def _perform_exclude(self, fields, *attr_dicts): # FIXME: work for non-identity s2p maps, if necessary for attr_dict in attr_dicts: for f in fields: try: del attr_dict[f] except KeyError: pass def _perform_select(self, fields, *attr_dicts): # FIXME: work for non-identity s2p maps, if necessary for attr_dict in attr_dicts: for f in set(attr_dict) - fields: try: del attr_dict[f] except KeyError: pass
class DumpApp(App): """ Debug: unpack the stream and pretty-print it. """ dump_ap = ArgumentParser() dump_ap.add_argument( '-m', '--human', dest='human_readable', action='store_true', default=False, help= 'Reinterpret the messages to be more human-readable by integrating headers into content.' ) dump_ap.add_argument( '-n', '--numbered', action='store_true', default=False, help= 'In --human mode, add a \'#\' field to each annotation, indicating its ordinal index' ) dump_ap.add_argument('-d', '--headers', dest='hide_instances', action='store_true', default=False, help='Show headers only, hiding any instances') dump_ap.add_argument( '-r', '--reverse-pointers', action='store_true', default=False, help= 'Show pointer and slice sources at their target sites, only if --human' ) dump_ap.add_argument('-j', '--json', dest='format', action='store_const', const='json', default='pprint', help='Output valid JSON') arg_parsers = (dump_ap, ISTREAM_AP, OSTREAM_AP) def dump(self, obj): print(self.format(obj), file=self.args.out_stream) def __call__(self, encoding='utf-8'): if six.PY2 and isinstance(encoding, six.text_type): encoding = encoding.encode('utf-8') self.format = FORMATTERS[self.args.format] unpacker = msgpack.Unpacker(self.args.in_stream, encoding=encoding) if self.args.human_readable: unpacker = self._integrate_names(unpacker) elif self.args.hide_instances: unpacker = self._headers_only(unpacker) first = True for obj in unpacker: if self.args.format == 'json': print('[' if first else ',', file=self.args.out_stream) self.dump(obj) first = False if self.args.format == 'json': print(']') def _headers_only(self, unpacker): for doc in read_raw_docs(unpacker): yield doc.version yield doc.klasses yield doc.stores def _integrate_names(self, unpacker): for i, doc in enumerate(read_raw_docs(unpacker)): obj = {} obj['__version__'] = doc.version store_defs = list(self._process_store_defs(doc.stores, doc.klasses)) obj['__meta__'] = { 'fields': dict( self._fields_to_dict(doc.klasses[META_TYPE][1], store_defs)), 'item': self._process_annot(doc.doc, doc.klasses[META_TYPE][1]) } if self.args.numbered: obj['#'] = i for (store_name, store), instances in zip(store_defs, doc.instances): obj[store_name] = store if not self.args.hide_instances: store['items'] = [ self._process_annot(item, store['fields']) for item in instances ] if self.args.numbered: for j, item in enumerate(store['items']): item['#'] = j store['fields'] = dict( self._fields_to_dict(store['fields'], store_defs)) if self.args.reverse_pointers: self._reverse_pointers_with_names(obj) yield obj def _process_store_defs(self, msg, types): for name, typ, size in msg: try: type_name, type_fields = types[typ] except IndexError: # for robustness to broken data type_name, type_fields = '??MissingType={0}'.format(typ), () yield name, { 'type': type_name, 'fields': type_fields, 'count': size } def _process_annot(self, msg, fields): return dict( (fields[fnum][FieldType.NAME], val) for fnum, val in msg.items()) TRAIT_NAMES = { FieldType.IS_SLICE: 'is slice', FieldType.IS_SELF_POINTER: 'is self-pointer', FieldType.IS_COLLECTION: 'is collection', } def _fields_to_dict(self, fields, store_defs, trait_names=TRAIT_NAMES): for field in fields: name = None traits = {} for k, v in field.items(): if k == FieldType.NAME: name = v elif k == FieldType.POINTER_TO: traits['points to'], store_data = store_defs[v] elif k in trait_names: traits[trait_names[k]] = v else: traits[k] = v yield name, traits def _reverse_pointers_with_names(self, obj): for source_name, source_store in obj.items(): if source_name == '__version__': continue for source_field, source_desc in source_store.get('fields', {}).items(): target_name = source_desc.get('points to') if target_name is None: continue qual_field = '{}.{}'.format(source_name, source_field) target_items = obj[target_name]['items'] is_slice = 'is slice' in source_desc if source_name == '__meta__': source_items = [source_store['item']] else: source_items = source_store['items'] for i, source_item in enumerate(source_items): pointers = source_item.get(source_field) if not pointers: continue if is_slice: for target in target_items[pointers[0]:pointers[0] + pointers[1]]: target.setdefault(qual_field, []).append(i) else: if isinstance(pointers, list): for j in pointers: target_items[j].setdefault(qual_field, []).append(i) else: target_items[pointers].setdefault(qual_field, []).append(i)
class HackHeaderApp(App): """ Debug: rewrite header components of given documents using Python literal input """ hack_ap = ArgumentParser() hack_ap.add_argument( '--klasses', default=None, help='Overwrites the entire klasses header with the given list') hack_ap.add_argument( '-k', '--klass', default=[], action='append', help= 'Overwrites a klass definition, specified with <name|num>=[<new_name>,<field list>]' ) hack_ap.add_argument( '-f', '--field', default=[], action='append', help= 'Overwrites a field definition, specified with <klass-name|num>.<field-name|num>[+]=<map> (use += for update semantics)' ) arg_parsers = (hack_ap, ISTREAM_AP, OSTREAM_AP) def __init__(self, argparser, args): super(HackHeaderApp, self).__init__(argparser, args) def parse(s, exp_type): try: res = ast.literal_eval(s) except (SyntaxError, ValueError): argparser.error('{0} is not a valid Python literal'.format(s)) if exp_type is not None and type(res) != exp_type: argparser.error('{0} does not evaluate to type {1}'.format( s, exp_type)) return res self.operations = [] if args.klasses: self.operations.append((self._set_klasses, { 'value': parse(args.klasses, list) })) for arg in args.klass: try: key, value = arg.split('=', 1) except ValueError: argparser.error('Expected <name>=<value>, got {0}'.format(arg)) try: key = int(key) except ValueError: pass value = parse(value, list) if len(value) != 2: argparser.error( 'Expected a list of length 2, got {0}'.format(value)) self.operations.append((self._set_klass, { 'klass': key, 'value': value })) for arg in args.field: try: key, value = arg.split('=', 1) kname, fname = key.split('.') except ValueError: argparser.error( 'Expected <kname>.<fname>=<value>, got {0}'.format(arg)) if fname.endswith('+'): fname = fname[:-1] update = True else: update = False try: kname = int(kname) except ValueError: pass try: fname = int(fname) except ValueError: pass value = parse(value, dict) self.operations.append((self._set_field, { 'klass': kname, 'field': fname, 'value': value, 'update': update })) if not self.operations: argparser.error('Nothing to do!') def _set_klasses(self, klasses, stores, value): klasses[:] = value def _set_klass(self, klasses, stores, klass, value): if klass == len(klasses): klasses.append(value) for knum, (kname, fields) in enumerate(klasses): if klass in (knum, kname): klasses[knum] = value return raise ValueError('Could not find class {0}'.format(klass)) def _set_field(self, klasses, stores, klass, field, value, update=False): for knum, (kname, fields) in enumerate(klasses): if klass not in (knum, kname): continue if field == len(fields): fields.append({}) for fnum, fdef in enumerate(fields): fname = fdef.get(FieldType.NAME) if field in (fnum, fname): if update: fields[fnum].update(value) else: fields[fnum] = value return raise ValueError('Could not find field {1} in class {0}'.format( klass, field)) def __call__(self): writer = self.raw_stream_writer for doc in self.raw_stream_reader: for fn, kwargs in self.operations: fn(doc.klasses, doc.stores, **kwargs) writer.write(doc)
class ShellApp(App): """ Loads the given input file into a Python shell as the variable `docs` Examples: %(prog)s -c 'for doc in docs: do_something()' # executes the given code on `docs` read with automagic from standard input %(prog)s -o out.dr -c 'for doc in docs: do_something() and write_doc(doc)' # same, writing the documents to out.dr %(prog)s path.dr # open an interactive Python shell with `docs` read from path.dr with automagic %(prog)s --doc-class pkg.module.DocSchema path.dr # same, but using the specified schema """ SHELLS = ('ipython', 'bpython', 'python') ap = ArgumentParser() ap.add_argument( '-s', '--shell', default=None, help='One of {0} (default: try these in order)'.format(SHELLS)) ap.add_argument( '--doc-class', metavar='CLS', dest='doc_class', type=import_string, default=None, help= 'Import path to the Document class for the input. If available, doc.{0}() will be called for each document on the stream.' .format(DECORATE_METHOD)) ap.add_argument('-o', '--out-file', type=argparse.FileType('wb'), default=None, help='The output file, written to by `write_doc`') ap.add_argument( '-c', '--code', default=None, help= 'Execute the specified code (before opening an interactive session if -i is also used)' ) ap.add_argument('-i', '--interactive', default=False, action='store_true', help='Use an interactive shell even if -c is supplied') ap.add_argument('in_file', type=DrInputType, nargs='?', default=None, help='The input file') arg_parsers = (ap, ) def __init__(self, argparser, args): args.interactive = args.interactive or args.code is None if args.interactive and not args.in_file: argparser.error( 'Cannot read documents from STDIN in interactive mode. Please provide a path to the documents.' ) if not args.in_file: import sys args.in_file = sys.stdin super(ShellApp, self).__init__(argparser, args) def __call__(self): local = self.build_locals() if self.args.code: exec(self.args.code, local) # XXX: this is actually using globals, not locals if not self.args.interactive: return tmp = local local = self.run_startup() local.update(tmp) shells = [self.args.shell] if self.args.shell else self.SHELLS for shell in shells: try: return getattr(self, 'run_' + shell)(local) except ImportError as e: pass raise e def build_locals(self): res = {'__name__': '__main__'} from schwa import dr reader, schema = self.get_reader_and_schema(self.args.in_file) res.update({'dr': dr, 'docs': reader}) if self.args.out_file: res['write_doc'] = dr.Writer(self.args.out_file, schema).write return res def run_startup(self): res = {'__name__': '__main__'} pythonrc = os.environ.get('PYTHONSTARTUP') if pythonrc and os.path.isfile(pythonrc): with open(pythonrc, 'rU') as f: try: exec(f.read(), res) except NameError: pass try: exec('import user', res) except ImportError: pass return res def run_ipython(self, local): try: from IPython.terminal.embed import TerminalInteractiveShell shell = TerminalInteractiveShell(user_ns=local) shell.mainloop() except ImportError: # IPython < 0.11 # Explicitly pass an empty list as arguments, because otherwise # IPython would use sys.argv from this script. from IPython.Shell import IPShell shell = IPShell(argv=[], user_ns=local) shell.mainloop() def run_bpython(self, local): import bpython bpython.embed(locals_=local) def run_python(self, local): import code try: import readline except ImportError: pass else: import rlcompleter readline.set_completer(rlcompleter.Completer(local).complete) readline.parse_and_bind('tab:complete') code.interact(local=local)
class SplitApp(App): """ Split a stream into k files, or a separate file for each key determined per doc. To perform stratified k-fold validation, first sort the corpus by the stratification label. If the evaluation returns a list, the document is written to each key in the list. """ multioutput_ap = ArgumentParser() multioutput_ap.add_argument( '-t', '--template', dest='path_tpl', default='fold{n:03d}.dr', help= 'A template for output paths (default: %(default)s). {n} substitutes for fold number, {key} for evaluation output.' ) multioutput_ap.add_argument( '--overwrite', action='store_true', default=False, help='Overwrite an output file if it already exists.') multioutput_ap.add_argument( '--sparse', action='store_true', default=False, help= 'Use append mode to write files, and close the handle between writes') multioutput_ap.add_argument('--make-dirs', action='store_true', default=False, help='Make directories when necessary') arg_parsers = ( DESERIALISE_AP, multioutput_ap, get_evaluator_ap({'k': KFoldsEvaluator}), ) def __init__(self, argparser, args): if '{' not in args.path_tpl: argparser.error( 'Output path template must include a substitution (e.g. {n:02d} or {key})' ) super(SplitApp, self).__init__(argparser, args) if self.args.sparse: if self.args.overwrite: argparser.error('--overwrite does not apply with --sparse') if isinstance(self.evaluator, KFoldsEvaluator): argparser.error('k-folds cannot be used with --sparse') if any(expr in args.path_tpl for expr in ('{n}', '{n!', '{n:')): # FIXME: use regexp argparser.error('--sparse must use filenames templated by key') def __call__(self): # TODO: clean up!! evaluator = self.evaluator if isinstance(evaluator, KFoldsEvaluator): # avoid full deserialisation # TODO: make more generic reader = self.raw_stream_reader from drapps.util import RawDocWriter make_writer = RawDocWriter else: reader, schema = self.get_reader_and_schema() make_writer = lambda out: dr.Writer(out, schema) if self.args.make_dirs: def fopen(path, mode): dirname = os.path.dirname(path) if not os.path.exists(dirname): cur = '' for part in dirname.split(os.path.sep): cur += part if part and not os.path.exists(cur): os.mkdir(cur) cur += os.path.sep return open(path, mode) else: fopen = open def new_writer(key): fold_num = len(writers) path = self.args.path_tpl.format(n=fold_num, key=key) if not self.args.overwrite and os.path.exists(path): print('Path {0} already exists. Use --overwrite to overwrite.'. format(path), file=sys.stderr) sys.exit(1) print('Writing fold {k} to {path}'.format(k=fold_num, path=path), file=sys.stderr) return make_writer(fopen(path, 'wb')) if self.args.sparse: get_writer = lambda key: make_writer( fopen(self.args.path_tpl.format(key=key), 'ab')) else: writers = {} def get_writer(key): try: writer = writers[key] except KeyError: writer = writers[key] = new_writer(key) return writer for i, doc in enumerate(reader): val = evaluator(doc, i) for key in val if isinstance(val, list) else (val, ): writer = get_writer(key) writer.write(doc)