def zmq_coroutine(context, dealer_url, doc_class=None, automagic=False): # FIXME: reduce overhead of reader/writer creation ostream = io.BytesIO() socket = context.socket(zmq.REP) socket.connect(dealer_url) while True: msg = socket.recv() istream = io.BytesIO(msg) istream.seek(0) reader = Reader(istream, doc_class, automagic) writer = Writer(ostream, reader.doc_schema) for doc in reader: res = yield (doc) writer.write(res or doc) ostream.seek(0) socket.send(ostream.getvalue()) ostream.truncate(0)
def zmq_coroutine(context, dealer_url, doc_class=None, automagic=False): # FIXME: reduce overhead of reader/writer creation ostream = io.BytesIO() socket = context.socket(zmq.REP) socket.connect(dealer_url) while True: msg = socket.recv() istream = io.BytesIO(msg) istream.seek(0) reader = Reader(istream, doc_class, automagic) writer = Writer(ostream, reader.doc_schema) for doc in reader: res = yield(doc) writer.write(res or doc) ostream.seek(0) socket.send(ostream.getvalue()) ostream.truncate(0)
def stream_coroutine(istream, ostream, doc_class=None, automagic=False): reader = Reader(istream, doc_class, automagic) writer = Writer(ostream, reader.doc_schema) for doc in reader: res = yield (doc) writer.write(res or doc)
def stream_coroutine(istream, ostream, doc_class=None, automagic=False): reader = Reader(istream, doc_class, automagic) writer = Writer(ostream, reader.doc_schema) for doc in reader: res = yield(doc) writer.write(res or doc)
def stream_reader_writer(self): docs, schema = self.get_reader_and_schema() return docs, Writer(self.args.out_stream, schema)
ntokens += 1 prev_raw = raw sentence = doc.sents.create(span=slice(sent_start, ntokens)) nsents += 1 paragraph = doc.paras.create(span=slice(paragraph_start, nsents)) return doc if __name__ == '__main__': # grep-like over HTTP response import argparse import sys sys.path.append('.') ap = argparse.ArgumentParser(description="Reads an input warc file, and converts it to docrep on stdout") ap.add_argument('input_warc') ap.add_argument('-a', '--annotations', default=None, help='FACC annotation file') ap.add_argument('-d', '--debug', default=None, help='Print debug log information') args = ap.parse_args() sys.stdout = sys.stdout.detach() # make stdout binary (for py3) writer = Writer(sys.stdout, Doc) log.info("Processing {}".format(args.input_warc)) debug = None if args.debug is not None: debug = open(args.debug, "a") try: for doc in process(args.input_warc, args.annotations, debug): writer.write(doc) except Exception as e: print(str(e), file=sys.stderr) sys.exit(1) log.info("Finished processing {}".format(args.input_warc))