Example #1
0
def zmq_coroutine(context, dealer_url, doc_class=None, automagic=False):
    # FIXME: reduce overhead of reader/writer creation
    ostream = io.BytesIO()
    socket = context.socket(zmq.REP)
    socket.connect(dealer_url)
    while True:
        msg = socket.recv()
        istream = io.BytesIO(msg)
        istream.seek(0)
        reader = Reader(istream, doc_class, automagic)
        writer = Writer(ostream, reader.doc_schema)
        for doc in reader:
            res = yield (doc)
            writer.write(res or doc)
        ostream.seek(0)
        socket.send(ostream.getvalue())
        ostream.truncate(0)
Example #2
0
def zmq_coroutine(context, dealer_url, doc_class=None, automagic=False):
  # FIXME: reduce overhead of reader/writer creation
  ostream = io.BytesIO()
  socket = context.socket(zmq.REP)
  socket.connect(dealer_url)
  while True:
    msg = socket.recv()
    istream = io.BytesIO(msg)
    istream.seek(0)
    reader = Reader(istream, doc_class, automagic)
    writer = Writer(ostream, reader.doc_schema)
    for doc in reader:
      res = yield(doc)
      writer.write(res or doc)
    ostream.seek(0)
    socket.send(ostream.getvalue())
    ostream.truncate(0)
Example #3
0
def stream_coroutine(istream, ostream, doc_class=None, automagic=False):
    reader = Reader(istream, doc_class, automagic)
    writer = Writer(ostream, reader.doc_schema)
    for doc in reader:
        res = yield (doc)
        writer.write(res or doc)
Example #4
0
def stream_coroutine(istream, ostream, doc_class=None, automagic=False):
  reader = Reader(istream, doc_class, automagic)
  writer = Writer(ostream, reader.doc_schema)
  for doc in reader:
    res = yield(doc)
    writer.write(res or doc)
Example #5
0
 def stream_reader_writer(self):
     docs, schema = self.get_reader_and_schema()
     return docs, Writer(self.args.out_stream, schema)
                    ntokens += 1
                    prev_raw = raw
                sentence = doc.sents.create(span=slice(sent_start, ntokens))
                nsents += 1
            paragraph = doc.paras.create(span=slice(paragraph_start, nsents))
    return doc

if __name__ == '__main__':
    # grep-like over HTTP response
    import argparse
    import sys
    sys.path.append('.')
    ap = argparse.ArgumentParser(description="Reads an input warc file, and converts it to docrep on stdout")
    ap.add_argument('input_warc')
    ap.add_argument('-a', '--annotations', default=None, help='FACC annotation file')
    ap.add_argument('-d', '--debug', default=None, help='Print debug log information')
    args = ap.parse_args()
    sys.stdout = sys.stdout.detach() # make stdout binary (for py3)
    writer = Writer(sys.stdout, Doc)
    log.info("Processing {}".format(args.input_warc))
    debug = None
    if args.debug is not None:
        debug = open(args.debug, "a")
    try:
        for doc in process(args.input_warc, args.annotations, debug):
            writer.write(doc)
    except Exception as e:
        print(str(e), file=sys.stderr)
        sys.exit(1)
    log.info("Finished processing {}".format(args.input_warc))