def test(self):
        orig = six.BytesIO()
        write(orig)
        orig.seek(0)

        reader = dr.Reader(orig, automagic=True)
        doc0 = reader.next()
        doc0_schema = reader.doc_schema
        doc1 = reader.next()
        doc1_schema = reader.doc_schema
        with self.assertRaises(StopIteration):
            reader.next()

        rewritten = six.BytesIO()

        writer = dr.Writer(rewritten, doc0_schema)
        doc = doc0
        self.assertTrue(hasattr(doc, 'tokens'))
        self.assertTrue(hasattr(doc, 'sents'))
        self.assertEqual(len(doc.tokens), 0)
        self.assertEqual(len(doc.sents), 0)
        self.assertEqual(doc.adjectives, [])
        writer.write(doc)

        writer = dr.Writer(rewritten, doc1_schema)
        doc = doc1
        self.assertTrue(hasattr(doc, 'tokens'))
        self.assertTrue(hasattr(doc, 'sents'))
        self.assertEqual(len(doc.tokens), 5)
        self.assertEqual(len(doc.sents), 1)
        self.assertEqual(doc.tokens[0].norm, 'The')
        self.assertEqual(doc.tokens[0].span, slice(0, 3))
        self.assertEqual(doc.tokens[1].norm, 'quick')
        self.assertEqual(doc.tokens[1].span, slice(4, 9))
        self.assertEqual(doc.tokens[2].norm, 'brown')
        self.assertEqual(doc.tokens[2].span, slice(11, 16))
        self.assertEqual(doc.tokens[3].norm, 'fox')
        self.assertEqual(doc.tokens[3].span, slice(17, 20))
        self.assertEqual(doc.tokens[4].norm, '.')
        self.assertEqual(doc.tokens[4].span, slice(20, 21))
        self.assertEqual(doc.sents[0].span, slice(0, 5))
        self.assertListEqual(doc.adjectives, doc.tokens[1:3])
        writer.write(doc)

        orig.seek(0)
        rewritten.seek(0)
        orig = orig.getvalue()
        rewritten = rewritten.getvalue()
        self.assertEqual(orig, rewritten)
Esempio n. 2
0
    def test(self):
        doc = Doc()
        doc.tokens.create(span=slice(0, 3), raw='The')
        doc.tokens.create(span=slice(4, 9), raw='quick')
        doc.tokens.create(span=slice(11, 16), raw='brown')
        doc.tokens.create(span=slice(17, 20), raw='fox')
        doc.tokens.create(span=slice(20, 21), raw='.')
        doc.sents.create(span=slice(0, 5))
        doc.tokens.create(span=slice(22, 25), raw='The')
        doc.tokens.create(span=slice(26, 30), raw='lazy')
        doc.tokens.create(span=slice(31, 34), raw='cat')
        doc.tokens.create(span=slice(35, 38), raw='too')
        doc.tokens.create(span=slice(38, 39), raw='.')
        doc.sents.create(span=slice(5, 10))

        correct = six.BytesIO()
        correct.write(b'\x03'
                      b'\x93'
                      b'\x92'
                      b'\xa8__meta__'
                      b'\x90'
                      b'\x92'
                      b'\xa4Sent'
                      b'\x92'
                      b'\x81\x00\xa6number'
                      b'\x83\x00\xa4span\x01\x01\x02\xc0'
                      b'\x92'
                      b'\xa5Token'
                      b'\x92'
                      b'\x81\x00\xa3raw'
                      b'\x82\x00\xa4span\x02\xc0'
                      b'\x92'
                      b'\x93\xa5sents\x01\x02'
                      b'\x93\xa6tokens\x02\x0a'
                      b'\x01'
                      b'\x80'
                      b'\x0b'
                      b'\x92'
                      b'\x81\x01\x92\x00\x05'
                      b'\x81\x01\x92\x05\x05'
                      b'\x66'
                      b'\x9a'
                      b'\x82\x00\xa3The\x01\x92\x00\x03'
                      b'\x82\x00\xa5quick\x01\x92\x04\x05'
                      b'\x82\x00\xa5brown\x01\x92\x0b\x05'
                      b'\x82\x00\xa3fox\x01\x92\x11\x03'
                      b'\x82\x00\xa1.\x01\x92\x14\x01'
                      b'\x82\x00\xa3The\x01\x92\x16\x03'
                      b'\x82\x00\xa4lazy\x01\x92\x1a\x04'
                      b'\x82\x00\xa3cat\x01\x92\x1f\x03'
                      b'\x82\x00\xa3too\x01\x92\x23\x03'
                      b'\x82\x00\xa1.\x01\x92\x26\x01')

        out = six.BytesIO()
        writer = dr.Writer(out, Doc)
        writer.write(doc)

        out = out.getvalue()
        correct = correct.getvalue()
        self.assertEqual(out, correct)
Esempio n. 3
0
 def build_locals(self):
     res = {'__name__': '__main__'}
     from schwa import dr
     reader, schema = self.get_reader_and_schema(self.args.in_file)
     res.update({'dr': dr, 'docs': reader})
     if self.args.out_file:
         res['write_doc'] = dr.Writer(self.args.out_file, schema).write
     return res
Esempio n. 4
0
    def __call__(self):
        # TODO: clean up!!
        evaluator = self.evaluator
        if isinstance(evaluator, KFoldsEvaluator):
            # avoid full deserialisation
            # TODO: make more generic
            reader = self.raw_stream_reader
            from drapps.util import RawDocWriter
            make_writer = RawDocWriter
        else:
            reader, schema = self.get_reader_and_schema()
            make_writer = lambda out: dr.Writer(out, schema)

        if self.args.make_dirs:

            def fopen(path, mode):
                dirname = os.path.dirname(path)
                if not os.path.exists(dirname):
                    cur = ''
                    for part in dirname.split(os.path.sep):
                        cur += part
                        if part and not os.path.exists(cur):
                            os.mkdir(cur)
                        cur += os.path.sep
                return open(path, mode)
        else:
            fopen = open

        def new_writer(key):
            fold_num = len(writers)
            path = self.args.path_tpl.format(n=fold_num, key=key)
            if not self.args.overwrite and os.path.exists(path):
                print('Path {0} already exists. Use --overwrite to overwrite.'.
                      format(path),
                      file=sys.stderr)
                sys.exit(1)
            print('Writing fold {k} to {path}'.format(k=fold_num, path=path),
                  file=sys.stderr)
            return make_writer(fopen(path, 'wb'))

        if self.args.sparse:
            get_writer = lambda key: make_writer(
                fopen(self.args.path_tpl.format(key=key), 'ab'))
        else:
            writers = {}

            def get_writer(key):
                try:
                    writer = writers[key]
                except KeyError:
                    writer = writers[key] = new_writer(key)
                return writer

        for i, doc in enumerate(reader):
            val = evaluator(doc, i)
            for key in val if isinstance(val, list) else (val, ):
                writer = get_writer(key)
                writer.write(doc)
Esempio n. 5
0
def write_read(doc, out_schema, in_schema=None):
    if in_schema is None:
        in_schema = out_schema
    print('Writing {0}'.format(out_schema))
    f = six.BytesIO()
    dr.Writer(f, out_schema).write(doc)
    f.seek(0)
    print('Reading {0}'.format(in_schema))
    return dr.Reader(f, in_schema).next()
Esempio n. 6
0
  def test_exception_message(self):
    doc = DocToken()
    t = doc.tokens.create()
    t.raw = 'meow'

    stream = io.BytesIO()
    writer = dr.Writer(stream, DocToken)
    writer.write(doc)

    stream.seek(0)
    reader = dr.Reader(stream, DocTok)
    with self.assertRaisesRegexp(ReaderException, r"Store u?'tokens' points to annotation type u?'.*Tok' but the store on the stream points to a lazy type \(u?'Token'\)\."):
      doc = next(reader)
Esempio n. 7
0
    def __call__(self):
        empty = io.BytesIO()
        writer = dr.Writer(empty, dr.Doc)
        writer.write(dr.Doc())
        empty = empty.getvalue()

        out = self.args.out_stream
        if six.PY3:
            out = out.buffer
        i = 0
        while i < self.args.ndocs:
            out.write(empty)
            i += 1
def write(out):
    doc1 = Doc()

    doc2 = Doc()
    doc2.tokens.create(span=slice(0, 3), norm='The')
    doc2.tokens.create(span=slice(4, 9), norm='quick')
    doc2.tokens.create(span=slice(11, 16), norm='brown')
    doc2.tokens.create(span=slice(17, 20), norm='fox')
    doc2.tokens.create(span=slice(20, 21), norm='.')
    doc2.sents.create(span=slice(0, 5))
    doc2.adjectives = doc2.tokens[1:3]

    writer = dr.Writer(out, Doc)
    writer.write(doc1)
    writer.write(doc2)
def create_stream():
    stream = six.BytesIO()
    writer = dr.Writer(stream, Doc)

    d = Doc()
    for name in ('hello', 'world', '.'):
        d.xs.create(name=name)
    writer.write(d)

    d = Doc()
    for name in ('how', 'are', 'you', '?'):
        d.xs.create(name=name)
    writer.write(d)

    stream.seek(0)
    return stream
Esempio n. 10
0
    def __call__(self):
        reader, schema = self.get_reader_and_schema()
        tmp_out = io.BytesIO()
        tmp_writer = dr.Writer(tmp_out, schema)
        evaluator = self.evaluator
        items = []
        for i, doc in enumerate(reader):
            # TODO: avoid re-serialising
            doc_key = evaluator(doc, i)
            tmp_writer.write(doc)
            doc_data = tmp_out.getvalue()
            tmp_out.truncate(0)
            items.append((doc_key, doc_data))

        items.sort()
        for doc_key, doc_data in items:
            out = self.args.out_stream
            if six.PY3:
                out = out.buffer
            out.write(doc_data)
Esempio n. 11
0
    def __call__(self):
        WK_PAGES_EST = 4630000

        with open(self.out_path, 'w') as f:
            i = 0
            writer = dr.Writer(f, WikiDoc)
            try:
                log.info('Processing docs...')
                start_time = time()
                for i, doc in enumerate(self.iter_doc_reps()):
                    if i == 10000 or (i % 100000 == 0 and i > 0):
                        dps = (i + 1) / float(time() - start_time)
                        eta = datetime.timedelta(seconds=int(WK_PAGES_EST /
                                                             dps))
                        log.info(
                            'Processed %i documents... %.2f d/s (eta: %s)', i,
                            dps, eta)

                    writer.write(doc)
            except:
                log.error('Failed on doc: %i', i)
                raise
Esempio n. 12
0
def serialise(doc, doc_klass):
    f = six.BytesIO()
    dr.Writer(f, doc_klass).write(doc)
    return f.getvalue()
Esempio n. 13
0
def serialize(doc, schema):
    out = six.BytesIO()
    writer = dr.Writer(out, schema)
    writer.write(doc)
    return out.getvalue()