def text_to_dr(self, text): from schwa import dr tokenizer = Popen([ self.tokenizer_path, '-p', 'docrep' ], cwd = self.ner_package_path, stdout = PIPE, stdin = PIPE) tok_dr, err = tokenizer.communicate(text) if not tok_dr or err: raise Exception("Schwa tokenizer failed while processing document") self.tagger_process.stdin.write(tok_dr) self.tagger_process.stdin.flush() try: status = self.tagger_process.stderr.readline().strip() if self.tagger_process.poll() != None or status == '' or status == '0': raise Exception("Schwa tagger failed while processing document") try: result_sz = int(status) except ValueError: schwa_error = Exception(status) raise Exception("Schwa tagger error while processing document", schwa_error) try: result_bytes = self.tagger_process.stdout.read(result_sz) result = dr.Reader(StringIO(result_bytes), self.schema).read() return result except Exception as e: raise Exception("Failed to deserialise schwa tagger output", e), None, sys.exc_info()[2] except: self.initialise_tagger() raise
def test_nonempty__iter(self): stream = create_stream() count = 0 reader = dr.Reader(stream, Doc) for doc in reader: self.assertIsNotNone(doc) count += 1 self.assertEquals(count, 2)
def test_empty__iter(self): stream = six.BytesIO() count = 0 reader = dr.Reader(stream, Doc) for doc in reader: self.assertIsNotNone(doc) count += 1 self.assertEquals(count, 0)
def write_read(doc, out_schema, in_schema=None): if in_schema is None: in_schema = out_schema print('Writing {0}'.format(out_schema)) f = six.BytesIO() dr.Writer(f, out_schema).write(doc) f.seek(0) print('Reading {0}'.format(in_schema)) return dr.Reader(f, in_schema).next()
def test_nonempty__read(self): stream = create_stream() reader = dr.Reader(stream, Doc) doc = reader.read() self.assertIsNotNone(doc) doc = reader.read() self.assertIsNotNone(doc) doc = reader.read() self.assertIsNone(doc)
def test_exception_message(self): doc = DocToken() t = doc.tokens.create() t.raw = 'meow' stream = io.BytesIO() writer = dr.Writer(stream, DocToken) writer.write(doc) stream.seek(0) reader = dr.Reader(stream, DocTok) with self.assertRaisesRegexp(ReaderException, r"Store u?'tokens' points to annotation type u?'.*Tok' but the store on the stream points to a lazy type \(u?'Token'\)\."): doc = next(reader)
def test_schema(self): orig = six.BytesIO() write(orig) orig.seek(0) reader = dr.Reader(orig, automagic=True) reader.next() reader.next() doc1_schema = reader.doc_schema with self.assertRaises(StopIteration): reader.next() # The following works if reader.doc_schema is replaced with docs[0]._dr_rt.copy_to_schema() self.assertSchemaEqual(Doc.schema(), doc1_schema)
def test(self): orig = six.BytesIO() write(orig) orig.seek(0) reader = dr.Reader(orig, automagic=True) doc0 = reader.next() doc0_schema = reader.doc_schema doc1 = reader.next() doc1_schema = reader.doc_schema with self.assertRaises(StopIteration): reader.next() rewritten = six.BytesIO() writer = dr.Writer(rewritten, doc0_schema) doc = doc0 self.assertTrue(hasattr(doc, 'tokens')) self.assertTrue(hasattr(doc, 'sents')) self.assertEqual(len(doc.tokens), 0) self.assertEqual(len(doc.sents), 0) self.assertEqual(doc.adjectives, []) writer.write(doc) writer = dr.Writer(rewritten, doc1_schema) doc = doc1 self.assertTrue(hasattr(doc, 'tokens')) self.assertTrue(hasattr(doc, 'sents')) self.assertEqual(len(doc.tokens), 5) self.assertEqual(len(doc.sents), 1) self.assertEqual(doc.tokens[0].norm, 'The') self.assertEqual(doc.tokens[0].span, slice(0, 3)) self.assertEqual(doc.tokens[1].norm, 'quick') self.assertEqual(doc.tokens[1].span, slice(4, 9)) self.assertEqual(doc.tokens[2].norm, 'brown') self.assertEqual(doc.tokens[2].span, slice(11, 16)) self.assertEqual(doc.tokens[3].norm, 'fox') self.assertEqual(doc.tokens[3].span, slice(17, 20)) self.assertEqual(doc.tokens[4].norm, '.') self.assertEqual(doc.tokens[4].span, slice(20, 21)) self.assertEqual(doc.sents[0].span, slice(0, 5)) self.assertListEqual(doc.adjectives, doc.tokens[1:3]) writer.write(doc) orig.seek(0) rewritten.seek(0) orig = orig.getvalue() rewritten = rewritten.getvalue() self.assertEqual(orig, rewritten)
def labeled_text_for_file(self, path): log.info('Processing entity embeddings: %s...', path) instances = [] with open(path,'rb') as f: reader = dr.Reader(f, Doc.schema()) for doc in reader: #if doc.name not in self.entities: # continue tokens = [t.norm for t in doc.tokens[doc.sections[0].span]] + [t.norm for s in doc.sections[1:] for t in doc.tokens[s.span][:100]] #tokens = [t.norm for t in doc.tokens[doc.sections[0].span]] #tokens = [t.norm for t in doc.tokens[:500]] if MIN_COUNT and len(tokens) < MIN_COUNT: tokens = (['__NULL__'] * (MIN_COUNT - len(doc.tokens))) + tokens instances.append(LabeledText(tokens, [ENTITY_PREFIX + doc.name])) return instances
def test_dr_fields_and_dr_stores(self): orig = six.BytesIO() write(orig) orig.seek(0) reader = dr.Reader(orig, automagic=True) doc = reader.next() doc = reader.next() self.assertTupleEqual(('adjectives', 'empty'), tuple(doc._dr_fields)) self.assertTupleEqual(('sents', 'tokens'), tuple(doc._dr_stores)) t = doc.tokens[0] self.assertTupleEqual(('empty', 'norm', 'span'), tuple(t.__class__._dr_fields)) self.assertTupleEqual((), tuple(t.__class__._dr_stores)) s = doc.sents[0] self.assertTupleEqual(('span', ), tuple(s.__class__._dr_fields)) self.assertTupleEqual((), tuple(s.__class__._dr_stores)) self.assertEqual("Token(norm=" + repr('The') + ", span=slice(0, 3))", repr(t)) self.assertEqual('Sent(span=slice(0, 5))', repr(s))
def test_empty__read(self): stream = six.BytesIO() reader = dr.Reader(stream, Doc) doc = reader.read() self.assertIsNone(doc)