Exemple #1
0
    def text_to_dr(self, text):
        from schwa import dr
        tokenizer = Popen([
            self.tokenizer_path,
            '-p', 'docrep'
        ], cwd = self.ner_package_path, stdout = PIPE, stdin = PIPE)

        tok_dr, err = tokenizer.communicate(text)
        if not tok_dr or err:
            raise Exception("Schwa tokenizer failed while processing document")

        self.tagger_process.stdin.write(tok_dr)
        self.tagger_process.stdin.flush()

        try:
            status = self.tagger_process.stderr.readline().strip()
            if self.tagger_process.poll() != None or status == '' or status == '0':
                raise Exception("Schwa tagger failed while processing document")

            try:
                result_sz = int(status)
            except ValueError:
                schwa_error = Exception(status)
                raise Exception("Schwa tagger error while processing document", schwa_error)

            try:
                result_bytes = self.tagger_process.stdout.read(result_sz)
                result = dr.Reader(StringIO(result_bytes), self.schema).read()
                return result
            except Exception as e:
                raise Exception("Failed to deserialise schwa tagger output", e), None, sys.exc_info()[2]
        except:
            self.initialise_tagger()
            raise
 def test_nonempty__iter(self):
     stream = create_stream()
     count = 0
     reader = dr.Reader(stream, Doc)
     for doc in reader:
         self.assertIsNotNone(doc)
         count += 1
     self.assertEquals(count, 2)
 def test_empty__iter(self):
     stream = six.BytesIO()
     count = 0
     reader = dr.Reader(stream, Doc)
     for doc in reader:
         self.assertIsNotNone(doc)
         count += 1
     self.assertEquals(count, 0)
Exemple #4
0
def write_read(doc, out_schema, in_schema=None):
    if in_schema is None:
        in_schema = out_schema
    print('Writing {0}'.format(out_schema))
    f = six.BytesIO()
    dr.Writer(f, out_schema).write(doc)
    f.seek(0)
    print('Reading {0}'.format(in_schema))
    return dr.Reader(f, in_schema).next()
 def test_nonempty__read(self):
     stream = create_stream()
     reader = dr.Reader(stream, Doc)
     doc = reader.read()
     self.assertIsNotNone(doc)
     doc = reader.read()
     self.assertIsNotNone(doc)
     doc = reader.read()
     self.assertIsNone(doc)
  def test_exception_message(self):
    doc = DocToken()
    t = doc.tokens.create()
    t.raw = 'meow'

    stream = io.BytesIO()
    writer = dr.Writer(stream, DocToken)
    writer.write(doc)

    stream.seek(0)
    reader = dr.Reader(stream, DocTok)
    with self.assertRaisesRegexp(ReaderException, r"Store u?'tokens' points to annotation type u?'.*Tok' but the store on the stream points to a lazy type \(u?'Token'\)\."):
      doc = next(reader)
    def test_schema(self):
        orig = six.BytesIO()
        write(orig)
        orig.seek(0)

        reader = dr.Reader(orig, automagic=True)
        reader.next()
        reader.next()
        doc1_schema = reader.doc_schema
        with self.assertRaises(StopIteration):
            reader.next()

        # The following works if reader.doc_schema is replaced with docs[0]._dr_rt.copy_to_schema()
        self.assertSchemaEqual(Doc.schema(), doc1_schema)
    def test(self):
        orig = six.BytesIO()
        write(orig)
        orig.seek(0)

        reader = dr.Reader(orig, automagic=True)
        doc0 = reader.next()
        doc0_schema = reader.doc_schema
        doc1 = reader.next()
        doc1_schema = reader.doc_schema
        with self.assertRaises(StopIteration):
            reader.next()

        rewritten = six.BytesIO()

        writer = dr.Writer(rewritten, doc0_schema)
        doc = doc0
        self.assertTrue(hasattr(doc, 'tokens'))
        self.assertTrue(hasattr(doc, 'sents'))
        self.assertEqual(len(doc.tokens), 0)
        self.assertEqual(len(doc.sents), 0)
        self.assertEqual(doc.adjectives, [])
        writer.write(doc)

        writer = dr.Writer(rewritten, doc1_schema)
        doc = doc1
        self.assertTrue(hasattr(doc, 'tokens'))
        self.assertTrue(hasattr(doc, 'sents'))
        self.assertEqual(len(doc.tokens), 5)
        self.assertEqual(len(doc.sents), 1)
        self.assertEqual(doc.tokens[0].norm, 'The')
        self.assertEqual(doc.tokens[0].span, slice(0, 3))
        self.assertEqual(doc.tokens[1].norm, 'quick')
        self.assertEqual(doc.tokens[1].span, slice(4, 9))
        self.assertEqual(doc.tokens[2].norm, 'brown')
        self.assertEqual(doc.tokens[2].span, slice(11, 16))
        self.assertEqual(doc.tokens[3].norm, 'fox')
        self.assertEqual(doc.tokens[3].span, slice(17, 20))
        self.assertEqual(doc.tokens[4].norm, '.')
        self.assertEqual(doc.tokens[4].span, slice(20, 21))
        self.assertEqual(doc.sents[0].span, slice(0, 5))
        self.assertListEqual(doc.adjectives, doc.tokens[1:3])
        writer.write(doc)

        orig.seek(0)
        rewritten.seek(0)
        orig = orig.getvalue()
        rewritten = rewritten.getvalue()
        self.assertEqual(orig, rewritten)
Exemple #9
0
    def labeled_text_for_file(self, path):
        log.info('Processing entity embeddings: %s...', path)
        
        instances = []
        with open(path,'rb')  as f:
            reader = dr.Reader(f, Doc.schema())
            for doc in reader:
                #if doc.name not in self.entities:
                #    continue
                tokens = [t.norm for t in doc.tokens[doc.sections[0].span]] + [t.norm for s in doc.sections[1:] for t in doc.tokens[s.span][:100]]
                #tokens = [t.norm for t in doc.tokens[doc.sections[0].span]]
                #tokens = [t.norm for t in doc.tokens[:500]]
                if MIN_COUNT and len(tokens) < MIN_COUNT:
                    tokens = (['__NULL__'] * (MIN_COUNT - len(doc.tokens))) + tokens

                instances.append(LabeledText(tokens, [ENTITY_PREFIX + doc.name]))

        return instances
    def test_dr_fields_and_dr_stores(self):
        orig = six.BytesIO()
        write(orig)
        orig.seek(0)

        reader = dr.Reader(orig, automagic=True)
        doc = reader.next()
        doc = reader.next()
        self.assertTupleEqual(('adjectives', 'empty'), tuple(doc._dr_fields))
        self.assertTupleEqual(('sents', 'tokens'), tuple(doc._dr_stores))

        t = doc.tokens[0]
        self.assertTupleEqual(('empty', 'norm', 'span'),
                              tuple(t.__class__._dr_fields))
        self.assertTupleEqual((), tuple(t.__class__._dr_stores))

        s = doc.sents[0]
        self.assertTupleEqual(('span', ), tuple(s.__class__._dr_fields))
        self.assertTupleEqual((), tuple(s.__class__._dr_stores))

        self.assertEqual("Token(norm=" + repr('The') + ", span=slice(0, 3))",
                         repr(t))
        self.assertEqual('Sent(span=slice(0, 5))', repr(s))
 def test_empty__read(self):
     stream = six.BytesIO()
     reader = dr.Reader(stream, Doc)
     doc = reader.read()
     self.assertIsNone(doc)