Beispiel #1
0
    preprocess.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
import logging

from docopt import docopt

import iepy

iepy.setup(__file__)
from iepy.data.db import DocumentManager
from iepy.preprocess.stanford_preprocess import StanfordPreprocess
from iepy.preprocess.pipeline import PreProcessPipeline
from iepy.preprocess.segmenter import SyntacticSegmenterRunner

if __name__ == '__main__':
    logger = logging.getLogger(u'preprocess')
    logger.setLevel(logging.INFO)
    logging.basicConfig(
        level=logging.INFO,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)
    docs = DocumentManager()
    pipeline = PreProcessPipeline(
        [StanfordPreprocess(),
         SyntacticSegmenterRunner(increment=True)], docs)
    pipeline.process_everything()
Beispiel #2
0
 def setUp(self):
     self.doc = IEDocFactory()
     super(TestDocumentSegmenter, self).setUp()
     self.segmenter = SyntacticSegmenterRunner()
Beispiel #3
0
class TestDocumentSegmenter(ManagerTestCase):

    ManagerClass = TextSegment

    def setUp(self):
        self.doc = IEDocFactory()
        super(TestDocumentSegmenter, self).setUp()
        self.segmenter = SyntacticSegmenterRunner()

    def set_doc_length(self, n):
        self.doc.tokens = ["x"] * n
        self.doc.offsets = list(range(n))
        self.doc.postags = ["tag"] * n
        self.doc.sentences = [0]

    def add_entities(self, positions):
        e1 = EntityFactory()
        for p in positions:
            if isinstance(p, tuple):
                start, length = p
            else:
                start, length = p, 1
            EntityOccurrenceFactory(document=self.doc,
                                    entity=e1,
                                    offset=start,
                                    offset_end=start + length,
                                    alias="AB")

    def test_no_entities(self):
        self.set_doc_length(100)
        raws = self.segmenter.build_syntactic_segments(self.doc)
        self.assertEqual(raws, [])

    def test_sentence_segmenter_limits(self):
        self.set_doc_length(100)
        self.add_entities([1, 2, 22, 23, 35, 61, 80])
        self.doc.sentences = [0, 20, 50]
        raws = self.segmenter.build_syntactic_segments(self.doc)
        self.assertEqual(len(raws), 3)
        s = raws[0]
        self.assertEqual(s.offset, 0)
        self.assertEqual(s.offset_end, 20)
        self.assertEqual(len(s.entity_occurrences), 2)
        s = raws[1]
        self.assertEqual(s.offset, 20)
        self.assertEqual(s.offset_end, 50)
        self.assertEqual(len(s.entity_occurrences), 3)
        s = raws[2]
        self.assertEqual(s.offset, 50)
        self.assertEqual(s.offset_end, len(self.doc.tokens))
        self.assertEqual(len(s.entity_occurrences), 2)

    def test_sentence_segmenter_requires_2_entities(self):
        self.set_doc_length(100)
        self.add_entities([1, 2, 22])
        self.doc.sentences = [0, 20, 50]
        raws = self.segmenter.build_syntactic_segments(self.doc)
        self.assertEqual(len(raws), 1)
        s = raws[0]
        self.assertEqual(s.offset, 0)
        self.assertEqual(s.offset_end, 20)
        self.assertEqual(len(s.entity_occurrences), 2)
Beispiel #4
0
def start_preprocess(docs, increment_ner):
    pipeline = PreProcessPipeline([
        StanfordPreprocess(increment_ner),
        SyntacticSegmenterRunner(increment=True)
    ], docs)
    pipeline.process_everything()
Beispiel #5
0
 def setUp(self):
     self.doc = IEDocFactory()
     super(TestDocumentSegmenter, self).setUp()
     self.segmenter = SyntacticSegmenterRunner()
Beispiel #6
0
class TestDocumentSegmenter(ManagerTestCase):

    ManagerClass = TextSegment

    def setUp(self):
        self.doc = IEDocFactory()
        super(TestDocumentSegmenter, self).setUp()
        self.segmenter = SyntacticSegmenterRunner()

    def set_doc_length(self, n):
        self.doc.tokens = ["x"] * n
        self.doc.offsets = list(range(n))
        self.doc.postags = ["tag"] * n
        self.doc.sentences = [0]

    def add_entities(self, positions):
        e1 = EntityFactory()
        for p in positions:
            if isinstance(p, tuple):
                start, length = p
            else:
                start, length = p, 1
            EntityOccurrenceFactory(
                document=self.doc,
                entity=e1, offset=start,
                offset_end=start + length,
                alias="AB")

    def test_no_entities(self):
        self.set_doc_length(100)
        raws = self.segmenter.build_syntactic_segments(self.doc)
        self.assertEqual(raws, [])

    def test_sentence_segmenter_limits(self):
        self.set_doc_length(100)
        self.add_entities([1, 2, 22, 23, 35, 61, 80])
        self.doc.sentences = [0, 20, 50]
        raws = self.segmenter.build_syntactic_segments(self.doc)
        self.assertEqual(len(raws), 3)
        s = raws[0]
        self.assertEqual(s.offset, 0)
        self.assertEqual(s.offset_end, 20)
        self.assertEqual(len(s.entity_occurrences), 2)
        s = raws[1]
        self.assertEqual(s.offset, 20)
        self.assertEqual(s.offset_end, 50)
        self.assertEqual(len(s.entity_occurrences), 3)
        s = raws[2]
        self.assertEqual(s.offset, 50)
        self.assertEqual(s.offset_end, len(self.doc.tokens))
        self.assertEqual(len(s.entity_occurrences), 2)

    def test_sentence_segmenter_requires_2_entities(self):
        self.set_doc_length(100)
        self.add_entities([1, 2, 22])
        self.doc.sentences = [0, 20, 50]
        raws = self.segmenter.build_syntactic_segments(self.doc)
        self.assertEqual(len(raws), 1)
        s = raws[0]
        self.assertEqual(s.offset, 0)
        self.assertEqual(s.offset_end, 20)
        self.assertEqual(len(s.entity_occurrences), 2)