preprocess.py -h | --help | --version Options: -h --help Show this screen --version Version number """ import logging from docopt import docopt import iepy iepy.setup(__file__) from iepy.data.db import DocumentManager from iepy.preprocess.stanford_preprocess import StanfordPreprocess from iepy.preprocess.pipeline import PreProcessPipeline from iepy.preprocess.segmenter import SyntacticSegmenterRunner if __name__ == '__main__': logger = logging.getLogger(u'preprocess') logger.setLevel(logging.INFO) logging.basicConfig( level=logging.INFO, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = docopt(__doc__, version=0.1) docs = DocumentManager() pipeline = PreProcessPipeline( [StanfordPreprocess(), SyntacticSegmenterRunner(increment=True)], docs) pipeline.process_everything()
def setUp(self): self.doc = IEDocFactory() super(TestDocumentSegmenter, self).setUp() self.segmenter = SyntacticSegmenterRunner()
class TestDocumentSegmenter(ManagerTestCase): ManagerClass = TextSegment def setUp(self): self.doc = IEDocFactory() super(TestDocumentSegmenter, self).setUp() self.segmenter = SyntacticSegmenterRunner() def set_doc_length(self, n): self.doc.tokens = ["x"] * n self.doc.offsets = list(range(n)) self.doc.postags = ["tag"] * n self.doc.sentences = [0] def add_entities(self, positions): e1 = EntityFactory() for p in positions: if isinstance(p, tuple): start, length = p else: start, length = p, 1 EntityOccurrenceFactory(document=self.doc, entity=e1, offset=start, offset_end=start + length, alias="AB") def test_no_entities(self): self.set_doc_length(100) raws = self.segmenter.build_syntactic_segments(self.doc) self.assertEqual(raws, []) def test_sentence_segmenter_limits(self): self.set_doc_length(100) self.add_entities([1, 2, 22, 23, 35, 61, 80]) self.doc.sentences = [0, 20, 50] raws = self.segmenter.build_syntactic_segments(self.doc) self.assertEqual(len(raws), 3) s = raws[0] self.assertEqual(s.offset, 0) self.assertEqual(s.offset_end, 20) self.assertEqual(len(s.entity_occurrences), 2) s = raws[1] self.assertEqual(s.offset, 20) self.assertEqual(s.offset_end, 50) self.assertEqual(len(s.entity_occurrences), 3) s = raws[2] self.assertEqual(s.offset, 50) self.assertEqual(s.offset_end, len(self.doc.tokens)) self.assertEqual(len(s.entity_occurrences), 2) def test_sentence_segmenter_requires_2_entities(self): self.set_doc_length(100) self.add_entities([1, 2, 22]) self.doc.sentences = [0, 20, 50] raws = self.segmenter.build_syntactic_segments(self.doc) self.assertEqual(len(raws), 1) s = raws[0] self.assertEqual(s.offset, 0) self.assertEqual(s.offset_end, 20) self.assertEqual(len(s.entity_occurrences), 2)
def start_preprocess(docs, increment_ner): pipeline = PreProcessPipeline([ StanfordPreprocess(increment_ner), SyntacticSegmenterRunner(increment=True) ], docs) pipeline.process_everything()
class TestDocumentSegmenter(ManagerTestCase): ManagerClass = TextSegment def setUp(self): self.doc = IEDocFactory() super(TestDocumentSegmenter, self).setUp() self.segmenter = SyntacticSegmenterRunner() def set_doc_length(self, n): self.doc.tokens = ["x"] * n self.doc.offsets = list(range(n)) self.doc.postags = ["tag"] * n self.doc.sentences = [0] def add_entities(self, positions): e1 = EntityFactory() for p in positions: if isinstance(p, tuple): start, length = p else: start, length = p, 1 EntityOccurrenceFactory( document=self.doc, entity=e1, offset=start, offset_end=start + length, alias="AB") def test_no_entities(self): self.set_doc_length(100) raws = self.segmenter.build_syntactic_segments(self.doc) self.assertEqual(raws, []) def test_sentence_segmenter_limits(self): self.set_doc_length(100) self.add_entities([1, 2, 22, 23, 35, 61, 80]) self.doc.sentences = [0, 20, 50] raws = self.segmenter.build_syntactic_segments(self.doc) self.assertEqual(len(raws), 3) s = raws[0] self.assertEqual(s.offset, 0) self.assertEqual(s.offset_end, 20) self.assertEqual(len(s.entity_occurrences), 2) s = raws[1] self.assertEqual(s.offset, 20) self.assertEqual(s.offset_end, 50) self.assertEqual(len(s.entity_occurrences), 3) s = raws[2] self.assertEqual(s.offset, 50) self.assertEqual(s.offset_end, len(self.doc.tokens)) self.assertEqual(len(s.entity_occurrences), 2) def test_sentence_segmenter_requires_2_entities(self): self.set_doc_length(100) self.add_entities([1, 2, 22]) self.doc.sentences = [0, 20, 50] raws = self.segmenter.build_syntactic_segments(self.doc) self.assertEqual(len(raws), 1) s = raws[0] self.assertEqual(s.offset, 0) self.assertEqual(s.offset_end, 20) self.assertEqual(len(s.entity_occurrences), 2)