def start_preprocess(docs, increment_ner):
    pipeline = PreProcessPipeline([
        TokenizeSentencerRunner(increment_ner),
        ResolucionesNERRunner(),
        SociedadesNERRunner(),
    ], docs)
    pipeline.process_everything()
 def test_process_step_in_batch_does_not_call_docs_save(self):
     runner = mock.Mock(wraps=lambda x: x)
     docs = [mock.Mock() for i in range(5)]
     p = PreProcessPipeline([runner], docs)
     p.process_step_in_batch(runner)
     for d in docs:
         self.assertFalse(d.save.called)
 def test_process_step_in_batch_does_nothing_with_previous_steps_runner(self):
     runner1 = mock.Mock(wraps=lambda x: x)
     runner2 = mock.Mock(wraps=lambda x: x)
     docs = [object() for i in range(5)]
     p = PreProcessPipeline([runner1, runner2], docs)
     p.process_step_in_batch(runner2)
     self.assertFalse(runner1.called)
 def test_walk_document_applies_all_step_runners_again_if_they_were_already_run(self):
     step_runner1 = mock.MagicMock()
     p = PreProcessPipeline([step_runner1], [])
     doc = object()
     p.walk_document(doc)
     p.walk_document(doc)
     self.assertEqual(step_runner1.call_count, 2)
Example #5
0
 def test_process_step_in_batch_does_not_call_docs_save(self):
     runner = mock.Mock(wraps=lambda x: x)
     docs = [mock.Mock() for i in range(5)]
     p = PreProcessPipeline([runner], docs)
     p.process_step_in_batch(runner)
     for d in docs:
         self.assertFalse(d.save.called)
Example #6
0
 def test_process_step_in_batch_does_nothing_with_previous_steps_runner(
         self):
     runner1 = mock.Mock(wraps=lambda x: x)
     runner2 = mock.Mock(wraps=lambda x: x)
     docs = [object() for i in range(5)]
     p = PreProcessPipeline([runner1, runner2], docs)
     p.process_step_in_batch(runner2)
     self.assertFalse(runner1.called)
Example #7
0
 def test_walk_document_applies_all_step_runners_again_if_they_were_already_run(
         self):
     step_runner1 = mock.MagicMock()
     p = PreProcessPipeline([step_runner1], [])
     doc = object()
     p.walk_document(doc)
     p.walk_document(doc)
     self.assertEqual(step_runner1.call_count, 2)
Example #8
0
 def test_process_step_in_batch_applies_runner_to_all_documents(self):
     # We take care that doesn't have attr "step"
     _runner = lambda x: x
     runner = mock.Mock(wraps=_runner)
     docs = [object() for i in range(5)]
     p = PreProcessPipeline([runner], docs)
     p.process_step_in_batch(runner)
     self.assertEqual(runner.call_count, len(docs))
     self.assertEqual(runner.call_args_list, [mock.call(d) for d in docs])
 def test_process_step_in_batch_applies_runner_to_all_documents(self):
     # We take care that doesn't have attr "step"
     _runner = lambda x: x
     runner = mock.Mock(wraps=_runner)
     docs = [object() for i in range(5)]
     p = PreProcessPipeline([runner], docs)
     p.process_step_in_batch(runner)
     self.assertEqual(runner.call_count, len(docs))
     self.assertEqual(runner.call_args_list, [mock.call(d) for d in docs])
Example #10
0
 def test_walk_document_applies_all_step_runners_to_the_given_doc(self):
     step1_runner = mock.MagicMock()
     step1_runner.side_effect = lambda x: x.call_order.append(1)
     step2_runner = mock.MagicMock()
     step2_runner.side_effect = lambda x: x.call_order.append(2)
     doc = mock.MagicMock()
     doc.call_order = []
     p = PreProcessPipeline([step1_runner, step2_runner], [])
     p.walk_document(doc)
     step1_runner.assert_called_once_with(doc)
     step2_runner.assert_called_once_with(doc)
     self.assertEqual(doc.call_order, [1, 2])
 def test_walk_document_applies_all_step_runners_to_the_given_doc(self):
     step1_runner = mock.MagicMock()
     step1_runner.side_effect = lambda x: x.call_order.append(1)
     step2_runner = mock.MagicMock()
     step2_runner.side_effect = lambda x: x.call_order.append(2)
     doc = mock.MagicMock()
     doc.call_order = []
     p = PreProcessPipeline([step1_runner, step2_runner], [])
     p.walk_document(doc)
     step1_runner.assert_called_once_with(doc)
     step2_runner.assert_called_once_with(doc)
     self.assertEqual(doc.call_order, [1, 2])
 def test_process_everythin_calls_successively_process_step_in_batch(self):
     runner1 = mock.Mock(wraps=lambda x: x)
     runner2 = mock.Mock(wraps=lambda x: x)
     docs = [object() for i in range(5)]
     p = PreProcessPipeline([runner1, runner2], docs)
     with mock.patch.object(p, 'process_step_in_batch') as mock_batch:
         p.call_order = []
         mock_batch.side_effect = lambda r: p.call_order.append(r)
         p.process_everything()
         self.assertEqual(mock_batch.call_count, 2)
         self.assertEqual(mock_batch.call_args_list,
                          [mock.call(runner1), mock.call(runner2)])
     self.assertEqual(p.call_order, [runner1, runner2])
Example #13
0
 def test_process_everythin_calls_successively_process_step_in_batch(self):
     runner1 = mock.Mock(wraps=lambda x: x)
     runner2 = mock.Mock(wraps=lambda x: x)
     docs = [object() for i in range(5)]
     p = PreProcessPipeline([runner1, runner2], docs)
     with mock.patch.object(p, 'process_step_in_batch') as mock_batch:
         p.call_order = []
         mock_batch.side_effect = lambda r: p.call_order.append(r)
         p.process_everything()
         self.assertEqual(mock_batch.call_count, 2)
         self.assertEqual(
             mock_batch.call_args_list,
             [mock.call(runner1), mock.call(runner2)])
     self.assertEqual(p.call_order, [runner1, runner2])
 def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
     step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
                                  override=False, increment=False)
     all_docs = [object() for i in range(5)]
     self.patch_object(DocumentManager, '__iter__', return_value=all_docs)
     dm_get_docs = self.patch_object(DocumentManager, 'get_documents_lacking_preprocess',
                                     return_value=all_docs[:2])
     # Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
     # only 2 of them
     p = PreProcessPipeline([step_runner], DocumentManager())
     p.process_step_in_batch(step_runner)
     dm_get_docs.assert_called_once_with(step_runner.step)
     self.assertNotEqual(step_runner.call_count, 5)
     self.assertEqual(step_runner.call_count, 2)
     self.assertEqual(step_runner.call_args_list, [mock.call(d) for d in all_docs[:2]])
Example #15
0
 def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
     step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
                                  override=False, increment=False)
     all_docs = [object() for i in range(5)]
     docs_manager = mock.MagicMock()
     docs_manager.__iter__.return_value = all_docs
     docs_manager.get_documents_lacking_preprocess.side_effect = lambda x: all_docs[:2]
     # Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
     # only 2 of them
     p = PreProcessPipeline([step_runner], docs_manager)
     p.process_step_in_batch(step_runner)
     docs_filter = docs_manager.get_documents_lacking_preprocess
     docs_filter.assert_called_once_with(step_runner.step)
     self.assertNotEqual(step_runner.call_count, 5)
     self.assertEqual(step_runner.call_count, 2)
     self.assertEqual(step_runner.call_args_list, [mock.call(d) for d in all_docs[:2]])
 def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
     step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
                                  override=False,
                                  increment=False)
     all_docs = [object() for i in range(5)]
     self.patch_object(DocumentManager, '__iter__', return_value=all_docs)
     dm_get_docs = self.patch_object(DocumentManager,
                                     'get_documents_lacking_preprocess',
                                     return_value=all_docs[:2])
     # Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
     # only 2 of them
     p = PreProcessPipeline([step_runner], DocumentManager())
     p.process_step_in_batch(step_runner)
     dm_get_docs.assert_called_once_with(step_runner.step)
     self.assertNotEqual(step_runner.call_count, 5)
     self.assertEqual(step_runner.call_count, 2)
     self.assertEqual(step_runner.call_args_list,
                      [mock.call(d) for d in all_docs[:2]])
 def test_process_step_in_batch_filter_docs_to_apply_if_has_attr_step(self):
     step_runner = mock.MagicMock(step=PreProcessSteps.tokenization,
                                  override=False)
     all_docs = [object() for i in range(5)]
     docs_manager = mock.MagicMock()
     docs_manager.__iter__.return_value = all_docs
     docs_manager.get_documents_lacking_preprocess.side_effect = lambda x: all_docs[:
                                                                                    2
                                                                                    ]
     # Ok, docs manager has 5 docs, but get_documents_lacking_preprocess will return
     # only 2 of them
     p = PreProcessPipeline([step_runner], docs_manager)
     p.process_step_in_batch(step_runner)
     docs_filter = docs_manager.get_documents_lacking_preprocess
     docs_filter.assert_called_once_with(step_runner.step)
     self.assertNotEqual(step_runner.call_count, 5)
     self.assertEqual(step_runner.call_count, 2)
     self.assertEqual(step_runner.call_args_list,
                      [mock.call(d) for d in all_docs[:2]])
 def test_walk_document_itself_does_not_save_the_document(self):
     step_runner1 = mock.MagicMock()
     p = PreProcessPipeline([step_runner1], [])
     doc = mock.MagicMock()
     p.walk_document(doc)
     self.assertEqual(doc.save.call_count, 0)
Example #19
0
    preprocess.py
    preprocess.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
import logging

from docopt import docopt

import iepy
iepy.setup(__file__)
from iepy.data.db import DocumentManager
from iepy.preprocess.stanford_preprocess import StanfordPreprocess
from iepy.preprocess.pipeline import PreProcessPipeline
from iepy.preprocess.segmenter import SyntacticSegmenterRunner


if __name__ == '__main__':
    logger = logging.getLogger(u'preprocess')
    logger.setLevel(logging.INFO)
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    opts = docopt(__doc__, version=iepy.__version__)
    docs = DocumentManager()
    pipeline = PreProcessPipeline([
        StanfordPreprocess(),
        SyntacticSegmenterRunner(increment=True)
    ], docs)
    pipeline.process_everything()
Example #20
0
    preprocess.py -h | --help | --version

Options:
  -h --help             Show this screen
  --version             Version number
"""
import logging

from docopt import docopt

import iepy

iepy.setup(__file__)
from iepy.data.db import DocumentManager
from iepy.preprocess.stanford_preprocess import StanfordPreprocess
from iepy.preprocess.pipeline import PreProcessPipeline
from iepy.preprocess.segmenter import SyntacticSegmenterRunner

if __name__ == '__main__':
    logger = logging.getLogger(u'preprocess')
    logger.setLevel(logging.INFO)
    logging.basicConfig(
        level=logging.INFO,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)
    docs = DocumentManager()
    pipeline = PreProcessPipeline(
        [StanfordPreprocess(),
         SyntacticSegmenterRunner(increment=True)], docs)
    pipeline.process_everything()
Example #21
0
        doc.text = get_body_text(raw)
        doc.save()


CUSTOM_ENTITIES = [u'PERSON', u'DISEASE', u'SYMPTOM', u'MEDICAL_TEST']
CUSTOM_ENTITIES_FILES = [
    u'examples/tvseries/notable_people.txt', u'examples/tvseries/disease.txt',
    u'examples/tvseries/symptom.txt', u'examples/tvseries/diagnostic_test.txt'
]

if __name__ == '__main__':
    logger = logging.getLogger(u'preprocess')
    logger.setLevel(logging.INFO)
    logging.basicConfig(
        level=logging.INFO,
        format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    opts = docopt(__doc__, version=0.1)
    docs = DocumentManager()
    EntityManager.ensure_kinds(CUSTOM_ENTITIES)
    pipeline = PreProcessPipeline([
        media_wiki_to_txt,
        TokenizeSentencerRunner(),
        StanfordTaggerRunner(),
        NoOverlapCombinedNERRunner(ners=[
            LiteralNERRunner(CUSTOM_ENTITIES, CUSTOM_ENTITIES_FILES),
            StanfordNERRunner()
        ]),
        SyntacticSegmenterRunner(increment=True),
    ], docs)
    pipeline.process_everything()
Example #22
0
def start_preprocess(docs, increment_ner):
    pipeline = PreProcessPipeline([
        StanfordPreprocess(increment_ner),
        SyntacticSegmenterRunner(increment=True)
    ], docs)
    pipeline.process_everything()
Example #23
0
def start_preprocess(docs, increment_ner):
    pipeline = PreProcessPipeline([
        StanfordPreprocess(increment_ner),
        SyntacticSegmenterRunner(increment=True)
    ], docs)
    pipeline.process_everything()
Example #24
0
 def test_walk_document_itself_does_not_save_the_document(self):
     step_runner1 = mock.MagicMock()
     p = PreProcessPipeline([step_runner1], [])
     doc = mock.MagicMock()
     p.walk_document(doc)
     self.assertEqual(doc.save.call_count, 0)