def test_pdf(self):

        etl_file = Connector_File()

        filename = os.path.dirname(
            os.path.realpath(__file__)) + '/test/test.pdf'

        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)
        parameters, data = etl_file.index_file(
            filename=filename, additional_plugins=['enhance_pdf_ocr'])

        # delete from search index
        etl_delete = Delete()
        etl_delete.delete(filename)

        # check extracted content type
        self.assertTrue(data['content_type_ss'] == 'application/pdf'
                        or data['content_type_ss']
                        == ['application/pdf', 'image/jpeg', 'image/png'])

        # check content type group which is mapped to this content type (result of plugin enhance_contenttype_group.py)
        self.assertTrue(data['content_type_group_ss'] == ['Text document']
                        or data['content_type_group_ss']
                        == ['Text document', 'Image', 'Image'])

        # check extracted title (result of plugin enhance_extract_text_tika_server.py)
        self.assertEqual(data['title_txt'], 'TestPDFtitle')

        # check extracted content of PDF text (result of plugin enhance_extract_text_tika_server.py)
        self.assertTrue(
            'TestPDFContent1 on TestPDFPage1' in data['content_txt'])
        self.assertTrue(
            'TestPDFContent2 on TestPDFPage2' in data['content_txt'])

        # check OCR of embedded images in PDF (result of plugin enhance_pdf_ocr.py)
        self.assertTrue('TestPDFOCRImage1Content1' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage1Content2' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage2Content1' in data['ocr_t'])
        self.assertTrue('TestPDFOCRImage2Content2' in data['ocr_t'])

        # check if a plugin threw an exception
        self.assertEqual(len(data['etl_error_plugins_ss']), 0)
Beispiel #2
0
    def test_warc(self):

        etl_file = Connector_File()
        exporter = export_solr()

        filename = os.path.dirname(
            os.path.realpath(__file__)) + '/test/example.warc'

        # run ETL of example.warc with configured plugins and warc extractor
        parameters, data = etl_file.index_file(filename=filename)

        contained_doc_id = 'http://example.com/<urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>'
        fields = ['id', 'title_txt', 'content_type_ss', 'content_txt']

        data = exporter.get_data(contained_doc_id, fields)

        # delete from search index
        etl_delete = Delete()
        etl_delete.delete(filename)
        etl_delete.delete(contained_doc_id)

        self.assertEqual(data['title_txt'], ['Example Domain'])

        self.assertEqual(data['content_type_ss'], ['text/html; charset=UTF-8'])

        self.assertTrue(
            'This domain is established to be used for illustrative examples in documents.'
            in data['content_txt'][0])
Beispiel #3
0
    broker = os.getenv('OPEN_SEMANTIC_ETL_MQ_BROKER')

app = Celery('etl.tasks', broker=broker)

app.conf.CELERY_QUEUES = [
    Queue('tasks',
          Exchange('tasks'),
          routing_key='tasks',
          queue_arguments={'x-max-priority': 10})
]

app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1
app.conf.CELERYD_PREFETCH_MULTIPLIER = 1
app.conf.CELERY_ACKS_LATE = True

etl_delete = Delete()
etl_web = Connector_Web()
etl_rss = Connector_RSS()

#
# Delete document with URI from index
#


@app.task(name='etl.delete')
def delete(uri):
    etl_delete.delete(uri=uri)


#
# Index a file
# ETL connectors
from etl import ETL
from etl_delete import Delete
from etl_file import Connector_File
from etl_web import Connector_Web
from etl_rss import Connector_RSS


verbose = True
quiet = False

app = Celery('etl.tasks')
app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1

etl_delete = Delete()
etl_web = Connector_Web()
etl_rss = Connector_RSS()


#
# Delete document with URI from index
#

@app.task(name='etl.delete')
def delete(uri):
	etl_delete.delete(uri=uri)


#
# Index a file