def test_pdf(self): etl_file = Connector_File() filename = os.path.dirname( os.path.realpath(__file__)) + '/test/test.pdf' # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py) parameters, data = etl_file.index_file( filename=filename, additional_plugins=['enhance_pdf_ocr']) # delete from search index etl_delete = Delete() etl_delete.delete(filename) # check extracted content type self.assertTrue(data['content_type_ss'] == 'application/pdf' or data['content_type_ss'] == ['application/pdf', 'image/jpeg', 'image/png']) # check content type group which is mapped to this content type (result of plugin enhance_contenttype_group.py) self.assertTrue(data['content_type_group_ss'] == ['Text document'] or data['content_type_group_ss'] == ['Text document', 'Image', 'Image']) # check extracted title (result of plugin enhance_extract_text_tika_server.py) self.assertEqual(data['title_txt'], 'TestPDFtitle') # check extracted content of PDF text (result of plugin enhance_extract_text_tika_server.py) self.assertTrue( 'TestPDFContent1 on TestPDFPage1' in data['content_txt']) self.assertTrue( 'TestPDFContent2 on TestPDFPage2' in data['content_txt']) # check OCR of embedded images in PDF (result of plugin enhance_pdf_ocr.py) self.assertTrue('TestPDFOCRImage1Content1' in data['ocr_t']) self.assertTrue('TestPDFOCRImage1Content2' in data['ocr_t']) self.assertTrue('TestPDFOCRImage2Content1' in data['ocr_t']) self.assertTrue('TestPDFOCRImage2Content2' in data['ocr_t']) # check if a plugin threw an exception self.assertEqual(len(data['etl_error_plugins_ss']), 0)
def test_warc(self): etl_file = Connector_File() exporter = export_solr() filename = os.path.dirname( os.path.realpath(__file__)) + '/test/example.warc' # run ETL of example.warc with configured plugins and warc extractor parameters, data = etl_file.index_file(filename=filename) contained_doc_id = 'http://example.com/<urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>' fields = ['id', 'title_txt', 'content_type_ss', 'content_txt'] data = exporter.get_data(contained_doc_id, fields) # delete from search index etl_delete = Delete() etl_delete.delete(filename) etl_delete.delete(contained_doc_id) self.assertEqual(data['title_txt'], ['Example Domain']) self.assertEqual(data['content_type_ss'], ['text/html; charset=UTF-8']) self.assertTrue( 'This domain is established to be used for illustrative examples in documents.' in data['content_txt'][0])
broker = os.getenv('OPEN_SEMANTIC_ETL_MQ_BROKER') app = Celery('etl.tasks', broker=broker) app.conf.CELERY_QUEUES = [ Queue('tasks', Exchange('tasks'), routing_key='tasks', queue_arguments={'x-max-priority': 10}) ] app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1 app.conf.CELERYD_PREFETCH_MULTIPLIER = 1 app.conf.CELERY_ACKS_LATE = True etl_delete = Delete() etl_web = Connector_Web() etl_rss = Connector_RSS() # # Delete document with URI from index # @app.task(name='etl.delete') def delete(uri): etl_delete.delete(uri=uri) # # Index a file
# ETL connectors from etl import ETL from etl_delete import Delete from etl_file import Connector_File from etl_web import Connector_Web from etl_rss import Connector_RSS verbose = True quiet = False app = Celery('etl.tasks') app.conf.CELERYD_MAX_TASKS_PER_CHILD = 1 etl_delete = Delete() etl_web = Connector_Web() etl_rss = Connector_RSS() # # Delete document with URI from index # @app.task(name='etl.delete') def delete(uri): etl_delete.delete(uri=uri) # # Index a file