def notify_role(role, subject, html): if role.email is None: log.error("Role does not have E-Mail: %r", role) return sender = '%s <%s>' % (get_config('APP_TITLE'), get_config('MAIL_FROM')) subject = '[%s] %s' % (get_config('APP_TITLE'), subject) msg = Message(subject=subject, sender=sender, recipients=[role.email]) msg.html = html mail.send(msg)
def graph_metadata(): graph = get_graph() if graph is None: return {'active': False} ignore_labels = ['Collection', BASE_NODE] labels = [l for l in graph.node_labels if l not in ignore_labels] types = [t for t in graph.relationship_types if t != 'PART_OF'] return { 'active': True, 'labels': labels, 'types': types, 'icons': get_config('GRAPH_ICONS'), 'colors': get_config('GRAPH_COLORS') }
def session(self): if not hasattr(self, '_session'): username = get_config('ID_USERNAME') password = get_config('ID_PASSWORD') sess = requests.Session() res = sess.get(urljoin(self.host, '/accounts/login/')) data = {'csrfmiddlewaretoken': sess.cookies['csrftoken'], 'username': username, 'password': password} res = sess.post(res.url, data=data, headers={ 'Referer': res.url }) self._session = sess return self._session
def metadata(): enable_cache(server_side=False) schemata = {} for schema_id, schema in resolver.store.items(): if not schema_id.endswith('#'): schema_id = schema_id + '#' schemata[schema_id] = { 'id': schema_id, 'title': schema.get('title'), 'faIcon': schema.get('faIcon'), 'plural': schema.get('plural', schema.get('title')), 'description': schema.get('description'), 'inline': schema.get('inline', False) } return jsonify({ 'status': 'ok', 'app': { 'title': get_app_title(), 'url': get_app_url(), 'samples': get_config('SAMPLE_SEARCHES') }, 'fields': Metadata.facets(), 'categories': Collection.CATEGORIES, 'countries': COUNTRY_NAMES, 'languages': LANGUAGE_NAMES, 'schemata': schemata })
def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text try: img = Image.open(StringIO(data)) except DecompressionBombWarning as dce: log.debug("Image too large: %", dce) return None except IOError as ioe: log.info("Unknown image format: %r", ioe) return None # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) extractor.clear() log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def get_languages(): active = [c.lower().strip() for c in get_config("LANGUAGES")] languages = {} for code, label in LANGUAGE_NAMES.items(): if code in active: languages[code] = label return languages
def _extract_image_page(pdf_file, page, languages=None): # This is a somewhat hacky way of working around some of the formats # and compression mechanisms not supported in pdfminer. It will # generate an image based on the given page in the PDF and then OCR # that. pdftoppm = get_config('PDFTOPPM_BIN') args = [pdftoppm, pdf_file, '-singlefile', '-gray', '-f', str(page)] output = subprocess.check_output(args) return extract_image_data(output, languages=languages)
def generate_pdf_version(self, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" fh, out_path = mkstemp(suffix='.pdf') os.close(fh) wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path] subprocess.call(args) return out_path
def generate_pdf_alternative(self, meta, local_path): """Convert DjVu book to PDF.""" out_path = make_tempfile(meta.file_name, suffix='pdf') ddjvu = get_config('DDJVU_BIN') args = [ddjvu, '-format=pdf', '-quality=85', '-skip', local_path, out_path] log.debug('Converting DJVU book: %r', ' '.join(args)) subprocess.call(args, stderr=subprocess.STDOUT) return out_path
def handle_html(self, meta, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" out_path = make_tempfile(name=meta.file_name, suffix='pdf') try: wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path] subprocess.call(args) if not os.path.isfile(out_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, out_path) finally: remove_tempfile(out_path)
def angular_templates(): templates = {} template_dirs = [current_app.static_folder] template_dirs.extend(get_config('CUSTOM_TEMPLATES_DIR')) for template_dir in template_dirs: for tmpl_set in ['templates', 'help']: tmpl_dir = os.path.join(template_dir, tmpl_set) for (root, dirs, files) in os.walk(tmpl_dir): for file_name in files: file_path = os.path.join(root, file_name) with open(file_path, 'rb') as fh: file_name = file_path[len(template_dir) + 1:] templates[file_name] = fh.read().decode('utf-8') return templates.items()
def check_role_alerts(role): alerts = Alert.by_role(role).all() if not len(alerts): return log.info('Alerting %r, %d alerts...', role, len(alerts)) for alert in alerts: q = documents_query(alert.query, newer_than=alert.notified_at) results = execute_documents_alert_query(alert.query, q) if results['total'] == 0: continue log.info('Found: %d new results for: %r', results['total'], alert.query) alert.update() try: subject = '%s (%s new results)' % (alert.label, results['total']) html = render_template('alert.html', alert=alert, results=results, role=role, qs=make_document_query(alert), app_title=get_config('APP_TITLE'), app_url=get_config('APP_BASEURL')) notify_role(role, subject, html) except Exception as ex: log.exception(ex) db.session.commit()
def handle_html(self, meta, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" out_path = make_tempfile(name=meta.file_name, suffix='pdf') try: wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [ wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path ] subprocess.call(args) if not os.path.isfile(out_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, out_path) finally: remove_tempfile(out_path)
def enable_cache(vary_user=False, vary=None, server_side=True): args = sorted(set(request.args.items())) # jquery where is your god now?!? args = filter(lambda (k, v): k != '_', args) cache_parts = [args, vary] if vary_user: cache_parts.extend((request.auth_roles)) request._http_cache = get_config('CACHE') request._http_etag = cache_hash(*cache_parts) request._http_server = server_side if request.if_none_match == request._http_etag: raise NotModified()
def ingest(self, meta, local_path): try: fh, pdf_path = mkstemp(suffix='.pdf') os.close(fh) meta.title = meta.file_name convert = get_config('CONVERT_BIN') args = [convert, local_path, '-density', '300', '-define', 'pdf:fit-page=A4', pdf_path] subprocess.call(args) if pdf_path is None or not os.path.isfile(pdf_path): raise IngestorException("Could not convert image: %r" % meta) self.store_pdf(meta, pdf_path) self.extract_pdf(meta, pdf_path) finally: if os.path.isfile(pdf_path): os.unlink(pdf_path)
def ingest(self, meta, local_path): pdf_path = make_tempfile(name=meta.file_name, suffix='pdf') try: meta.title = meta.file_name if not self.check_image_size(meta, local_path): return convert = get_config('CONVERT_BIN') args = [convert, local_path, '-density', '300', '-define', 'pdf:fit-page=A4', pdf_path] subprocess.call(args) if not os.path.isfile(pdf_path): msg = "Could not convert image: %r" % meta raise ImageIngestorException(msg) self.store_pdf(meta, pdf_path) self.extract_pdf(meta, pdf_path) finally: remove_tempfile(pdf_path)
def generate_pdf_alternative(self, meta, local_path): """Convert LibreOffice-supported documents to PDF.""" work_dir = mkdtemp() instance_dir = mkdtemp() try: soffice = get_config('SOFFICE_BIN') instance_path = '"-env:UserInstallation=file://%s"' % instance_dir args = [soffice, '--convert-to', 'pdf', '--nofirststartwizard', instance_path, '--norestore', '--nologo', '--nodefault', '--nolockcheck', '--invisible', '--outdir', work_dir, '--headless', local_path] log.debug('Converting document: %r', ' '.join(args)) subprocess.call(args) for out_file in os.listdir(work_dir): return os.path.join(work_dir, out_file) finally: shutil.rmtree(instance_dir)
def extract_image_data(data, languages=None): """Extract text from a binary string of data.""" tessdata_prefix = get_config('TESSDATA_PREFIX') if tessdata_prefix is None: raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.") languages = get_languages_iso3(languages) text = Cache.get_ocr(data, languages) if text is not None: return text img = Image.open(StringIO(data)) # TODO: play with contrast and sharpening the images. extractor = Tesseract(tessdata_prefix, lang=languages) extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD) text = extractor.ocr_image(img) log.debug('OCR done: %s, %s characters extracted', languages, len(text)) Cache.set_ocr(data, languages, text) return text
def _convert_page(interpreter, page, device, page_no, path, languages): # If this returns None or an empty string, it'll trigger OCR. text_content = [] ocr_required = False try: interpreter.process_page(page) layout = device.get_result() for text_obj in _find_objects(layout._objs, (LTTextBox, LTTextLine)): text = text_obj.get_text() if text is None: continue text = text.strip() if len(text): text_content.append(text) # Generous try/catch because pdfminers image support is # horrible. page_area = float(layout.width * layout.height) for image_obj in _find_objects(layout._objs, LTImage): image_area = float(image_obj.width * image_obj.height) page_portion = image_area / page_area # Go for OCR if an image makes up more than 70% of the page. if page_portion > 0.7: ocr_required = True except Exception as ex: log.exception(ex) ocr_required = True if ocr_required and get_config("OCR_PDF_PAGES"): log.info("Using OCR for %r, p.%s", path, page_no) text_content.append(_extract_image_page(path, page_no, languages)) text = "\n".join(text_content) log.debug("Extracted %d characters of text from %r, p.%s", len(text), path, page_no) return text.strip()
def iter_table(self, local_path, table_name): mdb_export = get_config('MDB_EXPORT_BIN') args = [mdb_export, '-b', 'strip', local_path, table_name] proc = subprocess.Popen(args, stdout=subprocess.PIPE) for row in DictReader(proc.stdout): yield row
def get_language_whitelist(): return [c.lower().strip() for c in get_config('LANGUAGES')]
def unpack(self, meta, local_path, temp_dir): args = [get_config('SEVENZ_BIN'), 'x', local_path, '-y', '-r', '-bb0', '-bd', '-oc:%s' % temp_dir] print ' '.join(args) subprocess.call(args, stderr=subprocess.STDOUT)
def get_tables(self, local_path): mdb_tables = get_config('MDB_TABLES_BIN') output = subprocess.check_output([mdb_tables, local_path]) return [t.strip() for t in output.split(' ') if len(t.strip())]
def host(self): return get_config('ID_HOST', 'https://investigativedashboard.org/')
def get_languages(self, meta): default_languages = [get_config('DEFAULT_LANGUAGE')] languages = meta.languages + default_languages return list(set(languages))