class ServiceDocumentConverter(DocumentConverter): """Provides helpers for UNO document conversion via HTTP.""" SERVICE_URL = env.get('UNOSERVICE_URL') @classmethod def is_available(cls): return cls.SERVICE_URL is not None def _document_to_pdf(self, file_path, result, work_path): """Converts an office document to PDF.""" log.info('Converting [%s] to PDF...', result.file_name) out_path = os.path.basename(file_path) out_path = join_path(work_path, '%s.pdf' % out_path) file_name = result.file_name or 'data' mime_type = result.mime_type or DEFAULT attempt = 1 for attempt in service_retries(): fh = open(file_path, 'rb') try: files = {'file': (file_name, fh, mime_type)} res = requests.post(self.SERVICE_URL, files=files, timeout=(5, 305), stream=True) res.raise_for_status() with open(out_path, 'wb') as fh: for chunk in res.iter_content(chunk_size=None): fh.write(chunk) return out_path except RequestException as exc: if isinstance(exc, HTTPError): if exc.response.status_code == 400: raise ProcessingException(exc.response.text) log.error("Conversion failed: %s", exc) backoff(failures=attempt) finally: fh.close() raise ProcessingException("Document could not be converted to PDF.")
def find_command(self, name): config_name = '%s_BIN' % name config_name = config_name.replace('-', '_').upper() return env.get(config_name, find_executable(name))
# Show error messages to the user. DEBUG = env.to_bool('ALEPH_DEBUG', False) # Propose HTTP caching to the user agents. CACHE = env.to_bool('ALEPH_CACHE', not DEBUG) # Puts the system into read-only mode and displays a warning. MAINTENANCE = env.to_bool('ALEPH_MAINTENANCE', False) # Unit test context. TESTING = False ############################################################################### # General instance information APP_TITLE = env.get('ALEPH_APP_TITLE', lazy_gettext('Aleph')) APP_DESCRIPTION = env.get('ALEPH_APP_DESCRIPTION', '') APP_NAME = env.get('ALEPH_APP_NAME', 'aleph') APP_UI_URL = env.get('ALEPH_UI_URL', 'http://localhost:8080/') APP_LOGO = env.get('ALEPH_LOGO', '/static/logo.png') APP_FAVICON = env.get('ALEPH_FAVICON', '/static/logo.png') # Show a system-wide banner in the user interface. APP_BANNER = env.get('ALEPH_APP_BANNER') # Force HTTPS here: URL_SCHEME = env.get('ALEPH_URL_SCHEME', 'http') # Shown on the home page as a few sample queries: SAMPLE_SEARCHES = [lazy_gettext('TeliaSonera'), lazy_gettext('Vladimir Putin')] SAMPLE_SEARCHES = env.to_list('ALEPH_SAMPLE_SEARCHES', SAMPLE_SEARCHES)
from servicelayer import settings as sls from flask_babel import lazy_gettext # Show error messages to the user. DEBUG = env.to_bool('ALEPH_DEBUG', False) # Propose HTTP caching to the user agents. CACHE = env.to_bool('ALEPH_CACHE', not DEBUG) # Puts the system into read-only mode and displays a warning. MAINTENANCE = env.to_bool('ALEPH_MAINTENANCE', False) # Unit test context. TESTING = False ############################################################################### # General instance information APP_TITLE = env.get('ALEPH_APP_TITLE', lazy_gettext('Aleph')) APP_DESCRIPTION = env.get('ALEPH_APP_DESCRIPTION', '') APP_NAME = env.get('ALEPH_APP_NAME', 'aleph') APP_UI_URL = env.get('ALEPH_UI_URL', 'http://localhost:8080/') APP_LOGO = env.get('ALEPH_LOGO', '/static/logo.png') APP_FAVICON = env.get('ALEPH_FAVICON', '/static/logo.png') # Show a system-wide banner in the user interface. APP_BANNER = env.get('ALEPH_APP_BANNER') # Force HTTPS here: URL_SCHEME = env.get('ALEPH_URL_SCHEME', 'http') # Shown on the home page as a few sample queries: SAMPLE_SEARCHES = [lazy_gettext('TeliaSonera'), lazy_gettext('Vladimir Putin')] SAMPLE_SEARCHES = env.to_list('ALEPH_SAMPLE_SEARCHES', SAMPLE_SEARCHES)
from servicelayer import env from flask_babel import lazy_gettext # Show error messages to the user. DEBUG = env.to_bool('ALEPH_DEBUG', False) # Propose HTTP caching to the user agents. CACHE = env.to_bool('ALEPH_CACHE', not DEBUG) # Puts the system into read-only mode and displays a warning. MAINTENANCE = env.to_bool('ALEPH_MAINTENANCE', False) # Unit test context. TESTING = False ############################################################################### # General instance information APP_TITLE = env.get('ALEPH_APP_TITLE', lazy_gettext('Aleph')) APP_DESCRIPTION = env.get('ALEPH_APP_DESCRIPTION', '') APP_NAME = env.get('ALEPH_APP_NAME', 'aleph') APP_UI_URL = env.get('ALEPH_UI_URL', 'http://localhost:8080/') APP_LOGO = env.get('ALEPH_LOGO', '/static/logo.png') APP_FAVICON = env.get('ALEPH_FAVICON', '/static/favicon.png') # Show a system-wide banner in the user interface. APP_BANNER = env.get('ALEPH_APP_BANNER') # Shown on the home page as a few sample queries: SAMPLE_SEARCHES = [lazy_gettext('TeliaSonera'), lazy_gettext('Vladimir Putin')] SAMPLE_SEARCHES = env.to_list('ALEPH_SAMPLE_SEARCHES', SAMPLE_SEARCHES) # Force HTTPS here: FORCE_HTTPS = env.to_bool('ALEPH_FORCE_HTTPS', False)
from servicelayer import env from servicelayer import settings as sls from ftmstore import settings as sts TESTING = False # Document conversion service CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert") CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL) CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256) # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) # Geonames data file GEONAMES_PATH = env.get("INGESTORS_GEONAMES_PATH", "/ingestors/data/geonames.txt") # FastText lid model file LID_MODEL_PATH = env.get("INGESTORS_LID_MODEL_PATH", "/ingestors/data/lid.176.ftz") # Disable entity extraction ANALYZE_ENTITIES = env.to_bool("INGESTORS_ANALYZE_ENTITIES", True) # List available NER models NER_MODELS = set(env.to_list("INGESTORS_NER_MODELS", ["eng"])) NER_DISABLE = ["ara"] NER_DISABLE = set(env.to_list("INGESTORS_NER_DISABLE", NER_DISABLE)) NER_DEFAULT_MODEL = "xx"
import multiprocessing from servicelayer import env NUM_THREADS = min(8, multiprocessing.cpu_count()) NUM_THREADS = env.to_int('INGEST_THREADS', NUM_THREADS) MAX_RETRIES = env.to_int('INGEST_RETRIES', 3) UNOSERVICE_URL = env.get('UNOSERVICE_URL')
import os import pkg_resources import multiprocessing from servicelayer import env from servicelayer import settings as sls ############################################################################### # Core configuration VERSION = pkg_resources.get_distribution('memorious').version APP_NAME = env.get('MEMORIOUS_APP_NAME', 'memorious') # Enable debug logging etc. DEBUG = env.to_bool('MEMORIOUS_DEBUG', default=False) TESTING = False # Base operating path BASE_PATH = os.path.join(os.getcwd(), 'data') BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH) # Override servicelayer archive if undefined sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive') # Directory which contains crawler pipeline YAML specs CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH') # Try and run scrapers in a way that only acquires new data INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True) # How many days until an incremental crawl expires EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 1)
# Show error messages to the user. DEBUG = env.to_bool("ALEPH_DEBUG", False) # Profile requests PROFILE = env.to_bool("ALEPH_PROFILE", False) # Propose HTTP caching to the user agents. CACHE = env.to_bool("ALEPH_CACHE", not DEBUG) # Puts the system into read-only mode and displays a warning. MAINTENANCE = env.to_bool("ALEPH_MAINTENANCE", False) # Unit test context. TESTING = False ############################################################################### # General instance information APP_TITLE = env.get("ALEPH_APP_TITLE", lazy_gettext("Aleph")) APP_NAME = env.get("ALEPH_APP_NAME", "aleph") APP_UI_URL = env.get("ALEPH_UI_URL", "http://localhost:8080/") APP_LOGO = env.get("ALEPH_LOGO", "/static/logo.png") APP_LOGO_AR = env.get("ALEPH_LOGO_AR", APP_LOGO) APP_FAVICON = env.get("ALEPH_FAVICON", "/static/favicon.png") # Show a system-wide banner in the user interface. APP_BANNER = env.get("ALEPH_APP_BANNER") # Force HTTPS here: FORCE_HTTPS = True if APP_UI_URL.lower().startswith("https") else False FORCE_HTTPS = env.to_bool("ALEPH_FORCE_HTTPS", FORCE_HTTPS) PREFERRED_URL_SCHEME = "https" if FORCE_HTTPS else "http" PREFERRED_URL_SCHEME = env.get("ALEPH_URL_SCHEME", PREFERRED_URL_SCHEME) # Apply HTTPS rules to the UI URL:
from servicelayer import env from servicelayer import settings as sls from ftmstore import settings as sts TESTING = False # Document conversion service CONVERT_URL = env.get('UNOSERVICE_URL', 'http://convert-document:3000/convert') CONVERT_URL = env.get('INGESTORS_CONVERT_DOCUMENT_URL', CONVERT_URL) CONVERT_TIMEOUT = env.to_int('INGESTORS_CONVERT_TIMEOUT', 7200) # 2 hrs # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool('INGESTORS_OCR_VISION_API', False) # Geonames data file GEONAMES_PATH = env.get('INGESTORS_GEONAMES_PATH', '/ingestors/data/geonames.txt') # FastText lid model file LID_MODEL_PATH = env.get('INGESTORS_LID_MODEL_PATH', '/ingestors/data/lid.176.ftz') # Disable entity extraction ANALYZE_ENTITIES = env.to_bool('INGESTORS_ANALYZE_ENTITIES', True) # List available NER models NER_MODELS = set(env.to_list('INGESTORS_NER_MODELS', ['eng'])) NER_DEFAULT_MODEL = 'xx' # Use the environment variable set in aleph.env sts.DATABASE_URI = env.get('ALEPH_DATABASE_URI', sts.DATABASE_URI)
# Show error messages to the user. DEBUG = env.to_bool("ALEPH_DEBUG", False) # Profile requests PROFILE = env.to_bool("ALEPH_PROFILE", False) # Propose HTTP caching to the user agents. CACHE = env.to_bool("ALEPH_CACHE", not DEBUG) # Puts the system into read-only mode and displays a warning. MAINTENANCE = env.to_bool("ALEPH_MAINTENANCE", False) # Unit test context. TESTING = False ############################################################################### # General instance information APP_TITLE = env.get("ALEPH_APP_TITLE", lazy_gettext("Aleph")) APP_DESCRIPTION = env.get("ALEPH_APP_DESCRIPTION", "") APP_NAME = env.get("ALEPH_APP_NAME", "aleph") APP_UI_URL = env.get("ALEPH_UI_URL", "http://localhost:8080/") APP_LOGO = env.get("ALEPH_LOGO", "/static/logo.png") APP_FAVICON = env.get("ALEPH_FAVICON", "/static/favicon.png") # Show a system-wide banner in the user interface. APP_BANNER = env.get("ALEPH_APP_BANNER") # Shown on the home page as a few sample queries: SAMPLE_SEARCHES = [lazy_gettext("TeliaSonera"), lazy_gettext("Vladimir Putin")] SAMPLE_SEARCHES = env.to_list("ALEPH_SAMPLE_SEARCHES", SAMPLE_SEARCHES) # Force HTTPS here: FORCE_HTTPS = env.to_bool("ALEPH_FORCE_HTTPS", False)
def find_command(self, name): config_name = "%s_BIN" % name config_name = config_name.replace("-", "_").upper() return env.get(config_name, find_executable(name))
import os import pkg_resources from servicelayer import env from servicelayer import settings as sls ############################################################################### # Core configuration VERSION = pkg_resources.get_distribution("memorious").version APP_NAME = env.get("MEMORIOUS_APP_NAME", "memorious") # Enable debug logging etc. DEBUG = env.to_bool("MEMORIOUS_DEBUG", default=False) TESTING = False # Base operating path BASE_PATH = os.path.join(os.getcwd(), "data") BASE_PATH = env.get("MEMORIOUS_BASE_PATH", BASE_PATH) # Override servicelayer archive if undefined sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, "archive") # Directory which contains crawler pipeline YAML specs CONFIG_PATH = env.get("MEMORIOUS_CONFIG_PATH") # Try and run scrapers in a way that only acquires new data INCREMENTAL = env.to_bool("MEMORIOUS_INCREMENTAL", default=True) # Continue running the crawler even when we encounter an error CONTINUE_ON_ERROR = env.to_bool("MEMORIOUS_CONTINUE_ON_ERROR", default=False) # How many days until an incremental crawl expires
import multiprocessing from servicelayer import env # Redis cache # URL format: redis://localhost:6379/0 REDIS_URL = env.get("REDIS_URL") REDIS_SHORT = 84700 REDIS_LONG = REDIS_SHORT * 200 REDIS_EXPIRE = env.to_int("REDIS_EXPIRE", REDIS_SHORT * 7) REDIS_PREFIX = "sla" # Persistent database tags TAGS_DATABASE_URI = env.get("TAGS_DATABASE_URI", "sqlite://") # Worker WORKER_RETRY = env.to_int("WORKER_RETRY", 3) WORKER_THREADS = env.to_int("WORKER_THREADS", multiprocessing.cpu_count()) WORKER_REPORTING = env.to_bool("WORKER_REPORTING", True) # Amazon client credentials AWS_KEY_ID = env.get("AWS_ACCESS_KEY_ID") AWS_SECRET_KEY = env.get("AWS_SECRET_ACCESS_KEY") AWS_REGION = env.get("AWS_REGION", "eu-west-1") # S3 compatible Minio host if using Minio for storage ARCHIVE_ENDPOINT_URL = env.get("ARCHIVE_ENDPOINT_URL") # Storage type (either 's3', 'gs', or 'file', i.e. local file system): ARCHIVE_TYPE = env.get("ARCHIVE_TYPE", "file") ARCHIVE_BUCKET = env.get("ARCHIVE_BUCKET") ARCHIVE_PATH = env.get("ARCHIVE_PATH") PUBLICATION_BUCKET = env.get("PUBLICATION_BUCKET", ARCHIVE_BUCKET)
import multiprocessing from servicelayer import env # Redis cache REDIS_URL = env.get('REDIS_URL') REDIS_SHORT = 84700 REDIS_LONG = REDIS_SHORT * 200 REDIS_EXPIRE = env.to_int('REDIS_EXPIRE', REDIS_SHORT * 7) REDIS_PREFIX = 'sla' # Worker WORKER_RETRY = env.to_int('WORKER_RETRY', 3) WORKER_THREADS = min(8, multiprocessing.cpu_count()) WORKER_THREADS = env.to_int('WORKER_THREADS', WORKER_THREADS) # Amazon client credentials AWS_KEY_ID = env.get('AWS_ACCESS_KEY_ID') AWS_SECRET_KEY = env.get('AWS_SECRET_ACCESS_KEY') AWS_REGION = env.get('AWS_REGION', 'eu-west-1') # Storage type (either 's3', 'gs', or 'file', i.e. local file system): ARCHIVE_TYPE = env.get('ARCHIVE_TYPE', 'file') ARCHIVE_BUCKET = env.get('ARCHIVE_BUCKET') ARCHIVE_PATH = env.get('ARCHIVE_PATH')