import multiprocessing from servicelayer import env NUM_THREADS = min(8, multiprocessing.cpu_count()) NUM_THREADS = env.to_int('INGEST_THREADS', NUM_THREADS) MAX_RETRIES = env.to_int('INGEST_RETRIES', 3) UNOSERVICE_URL = env.get('UNOSERVICE_URL')
from servicelayer import env from servicelayer import settings as sls from ftmstore import settings as sts TESTING = False # Document conversion service CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert") CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL) CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256) # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) # Geonames data file GEONAMES_PATH = env.get("INGESTORS_GEONAMES_PATH", "/ingestors/data/geonames.txt") # FastText lid model file LID_MODEL_PATH = env.get("INGESTORS_LID_MODEL_PATH", "/ingestors/data/lid.176.ftz") # Disable entity extraction ANALYZE_ENTITIES = env.to_bool("INGESTORS_ANALYZE_ENTITIES", True) # List available NER models NER_MODELS = set(env.to_list("INGESTORS_NER_MODELS", ["eng"])) NER_DISABLE = ["ara"] NER_DISABLE = set(env.to_list("INGESTORS_NER_DISABLE", NER_DISABLE)) NER_DEFAULT_MODEL = "xx"
# Base operating path BASE_PATH = os.path.join(os.getcwd(), 'data') BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH) # Override servicelayer archive if undefined sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive') # Directory which contains crawler pipeline YAML specs CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH') # Try and run scrapers in a way that only acquires new data INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True) # How many days until an incremental crawl expires EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 1) # How many db inserts per minute DB_RATE_LIMIT = env.to_int('MEMORIOUS_DB_RATE_LIMIT', 6000) # How many http requests to a host per minute HTTP_RATE_LIMIT = env.to_int('MEMORIOUS_HTTP_RATE_LIMIT', 120) # How many seconds to wait before trying to run scheduled crawlers SCHEDULER_INTERVAL = env.to_int('MEMORIOUS_SCHEDULER_INTERVAL', 60) # Max scheduled tasks at the same time MAX_SCHEDULED = max( env.to_int('MEMORIOUS_MAX_SCHEDULED', multiprocessing.cpu_count()), 20) # noqa
DEFAULT_LANGUAGE = env.get('ALEPH_DEFAULT_LANGUAGE', 'en') # User interface UI_LANGUAGES = ['ru', 'es', 'de', 'en', 'ar'] UI_LANGUAGES = env.to_list('ALEPH_UI_LANGUAGES', UI_LANGUAGES) UI_LANGUAGES = [l.lower().strip() for l in UI_LANGUAGES] # Result high-lighting RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True) # Minimum update date for sitemap.xml SITEMAP_FLOOR = '2019-06-22' # Maximum number of entities to return per property when expanding entities MAX_EXPAND_ENTITIES = env.to_int('ALEPH_MAX_EXPAND_ENTITIES', 200) # API rate limiting (req/min for anonymous users) API_RATE_LIMIT = env.to_int('ALEPH_API_RATE_LIMIT', 30) API_RATE_WINDOW = 15 # minutes # Mini-CMS # Pages directory PAGES_PATH = os.path.join(APP_DIR, 'pages') PAGES_PATH = env.get('ALEPH_PAGES_PATH', PAGES_PATH) ############################################################################## # E-mail settings MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**') MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost')
from servicelayer import env from servicelayer import settings as sls from ftmstore import settings as sts TESTING = False # Document conversion service CONVERT_URL = env.get('UNOSERVICE_URL', 'http://convert-document:3000/convert') CONVERT_URL = env.get('INGESTORS_CONVERT_DOCUMENT_URL', CONVERT_URL) CONVERT_TIMEOUT = env.to_int('INGESTORS_CONVERT_TIMEOUT', 7200) # 2 hrs # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool('INGESTORS_OCR_VISION_API', False) # Geonames data file GEONAMES_PATH = env.get('INGESTORS_GEONAMES_PATH', '/ingestors/data/geonames.txt') # FastText lid model file LID_MODEL_PATH = env.get('INGESTORS_LID_MODEL_PATH', '/ingestors/data/lid.176.ftz') # Disable entity extraction ANALYZE_ENTITIES = env.to_bool('INGESTORS_ANALYZE_ENTITIES', True) # List available NER models NER_MODELS = set(env.to_list('INGESTORS_NER_MODELS', ['eng'])) NER_DEFAULT_MODEL = 'xx' # Use the environment variable set in aleph.env sts.DATABASE_URI = env.get('ALEPH_DATABASE_URI', sts.DATABASE_URI)
# Handler is one of: keycloak, google, cognito, azure (or a plugin) OAUTH_MIGRATE_SUB = env.to_bool("ALEPH_OAUTH_MIGRATE_SUB", True) OAUTH_HANDLER = env.get("ALEPH_OAUTH_HANDLER", "oidc") OAUTH_KEY = env.get("ALEPH_OAUTH_KEY") OAUTH_SECRET = env.get("ALEPH_OAUTH_SECRET") OAUTH_SCOPE = env.get("ALEPH_OAUTH_SCOPE", "openid email profile") OAUTH_METADATA_URL = env.get("ALEPH_OAUTH_METADATA_URL") OAUTH_TOKEN_METHOD = env.get("ALEPH_OAUTH_TOKEN_METHOD", "POST") OAUTH_ADMIN_GROUP = env.get("ALEPH_OAUTH_ADMIN_GROUP", "superuser") # No authentication. Everyone is admin. SINGLE_USER = env.to_bool("ALEPH_SINGLE_USER") # Default session duration. SESSION_EXPIRE = 800_000 if SINGLE_USER else 60_000 SESSION_EXPIRE = env.to_int("ALEPH_SESSION_EXPIRE", SESSION_EXPIRE) # Disable password-based authentication for SSO settings: PASSWORD_LOGIN = env.to_bool("ALEPH_PASSWORD_LOGIN", not OAUTH) # Roles that haven't logged in since X months will stop receiving notifications. ROLE_INACTIVE = env.to_int("ALEPH_ROLE_INACTIVE", 6 * 30) ROLE_INACTIVE = timedelta(days=ROLE_INACTIVE) ############################################################################### # Content processing options DEFAULT_LANGUAGE = env.get("ALEPH_DEFAULT_LANGUAGE", "en") # User interface UI_LANGUAGES = ["ru", "es", "de", "en", "ar"]
# User interface UI_LANGUAGES = ["ru", "es", "de", "en", "ar"] UI_LANGUAGES = env.to_list("ALEPH_UI_LANGUAGES", UI_LANGUAGES) UI_LANGUAGES = [l.lower().strip() for l in UI_LANGUAGES] # Document processing pipeline INGEST_PIPELINE = env.to_list("ALEPH_INGEST_PIPELINE", ["analyze"]) # Result high-lighting RESULT_HIGHLIGHT = env.to_bool("ALEPH_RESULT_HIGHLIGHT", True) # Minimum update date for sitemap.xml SITEMAP_FLOOR = "2019-06-22" # Maximum number of entities to return per property when expanding entities MAX_EXPAND_ENTITIES = env.to_int("ALEPH_MAX_EXPAND_ENTITIES", 200) # API rate limiting (req/min for anonymous users) API_RATE_LIMIT = env.to_int("ALEPH_API_RATE_LIMIT", 30) API_RATE_WINDOW = 15 # minutes # Mini-CMS # Pages directory PAGES_PATH = os.path.join(APP_DIR, "pages") PAGES_PATH = env.get("ALEPH_PAGES_PATH", PAGES_PATH) ############################################################################## # E-mail settings MAIL_FROM = env.get("ALEPH_MAIL_FROM", "*****@*****.**") MAIL_SERVER = env.get("ALEPH_MAIL_HOST", "localhost")
# Base operating path BASE_PATH = os.path.join(os.getcwd(), 'data') BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH) # Override servicelayer archive if undefined sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive') # Directory which contains crawler pipeline YAML specs CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH') # Try and run scrapers in a way that only acquires new data INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True) # How many days until an incremental crawl expires EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 60) # How many seconds to wait before trying to run scheduled crawlers SCHEDULER_INTERVAL = env.to_int('MEMORIOUS_SCHEDULER_INTERVAL', 60) # How many threads to use for execution THREADS = env.to_int('MEMORIOUS_THREADS', min(8, multiprocessing.cpu_count())) # Max scheduled tasks at the same time MAX_SCHEDULED = env.to_int('MEMORIOUS_MAX_SCHEDULED', THREADS) # HTTP request configuration HTTP_CACHE = env.to_bool('MEMORIOUS_HTTP_CACHE', default=True) # HTTP user agent default USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.1)'
BASE_PATH = env.get("MEMORIOUS_BASE_PATH", BASE_PATH) # Override servicelayer archive if undefined sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, "archive") # Directory which contains crawler pipeline YAML specs CONFIG_PATH = env.get("MEMORIOUS_CONFIG_PATH") # Try and run scrapers in a way that only acquires new data INCREMENTAL = env.to_bool("MEMORIOUS_INCREMENTAL", default=True) # Continue running the crawler even when we encounter an error CONTINUE_ON_ERROR = env.to_bool("MEMORIOUS_CONTINUE_ON_ERROR", default=False) # How many days until an incremental crawl expires EXPIRE = env.to_int("MEMORIOUS_EXPIRE", 1) # How many db inserts per minute DB_RATE_LIMIT = env.to_int("MEMORIOUS_DB_RATE_LIMIT", 6000) # How many http requests to a host per minute HTTP_RATE_LIMIT = env.to_int("MEMORIOUS_HTTP_RATE_LIMIT", 120) # Max number of tasks in a stage's task queue MAX_QUEUE_LENGTH = env.to_int("MEMORIOUS_MAX_QUEUE_LENGTH", 50000) # HTTP request configuration HTTP_CACHE = env.to_bool("MEMORIOUS_HTTP_CACHE", default=True) # HTTP request timeout HTTP_TIMEOUT = float(env.to_int("MEMORIOUS_HTTP_TIMEOUT", 30))
from servicelayer import env from servicelayer import settings as sls from ftmstore import settings as sts TESTING = False # Document conversion service CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert") CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL) CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) # Geonames data file GEONAMES_PATH = env.get("INGESTORS_GEONAMES_PATH", "/ingestors/data/geonames.txt") # FastText lid model file LID_MODEL_PATH = env.get("INGESTORS_LID_MODEL_PATH", "/ingestors/data/lid.176.ftz") # Disable entity extraction ANALYZE_ENTITIES = env.to_bool("INGESTORS_ANALYZE_ENTITIES", True) # List available NER models NER_MODELS = set(env.to_list("INGESTORS_NER_MODELS", ["eng"])) NER_DEFAULT_MODEL = "xx" # Use the environment variable set in aleph.env sts.DATABASE_URI = env.get("ALEPH_DATABASE_URI", sts.DATABASE_URI)
import multiprocessing from servicelayer import env # Redis cache # URL format: redis://localhost:6379/0 REDIS_URL = env.get("REDIS_URL") REDIS_SHORT = 84700 REDIS_LONG = REDIS_SHORT * 200 REDIS_EXPIRE = env.to_int("REDIS_EXPIRE", REDIS_SHORT * 7) REDIS_PREFIX = "sla" # Persistent database tags TAGS_DATABASE_URI = env.get("TAGS_DATABASE_URI", "sqlite://") # Worker WORKER_RETRY = env.to_int("WORKER_RETRY", 3) WORKER_THREADS = env.to_int("WORKER_THREADS", multiprocessing.cpu_count()) WORKER_REPORTING = env.to_bool("WORKER_REPORTING", True) # Amazon client credentials AWS_KEY_ID = env.get("AWS_ACCESS_KEY_ID") AWS_SECRET_KEY = env.get("AWS_SECRET_ACCESS_KEY") AWS_REGION = env.get("AWS_REGION", "eu-west-1") # S3 compatible Minio host if using Minio for storage ARCHIVE_ENDPOINT_URL = env.get("ARCHIVE_ENDPOINT_URL") # Storage type (either 's3', 'gs', or 'file', i.e. local file system): ARCHIVE_TYPE = env.get("ARCHIVE_TYPE", "file") ARCHIVE_BUCKET = env.get("ARCHIVE_BUCKET") ARCHIVE_PATH = env.get("ARCHIVE_PATH") PUBLICATION_BUCKET = env.get("PUBLICATION_BUCKET", ARCHIVE_BUCKET)
from servicelayer import env from servicelayer import settings as sls from ftmstore import settings as sts TESTING = False # When set to True, a debugpy server will be enabled in cli.py process() DEBUGPY_PROCESS = env.to_bool("INGESTORS_DEBUGPY_PROCESS", False) # The address that the debugpy server should bind to DEBUGPY_ADDRESS = env.get("INGESTORS_DEBUGPY_ADDRESS", "0.0.0.0") # The port that the debugpy server should listen for a connection on DEBUGPY_PORT = env.to_int("INGESTORS_DEBUGPY_PORT", 5678) # When set to True, after setting up the debug server the application will block # and wait for a client connection before continuing with processing DEBUGPY_WAIT_FOR_CLIENT = env.to_bool("INGESTORS_DEBUGPY_WAIT_FOR_CLIENT", False) # Document conversion service CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert") CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL) CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) # Enable Google Cloud Translation API TRANSLATION_API = env.to_bool("INGESTORS_TRANSLATION_API", False) # White list of language IDs for languages that should be translated # An empty white list is considered a wildcard, allowing all languages to be translated TRANSLATION_LANGUAGE_WHITE_LIST = env.to_list("INGESTORS_TRANSLATION_LANGUAGE_WHITE_LIST", None)
import multiprocessing from servicelayer import env # Redis cache REDIS_URL = env.get('REDIS_URL') REDIS_SHORT = 84700 REDIS_LONG = REDIS_SHORT * 200 REDIS_EXPIRE = env.to_int('REDIS_EXPIRE', REDIS_SHORT * 7) REDIS_PREFIX = 'sla' # Worker WORKER_RETRY = env.to_int('WORKER_RETRY', 3) WORKER_THREADS = min(8, multiprocessing.cpu_count()) WORKER_THREADS = env.to_int('WORKER_THREADS', WORKER_THREADS) # Amazon client credentials AWS_KEY_ID = env.get('AWS_ACCESS_KEY_ID') AWS_SECRET_KEY = env.get('AWS_SECRET_ACCESS_KEY') AWS_REGION = env.get('AWS_REGION', 'eu-west-1') # Storage type (either 's3', 'gs', or 'file', i.e. local file system): ARCHIVE_TYPE = env.get('ARCHIVE_TYPE', 'file') ARCHIVE_BUCKET = env.get('ARCHIVE_BUCKET') ARCHIVE_PATH = env.get('ARCHIVE_PATH')
# Result high-lighting RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True) # Minimum update date for sitemap.xml SITEMAP_FLOOR = '2018-12-09' ############################################################################## # E-mail settings MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**') MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost') MAIL_USERNAME = env.get('ALEPH_MAIL_USERNAME') MAIL_PASSWORD = env.get('ALEPH_MAIL_PASSWORD') MAIL_USE_SSL = env.to_bool('ALEPH_MAIL_SSL', True) MAIL_PORT = env.to_int('ALEPH_MAIL_PORT', 465) ############################################################################### # Database, search index and queue processing. DATABASE_URI = env.get('ALEPH_DATABASE_URI') SQLALCHEMY_TRACK_MODIFICATIONS = False ALEMBIC_DIR = os.path.join(os.path.dirname(__file__), 'migrate') ALEMBIC_DIR = os.path.abspath(ALEMBIC_DIR) ELASTICSEARCH_URL = env.get('ALEPH_ELASTICSEARCH_URI', 'http://localhost:9200') ELASTICSEARCH_TIMEOUT = env.to_int('ELASTICSEARCH_TIMEOUT', 30) INDEX_PREFIX = env.get('ALEPH_INDEX_PREFIX', APP_NAME) INDEX_WRITE = env.get('ALEPH_INDEX_WRITE', 'v1') INDEX_READ = env.to_list('ALEPH_INDEX_READ', [INDEX_WRITE])
# Result high-lighting RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True) # Minimum update date for sitemap.xml SITEMAP_FLOOR = '2019-06-22' ############################################################################## # E-mail settings MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**') MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost') MAIL_USERNAME = env.get('ALEPH_MAIL_USERNAME') MAIL_PASSWORD = env.get('ALEPH_MAIL_PASSWORD') MAIL_USE_SSL = env.to_bool('ALEPH_MAIL_SSL', True) MAIL_PORT = env.to_int('ALEPH_MAIL_PORT', 465) ############################################################################### # Database, search index and queue processing. QUEUE_RETRY = env.to_int('ALEPH_QUEUE_RETRY', 3) DATABASE_URI = env.get('ALEPH_DATABASE_URI') SQLALCHEMY_TRACK_MODIFICATIONS = False ALEMBIC_DIR = os.path.join(os.path.dirname(__file__), 'migrate') ALEMBIC_DIR = os.path.abspath(ALEMBIC_DIR) ELASTICSEARCH_URL = env.get('ALEPH_ELASTICSEARCH_URI', 'http://localhost:9200') ELASTICSEARCH_TIMEOUT = env.to_int('ELASTICSEARCH_TIMEOUT', 30) INDEX_PREFIX = env.get('ALEPH_INDEX_PREFIX', APP_NAME)
# Result high-lighting RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True) # Minimum update date for sitemap.xml SITEMAP_FLOOR = '2018-12-09' ############################################################################## # E-mail settings MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**') MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost') MAIL_USERNAME = env.get('ALEPH_MAIL_USERNAME') MAIL_PASSWORD = env.get('ALEPH_MAIL_PASSWORD') MAIL_USE_SSL = env.to_bool('ALEPH_MAIL_SSL', True) MAIL_PORT = env.to_int('ALEPH_MAIL_PORT', 465) ############################################################################### # Database, search index and queue processing. DATABASE_URI = env.get('ALEPH_DATABASE_URI') SQLALCHEMY_TRACK_MODIFICATIONS = False ALEMBIC_DIR = os.path.join(os.path.dirname(__file__), 'migrate') ALEMBIC_DIR = os.path.abspath(ALEMBIC_DIR) ELASTICSEARCH_URL = env.get('ALEPH_ELASTICSEARCH_URI', 'http://localhost:9200') ELASTICSEARCH_TIMEOUT = env.to_int('ELASTICSEARCH_TIMEOUT', 30) INDEX_PREFIX = env.get('ALEPH_INDEX_PREFIX', APP_NAME) INDEX_WRITE = env.get('ALEPH_INDEX_WRITE', 'v1')
# Base operating path BASE_PATH = os.path.join(os.getcwd(), 'data') BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH) # Override servicelayer archive if undefined sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive') # Directory which contains crawler pipeline YAML specs CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH') # Try and run scrapers in a way that only acquires new data INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True) # How many days until an incremental crawl expires EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 60) # How many db inserts per minute DB_RATE_LIMIT = env.to_int('MEMORIOUS_DB_RATE_LIMIT', 6000) # How many http requests to a host per minute HTTP_RATE_LIMIT = env.to_int('MEMORIOUS_HTTP_RATE_LIMIT', 120) # noqa # How many seconds to wait before trying to run scheduled crawlers SCHEDULER_INTERVAL = env.to_int('MEMORIOUS_SCHEDULER_INTERVAL', 60) # How many threads to use for execution THREADS = env.to_int('MEMORIOUS_THREADS', min(8, multiprocessing.cpu_count())) # Max scheduled tasks at the same time MAX_SCHEDULED = max(env.to_int('MEMORIOUS_MAX_SCHEDULED', THREADS), 20)