def _get_store_for_reading(download: dict) -> KeyValueStore: """Return the store from which to read the content for the given download.""" download = decode_object_from_bytes_if_needed(download) config = get_config() if config['mediawords'].get('read_all_downloads_from_s3', False): return _get_amazon_s3_store() path = download.get('path', 's3:') match = re.search(r'^([\w]+):', path) location = match.group(1) if match else 's3' location = location.lower() if location == 'content': download_store = _get_inline_store() elif location == 'postgresql': download_store = _get_postgresql_store() elif location in ('s3', 'amazon_s3'): download_store = _get_amazon_s3_store() elif location == 'gridfs' or location == 'tar': # these are old storage formats that we moved to postgresql download_store = _get_postgresql_store() else: downloads_id = download.get('downloads_id', '(no downloads_id') raise McDBIDownloadsException("Location 'location' is unknown for download %d", [downloads_id]) assert download_store is not None return download_store
def fetch_content(db: DatabaseHandler, download: dict) -> str: """Fetch the content for the given download from the configured content store.""" download = decode_object_from_bytes_if_needed(download) if 'downloads_id' not in download: raise McDBIDownloadsException("downloads_id not in download") if not download_successful(download): raise McDBIDownloadsException( "attempt to fetch content for unsuccessful download: %d" % (download['downloads_id'])) store = _get_store_for_reading(download) content_bytes = store.fetch_content(db, download['downloads_id'], download['path']) content = content_bytes.decode() # horrible hack to fix old content that is not stored in unicode config = get_config() ascii_hack_downloads_id = config['mediawords'].get( 'ascii_hack_downloads_id', 0) if download['downloads_id'] < ascii_hack_downloads_id: # this matches all non-printable-ascii characters. python re does not support POSIX character # classes like [[:ascii:]] content = re.sub(r'[^ -~]', ' ', content) return content
def test_set_config_file(): root_path = mc_root_path() # Test with .yml.dist mediawords_yml_dist_path = os.path.join(root_path, 'mediawords.yml.dist') assert os.path.isfile(mediawords_yml_dist_path) old_config = get_config() set_config_file(mediawords_yml_dist_path) set_config(old_config) # Test with .yml mediawords_yml_path = os.path.join(root_path, 'mediawords.yml') assert os.path.isfile(mediawords_yml_path) old_config = get_config() set_config_file(mediawords_yml_path) set_config(old_config)
def fetch_content(db: DatabaseHandler, download: dict) -> str: """Fetch the content for the given download from the configured content store.""" download = decode_object_from_bytes_if_needed(download) if 'downloads_id' not in download: raise McDBIDownloadsException("downloads_id not in download") if not download_successful(download): raise McDBIDownloadsException( "attempt to fetch content for unsuccessful download: %d" % (download['downloads_id'])) store = _get_store_for_reading(download) content_bytes = store.fetch_content(db, download['downloads_id'], download['path']) content = content_bytes.decode() # horrible hack to fix old content that is not stored in unicode config = get_config() ascii_hack_downloads_id = config['mediawords'].get('ascii_hack_downloads_id', 0) if download['downloads_id'] < ascii_hack_downloads_id: # this matches all non-printable-ascii characters. python re does not support POSIX character # classes like [[:ascii:]] content = re.sub(r'[^ -~]', ' ', content) return content
def _get_store_for_writing() -> KeyValueStore: """Get MultiStoresStore for writing downloads.""" global _store_for_writing if _store_for_writing is not None: return _store_for_writing config = get_config() # Early sanity check on configuration download_storage_locations = config['mediawords'].get('download_storage_locations', []) if len(download_storage_locations) == 0: raise McDBIDownloadsException("No download stores are configured.") stores = [] for location in download_storage_locations: location = location.lower() if location == 'databaseinline': raise McDBIDownloadsException("databaseinline location is not valid for storage") elif location == 'postgresql': store = PostgreSQLStore(table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME) elif location in ('s3', 'amazon', 'amazon_s3'): store = _get_amazon_s3_store() else: raise McDBIDownloadsException("store location '" + location + "' is not valid") if store is None: raise McDBIDownloadsException("store location '" + location + "' is not configured") stores.append(store) _store_for_writing = MultipleStoresStore(stores_for_writing=stores) return _store_for_writing
def __get_large_work_mem(self) -> str: config = get_config() if 'large_work_mem' in config['mediawords']: work_mem = config['mediawords']['large_work_mem'] else: work_mem = self.__get_current_work_mem() return work_mem
def _get_store_for_reading(download: dict) -> KeyValueStore: """Return the store from which to read the content for the given download.""" download = decode_object_from_bytes_if_needed(download) config = get_config() if config['mediawords'].get('read_all_downloads_from_s3', False): return _get_amazon_s3_store() path = download.get('path', 's3:') match = re.search(r'^([\w]+):', path) location = match.group(1) if match else 's3' location = location.lower() if location == 'content': download_store = _get_inline_store() elif location == 'postgresql': download_store = _get_postgresql_store() elif location in ('s3', 'amazon_s3'): download_store = _get_amazon_s3_store() elif location == 'gridfs' or location == 'tar': # these are old storage formats that we moved to postgresql download_store = _get_postgresql_store() else: downloads_id = download.get('downloads_id', '(no downloads_id') raise McDBIDownloadsException( "Location 'location' is unknown for download %d", [downloads_id]) assert download_store is not None return download_store
def _get_amazon_s3_store() -> KeyValueStore: """Get lazy initialized amazon s3 store, with credentials from mediawords.yml.""" global _amazon_s3_store if _amazon_s3_store: return _amazon_s3_store config = get_config() if 'amazon_s3' not in config: raise McDBIDownloadsException( "Amazon S3 download store is not configured.") store_params = { 'access_key_id': config['amazon_s3']['downloads']['access_key_id'], 'secret_access_key': config['amazon_s3']['downloads']['secret_access_key'], 'bucket_name': config['amazon_s3']['downloads']['bucket_name'], 'directory_name': config['amazon_s3']['downloads']['directory_name'], } if config['mediawords'].get('cache_s3_downloads', False): store_params['cache_table'] = S3_RAW_DOWNLOADS_CACHE_TABLE_NAME _amazon_s3_store = CachedAmazonS3Store(**store_params) else: _amazon_s3_store = AmazonS3Store(**store_params) return _amazon_s3_store
def _get_amazon_s3_store() -> KeyValueStore: """Get lazy initialized amazon s3 store, with credentials from mediawords.yml.""" global _amazon_s3_store if _amazon_s3_store: return _amazon_s3_store config = get_config() if 'amazon_s3' not in config: raise McDBIDownloadsException("Amazon S3 download store is not configured.") store_params = { 'access_key_id': config['amazon_s3']['downloads']['access_key_id'], 'secret_access_key': config['amazon_s3']['downloads']['secret_access_key'], 'bucket_name': config['amazon_s3']['downloads']['bucket_name'], 'directory_name': config['amazon_s3']['downloads']['directory_name'], } if config['mediawords'].get('cache_s3_downloads', False): store_params['cache_table'] = S3_RAW_DOWNLOADS_CACHE_TABLE_NAME _amazon_s3_store = CachedAmazonS3Store(**store_params) else: _amazon_s3_store = AmazonS3Store(**store_params) return _amazon_s3_store
def test_set_config_file_nonexistent(): old_config = get_config() tempdir = tempfile.mkdtemp() nonexistent_config = os.path.join(tempdir, 'nonexistent_configuration.yml') assert os.path.exists(nonexistent_config) is False with pytest.raises(McConfigException): set_config_file(nonexistent_config) set_config(old_config)
def rotate_supervisor_logs(): root_path = mc_root_path() l.debug('Media Cloud root path: %s' % root_path) config = get_config() child_log_dir = config['supervisor']['childlogdir'] l.debug('Child log directory: %s' % child_log_dir) supervisor_logs_dir = os.path.join(root_path, child_log_dir) l.info('Supervisor logs path: %s' % supervisor_logs_dir) logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state') l.debug('logrotate state file: %s' % logrotate_state_file) if not os.path.isdir(supervisor_logs_dir): raise Exception('Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir) logrotate_config = ''' %(supervisor_logs_dir)s/*.log { size %(log_max_size)d rotate %(old_log_count)d copytruncate compress missingok notifempty } ''' % { 'supervisor_logs_dir': supervisor_logs_dir, 'log_max_size': __LOG_MAX_SIZE, 'old_log_count': __OLD_LOG_COUNT, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate') l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, 'w') as tmp: tmp.write(logrotate_config) l.info('Running logrotate...') subprocess.check_call([ 'logrotate', '--verbose', '--state', logrotate_state_file, logrotate_temp_config_path ]) l.debug('Cleaning up temporary logrotate config...') os.unlink(logrotate_temp_config_path)
def __init__(self): self.api_key = None self.api_version = '1.1' self.retry_limit = 5 self.ratelimit_info = defaultdict(dict) config = get_config() self.ua = mediawords.util.web.user_agent.UserAgent() self.ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256]) if 'associated_press' in config: self.api_key = config['associated_press'].get('apikey') if self.api_key is None: raise McAPMissingAPIKey( "API key configuration data missing for associated_press.")
def _get_postgresql_store() -> KeyValueStore: """Get lazy initialized postgresql store, with credentials from mediawords.yml.""" global _postgresql_store if _postgresql_store is not None: return _postgresql_store config = get_config() _postgresql_store = PostgreSQLStore(table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME) if config['mediawords'].get('fallback_postgresql_downloads_to_s3', False): _postgresql_store = MultipleStoresStore( stores_for_reading=[_postgresql_store, _get_amazon_s3_store()], stores_for_writing=[_postgresql_store]) return _postgresql_store
def _get_postgresql_store() -> KeyValueStore: """Get lazy initialized postgresql store, with credentials from mediawords.yml.""" global _postgresql_store if _postgresql_store is not None: return _postgresql_store config = get_config() _postgresql_store = PostgreSQLStore( table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME) if config['mediawords'].get('fallback_postgresql_downloads_to_s3', False): _postgresql_store = MultipleStoresStore( stores_for_reading=[_postgresql_store, _get_amazon_s3_store()], stores_for_writing=[_postgresql_store]) return _postgresql_store
def _get_store_for_writing() -> KeyValueStore: """Get MultiStoresStore for writing downloads.""" global _store_for_writing if _store_for_writing is not None: return _store_for_writing config = get_config() # Early sanity check on configuration download_storage_locations = config['mediawords'].get( 'download_storage_locations', []) if len(download_storage_locations) == 0: raise McDBIDownloadsException("No download stores are configured.") stores = [] for location in download_storage_locations: location = location.lower() if location == 'databaseinline': raise McDBIDownloadsException( "databaseinline location is not valid for storage") elif location == 'postgresql': store = PostgreSQLStore( table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME) elif location in ('s3', 'amazon', 'amazon_s3'): store = _get_amazon_s3_store() else: raise McDBIDownloadsException("store location '" + location + "' is not valid") if store is None: raise McDBIDownloadsException("store location '" + location + "' is not configured") stores.append(store) _store_for_writing = MultipleStoresStore(stores_for_writing=stores) return _store_for_writing
def test_run_block_with_large_work_mem(self): normal_work_mem = 256 # MB large_work_mem = 512 # MB old_large_work_mem = None config = get_config() if 'large_work_mem' in config['mediawords']: old_large_work_mem = config['mediawords']['large_work_mem'] config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem set_config(config) self.__db.query("SET work_mem TO %s", ('%sMB' % normal_work_mem,)) current_work_mem = int(self.__db.query(""" SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """).flat()[0]) assert current_work_mem == normal_work_mem * 1024 def __test_run_block_with_large_work_mem_inner(): self.__db.execute_with_large_work_mem(""" INSERT INTO execute_large_work_mem (work_mem) SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """) self.__db.query('CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)') self.__db.run_block_with_large_work_mem(__test_run_block_with_large_work_mem_inner) statement_work_mem = int(self.__db.query(""" SELECT work_mem FROM execute_large_work_mem """).flat()[0]) assert statement_work_mem == large_work_mem * 1024 current_work_mem = int(self.__db.query(""" SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """).flat()[0]) assert current_work_mem == normal_work_mem * 1024 config['mediawords']['large_work_mem'] = old_large_work_mem set_config(config)
def __should_continue_with_outdated_schema(self, current_schema_version: int, target_schema_version: int) -> bool: """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise""" config = get_config() config_ignore_schema_version = config["mediawords"]["ignore_schema_version"] or False if config_ignore_schema_version and self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ: l.warn(""" The current Media Cloud database schema is older than the schema present in mediawords.sql, but %s is set so continuing anyway. """ % self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE) return True else: l.warn(""" ################################ The current Media Cloud database schema is not the same as the schema present in mediawords.sql. The database schema currently running in the database is %(current_schema_version)s, and the schema version in the mediawords.sql is %(target_schema_version)s. Please run: ./script/mediawords_upgrade_db.py --import to automatically upgrade the database schema to the latest version. If you want to connect to the Media Cloud database anyway (ignoring the schema version), set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such: %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py ################################ """ % { "current_schema_version": current_schema_version, "target_schema_version": target_schema_version, "IGNORE_SCHEMA_VERSION_ENV_VARIABLE": self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE, }) return False
def setUp(self): l.info("Looking for test database credentials...") test_database = None config = get_config() for database in config['database']: if database['label'] == 'test': test_database = database break assert test_database is not None l.info("Connecting to test database '%s' via DatabaseHandler class..." % test_database['db']) self.__db = DatabaseHandler(host=test_database['host'], port=test_database['port'], username=test_database['user'], password=test_database['pass'], database=test_database['db']) l.info("Preparing test table 'kardashians'...") self.__db.query(""" CREATE TEMPORARY TABLE kardashians ( id SERIAL PRIMARY KEY NOT NULL, name VARCHAR UNIQUE NOT NULL, -- UNIQUE to test find_or_create() surname TEXT NOT NULL, dob DATE NOT NULL, married_to_kanye BOOL NOT NULL DEFAULT 'f' ) """) self.__db.query(""" INSERT INTO kardashians (name, surname, dob, married_to_kanye) VALUES ('Kris', 'Jenner', '1955-11-05'::DATE, 'f'), -- id=1 ('Caitlyn', 'Jenner', '1949-10-28'::DATE, 'f'), -- id=2 ('Kourtney', 'Kardashian', '1979-04-18'::DATE, 'f'), -- id=3 ('Kim', 'Kardashian', '1980-10-21'::DATE, 't'), -- id=4 ('Khloé', 'Kardashian', '1984-06-27'::DATE, 'f'), -- id=5 ('Rob', 'Kardashian', '1987-03-17'::DATE, 'f'), -- id=6 ('Kendall', 'Jenner', '1995-11-03'::DATE, 'f'), -- id=7 ('Kylie', 'Jenner', '1997-08-10'::DATE, 'f') -- id=8 """)
def test_get_config(): config = get_config() assert 'database' in config assert 'mediawords' in config assert 'data_dir' in config['mediawords']
def setUp(self) -> None: """Set self.config and assign dummy values for amazon_s3.""" self.config = get_config() self.save_config = copy.deepcopy(self.config) self._setup_amazon_s3_config()