Example #1
0
def _get_store_for_reading(download: dict) -> KeyValueStore:
    """Return the store from which to read the content for the given download."""
    download = decode_object_from_bytes_if_needed(download)

    config = get_config()

    if config['mediawords'].get('read_all_downloads_from_s3', False):
        return _get_amazon_s3_store()

    path = download.get('path', 's3:')

    match = re.search(r'^([\w]+):', path)
    location = match.group(1) if match else 's3'
    location = location.lower()

    if location == 'content':
        download_store = _get_inline_store()
    elif location == 'postgresql':
        download_store = _get_postgresql_store()
    elif location in ('s3', 'amazon_s3'):
        download_store = _get_amazon_s3_store()
    elif location == 'gridfs' or location == 'tar':
        # these are old storage formats that we moved to postgresql
        download_store = _get_postgresql_store()
    else:
        downloads_id = download.get('downloads_id', '(no downloads_id')
        raise McDBIDownloadsException("Location 'location' is unknown for download %d", [downloads_id])

    assert download_store is not None

    return download_store
Example #2
0
def fetch_content(db: DatabaseHandler, download: dict) -> str:
    """Fetch the content for the given download from the configured content store."""

    download = decode_object_from_bytes_if_needed(download)

    if 'downloads_id' not in download:
        raise McDBIDownloadsException("downloads_id not in download")

    if not download_successful(download):
        raise McDBIDownloadsException(
            "attempt to fetch content for unsuccessful download: %d" %
            (download['downloads_id']))

    store = _get_store_for_reading(download)

    content_bytes = store.fetch_content(db, download['downloads_id'],
                                        download['path'])

    content = content_bytes.decode()

    # horrible hack to fix old content that is not stored in unicode
    config = get_config()
    ascii_hack_downloads_id = config['mediawords'].get(
        'ascii_hack_downloads_id', 0)
    if download['downloads_id'] < ascii_hack_downloads_id:
        # this matches all non-printable-ascii characters.  python re does not support POSIX character
        # classes like [[:ascii:]]
        content = re.sub(r'[^ -~]', ' ', content)

    return content
Example #3
0
def test_set_config_file():
    root_path = mc_root_path()

    # Test with .yml.dist
    mediawords_yml_dist_path = os.path.join(root_path, 'mediawords.yml.dist')
    assert os.path.isfile(mediawords_yml_dist_path)
    old_config = get_config()
    set_config_file(mediawords_yml_dist_path)
    set_config(old_config)

    # Test with .yml
    mediawords_yml_path = os.path.join(root_path, 'mediawords.yml')
    assert os.path.isfile(mediawords_yml_path)
    old_config = get_config()
    set_config_file(mediawords_yml_path)
    set_config(old_config)
Example #4
0
def fetch_content(db: DatabaseHandler, download: dict) -> str:
    """Fetch the content for the given download from the configured content store."""

    download = decode_object_from_bytes_if_needed(download)

    if 'downloads_id' not in download:
        raise McDBIDownloadsException("downloads_id not in download")

    if not download_successful(download):
        raise McDBIDownloadsException(
            "attempt to fetch content for unsuccessful download: %d" % (download['downloads_id']))

    store = _get_store_for_reading(download)

    content_bytes = store.fetch_content(db, download['downloads_id'], download['path'])

    content = content_bytes.decode()

    # horrible hack to fix old content that is not stored in unicode
    config = get_config()
    ascii_hack_downloads_id = config['mediawords'].get('ascii_hack_downloads_id', 0)
    if download['downloads_id'] < ascii_hack_downloads_id:
        # this matches all non-printable-ascii characters.  python re does not support POSIX character
        # classes like [[:ascii:]]
        content = re.sub(r'[^ -~]', ' ', content)

    return content
Example #5
0
def test_set_config_file():
    root_path = mc_root_path()

    # Test with .yml.dist
    mediawords_yml_dist_path = os.path.join(root_path, 'mediawords.yml.dist')
    assert os.path.isfile(mediawords_yml_dist_path)
    old_config = get_config()
    set_config_file(mediawords_yml_dist_path)
    set_config(old_config)

    # Test with .yml
    mediawords_yml_path = os.path.join(root_path, 'mediawords.yml')
    assert os.path.isfile(mediawords_yml_path)
    old_config = get_config()
    set_config_file(mediawords_yml_path)
    set_config(old_config)
Example #6
0
def _get_store_for_writing() -> KeyValueStore:
    """Get MultiStoresStore for writing downloads."""
    global _store_for_writing
    if _store_for_writing is not None:
        return _store_for_writing

    config = get_config()

    # Early sanity check on configuration
    download_storage_locations = config['mediawords'].get('download_storage_locations', [])

    if len(download_storage_locations) == 0:
        raise McDBIDownloadsException("No download stores are configured.")

    stores = []
    for location in download_storage_locations:
        location = location.lower()

        if location == 'databaseinline':
            raise McDBIDownloadsException("databaseinline location is not valid for storage")
        elif location == 'postgresql':
            store = PostgreSQLStore(table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME)
        elif location in ('s3', 'amazon', 'amazon_s3'):
            store = _get_amazon_s3_store()
        else:
            raise McDBIDownloadsException("store location '" + location + "' is not valid")

        if store is None:
            raise McDBIDownloadsException("store location '" + location + "' is not configured")

        stores.append(store)

    _store_for_writing = MultipleStoresStore(stores_for_writing=stores)

    return _store_for_writing
Example #7
0
 def __get_large_work_mem(self) -> str:
     config = get_config()
     if 'large_work_mem' in config['mediawords']:
         work_mem = config['mediawords']['large_work_mem']
     else:
         work_mem = self.__get_current_work_mem()
     return work_mem
Example #8
0
def _get_store_for_reading(download: dict) -> KeyValueStore:
    """Return the store from which to read the content for the given download."""
    download = decode_object_from_bytes_if_needed(download)

    config = get_config()

    if config['mediawords'].get('read_all_downloads_from_s3', False):
        return _get_amazon_s3_store()

    path = download.get('path', 's3:')

    match = re.search(r'^([\w]+):', path)
    location = match.group(1) if match else 's3'
    location = location.lower()

    if location == 'content':
        download_store = _get_inline_store()
    elif location == 'postgresql':
        download_store = _get_postgresql_store()
    elif location in ('s3', 'amazon_s3'):
        download_store = _get_amazon_s3_store()
    elif location == 'gridfs' or location == 'tar':
        # these are old storage formats that we moved to postgresql
        download_store = _get_postgresql_store()
    else:
        downloads_id = download.get('downloads_id', '(no downloads_id')
        raise McDBIDownloadsException(
            "Location 'location' is unknown for download %d", [downloads_id])

    assert download_store is not None

    return download_store
Example #9
0
def _get_amazon_s3_store() -> KeyValueStore:
    """Get lazy initialized amazon s3 store, with credentials from mediawords.yml."""
    global _amazon_s3_store

    if _amazon_s3_store:
        return _amazon_s3_store

    config = get_config()

    if 'amazon_s3' not in config:
        raise McDBIDownloadsException(
            "Amazon S3 download store is not configured.")

    store_params = {
        'access_key_id': config['amazon_s3']['downloads']['access_key_id'],
        'secret_access_key':
        config['amazon_s3']['downloads']['secret_access_key'],
        'bucket_name': config['amazon_s3']['downloads']['bucket_name'],
        'directory_name': config['amazon_s3']['downloads']['directory_name'],
    }

    if config['mediawords'].get('cache_s3_downloads', False):
        store_params['cache_table'] = S3_RAW_DOWNLOADS_CACHE_TABLE_NAME
        _amazon_s3_store = CachedAmazonS3Store(**store_params)
    else:
        _amazon_s3_store = AmazonS3Store(**store_params)

    return _amazon_s3_store
Example #10
0
def _get_amazon_s3_store() -> KeyValueStore:
    """Get lazy initialized amazon s3 store, with credentials from mediawords.yml."""
    global _amazon_s3_store

    if _amazon_s3_store:
        return _amazon_s3_store

    config = get_config()

    if 'amazon_s3' not in config:
        raise McDBIDownloadsException("Amazon S3 download store is not configured.")

    store_params = {
        'access_key_id': config['amazon_s3']['downloads']['access_key_id'],
        'secret_access_key': config['amazon_s3']['downloads']['secret_access_key'],
        'bucket_name': config['amazon_s3']['downloads']['bucket_name'],
        'directory_name': config['amazon_s3']['downloads']['directory_name'],
    }

    if config['mediawords'].get('cache_s3_downloads', False):
        store_params['cache_table'] = S3_RAW_DOWNLOADS_CACHE_TABLE_NAME
        _amazon_s3_store = CachedAmazonS3Store(**store_params)
    else:
        _amazon_s3_store = AmazonS3Store(**store_params)

    return _amazon_s3_store
Example #11
0
def test_set_config_file_nonexistent():
    old_config = get_config()
    tempdir = tempfile.mkdtemp()
    nonexistent_config = os.path.join(tempdir, 'nonexistent_configuration.yml')
    assert os.path.exists(nonexistent_config) is False
    with pytest.raises(McConfigException):
        set_config_file(nonexistent_config)
    set_config(old_config)
Example #12
0
def test_set_config_file_nonexistent():
    old_config = get_config()
    tempdir = tempfile.mkdtemp()
    nonexistent_config = os.path.join(tempdir, 'nonexistent_configuration.yml')
    assert os.path.exists(nonexistent_config) is False
    with pytest.raises(McConfigException):
        set_config_file(nonexistent_config)
    set_config(old_config)
Example #13
0
def rotate_supervisor_logs():
    root_path = mc_root_path()
    l.debug('Media Cloud root path: %s' % root_path)

    config = get_config()
    child_log_dir = config['supervisor']['childlogdir']
    l.debug('Child log directory: %s' % child_log_dir)

    supervisor_logs_dir = os.path.join(root_path, child_log_dir)
    l.info('Supervisor logs path: %s' % supervisor_logs_dir)

    logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state')
    l.debug('logrotate state file: %s' % logrotate_state_file)

    if not os.path.isdir(supervisor_logs_dir):
        raise Exception('Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir)

    logrotate_config = '''
%(supervisor_logs_dir)s/*.log {
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    missingok
    notifempty
}
''' % {
        'supervisor_logs_dir': supervisor_logs_dir,
        'log_max_size': __LOG_MAX_SIZE,
        'old_log_count': __OLD_LOG_COUNT,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate')
    l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, 'w') as tmp:
        tmp.write(logrotate_config)

    l.info('Running logrotate...')
    subprocess.check_call([
        'logrotate',
        '--verbose',
        '--state', logrotate_state_file,
        logrotate_temp_config_path
    ])

    l.debug('Cleaning up temporary logrotate config...')
    os.unlink(logrotate_temp_config_path)
Example #14
0
    def __init__(self):

        self.api_key = None
        self.api_version = '1.1'
        self.retry_limit = 5
        self.ratelimit_info = defaultdict(dict)
        config = get_config()
        self.ua = mediawords.util.web.user_agent.UserAgent()
        self.ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256])

        if 'associated_press' in config:
            self.api_key = config['associated_press'].get('apikey')

        if self.api_key is None:
            raise McAPMissingAPIKey(
                "API key configuration data missing for associated_press.")
Example #15
0
def _get_postgresql_store() -> KeyValueStore:
    """Get lazy initialized postgresql store, with credentials from mediawords.yml."""
    global _postgresql_store

    if _postgresql_store is not None:
        return _postgresql_store

    config = get_config()

    _postgresql_store = PostgreSQLStore(table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME)

    if config['mediawords'].get('fallback_postgresql_downloads_to_s3', False):
        _postgresql_store = MultipleStoresStore(
            stores_for_reading=[_postgresql_store, _get_amazon_s3_store()],
            stores_for_writing=[_postgresql_store])

    return _postgresql_store
Example #16
0
def _get_postgresql_store() -> KeyValueStore:
    """Get lazy initialized postgresql store, with credentials from mediawords.yml."""
    global _postgresql_store

    if _postgresql_store is not None:
        return _postgresql_store

    config = get_config()

    _postgresql_store = PostgreSQLStore(
        table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME)

    if config['mediawords'].get('fallback_postgresql_downloads_to_s3', False):
        _postgresql_store = MultipleStoresStore(
            stores_for_reading=[_postgresql_store,
                                _get_amazon_s3_store()],
            stores_for_writing=[_postgresql_store])

    return _postgresql_store
Example #17
0
def _get_store_for_writing() -> KeyValueStore:
    """Get MultiStoresStore for writing downloads."""
    global _store_for_writing
    if _store_for_writing is not None:
        return _store_for_writing

    config = get_config()

    # Early sanity check on configuration
    download_storage_locations = config['mediawords'].get(
        'download_storage_locations', [])

    if len(download_storage_locations) == 0:
        raise McDBIDownloadsException("No download stores are configured.")

    stores = []
    for location in download_storage_locations:
        location = location.lower()

        if location == 'databaseinline':
            raise McDBIDownloadsException(
                "databaseinline location is not valid for storage")
        elif location == 'postgresql':
            store = PostgreSQLStore(
                table=RAW_DOWNLOADS_POSTGRESQL_KVS_TABLE_NAME)
        elif location in ('s3', 'amazon', 'amazon_s3'):
            store = _get_amazon_s3_store()
        else:
            raise McDBIDownloadsException("store location '" + location +
                                          "' is not valid")

        if store is None:
            raise McDBIDownloadsException("store location '" + location +
                                          "' is not configured")

        stores.append(store)

    _store_for_writing = MultipleStoresStore(stores_for_writing=stores)

    return _store_for_writing
Example #18
0
    def test_run_block_with_large_work_mem(self):
        normal_work_mem = 256  # MB
        large_work_mem = 512  # MB

        old_large_work_mem = None
        config = get_config()
        if 'large_work_mem' in config['mediawords']:
            old_large_work_mem = config['mediawords']['large_work_mem']

        config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem
        set_config(config)

        self.__db.query("SET work_mem TO %s", ('%sMB' % normal_work_mem,))

        current_work_mem = int(self.__db.query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        def __test_run_block_with_large_work_mem_inner():
            self.__db.execute_with_large_work_mem("""
                INSERT INTO execute_large_work_mem (work_mem)
                SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
            """)

        self.__db.query('CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)')
        self.__db.run_block_with_large_work_mem(__test_run_block_with_large_work_mem_inner)

        statement_work_mem = int(self.__db.query("""
            SELECT work_mem FROM execute_large_work_mem
        """).flat()[0])
        assert statement_work_mem == large_work_mem * 1024

        current_work_mem = int(self.__db.query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        config['mediawords']['large_work_mem'] = old_large_work_mem
        set_config(config)
Example #19
0
    def __should_continue_with_outdated_schema(self, current_schema_version: int, target_schema_version: int) -> bool:
        """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise"""
        config = get_config()
        config_ignore_schema_version = config["mediawords"]["ignore_schema_version"] or False

        if config_ignore_schema_version and self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ:
            l.warn("""
                The current Media Cloud database schema is older than the schema present in mediawords.sql,
                but %s is set so continuing anyway.
            """ % self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE)
            return True
        else:
            l.warn("""
                ################################

                The current Media Cloud database schema is not the same as the schema present in mediawords.sql.

                The database schema currently running in the database is %(current_schema_version)s,
                and the schema version in the mediawords.sql is %(target_schema_version)s.

                Please run:

                    ./script/mediawords_upgrade_db.py --import

                to automatically upgrade the database schema to the latest version.

                If you want to connect to the Media Cloud database anyway (ignoring the schema version),
                set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such:

                    %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py

                ################################

            """ % {
                "current_schema_version": current_schema_version,
                "target_schema_version": target_schema_version,
                "IGNORE_SCHEMA_VERSION_ENV_VARIABLE": self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE,
            })
            return False
Example #20
0
    def setUp(self):
        l.info("Looking for test database credentials...")
        test_database = None
        config = get_config()
        for database in config['database']:
            if database['label'] == 'test':
                test_database = database
                break
        assert test_database is not None

        l.info("Connecting to test database '%s' via DatabaseHandler class..." % test_database['db'])
        self.__db = DatabaseHandler(host=test_database['host'],
                                    port=test_database['port'],
                                    username=test_database['user'],
                                    password=test_database['pass'],
                                    database=test_database['db'])

        l.info("Preparing test table 'kardashians'...")
        self.__db.query("""
            CREATE TEMPORARY TABLE kardashians (
                id SERIAL PRIMARY KEY NOT NULL,
                name VARCHAR UNIQUE NOT NULL,   -- UNIQUE to test find_or_create()
                surname TEXT NOT NULL,
                dob DATE NOT NULL,
                married_to_kanye BOOL NOT NULL DEFAULT 'f'
            )
        """)
        self.__db.query("""
            INSERT INTO kardashians (name, surname, dob, married_to_kanye) VALUES
            ('Kris', 'Jenner', '1955-11-05'::DATE, 'f'),          -- id=1
            ('Caitlyn', 'Jenner', '1949-10-28'::DATE, 'f'),       -- id=2
            ('Kourtney', 'Kardashian', '1979-04-18'::DATE, 'f'),  -- id=3
            ('Kim', 'Kardashian', '1980-10-21'::DATE, 't'),       -- id=4
            ('Khloé', 'Kardashian', '1984-06-27'::DATE, 'f'),     -- id=5
            ('Rob', 'Kardashian', '1987-03-17'::DATE, 'f'),       -- id=6
            ('Kendall', 'Jenner', '1995-11-03'::DATE, 'f'),       -- id=7
            ('Kylie', 'Jenner', '1997-08-10'::DATE, 'f')          -- id=8
        """)
Example #21
0
def test_get_config():
    config = get_config()
    assert 'database' in config
    assert 'mediawords' in config
    assert 'data_dir' in config['mediawords']
Example #22
0
def test_get_config():
    config = get_config()
    assert 'database' in config
    assert 'mediawords' in config
    assert 'data_dir' in config['mediawords']
Example #23
0
    def setUp(self) -> None:
        """Set self.config and assign dummy values for amazon_s3."""
        self.config = get_config()
        self.save_config = copy.deepcopy(self.config)

        self._setup_amazon_s3_config()
    def setUp(self) -> None:
        """Set self.config and assign dummy values for amazon_s3."""
        self.config = get_config()
        self.save_config = copy.deepcopy(self.config)

        self._setup_amazon_s3_config()