Ejemplo n.º 1
0
def _get_store(db: DatabaseHandler, object_type: str) -> None:
    """Get the either the s3 store or the postgresql store, depending on the MC_PUBLIC_STORE_TYPE.
    
    Production systems should use s3, because urls generated by this module will only work for the s3 store.
    The postgresql store is only for testing."""
    store_type = env_value("MC_PUBLIC_STORE_TYPE")

    if store_type == 'postgresql':
        return PostgreSQLStore(table='public_store.%s' % object_type)
    elif store_type == 's3':
        access_key_id = env_value("MC_PUBLIC_AMAZON_S3_ACCESS_KEY_ID")
        secret_access_key = env_value("MC_PUBLIC_AMAZON_S3_SECRET_ACCESS_KEY")
        bucket_name = env_value("MC_PUBLIC_AMAZON_S3_BUCKET_NAME")

        directory_name = _get_directory_name(db, object_type)

        return AmazonS3Store(access_key_id=access_key_id,
                             secret_access_key=secret_access_key,
                             bucket_name=bucket_name,
                             directory_name=directory_name,
                             compression_method=mediawords.key_value_store.
                             KeyValueStore.Compression.GZIP)
    else:
        return McPublicStoreUnknownType(
            f'unknown value for MC_PUBLIC_STORE_TYPE: {store_type}')
Ejemplo n.º 2
0
def test_env_value_required():
    nonexistent_env_name = random_string(length=16)

    with pytest.raises(McConfigEnvironmentVariableUnsetException):
        env_value(name=nonexistent_env_name)

    assert env_value(name=nonexistent_env_name, required=False) is None
Ejemplo n.º 3
0
def test_env_value_empty_string():
    empty_env_name = random_string(length=16)

    os.environ[empty_env_name] = ''

    with pytest.raises(McConfigEnvironmentVariableUnsetException):
        env_value(name=empty_env_name)

    assert env_value(name=empty_env_name, allow_empty_string=True) == ''
Ejemplo n.º 4
0
def get_object_hash(object_id: str) -> int:
    """Hash the object_id with a salt so that it is not discoverable."""
    salt = env_value('MC_PUBLIC_STORE_SALT')
    store_type = env_value('MC_PUBLIC_STORE_TYPE')

    key = "%s-%s" % (salt, object_id)

    big_int = int(hashlib.md5(key.encode('utf-8')).hexdigest(), 16)

    # return just 64 bits of the hash, because that's all the postgresql store can handle
    return big_int & 0xFFFFFFFFFFFFFFF if store_type == 'postgresql' else big_int
Ejemplo n.º 5
0
def _get_s3_store() -> None:
    """Get the amazon s3 store."""
    access_key_id = env_value("MC_PUBLIC_AMAZON_S3_ACCESS_KEY_ID")
    secret_access_key = env_value("MC_PUBLIC_AMAZON_S3_SECRET_ACCESS_KEY")
    bucket_name = env_value("MC_PUBLIC_AMAZON_S3_BUCKET_NAME")
    directory_name = env_value("MC_PUBLIC_AMAZON_S3_DIRECTORY_NAME")

    store = AmazonS3Store(
            access_key_id=access_key_id,
            secret_access_key=secret_access_key,
            bucket_name=bucket_name,
            directory_name=directory_name,
            compression_method: mediawords.key_value_store.KeyValueStore.Compression)

    return store
Ejemplo n.º 6
0
 def parallel_get_num_parallel() -> int:
     """Parallel connection count."""
     value = env_value('MC_USERAGENT_PARALLEL_GET_NUM_PARALLEL',
                       required=False)
     if value is None:
         value = 10
     return int(value)
Ejemplo n.º 7
0
 def parallel_get_per_domain_timeout() -> int:
     """Per-domain timeout, in seconds."""
     value = env_value('MC_USERAGENT_PARALLEL_GET_PER_DOMAIN_TIMEOUT',
                       required=False)
     if not value:
         value = 1
     return int(value)
Ejemplo n.º 8
0
def test_env_value():
    random_env_name = random_string(length=16)
    random_env_value = random_string(length=16)

    os.environ[random_env_name] = random_env_value

    assert env_value(name=random_env_name) == random_env_value
Ejemplo n.º 9
0
 def unsubscribe_address() -> str:
     """Email to which unsubscribe/account deletion requests should be sent"""
     address = env_value('MC_EMAIL_UNSUBSCRIBE',
                         required=False,
                         allow_empty_string=True)
     if address is None or '@' not in address:
         address = '*****@*****.**'
     return address
Ejemplo n.º 10
0
 def read_all_from_s3() -> bool:
     """Whether or not to read all non-inline downloads from S3."""
     value = env_value('MC_DOWNLOADS_READ_ALL_FROM_S3',
                       required=False,
                       allow_empty_string=True)
     if value is None:
         value = 0
     return bool(int(value))
Ejemplo n.º 11
0
 def cache_s3() -> bool:
     """Whether to enable local Amazon S3 download cache."""
     value = env_value('MC_DOWNLOADS_CACHE_S3',
                       required=False,
                       allow_empty_string=True)
     if value is None:
         value = 0
     return bool(int(value))
Ejemplo n.º 12
0
 def blacklist_url_pattern() -> Optional[Pattern]:
     """URL pattern for which we should fail all of the HTTP(s) requests."""
     pattern = env_value('MC_USERAGENT_BLACKLIST_URL_PATTERN', required=False, allow_empty_string=True)
     if pattern:
         pattern = re.compile(pattern, flags=re.IGNORECASE | re.UNICODE)
     else:
         pattern = None
     return pattern
Ejemplo n.º 13
0
    def fallback_postgresql_to_s3() -> bool:
        """Whether to fallback PostgreSQL downloads to Amazon S3.

        If the download doesn't exist in PostgreSQL storage, S3 will be tried instead."""
        value = env_value('MC_DOWNLOADS_FALLBACK_POSTGRESQL_TO_S3', required=False, allow_empty_string=True)
        if value is None:
            value = 0
        return bool(int(value))
Ejemplo n.º 14
0
def _get_api_key() -> str:
    """Fetch the bw api key or use the cached one.

    To get a bw api key, you have to make an api call with the user and password, but the api key only lasts for
    a year, so we just get it and then cache it in a static variable, assuming that each run time will restart at least
    once a year.
    """
    if hasattr(_get_api_key, "api_key"):
        return _get_api_key.api_key

    user = env_value('MC_BRANDWATCH_USER')
    password = env_value('MC_BRANDWATCH_PASSWORD')

    log.debug(f"user: {user}")
    log.debug(f"passwod: {password}")

    ua = _get_user_agent()

    url = (
        "https://api.brandwatch.com/oauth/token?username=%s&grant_type=api-password&client_id=brandwatch-api-client"
        % (quote(user)))

    request = Request(method='POST', url=url)
    request.set_content_type(
        'application/x-www-form-urlencoded; charset=utf-8')
    request.set_content({'password': password})

    response = ua.request(request)

    if not response.is_success():
        raise McPostsBWTwitterDataException("error fetching posts: " +
                                            response.decoded_content())

    json = response.decoded_content()

    data = dict(decode_json(json))

    try:
        _get_api_key.api_key = data['access_token']
    except:
        raise McPostsBWTwitterDataException(
            "error parsing ouath response: '%s'" % json)

    return _get_api_key.api_key
Ejemplo n.º 15
0
 def storage_locations() -> List[str]:
     """Download storage locations."""
     value = env_value('MC_DOWNLOADS_STORAGE_LOCATIONS', required=False)
     if value is None:
         value = 'postgresql'
     locations = value.split(';')
     locations = [location.strip() for location in locations]
     if len(locations) == 0 and locations[0] == '':
         locations = []
     return locations
Ejemplo n.º 16
0
 def topic_alert_emails() -> List[str]:
     """List of emails to which to send all topic alerts."""
     emails = env_value('MC_TOPICS_BASE_TOPIC_ALERT_EMAILS', required=False, allow_empty_string=True)
     if emails is None:
         emails = "[email protected], [email protected]"
     emails = emails.split(',')
     emails = [email.strip() for email in emails]
     if len(emails) == 0 and emails[0] == '':
         emails = []
     return emails
Ejemplo n.º 17
0
def _get_directory_name(db, object_type: str) -> str:
    """Get the directory name either from the env var or from the database."""
    # MC_PUBLIC_AMAZON_S3_DIRECTORY_NAME should be unique for production to prevent overwriting
    try:
        directory_name = env_value("MC_PUBLIC_AMAZON_S3_DIRECTORY_NAME")
    except McConfigEnvironmentVariableUnsetException:
        directory_name = _get_test_directory_name_from_db(db)

    full_path = f'{directory_name}/{object_type}'

    return full_path
Ejemplo n.º 18
0
 def _default_path_prefix(self) -> str:
     return env_value(name='MC_PODCAST_TRANSCRIPTS_PATH_PREFIX')
Ejemplo n.º 19
0
 def directory_name() -> str:
     """Directory name (prefix)."""
     return env_value('MC_DOWNLOADS_AMAZON_S3_DIRECTORY_NAME', allow_empty_string=True)
Ejemplo n.º 20
0
 def bucket_name() -> str:
     """Bucket name."""
     return env_value('MC_DOWNLOADS_AMAZON_S3_BUCKET_NAME')
Ejemplo n.º 21
0
 def secret_access_key() -> str:
     """Secret access key."""
     return env_value('MC_DOWNLOADS_AMAZON_S3_SECRET_ACCESS_KEY')
Ejemplo n.º 22
0
 def access_key_id() -> str:
     """Access key ID."""
     return env_value('MC_DOWNLOADS_AMAZON_S3_ACCESS_KEY_ID')
Ejemplo n.º 23
0
 def email_from_address() -> str:
     """'From:' email address when sending emails."""
     value = env_value('MC_EMAIL_FROM_ADDRESS', required=False)
     if value is None:
         value = '*****@*****.**'
     return value
Ejemplo n.º 24
0
 def parallel_get_timeout() -> int:
     """Connection timeout, in seconds."""
     value = env_value('MC_USERAGENT_PARALLEL_GET_TIMEOUT', required=False)
     if value is None:
         value = 90
     return int(value)
Ejemplo n.º 25
0
 def authenticated_domains() -> List[AuthenticatedDomain]:
     """List of authenticated domains."""
     value = env_value('MC_USERAGENT_AUTHENTICATED_DOMAINS', required=False, allow_empty_string=True)
     return _authenticated_domains_from_json(value)
Ejemplo n.º 26
0
 def univision_client_id() -> Optional[str]:
     """"Univision API client ID."""
     return env_value(name='MC_UNIVISION_CLIENT_ID',
                      required=False,
                      allow_empty_string=True)
Ejemplo n.º 27
0
 def univision_client_secret() -> Optional[str]:
     """Univision API client secret (secret key)."""
     return env_value(name='MC_UNIVISION_CLIENT_SECRET',
                      required=False,
                      allow_empty_string=True)
Ejemplo n.º 28
0
    def tag_set() -> str:
        """NYTLabels version tag, e.g. "nyt_labeller_v1.0.0".

        Will be added under "geocoder_version" tag set."""
        return env_value('MC_NYTLABELS_TAG_SET')
Ejemplo n.º 29
0
 def _default_bucket_name(self) -> str:
     return env_value(name='MC_PODCAST_TRANSCRIPTS_BUCKET_NAME')
Ejemplo n.º 30
0
    def version_tag() -> str:
        """NYTLabels version tag, e.g. "nyt_labeller_v1.0.0".

        Will be added under "geocoder_version" tag set."""
        return env_value('MC_NYTLABELS_VERSION_TAG')