Ejemplo n.º 1
0
 def __get_large_work_mem(self) -> str:
     config = py_get_config()
     if 'large_work_mem' in config['mediawords']:
         work_mem = config['mediawords']['large_work_mem']
     else:
         work_mem = self.__get_current_work_mem()
     return work_mem
Ejemplo n.º 2
0
    def annotator_is_enabled(self) -> bool:
        config = py_get_config()

        if config.get('nytlabels', {}).get('enabled', False):
            return True
        else:
            return False
Ejemplo n.º 3
0
    def __get_domain_http_auth_lookup() -> Dict[str, Dict[str, str]]:
        """Read the mediawords.crawler_authenticated_domains list from mediawords.yml and generate a lookup hash with
        the host domain as the key and the user:password credentials as the value."""
        config = py_get_config()
        domain_http_auth_lookup = {}

        domains = None
        if 'crawler_authenticated_domains' in config['mediawords']:
            domains = config['mediawords']['crawler_authenticated_domains']

        if domains is not None:
            for domain in domains:

                if 'domain' not in domain:
                    raise McCrawlerAuthenticatedDomainsException(
                        '"domain" is not present in HTTP auth configuration.')
                if 'user' not in domain:
                    raise McCrawlerAuthenticatedDomainsException(
                        '"user" is not present in HTTP auth configuration.')
                if 'password' not in domain:
                    raise McCrawlerAuthenticatedDomainsException(
                        '"password" is not present in HTTP auth configuration.'
                    )

                domain_http_auth_lookup[domain['domain'].lower()] = domain

        return domain_http_auth_lookup
Ejemplo n.º 4
0
    def __init__(self,
                 to: Union[str, List[str]],
                 subject: str,
                 text_body: str,
                 html_body: Optional[str] = None,
                 cc: Optional[Union[str, List[str]]] = None,
                 bcc: Optional[Union[str, List[str]]] = None):
        """Email message constructor."""

        config = py_get_config()
        self.from_ = config['mail']['from_address']

        self.subject = decode_object_from_bytes_if_needed(subject)
        self.text_body = decode_object_from_bytes_if_needed(text_body)
        self.html_body = decode_object_from_bytes_if_needed(html_body)

        self.to = decode_object_from_bytes_if_needed(to)
        if isinstance(self.to, str):
            self.to = [self.to]

        self.cc = decode_object_from_bytes_if_needed(cc)
        if isinstance(self.cc, str):
            self.cc = [self.cc]

        self.bcc = decode_object_from_bytes_if_needed(bcc)
        if isinstance(self.bcc, str):
            self.bcc = [self.bcc]
Ejemplo n.º 5
0
    def __init__(self):
        """Constructor."""

        # "requests" session to carry the cookie pool around
        self.__session = requests.Session()

        config = py_get_config()
        self.__session.headers.update({
            'From':
            config['mediawords']['owner'],
            'User-Agent':
            config['mediawords']['user_agent'],
            'Accept-Charset':
            'utf-8',

            # MC_REWRITE_TO_PYTHON:
            #
            # Disable keep-alive (and fancy requests' connection pooling) because rudimentary HTTP server used for Perl
            # unit tests doesn't support it (but then maybe we don't want keep-alive anyway)
            'Connection':
            'close',
        })

        self.set_max_redirect(self.__DEFAULT_MAX_REDIRECT)

        self.__timeout = None
        self.set_timeout(self.__DEFAULT_TIMEOUT)

        self.__max_size = None
        self.set_max_size(self.__DEFAULT_MAX_SIZE)

        # Disable retries by default; if client wants those, it should call
        # timing() itself, e.g. set it to '1,2,4,8'
        self.__timing = None
        self.set_timing(None)
Ejemplo n.º 6
0
    def _request_for_text(self, text: str) -> Request:

        text = decode_object_from_bytes_if_needed(text)

        # CLIFF annotator URL
        config = py_get_config()
        url = config.get('nytlabels', {}).get('annotator_url', None)
        if url is None:
            raise McNYTLabelsAnnotatorException(
                "Unable to determine NYTLabels annotator URL to use.")

        # Create JSON request
        log.debug("Converting text to JSON request...")
        try:
            text_json = encode_json({'text': text})
        except Exception as ex:
            # Not critical, might happen to some stories, no need to shut down the annotator
            raise McNYTLabelsAnnotatorException(
                "Unable to encode text to a JSON request: %(exception)s\nText: %(text)s"
                % {
                    'exception': str(ex),
                    'text': text,
                })
        log.debug("Done converting text to JSON request.")

        request = Request(method='POST', url=url)
        request.set_content_type('application/json; charset=utf-8')
        request.set_content(text_json)

        return request
Ejemplo n.º 7
0
    def __blacklist_request_if_needed(request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        blacklist_url_pattern = None
        if 'blacklist_url_pattern' in config['mediawords']:
            blacklist_url_pattern = config['mediawords'][
                'blacklist_url_pattern']

        if blacklist_url_pattern is not None and len(
                blacklist_url_pattern) > 0:
            if re.search(pattern=blacklist_url_pattern,
                         string=url,
                         flags=re.IGNORECASE | re.UNICODE):
                request.set_url("http://blacklistedsite.localhost/%s" % url)

        return request
def get_test_s3_credentials() -> Union[dict, None]:
    """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured."""

    config = py_get_config()

    credentials = None

    # Environment variables
    if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None:
        credentials = {
            'access_key_id': os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None),
            'secret_access_key': os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None),
            'bucket_name': os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None),
            'directory_name': os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None),
        }

    # mediawords.yml
    elif 'amazon_s3' in config and 'test' in config['amazon_s3']:
        credentials = copy.deepcopy(config['amazon_s3']['test'])

    # We want to be able to run S3 tests in parallel
    if credentials is not None:
        credentials['directory_name'] = credentials['directory_name'] + '-' + random_string(64)

    return credentials
Ejemplo n.º 9
0
 def __get_large_work_mem(self) -> str:
     config = py_get_config()
     if 'large_work_mem' in config['mediawords']:
         work_mem = config['mediawords']['large_work_mem']
     else:
         work_mem = self.__get_current_work_mem()
     return work_mem
Ejemplo n.º 10
0
    def __init__(self):
        """Constructor."""

        # "requests" session to carry the cookie pool around
        self.__session = requests.Session()

        config = py_get_config()
        self.__session.headers.update({
            'From': config['mediawords']['owner'],
            'User-Agent': config['mediawords']['user_agent'],
            'Accept-Charset': 'utf-8',

            # MC_REWRITE_TO_PYTHON:
            #
            # Disable keep-alive (and fancy requests' connection pooling) because rudimentary HTTP server used for Perl
            # unit tests doesn't support it (but then maybe we don't want keep-alive anyway)
            'Connection': 'close',
        })

        self.set_max_redirect(self.__DEFAULT_MAX_REDIRECT)

        self.__timeout = None
        self.set_timeout(self.__DEFAULT_TIMEOUT)

        self.__max_size = None
        self.set_max_size(self.__DEFAULT_MAX_SIZE)

        # Disable retries by default; if client wants those, it should call
        # timing() itself, e.g. set it to '1,2,4,8'
        self.__timing = None
        self.set_timing(None)
def get_test_s3_credentials() -> Union[dict, None]:
    """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured."""

    config = py_get_config()

    credentials = None

    # Environment variables
    if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None:
        credentials = {
            'access_key_id':
            os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None),
            'secret_access_key':
            os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None),
            'bucket_name':
            os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None),
            'directory_name':
            os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None),
        }

    # mediawords.yml
    elif 'amazon_s3' in config and 'test' in config['amazon_s3']:
        credentials = copy.deepcopy(config['amazon_s3']['test'])

    # We want to be able to run S3 tests in parallel
    if credentials is not None:
        credentials['directory_name'] = credentials[
            'directory_name'] + '-' + random_string(64)

    return credentials
Ejemplo n.º 12
0
    def __get_domain_http_auth_lookup() -> Dict[str, Dict[str, str]]:
        """Read the mediawords.crawler_authenticated_domains list from mediawords.yml and generate a lookup hash with
        the host domain as the key and the user:password credentials as the value."""
        config = py_get_config()
        domain_http_auth_lookup = {}

        domains = None
        if 'crawler_authenticated_domains' in config['mediawords']:
            domains = config['mediawords']['crawler_authenticated_domains']

        if domains is not None:
            for domain in domains:

                if 'domain' not in domain:
                    raise McCrawlerAuthenticatedDomainsException(
                        '"domain" is not present in HTTP auth configuration.'
                    )
                if 'user' not in domain:
                    raise McCrawlerAuthenticatedDomainsException(
                        '"user" is not present in HTTP auth configuration.'
                    )
                if 'password' not in domain:
                    raise McCrawlerAuthenticatedDomainsException(
                        '"password" is not present in HTTP auth configuration.'
                    )

                domain_http_auth_lookup[domain['domain'].lower()] = domain

        return domain_http_auth_lookup
Ejemplo n.º 13
0
    def __init__(self, job_class: Type[AbstractJob]):
        """Return job broker (Celery app object) prepared for the specific job class."""

        if job_class is None:
            raise McJobBrokerAppException("Job class is None.")

        queue_name = job_class.queue_name()
        if queue_name is None:
            raise McJobBrokerAppException("Queue name is None.")
        if len(queue_name) == 0:
            raise McJobBrokerAppException("Queue name is empty.")

        config = py_get_config()
        rabbitmq_config = config.get('job_manager',
                                     {}).get('rabbitmq',
                                             {}).get('client', None)
        if rabbitmq_config is None:
            raise McJobBrokerAppException(
                "No supported job broker is configured.")

        broker_uri = 'amqp://%(username)s:%(password)s@%(hostname)s:%(port)d/%(vhost)s' % {
            'username': rabbitmq_config['username'],
            'password': rabbitmq_config['password'],
            'hostname': rabbitmq_config['hostname'],
            'port': int(rabbitmq_config['port']),
            'vhost': rabbitmq_config['vhost'],
        }

        super().__init__(queue_name, broker=broker_uri)

        self.conf.broker_connection_timeout = int(rabbitmq_config['timeout'])

        worker_concurrency = config.get('celery', {}).get(
            job_class.__name__, {}).get('worker_concurrency', 1)
        self.conf.worker_concurrency = worker_concurrency

        # Fetch only one job at a time
        self.conf.worker_prefetch_multiplier = 1

        self.conf.worker_max_tasks_per_child = 1000

        queue = Queue(name=queue_name,
                      exchange=Exchange(queue_name),
                      routing_key=queue_name,
                      queue_arguments={
                          'x-max-priority': 3,
                          'x-queue-mode': 'lazy',
                      })
        self.conf.task_queues = [queue]

        def __route_task(name, args, kwargs, options, task=None, **kw):
            return {'queue': name, 'exchange': name, 'routing_key': name}

        self.conf.task_routes = (__route_task, )

        task = job_class()
        self.__task = self.register_task(task.celery_task())

        self.__job_class = job_class
Ejemplo n.º 14
0
    def __init__(self, job_class: Type[AbstractJob]):
        """Return job broker (Celery app object) prepared for the specific job class."""

        if job_class is None:
            raise McJobBrokerAppException("Job class is None.")

        queue_name = job_class.queue_name()
        if queue_name is None:
            raise McJobBrokerAppException("Queue name is None.")
        if len(queue_name) == 0:
            raise McJobBrokerAppException("Queue name is empty.")

        config = py_get_config()
        rabbitmq_config = config.get('job_manager', {}).get('rabbitmq', {}).get('client', None)
        if rabbitmq_config is None:
            raise McJobBrokerAppException("No supported job broker is configured.")

        broker_uri = 'amqp://%(username)s:%(password)s@%(hostname)s:%(port)d/%(vhost)s' % {
            'username': rabbitmq_config['username'],
            'password': rabbitmq_config['password'],
            'hostname': rabbitmq_config['hostname'],
            'port': int(rabbitmq_config['port']),
            'vhost': rabbitmq_config['vhost'],
        }

        super().__init__(queue_name, broker=broker_uri)

        self.conf.broker_connection_timeout = int(rabbitmq_config['timeout'])

        # Concurrency is done by Supervisor, not Celery itself
        self.conf.worker_concurrency = 1

        # Fetch only one job at a time
        self.conf.worker_prefetch_multiplier = 1

        self.conf.worker_max_tasks_per_child = 1000

        queue = Queue(name=queue_name,
                      exchange=Exchange(queue_name),
                      routing_key=queue_name,
                      queue_arguments={
                          'x-max-priority': 3,
                          'x-queue-mode': 'lazy',
                      })
        self.conf.task_queues = [queue]

        # noinspection PyUnusedLocal
        def __route_task(name, args_, kwargs_, options_, task_=None, **kw_):
            return {'queue': name, 'exchange': name, 'routing_key': name}

        self.conf.task_routes = (__route_task,)

        task = job_class()
        self.__task = self.register_task(task.celery_task())

        self.__job_class = job_class
Ejemplo n.º 15
0
    def __should_continue_with_outdated_schema(
            self, current_schema_version: int,
            target_schema_version: int) -> bool:
        """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise"""
        config = py_get_config()

        config_ignore_schema_version = False
        if 'ignore_schema_version' in config['mediawords']:
            config_ignore_schema_version = config["mediawords"][
                "ignore_schema_version"]

        if config_ignore_schema_version or self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ:
            log.warning("""
                The current Media Cloud database schema is older than the schema present in mediawords.sql,
                but %s is set so continuing anyway.
            """ % self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE)
            return True
        else:
            log.warning(
                """
                ################################

                The current Media Cloud database schema is not the same as the schema present in mediawords.sql.

                The database schema currently running in the database is %(current_schema_version)s,
                and the schema version in the mediawords.sql is %(target_schema_version)s.

                Please run:

                    ./script/run_in_env.sh ./script/mediawords_upgrade_db.pl --import

                to automatically upgrade the database schema to the latest version.

                If you want to connect to the Media Cloud database anyway (ignoring the schema version),
                set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such:

                    %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py

                or

                    %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/run_in_env.sh ./script/your_script.pl

                ################################

            """ % {
                    "current_schema_version":
                    current_schema_version,
                    "target_schema_version":
                    target_schema_version,
                    "IGNORE_SCHEMA_VERSION_ENV_VARIABLE":
                    self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE,
                })
            return False
Ejemplo n.º 16
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'],
                                             'logs', 'http_request.log')

        with open(http_request_log_path, 'a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (
                sql_now(),
                url,
            ))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (
                http_request_log_path,
                str(ex),
            ))
            pass
Ejemplo n.º 17
0
def rotate_supervisor_logs():
    root_path = mc_root_path()
    l.debug('Media Cloud root path: %s' % root_path)

    config = py_get_config()
    child_log_dir = config['supervisor']['childlogdir']
    l.debug('Child log directory: %s' % child_log_dir)

    supervisor_logs_dir = os.path.join(root_path, child_log_dir)
    l.info('Supervisor logs path: %s' % supervisor_logs_dir)

    logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state')
    l.debug('logrotate state file: %s' % logrotate_state_file)

    if not os.path.isdir(supervisor_logs_dir):
        raise Exception(
            'Supervisor logs directory does not exist at path: %s' %
            supervisor_logs_dir)

    logrotate_config = '''
%(supervisor_logs_dir)s/*.log {
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    missingok
    notifempty
}
''' % {
        'supervisor_logs_dir': supervisor_logs_dir,
        'log_max_size': __LOG_MAX_SIZE,
        'old_log_count': __OLD_LOG_COUNT,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(
        suffix='.conf', prefix='logrotate')
    l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, 'w') as tmp:
        tmp.write(logrotate_config)

    l.info('Running logrotate...')
    subprocess.check_call([
        'logrotate', '--verbose', '--state', logrotate_state_file,
        logrotate_temp_config_path
    ])

    l.debug('Cleaning up temporary logrotate config...')
    os.unlink(logrotate_temp_config_path)
Ejemplo n.º 18
0
    def setUpClass(cls) -> None:
        """Create a fresh template data from mediawords.sql.

        The template database will be used to execute the
        'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database
        for each individual unit test.  Recreating from a template is much faster than creating a database from
        scratch from our large schema.
        """
        super().setUpClass()

        config = py_get_config()

        db_config = list(
            filter(lambda x: x['label'] == cls.TEST_DB_LABEL,
                   config['database']))
        if len(db_config) < 1:
            raise McTestDatabaseTestCaseException(
                "Unable to find %s database in mediawords.yml" %
                cls.TEST_DB_LABEL)

        cls.db_name = (db_config[0])['db']

        cls.template_db_name = config['mediawords'].get(
            'test_template_db_name', None)
        if cls.template_db_name is not None:
            log.warning("use existing test db template: %s" %
                        cls.template_db_name)
            return

        log.info("create test db template")

        cls.template_db_name = cls.db_name + '_template'

        # we insert this db name directly into sql, so be paranoid about what is in it
        if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None:
            raise McTestDatabaseTestCaseException("Illegal table name: " +
                                                  cls.db_name)

        # mediacloud_test should already exist, so we have to connect to it to create the template database
        db = connect_to_db(label=cls.TEST_DB_LABEL,
                           do_not_check_schema_version=True)

        cls.__kill_connections_to_database(db=db,
                                           database_name=cls.template_db_name)

        db.query("DROP DATABASE IF EXISTS {}".format(cls.template_db_name))
        db.query("CREATE DATABASE {}".format(cls.template_db_name))
        db.disconnect()
        recreate_db(label=cls.TEST_DB_LABEL, is_template=True)
def rotate_supervisor_logs():
    root_path = mc_root_path()
    log.debug('Media Cloud root path: %s' % root_path)

    config = py_get_config()
    child_log_dir = config['supervisor']['childlogdir']
    log.debug('Child log directory: %s' % child_log_dir)

    supervisor_logs_dir = os.path.join(root_path, child_log_dir)
    log.info('Supervisor logs path: %s' % supervisor_logs_dir)

    logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state')
    log.debug('logrotate state file: %s' % logrotate_state_file)

    if not os.path.isdir(supervisor_logs_dir):
        raise Exception('Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir)

    logrotate_config = '''
%(supervisor_logs_dir)s/*.log {
    size %(log_max_size)d
    rotate %(old_log_count)d
    copytruncate
    compress
    missingok
    notifempty
}
''' % {
        'supervisor_logs_dir': supervisor_logs_dir,
        'log_max_size': __LOG_MAX_SIZE,
        'old_log_count': __OLD_LOG_COUNT,
    }

    logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate')
    log.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path)

    with os.fdopen(logrotate_temp_fd, 'w') as tmp:
        tmp.write(logrotate_config)

    log.info('Running logrotate...')
    subprocess.check_call([
        'logrotate',
        '--verbose',
        '--state', logrotate_state_file,
        logrotate_temp_config_path
    ])

    log.debug('Cleaning up temporary logrotate config...')
    os.unlink(logrotate_temp_config_path)
Ejemplo n.º 20
0
    def _request_for_text(self, text: str) -> Request:

        text = decode_object_from_bytes_if_needed(text)

        # CLIFF annotator URL
        config = py_get_config()
        url = config.get('cliff', {}).get('annotator_url', None)
        if url is None:
            raise McCLIFFAnnotatorException(
                "Unable to determine CLIFF annotator URL to use.")

        request = Request(method='POST', url=url)
        request.set_content_type(
            'application/x-www-form-urlencoded; charset=utf-8')
        request.set_content({'q': text})

        return request
Ejemplo n.º 21
0
    def __should_continue_with_outdated_schema(current_schema_version: int, target_schema_version: int) -> bool:
        """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise"""
        config = py_get_config()

        config_ignore_schema_version = False
        if 'ignore_schema_version' in config['mediawords']:
            config_ignore_schema_version = config["mediawords"]["ignore_schema_version"]

        if config_ignore_schema_version or DatabaseHandler.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ:
            log.warning("""
                The current Media Cloud database schema is older than the schema present in mediawords.sql,
                but %s is set so continuing anyway.
            """ % DatabaseHandler.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE)
            return True
        else:
            log.warning("""
                ################################

                The current Media Cloud database schema is not the same as the schema present in mediawords.sql.

                The database schema currently running in the database is %(current_schema_version)s,
                and the schema version in the mediawords.sql is %(target_schema_version)s.

                Please run:

                    ./script/run_in_env.sh ./script/upgrade_db.pl --import

                to automatically upgrade the database schema to the latest version.

                If you want to connect to the Media Cloud database anyway (ignoring the schema version),
                set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such:

                    %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py

                or

                    %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/run_in_env.sh ./script/your_script.pl

                ################################

            """ % {
                "current_schema_version": current_schema_version,
                "target_schema_version": target_schema_version,
                "IGNORE_SCHEMA_VERSION_ENV_VARIABLE": DatabaseHandler.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE,
            })
            return False
Ejemplo n.º 22
0
    def __log_request(request: Request) -> None:
        """Log HTTP request."""
        # FIXME use Python's logging facilities

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log')

        with open(http_request_log_path, encoding='utf-8', mode='a') as f:

            while True:
                try:
                    fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
                    break
                except IOError as e:
                    # raise on unrelated IOErrors
                    if e.errno != errno.EAGAIN:
                        raise
                    else:
                        log.warning("Waiting for HTTP request log lock...")
                        time.sleep(0.1)

            f.write("%s %s\n" % (sql_now(), url,))

            # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself

            fcntl.flock(f, fcntl.LOCK_UN)

        # Processes from various users (web service, workers, ...) will want to write to the same file
        try:
            os.chmod(http_request_log_path, 0o666)
        except PermissionError as ex:
            # Web server process might attempt at chmodding the file without the appropriate permissions
            log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),))
            pass
Ejemplo n.º 23
0
    def test_run_block_with_large_work_mem(self):
        normal_work_mem = 256  # MB
        large_work_mem = 512  # MB

        old_large_work_mem = None
        config = py_get_config()
        if 'large_work_mem' in config['mediawords']:
            old_large_work_mem = config['mediawords']['large_work_mem']

        config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem
        py_set_config(config)

        self.db().query("SET work_mem TO %s", ('%sMB' % normal_work_mem, ))

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        def __test_run_block_with_large_work_mem_inner():
            self.db().execute_with_large_work_mem("""
                INSERT INTO execute_large_work_mem (work_mem)
                SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
            """)

        self.db().query(
            'CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)'
        )
        self.db().run_block_with_large_work_mem(
            __test_run_block_with_large_work_mem_inner)

        statement_work_mem = int(self.db().query("""
            SELECT work_mem FROM execute_large_work_mem
        """).flat()[0])
        assert statement_work_mem == large_work_mem * 1024

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        config['mediawords']['large_work_mem'] = old_large_work_mem
        py_set_config(config)
Ejemplo n.º 24
0
    def setUp(self) -> None:
        """Create a fresh testing database for each unit test.

        The first time this function is called within a given process, it will create a template database from
        mediawords.sql.  For each test, it will create a new test database using the postgres
        'create database mediacloud_test template mediacloud_test_template' functionality.  Recreating each
        unit test database from the template is much faster than recreating from mediawords.sql.
        """
        test_db_label = 'test'
        config = py_get_config()
        db_config = list(filter(lambda x: x['label'] == test_db_label, config['database']))
        if len(db_config) < 1:
            raise McTestDatabaseTestCaseException("Unable to find %s database in mediawords.yml" % [test_db_label])
        db_name = (db_config[0])['db']
        template_db_name = db_name + '_template'

        if re.search('[^a-z0-9_]', db_name, flags=re.I) is not None:
            raise McTestDatabaseTestCaseException("Illegal table name: " + db_name)

        if not TestDatabaseWithSchemaTestCase._template_db_created:
            log.info("create test db template")
            # mediacloud_test should already exist, so we have to connect to it to create the template database
            db = connect_to_db(label=test_db_label, do_not_check_schema_version=True)
            db.query("drop database if exists %s" % (template_db_name,))
            db.query("create database %s" % (template_db_name,))
            db.disconnect()
            recreate_db(label=test_db_label, is_template=True)
            TestDatabaseWithSchemaTestCase._template_db_created = True

        # now connect to the template database to execure the create command for the test database
        log.info("recreate test db template")

        db = connect_to_db(label=test_db_label, is_template=True)
        db.query("drop database if exists %s" % (db_name,))
        db.query("create database %s template %s" % (db_name, template_db_name))

        db.disconnect()

        db = connect_to_db(label=test_db_label)

        force_using_test_database()
        self.__db = db
Ejemplo n.º 25
0
    def _create_database_handler():
        log.info("Looking for test database credentials...")
        test_database = None
        config = py_get_config()
        for database in config['database']:
            if database['label'] == 'test':
                test_database = database
                break
        assert test_database is not None

        log.info(
            "Connecting to test database '%s' via DatabaseHandler class..." %
            test_database['db'])
        db = DatabaseHandler(host=test_database['host'],
                             port=test_database['port'],
                             username=test_database['user'],
                             password=test_database['pass'],
                             database=test_database['db'])

        return db
Ejemplo n.º 26
0
    def create_database_handler() -> DatabaseHandler:
        log.info("Looking for test database credentials...")
        test_database = None
        config = py_get_config()
        for database in config['database']:
            if database['label'] == 'test':
                test_database = database
                break
        assert test_database is not None

        log.info("Connecting to test database '%s' via DatabaseHandler class..." % test_database['db'])
        db = DatabaseHandler(
            host=test_database['host'],
            port=test_database['port'],
            username=test_database['user'],
            password=test_database['pass'],
            database=test_database['db']
        )

        return db
Ejemplo n.º 27
0
    def setUpClass(cls) -> None:
        """Create a fresh template data from mediawords.sql.

        The template database will be used to execute the
        'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database
        for each individual unit test.  Recreating from a template is much faster than creating a database from
        scratch from our large schema.
        """
        log.info("create test db template")

        config = py_get_config()
        db_config = list(
            filter(lambda x: x['label'] == cls.TEST_DB_LABEL,
                   config['database']))
        if len(db_config) < 1:
            raise McTestDatabaseTestCaseException(
                "Unable to find %s database in mediawords.yml" %
                cls.TEST_DB_LABEL)

        cls.db_name = (db_config[0])['db']
        cls.template_db_name = cls.db_name + '_template'

        # we only want to run this once per test suite for all database test cases, so this needs to be a global
        global _template_db_created
        if _template_db_created:
            return

        # we insert this db name directly into sql, so be paranoid about what is in it
        if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None:
            raise McTestDatabaseTestCaseException("Illegal table name: " +
                                                  cls.db_name)

        # mediacloud_test should already exist, so we have to connect to it to create the template database
        db = connect_to_db(label=cls.TEST_DB_LABEL,
                           do_not_check_schema_version=True)
        db.query("drop database if exists %s" % (cls.template_db_name, ))
        db.query("create database %s" % (cls.template_db_name, ))
        db.disconnect()
        recreate_db(label=cls.TEST_DB_LABEL, is_template=True)

        _template_db_created = True
Ejemplo n.º 28
0
    def test_run_block_with_large_work_mem(self):
        normal_work_mem = 256  # MB
        large_work_mem = 512  # MB

        old_large_work_mem = None
        config = py_get_config()
        if 'large_work_mem' in config['mediawords']:
            old_large_work_mem = config['mediawords']['large_work_mem']

        config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem
        py_set_config(config)

        self.db().query("SET work_mem TO %s", ('%sMB' % normal_work_mem,))

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        def __test_run_block_with_large_work_mem_inner():
            self.db().execute_with_large_work_mem("""
                INSERT INTO execute_large_work_mem (work_mem)
                SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
            """)

        self.db().query('CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)')
        self.db().run_block_with_large_work_mem(__test_run_block_with_large_work_mem_inner)

        statement_work_mem = int(self.db().query("""
            SELECT work_mem FROM execute_large_work_mem
        """).flat()[0])
        assert statement_work_mem == large_work_mem * 1024

        current_work_mem = int(self.db().query("""
            SELECT setting::INT FROM pg_settings WHERE name = 'work_mem'
        """).flat()[0])
        assert current_work_mem == normal_work_mem * 1024

        config['mediawords']['large_work_mem'] = old_large_work_mem
        py_set_config(config)
Ejemplo n.º 29
0
    def setUpClass(cls) -> None:
        """Create a fresh template data from mediawords.sql.

        The template database will be used to execute the
        'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database
        for each individual unit test.  Recreating from a template is much faster than creating a database from
        scratch from our large schema.
        """
        super().setUpClass()

        config = py_get_config()

        db_config = list(filter(lambda x: x['label'] == cls.TEST_DB_LABEL, config['database']))
        if len(db_config) < 1:
            raise McTestDatabaseTestCaseException("Unable to find %s database in mediawords.yml" % cls.TEST_DB_LABEL)

        cls.db_name = (db_config[0])['db']

        cls.template_db_name = config['mediawords'].get('test_template_db_name', None)
        if cls.template_db_name is not None:
            log.warning("use existing test db template: %s" % cls.template_db_name)
            return

        log.info("create test db template")

        cls.template_db_name = cls.db_name + '_template'

        # we insert this db name directly into sql, so be paranoid about what is in it
        if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None:
            raise McTestDatabaseTestCaseException("Illegal table name: " + cls.db_name)

        # mediacloud_test should already exist, so we have to connect to it to create the template database
        db = connect_to_db(label=cls.TEST_DB_LABEL, do_not_check_schema_version=True)

        cls.__kill_connections_to_database(db=db, database_name=cls.template_db_name)

        db.query("DROP DATABASE IF EXISTS {}".format(cls.template_db_name))
        db.query("CREATE DATABASE {}".format(cls.template_db_name))
        db.disconnect()
        recreate_db(label=cls.TEST_DB_LABEL, is_template=True)
Ejemplo n.º 30
0
    def __blacklist_request_if_needed(request: Request) -> Request:
        """If request's URL is blacklisted, update the request to point to a blacklisted URL."""
        # FIXME there should be a better way to block those unwanted requests

        if request is None:
            raise McRequestException("Request is None.")

        url = request.url()
        if url is None:
            raise McRequestException("URL is None.")
        if len(url) == 0:
            raise McRequestException("URL is empty.")

        config = py_get_config()

        blacklist_url_pattern = None
        if 'blacklist_url_pattern' in config['mediawords']:
            blacklist_url_pattern = config['mediawords']['blacklist_url_pattern']

        if blacklist_url_pattern is not None and len(blacklist_url_pattern) > 0:
            if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE) is not None:
                request.set_url("http://0.0.0.1/%s" % url)

        return request
Ejemplo n.º 31
0
    def _tags_for_annotation(
            self, annotation: Union[dict, list]) -> List[JSONAnnotator.Tag]:

        annotation = decode_object_from_bytes_if_needed(annotation)

        config = py_get_config()

        nytlabels_config = config.get('nytlabels', None)
        if nytlabels_config is None:
            raise McNYTLabelsAnnotatorException("NYTLabels is not configured.")

        nytlabels_labels_tag_set = nytlabels_config.get(
            'nytlabels_labels_tag_set', None)
        if nytlabels_labels_tag_set is None:
            raise McNYTLabelsAnnotatorException(
                "NYTLabels labels tag set is unset in configuration.")

        nytlabels_version_tag = nytlabels_config.get('nytlabels_version_tag',
                                                     None)
        if nytlabels_version_tag is None:
            raise McNYTLabelsAnnotatorException(
                "NYTLabels version tag is unset in configuration.")

        tags = list()

        tags.append(
            JSONAnnotator.Tag(tag_sets_name=self.__NYTLABELS_VERSION_TAG_SET,
                              tag_sets_label=self.__NYTLABELS_VERSION_TAG_SET,
                              tag_sets_description=
                              'NYTLabels version the story was tagged with',
                              tags_name=nytlabels_version_tag,
                              tags_label=nytlabels_version_tag,
                              tags_description="Story was tagged with '%s'" %
                              nytlabels_version_tag))

        descriptors600 = annotation.get('descriptors600', None)
        if descriptors600 is not None and len(descriptors600) > 0:

            for descriptor in descriptors600:

                label = descriptor['label']
                score = float(descriptor['score'])

                if score > self.__NYTLABELS_SCORE_THRESHOLD:
                    tags.append(
                        JSONAnnotator.Tag(
                            tag_sets_name=nytlabels_labels_tag_set,
                            tag_sets_label=nytlabels_labels_tag_set,
                            tag_sets_description='NYTLabels labels',

                            # e.g. "hurricanes and tropical storms"
                            tags_name=label,
                            tags_label=label,
                            tags_description=label))

                else:
                    log.debug((
                        "Skipping label '%(label)s' because its score %(score)2.6f"
                        "is lower than the threshold %(threshold)2.6f" % {
                            'label': label,
                            'score': score,
                            'threshold': self.__NYTLABELS_SCORE_THRESHOLD,
                        }))

        return tags
Ejemplo n.º 32
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media',
                                 insert_hash={
                                     'name': "test medium",
                                     'url': "url://test/medium",
                                 })

        story = self.db().create(table='stories',
                                 insert_hash={
                                     'media_id': media['media_id'],
                                     'url': 'url://story/a',
                                     'guid': 'guid://story/a',
                                     'title': 'story a',
                                     'description': 'description a',
                                     'publish_date': sql_now(),
                                     'collect_date': sql_now(),
                                     'full_text_rss': True,
                                 })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences',
                         insert_hash={
                             'stories_id': stories_id,
                             'sentence_number': 1,
                             'sentence':
                             'I hope that the CLIFF annotator is working.',
                             'media_id': media['media_id'],
                             'publish_date': sql_now(),
                             'language': 'en'
                         })

        def __nyt_labels_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(),
                                               stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query(
            """
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {
                'object_id': stories_id
            }).hash()
        assert annotation_exists is not None

        story_tags = self.db().query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
Ejemplo n.º 33
0
def send_email(message: Message) -> bool:
    """Send email to someone.

    Returns True on success, False on failure.

    Raises on programming error."""

    if message is None:
        raise McSendEmailException('Message is None.')

    if not message.from_:
        raise McSendEmailException("'from' is unset.")
    if message.to and (not isinstance(message.to, list)):
        raise McSendEmailException("'to' is not a list.")
    if message.cc and (not isinstance(message.cc, list)):
        raise McSendEmailException("'cc' is not a list.")
    if message.bcc and (not isinstance(message.bcc, list)):
        raise McSendEmailException("'bcc' is not a list.")

    if not (len(message.to) > 0 or len(message.cc) > 0
            or len(message.bcc) > 0):
        raise McSendEmailException("No one to send the email to.")

    if not message.subject:
        raise McSendEmailException("'subject' is unset.")

    if not (message.text_body or message.html_body):
        raise McSendEmailException("No message body.")

    try:

        # Create message
        mime_message = MIMEMultipart('alternative')
        mime_message['Subject'] = '[Media Cloud] %s' % message.subject
        mime_message['From'] = message.from_
        if message.to:
            mime_message['To'] = ', '.join(message.to)
        else:
            mime_message['To'] = 'undisclosed recipients'
        if message.cc:
            mime_message['Cc'] = ', '.join(message.cc)
        if message.bcc:
            mime_message['Bcc'] = ', '.join(message.bcc)

        if message.text_body:
            message_part = MIMEText(message.text_body, 'plain', 'utf-8')
            mime_message.attach(message_part)

        # HTML gets attached last, thus making it a preferred part as per RFC
        if message.html_body:
            message_part = MIMEText(message.html_body, 'html', 'utf-8')
            mime_message.attach(message_part)

        if test_mode_is_enabled():
            log.info("Test mode is enabled, not actually sending any email.")
            log.debug("Omitted email:\n\n%s" % mime_message.as_string())

        else:

            # Connect to SMTP
            config = py_get_config()
            smtp_config = config['mail']['smtp']

            smtp = smtplib.SMTP(host=smtp_config['host'],
                                port=smtp_config['port'])
            if smtp_config['starttls']:
                smtp.starttls()
            if smtp_config['username'] and smtp_config['password']:
                smtp.login(user=smtp_config['username'],
                           password=smtp_config['password'])

            # Send message
            refused_recipients = smtp.sendmail(mime_message['From'],
                                               mime_message['To'],
                                               mime_message.as_string())
            if len(refused_recipients):
                log.warning(
                    "Unable to send email to the following recipients: %s" %
                    str(refused_recipients))

            smtp.quit()

    except Exception as ex:
        log.warning('Unable to send email to %s: %s' % (message.to, str(ex)))
        return False

    return True
Ejemplo n.º 34
0
    def parallel_get(urls: List[str]) -> List[Response]:
        """GET multiple URLs in parallel."""

        # FIXME doesn't respect timing() and other object properties

        urls = decode_object_from_bytes_if_needed(urls)

        # Original implementation didn't raise on undefined / empty list of URLs
        if urls is None:
            return []
        if len(urls) == 0:
            return []

        # Remove duplicates from list while maintaining order because:
        # 1) We don't want to fetch the same URL twice
        # 2) URLs are being used as unique dictionary IDs later on
        urls_before_removing_duplicates = urls.copy()
        urls = list(OrderedDict.fromkeys(urls))
        if len(urls) != len(urls_before_removing_duplicates):
            log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates))

        # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid,
        # get() in a fork should be able to come up with a reasonable Response object for it
        for url in urls:
            if not is_http_url(url):
                raise McParallelGetException("URL %s is not a valid URL; URLs: %s" % (url, str(urls),))

        config = py_get_config()

        if 'web_store_num_parallel' not in config['mediawords']:
            raise McParallelGetException('"web_store_num_parallel" is not set.')
        num_parallel = config['mediawords']['web_store_num_parallel']

        if 'web_store_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_timeout" is not set.')
        timeout = config['mediawords']['web_store_timeout']

        if 'web_store_per_domain_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_per_domain_timeout" is not set.')
        per_domain_timeout = config['mediawords']['web_store_per_domain_timeout']

        url_stack = UserAgent.__get_scheduled_urls(urls_=urls, per_domain_timeout_=per_domain_timeout)

        start_time = time.time()

        url_blocks = {}
        while len(url_stack) > 0:
            block_i = len(url_stack) % num_parallel

            if block_i not in url_blocks:
                url_blocks[block_i] = []

            url_blocks[block_i].append(url_stack.pop())

        pool = multiprocessing.Pool(processes=num_parallel)

        all_results = []
        for i, url_block in url_blocks.items():
            result = pool.apply_async(_parallel_get_web_store, args=(url_block, start_time, timeout,))
            all_results.append(result)

        all_responses = []
        for result in all_results:
            responses = result.get()
            all_responses = all_responses + responses

        # No timeouts here because we trust the workers to timeout by themselves (by UserAgent)
        pool.close()
        pool.join()
        pool.terminate()

        # Sort URLs in parameter order
        # (if URLs weren't split into blocks, we could probably use map_async)
        response_url_map = {}
        for response in all_responses:
            url = response.scheduled_url.url
            response_url_map[url] = response.response

        sorted_responses = []
        for url in urls:
            if url not in response_url_map:
                raise McParallelGetException("URL %s is not in the response URL map %s." % (url, response_url_map,))

            sorted_responses.append(response_url_map[url])

        if len(urls) != len(sorted_responses):
            raise McParallelGetException(
                "Response count doesn't match URL count; responses: %s; URLs: %s" % (sorted_responses, urls,)
            )

        return sorted_responses
Ejemplo n.º 35
0
    def _tags_for_annotation(
            self, annotation: Union[dict, list]) -> List[JSONAnnotator.Tag]:

        annotation = decode_object_from_bytes_if_needed(annotation)

        config = py_get_config()

        cliff_config = config.get('cliff', None)
        if cliff_config is None:
            raise McCLIFFAnnotatorException("CLIFF is not configured.")

        cliff_version_tag = cliff_config.get('cliff_version_tag', None)
        if cliff_version_tag is None:
            raise McCLIFFAnnotatorException(
                "CLIFF version tag is unset in configuration.")

        cliff_geonames_tag_set = cliff_config.get('cliff_geonames_tag_set',
                                                  None)
        if cliff_geonames_tag_set is None:
            raise McCLIFFAnnotatorException(
                "CLIFF geographical names tag set is unset in configuration.")

        cliff_organizations_tag_set = cliff_config.get(
            'cliff_organizations_tag_set', None)
        if cliff_organizations_tag_set is None:
            raise McCLIFFAnnotatorException(
                "CLIFF organizations tag set is unset in configuration.")

        cliff_people_tag_set = cliff_config.get('cliff_people_tag_set', None)
        if cliff_people_tag_set is None:
            raise McCLIFFAnnotatorException(
                "CLIFF people tag set is unset in configuration.")

        tags = list()

        tags.append(
            JSONAnnotator.Tag(
                tag_sets_name=self.__CLIFF_VERSION_TAG_SET,
                tag_sets_label=self.__CLIFF_VERSION_TAG_SET,
                tag_sets_description='CLIFF version the story was tagged with',
                tags_name=cliff_version_tag,
                tags_label=cliff_version_tag,
                tags_description="Story was tagged with '%s'" %
                cliff_version_tag))

        results = annotation.get('results', None)
        if results is None or len(results) == 0:
            return tags

        organizations = results.get('organizations', None)
        if organizations is not None:
            for organization in organizations:
                tags.append(
                    JSONAnnotator.Tag(
                        tag_sets_name=cliff_organizations_tag_set,
                        tag_sets_label=cliff_organizations_tag_set,
                        tag_sets_description='CLIFF organizations',

                        # e.g. "United Nations"
                        tags_name=organization['name'],
                        tags_label=organization['name'],
                        tags_description=organization['name']))

        people = results.get('people', None)
        if people is not None:
            for person in people:
                tags.append(
                    JSONAnnotator.Tag(
                        tag_sets_name=cliff_people_tag_set,
                        tag_sets_label=cliff_people_tag_set,
                        tag_sets_description='CLIFF people',

                        # e.g. "Einstein"
                        tags_name=person['name'],
                        tags_label=person['name'],
                        tags_description=person['name']))

        places = results.get('places', None)
        if places is not None:
            focus = places.get('focus', None)
            if focus is not None:

                countries = focus.get('countries', None)
                if countries is not None:

                    for country in countries:
                        tags.append(
                            JSONAnnotator.Tag(
                                tag_sets_name=cliff_geonames_tag_set,
                                tag_sets_label=cliff_geonames_tag_set,
                                tag_sets_description='CLIFF geographical names',

                                # e.g. "geonames_6252001"
                                tags_name=self.__CLIFF_GEONAMES_TAG_PREFIX +
                                str(country['id']),

                                # e.g. "United States"
                                tags_label=country['name'],

                                # e.g. "United States | A | US"
                                tags_description=
                                '%(name)s | %(feature)s | %(country)s' % {
                                    'name': country['name'],
                                    'feature': country['featureClass'],
                                    'country': country['countryCode'],
                                }))

                states = focus.get('states', None)
                if states is not None:

                    for state in states:
                        tags.append(
                            JSONAnnotator.Tag(
                                tag_sets_name=cliff_geonames_tag_set,
                                tag_sets_label=cliff_geonames_tag_set,
                                tag_sets_description='CLIFF geographical names',

                                # e.g. "geonames_4273857"
                                tags_name=self.__CLIFF_GEONAMES_TAG_PREFIX +
                                str(state['id']),

                                # e.g. "Kansas"
                                tags_label=state['name'],

                                # e.g. "Kansas | A | KS | US"
                                tags_description=('%(name)s | %(feature)s | '
                                                  '%(state)s | %(country)s') %
                                {
                                    'name': state['name'],
                                    'feature': state['featureClass'],
                                    'state': state['stateCode'],
                                    'country': state['countryCode'],
                                }))

                cities = focus.get('cities', None)
                if cities is not None:

                    for city in cities:
                        tags.append(
                            JSONAnnotator.Tag(
                                tag_sets_name=cliff_geonames_tag_set,
                                tag_sets_label=cliff_geonames_tag_set,
                                tag_sets_description='CLIFF geographical names',

                                # e.g. "geonames_4273857"
                                tags_name=self.__CLIFF_GEONAMES_TAG_PREFIX +
                                str(city['id']),

                                # e.g. "Kansas"
                                tags_label=city['name'],

                                # e.g. "Kansas | A | KS | US"
                                tags_description=('%(name)s | %(feature)s | '
                                                  '%(state)s | %(country)s') %
                                {
                                    'name': city['name'],
                                    'feature': city['featureClass'],
                                    'state': city['stateCode'],
                                    'country': city['countryCode'],
                                }))

        return tags
Ejemplo n.º 36
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media', insert_hash={
            'name': "test medium",
            'url': "url://test/medium",
        })

        story = self.db().create(table='stories', insert_hash={
            'media_id': media['media_id'],
            'url': 'url://story/a',
            'guid': 'guid://story/a',
            'title': 'story a',
            'description': 'description a',
            'publish_date': sql_now(),
            'collect_date': sql_now(),
            'full_text_rss': True,
        })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences', insert_hash={
            'stories_id': stories_id,
            'sentence_number': 1,
            'sentence': 'I hope that the CLIFF annotator is working.',
            'media_id': media['media_id'],
            'publish_date': sql_now(),
            'language': 'en'
        })

        def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query("""
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {'object_id': stories_id}).hash()
        assert annotation_exists is not None

        story_tags = self.db().query("""
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {'stories_id': stories_id}).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
Ejemplo n.º 37
0
def get_similarweb_client():
    config = py_get_config()
    return SimilarWebClient(api_key=config['similarweb']['api_key'])
Ejemplo n.º 38
0
def connect_to_db(
        label: typing.Optional[str] = None,
        do_not_check_schema_version: bool = False,
        is_template: bool = False) -> DatabaseHandler:
    """Connect to PostgreSQL.

    Arguments:
    label - db config section label for mediawords.yml
    do_no_check_schema_version - if false, throw an error if the versions in mediawords.ym and the db do not match
    is_template - if true, connect to a db called <db_name>_template instead of <db_name>

    """
    label = decode_str_from_bytes_if_needed(label)

    # If this is Catalyst::Test run, force the label to the test database
    if using_test_database():
        label = 'test'

    config = py_get_config()

    if 'database' not in config:
        raise McConnectToDBException("No database connections are configured")

    all_settings = config['database']
    if all_settings is None:
        raise McConnectToDBException("No database connections are configured")

    settings = None
    if label is not None:
        for configured_database in all_settings:
            if configured_database['label'] == label:
                settings = configured_database
                break
        if settings is None:
            raise McConnectToDBException("No database connection settings labeled '%s'." % label)
    else:
        if len(all_settings) == 0:
            raise McConnectToDBException("No default connection settings found.")

        settings = all_settings[0]

    if settings is None:
        raise McConnectToDBException("Settings are undefined.")

    if 'host' not in settings or 'db' not in settings:
        raise McConnectToDBException("Settings are incomplete ('db' and 'host' must both be set).")

    host = settings['host']
    port = int(settings['port'])
    username = settings['user']
    password = settings['pass']
    database = settings['db']

    if is_template:
        database = database + "_template"

    try:
        ret = DatabaseHandler(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            do_not_check_schema_version=do_not_check_schema_version
        )
    except Exception as ex:
        raise McConnectToDBException(
            "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % {
                'username': username,
                'host': host,
                'port': port,
                'database': database,
                'exception': str(ex)
            })

    if ret is None:
        raise McConnectToDBException("Error while connecting to the database.")

    if 'db_statement_timeout' in config['mediawords']:
        db_statement_timeout = config['mediawords']['db_statement_timeout']

        ret.query('SET statement_timeout TO %(db_statement_timeout)s' % {'db_statement_timeout': db_statement_timeout})

    return ret
Ejemplo n.º 39
0
    def parallel_get(urls: List[str]) -> List[Response]:
        """GET multiple URLs in parallel."""

        # FIXME doesn't respect timing() and other object properties

        def __get_url_domain(url_: str) -> str:

            if not is_http_url(url_):
                return url_

            host = get_url_host(url_)

            name_parts = host.split('.')

            n = len(name_parts) - 1

            # for country domains, use last three parts of name
            if re.search(pattern=r"\...$", string=host):
                domain = '.'.join(
                    [name_parts[n - 2], name_parts[n - 1], name_parts[0]])

            elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)",
                           string=host):
                domain = url_

            else:
                domain = '.'.join([name_parts[n - 1], name_parts[n]])

            return domain.lower()

        def __get_scheduled_urls(
                urls_: List[str],
                per_domain_timeout_: int) -> List[_ParallelGetScheduledURL]:
            """Schedule the URLs by adding a { time => $time } field to each URL to make sure we obey the
            'per_domain_timeout'. Sort requests by ascending time."""
            domain_urls = {}

            for url_ in urls_:
                domain = __get_url_domain(url_=url_)
                if domain not in domain_urls:
                    domain_urls[domain] = []
                domain_urls[domain].append(url_)

            scheduled_urls = []

            for domain, urls_in_domain in domain_urls.items():
                time_ = 0
                for domain_url in urls_in_domain:
                    domain_url = _ParallelGetScheduledURL(url=domain_url,
                                                          time_=time_)
                    scheduled_urls.append(domain_url)

                    if time_ % 5 == 0:  # FIXME why 5?
                        time_ = time_ + per_domain_timeout_

            scheduled_urls = sorted(scheduled_urls, key=lambda x: x.time)

            return scheduled_urls

        # ---

        urls = decode_object_from_bytes_if_needed(urls)

        # Original implementation didn't raise on undefined / empty list of URLs
        if urls is None:
            return []
        if len(urls) == 0:
            return []

        # Remove duplicates from list while maintaining order because:
        # 1) We don't want to fetch the same URL twice
        # 2) URLs are being used as unique dictionary IDs later on
        urls_before_removing_duplicates = urls.copy()
        urls = list(OrderedDict.fromkeys(urls))
        if len(urls) != len(urls_before_removing_duplicates):
            log.warning("Some of the URLs are duplicate; URLs: %s" %
                        str(urls_before_removing_duplicates))

        # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid,
        # get() in a fork should be able to come up with a reasonable Response object for it
        for url in urls:
            if not is_http_url(url):
                raise McParallelGetException(
                    "URL %s is not a valid URL; URLs: %s" % (
                        url,
                        str(urls),
                    ))

        config = py_get_config()

        if 'web_store_num_parallel' not in config['mediawords']:
            raise McParallelGetException(
                '"web_store_num_parallel" is not set.')
        num_parallel = config['mediawords']['web_store_num_parallel']

        if 'web_store_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_timeout" is not set.')
        timeout = config['mediawords']['web_store_timeout']

        if 'web_store_per_domain_timeout' not in config['mediawords']:
            raise McParallelGetException(
                '"web_store_per_domain_timeout" is not set.')
        per_domain_timeout = config['mediawords'][
            'web_store_per_domain_timeout']

        url_stack = __get_scheduled_urls(
            urls_=urls, per_domain_timeout_=per_domain_timeout)

        start_time = time.time()

        url_blocks = {}
        while len(url_stack) > 0:
            block_i = len(url_stack) % num_parallel

            if block_i not in url_blocks:
                url_blocks[block_i] = []

            url_blocks[block_i].append(url_stack.pop())

        pool = multiprocessing.Pool(processes=num_parallel)

        all_results = []
        for i, url_block in url_blocks.items():
            result = pool.apply_async(_parallel_get_web_store,
                                      args=(
                                          url_block,
                                          start_time,
                                          timeout,
                                      ))
            all_results.append(result)

        all_responses = []
        for result in all_results:
            responses = result.get()
            all_responses = all_responses + responses

        # No timeouts here because we trust the workers to timeout by themselves (by UserAgent)
        pool.close()
        pool.join()
        pool.terminate()

        # Sort URLs in parameter order
        # (if URLs weren't split into blocks, we could probably use map_async)
        response_url_map = {}
        for response in all_responses:
            url = response.scheduled_url.url
            response_url_map[url] = response.response

        sorted_responses = []
        for url in urls:
            if url not in response_url_map:
                raise McParallelGetException(
                    "URL %s is not in the response URL map %s." % (
                        url,
                        response_url_map,
                    ))

            sorted_responses.append(response_url_map[url])

        if len(urls) != len(sorted_responses):
            raise McParallelGetException(
                "Response count doesn't match URL count; responses: %s; URLs: %s"
                % (
                    sorted_responses,
                    urls,
                ))

        return sorted_responses
Ejemplo n.º 40
0
    def parallel_get(urls: List[str]) -> List[Response]:
        """GET multiple URLs in parallel."""

        # FIXME doesn't respect timing() and other object properties

        urls = decode_object_from_bytes_if_needed(urls)

        # Original implementation didn't raise on undefined / empty list of URLs
        if urls is None:
            return []
        if len(urls) == 0:
            return []

        # Remove duplicates from list while maintaining order because:
        # 1) We don't want to fetch the same URL twice
        # 2) URLs are being used as unique dictionary IDs later on
        urls_before_removing_duplicates = urls.copy()
        urls = list(OrderedDict.fromkeys(urls))
        if len(urls) != len(urls_before_removing_duplicates):
            log.warning("Some of the URLs are duplicate; URLs: %s" %
                        str(urls_before_removing_duplicates))

        # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid,
        # get() in a fork should be able to come up with a reasonable Response object for it
        for url in urls:
            if not is_http_url(url):
                raise McParallelGetException(
                    "URL %s is not a valid URL; URLs: %s" % (
                        url,
                        str(urls),
                    ))

        config = py_get_config()

        if 'web_store_num_parallel' not in config['mediawords']:
            raise McParallelGetException(
                '"web_store_num_parallel" is not set.')
        num_parallel = config['mediawords']['web_store_num_parallel']

        if 'web_store_timeout' not in config['mediawords']:
            raise McParallelGetException('"web_store_timeout" is not set.')
        timeout = config['mediawords']['web_store_timeout']

        if 'web_store_per_domain_timeout' not in config['mediawords']:
            raise McParallelGetException(
                '"web_store_per_domain_timeout" is not set.')
        per_domain_timeout = config['mediawords'][
            'web_store_per_domain_timeout']

        url_stack = UserAgent.__get_scheduled_urls(
            urls_=urls, per_domain_timeout_=per_domain_timeout)

        start_time = time.time()

        url_blocks = {}
        while len(url_stack) > 0:
            block_i = len(url_stack) % num_parallel

            if block_i not in url_blocks:
                url_blocks[block_i] = []

            url_blocks[block_i].append(url_stack.pop())

        pool = multiprocessing.Pool(processes=num_parallel)

        all_results = []
        for i, url_block in url_blocks.items():
            result = pool.apply_async(_parallel_get_web_store,
                                      args=(
                                          url_block,
                                          start_time,
                                          timeout,
                                      ))
            all_results.append(result)

        all_responses = []
        for result in all_results:
            responses = result.get()
            all_responses = all_responses + responses

        # No timeouts here because we trust the workers to timeout by themselves (by UserAgent)
        pool.close()
        pool.join()
        pool.terminate()

        # Sort URLs in parameter order
        # (if URLs weren't split into blocks, we could probably use map_async)
        response_url_map = {}
        for response in all_responses:
            url = response.scheduled_url.url
            response_url_map[url] = response.response

        sorted_responses = []
        for url in urls:
            if url not in response_url_map:
                raise McParallelGetException(
                    "URL %s is not in the response URL map %s." % (
                        url,
                        response_url_map,
                    ))

            sorted_responses.append(response_url_map[url])

        if len(urls) != len(sorted_responses):
            raise McParallelGetException(
                "Response count doesn't match URL count; responses: %s; URLs: %s"
                % (
                    sorted_responses,
                    urls,
                ))

        return sorted_responses
Ejemplo n.º 41
0
def connect_to_db(
        label: str = None,
        do_not_check_schema_version: bool = False) -> DatabaseHandler:
    """Connect to PostgreSQL."""

    label = decode_object_from_bytes_if_needed(label)

    # If this is Catalyst::Test run, force the label to the test database
    if using_test_database():
        label = 'test'

    config = py_get_config()

    if 'database' not in config:
        raise McConnectToDBException("No database connections are configured")

    all_settings = config['database']
    if all_settings is None:
        raise McConnectToDBException("No database connections are configured")

    settings = None
    if label is not None:
        for configured_database in all_settings:
            if configured_database['label'] == label:
                settings = configured_database
                break
        if settings is None:
            raise McConnectToDBException(
                "No database connection settings labeled '%s'." % label)
    else:
        if len(all_settings) == 0:
            raise McConnectToDBException(
                "No default connection settings found.")

        settings = all_settings[0]

    if settings is None:
        raise McConnectToDBException("Settings are undefined.")

    if 'host' not in settings or 'db' not in settings:
        raise McConnectToDBException(
            "Settings are incomplete ('db' and 'host' must both be set).")

    host = settings['host']
    port = int(settings['port'])
    username = settings['user']
    password = settings['pass']
    database = settings['db']

    try:
        ret = DatabaseHandler(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            do_not_check_schema_version=do_not_check_schema_version)
    except Exception as ex:
        raise McConnectToDBException(
            "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s"
            % {
                'username': username,
                'host': host,
                'port': port,
                'database': database,
                'exception': str(ex)
            })

    if ret is None:
        raise McConnectToDBException("Error while connecting to the database.")

    if 'db_statement_timeout' in config['mediawords']:
        db_statement_timeout = config['mediawords']['db_statement_timeout']

        ret.query('SET statement_timeout TO %(db_statement_timeout)s' %
                  {'db_statement_timeout': db_statement_timeout})

    # Reset the session variable in case the database connection is being reused due to pooling
    ret.query("""
        DO $$
        BEGIN
        PERFORM enable_story_triggers();
        EXCEPTION
        WHEN undefined_function THEN
            -- This exception will be raised if the database is uninitialized at this point.
            -- So, don't emit any kind of error because of an non-existent function.
            NULL;
        WHEN OTHERS THEN
            -- Forward the exception
            RAISE;
        END
        $$;
    """)

    return ret