def __get_large_work_mem(self) -> str: config = py_get_config() if 'large_work_mem' in config['mediawords']: work_mem = config['mediawords']['large_work_mem'] else: work_mem = self.__get_current_work_mem() return work_mem
def annotator_is_enabled(self) -> bool: config = py_get_config() if config.get('nytlabels', {}).get('enabled', False): return True else: return False
def __get_domain_http_auth_lookup() -> Dict[str, Dict[str, str]]: """Read the mediawords.crawler_authenticated_domains list from mediawords.yml and generate a lookup hash with the host domain as the key and the user:password credentials as the value.""" config = py_get_config() domain_http_auth_lookup = {} domains = None if 'crawler_authenticated_domains' in config['mediawords']: domains = config['mediawords']['crawler_authenticated_domains'] if domains is not None: for domain in domains: if 'domain' not in domain: raise McCrawlerAuthenticatedDomainsException( '"domain" is not present in HTTP auth configuration.') if 'user' not in domain: raise McCrawlerAuthenticatedDomainsException( '"user" is not present in HTTP auth configuration.') if 'password' not in domain: raise McCrawlerAuthenticatedDomainsException( '"password" is not present in HTTP auth configuration.' ) domain_http_auth_lookup[domain['domain'].lower()] = domain return domain_http_auth_lookup
def __init__(self, to: Union[str, List[str]], subject: str, text_body: str, html_body: Optional[str] = None, cc: Optional[Union[str, List[str]]] = None, bcc: Optional[Union[str, List[str]]] = None): """Email message constructor.""" config = py_get_config() self.from_ = config['mail']['from_address'] self.subject = decode_object_from_bytes_if_needed(subject) self.text_body = decode_object_from_bytes_if_needed(text_body) self.html_body = decode_object_from_bytes_if_needed(html_body) self.to = decode_object_from_bytes_if_needed(to) if isinstance(self.to, str): self.to = [self.to] self.cc = decode_object_from_bytes_if_needed(cc) if isinstance(self.cc, str): self.cc = [self.cc] self.bcc = decode_object_from_bytes_if_needed(bcc) if isinstance(self.bcc, str): self.bcc = [self.bcc]
def __init__(self): """Constructor.""" # "requests" session to carry the cookie pool around self.__session = requests.Session() config = py_get_config() self.__session.headers.update({ 'From': config['mediawords']['owner'], 'User-Agent': config['mediawords']['user_agent'], 'Accept-Charset': 'utf-8', # MC_REWRITE_TO_PYTHON: # # Disable keep-alive (and fancy requests' connection pooling) because rudimentary HTTP server used for Perl # unit tests doesn't support it (but then maybe we don't want keep-alive anyway) 'Connection': 'close', }) self.set_max_redirect(self.__DEFAULT_MAX_REDIRECT) self.__timeout = None self.set_timeout(self.__DEFAULT_TIMEOUT) self.__max_size = None self.set_max_size(self.__DEFAULT_MAX_SIZE) # Disable retries by default; if client wants those, it should call # timing() itself, e.g. set it to '1,2,4,8' self.__timing = None self.set_timing(None)
def _request_for_text(self, text: str) -> Request: text = decode_object_from_bytes_if_needed(text) # CLIFF annotator URL config = py_get_config() url = config.get('nytlabels', {}).get('annotator_url', None) if url is None: raise McNYTLabelsAnnotatorException( "Unable to determine NYTLabels annotator URL to use.") # Create JSON request log.debug("Converting text to JSON request...") try: text_json = encode_json({'text': text}) except Exception as ex: # Not critical, might happen to some stories, no need to shut down the annotator raise McNYTLabelsAnnotatorException( "Unable to encode text to a JSON request: %(exception)s\nText: %(text)s" % { 'exception': str(ex), 'text': text, }) log.debug("Done converting text to JSON request.") request = Request(method='POST', url=url) request.set_content_type('application/json; charset=utf-8') request.set_content(text_json) return request
def __blacklist_request_if_needed(request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() blacklist_url_pattern = None if 'blacklist_url_pattern' in config['mediawords']: blacklist_url_pattern = config['mediawords'][ 'blacklist_url_pattern'] if blacklist_url_pattern is not None and len( blacklist_url_pattern) > 0: if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE): request.set_url("http://blacklistedsite.localhost/%s" % url) return request
def get_test_s3_credentials() -> Union[dict, None]: """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured.""" config = py_get_config() credentials = None # Environment variables if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None: credentials = { 'access_key_id': os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None), 'secret_access_key': os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None), 'bucket_name': os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None), 'directory_name': os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None), } # mediawords.yml elif 'amazon_s3' in config and 'test' in config['amazon_s3']: credentials = copy.deepcopy(config['amazon_s3']['test']) # We want to be able to run S3 tests in parallel if credentials is not None: credentials['directory_name'] = credentials['directory_name'] + '-' + random_string(64) return credentials
def __get_large_work_mem(self) -> str: config = py_get_config() if 'large_work_mem' in config['mediawords']: work_mem = config['mediawords']['large_work_mem'] else: work_mem = self.__get_current_work_mem() return work_mem
def __init__(self): """Constructor.""" # "requests" session to carry the cookie pool around self.__session = requests.Session() config = py_get_config() self.__session.headers.update({ 'From': config['mediawords']['owner'], 'User-Agent': config['mediawords']['user_agent'], 'Accept-Charset': 'utf-8', # MC_REWRITE_TO_PYTHON: # # Disable keep-alive (and fancy requests' connection pooling) because rudimentary HTTP server used for Perl # unit tests doesn't support it (but then maybe we don't want keep-alive anyway) 'Connection': 'close', }) self.set_max_redirect(self.__DEFAULT_MAX_REDIRECT) self.__timeout = None self.set_timeout(self.__DEFAULT_TIMEOUT) self.__max_size = None self.set_max_size(self.__DEFAULT_MAX_SIZE) # Disable retries by default; if client wants those, it should call # timing() itself, e.g. set it to '1,2,4,8' self.__timing = None self.set_timing(None)
def get_test_s3_credentials() -> Union[dict, None]: """Return test Amazon S3 credentials as a dictionary or None if credentials are not configured.""" config = py_get_config() credentials = None # Environment variables if os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID') is not None: credentials = { 'access_key_id': os.getenv('MC_AMAZON_S3_TEST_ACCESS_KEY_ID', None), 'secret_access_key': os.getenv('MC_AMAZON_S3_TEST_SECRET_ACCESS_KEY', None), 'bucket_name': os.getenv('MC_AMAZON_S3_TEST_BUCKET_NAME', None), 'directory_name': os.getenv('MC_AMAZON_S3_TEST_DIRECTORY_NAME', None), } # mediawords.yml elif 'amazon_s3' in config and 'test' in config['amazon_s3']: credentials = copy.deepcopy(config['amazon_s3']['test']) # We want to be able to run S3 tests in parallel if credentials is not None: credentials['directory_name'] = credentials[ 'directory_name'] + '-' + random_string(64) return credentials
def __get_domain_http_auth_lookup() -> Dict[str, Dict[str, str]]: """Read the mediawords.crawler_authenticated_domains list from mediawords.yml and generate a lookup hash with the host domain as the key and the user:password credentials as the value.""" config = py_get_config() domain_http_auth_lookup = {} domains = None if 'crawler_authenticated_domains' in config['mediawords']: domains = config['mediawords']['crawler_authenticated_domains'] if domains is not None: for domain in domains: if 'domain' not in domain: raise McCrawlerAuthenticatedDomainsException( '"domain" is not present in HTTP auth configuration.' ) if 'user' not in domain: raise McCrawlerAuthenticatedDomainsException( '"user" is not present in HTTP auth configuration.' ) if 'password' not in domain: raise McCrawlerAuthenticatedDomainsException( '"password" is not present in HTTP auth configuration.' ) domain_http_auth_lookup[domain['domain'].lower()] = domain return domain_http_auth_lookup
def __init__(self, job_class: Type[AbstractJob]): """Return job broker (Celery app object) prepared for the specific job class.""" if job_class is None: raise McJobBrokerAppException("Job class is None.") queue_name = job_class.queue_name() if queue_name is None: raise McJobBrokerAppException("Queue name is None.") if len(queue_name) == 0: raise McJobBrokerAppException("Queue name is empty.") config = py_get_config() rabbitmq_config = config.get('job_manager', {}).get('rabbitmq', {}).get('client', None) if rabbitmq_config is None: raise McJobBrokerAppException( "No supported job broker is configured.") broker_uri = 'amqp://%(username)s:%(password)s@%(hostname)s:%(port)d/%(vhost)s' % { 'username': rabbitmq_config['username'], 'password': rabbitmq_config['password'], 'hostname': rabbitmq_config['hostname'], 'port': int(rabbitmq_config['port']), 'vhost': rabbitmq_config['vhost'], } super().__init__(queue_name, broker=broker_uri) self.conf.broker_connection_timeout = int(rabbitmq_config['timeout']) worker_concurrency = config.get('celery', {}).get( job_class.__name__, {}).get('worker_concurrency', 1) self.conf.worker_concurrency = worker_concurrency # Fetch only one job at a time self.conf.worker_prefetch_multiplier = 1 self.conf.worker_max_tasks_per_child = 1000 queue = Queue(name=queue_name, exchange=Exchange(queue_name), routing_key=queue_name, queue_arguments={ 'x-max-priority': 3, 'x-queue-mode': 'lazy', }) self.conf.task_queues = [queue] def __route_task(name, args, kwargs, options, task=None, **kw): return {'queue': name, 'exchange': name, 'routing_key': name} self.conf.task_routes = (__route_task, ) task = job_class() self.__task = self.register_task(task.celery_task()) self.__job_class = job_class
def __init__(self, job_class: Type[AbstractJob]): """Return job broker (Celery app object) prepared for the specific job class.""" if job_class is None: raise McJobBrokerAppException("Job class is None.") queue_name = job_class.queue_name() if queue_name is None: raise McJobBrokerAppException("Queue name is None.") if len(queue_name) == 0: raise McJobBrokerAppException("Queue name is empty.") config = py_get_config() rabbitmq_config = config.get('job_manager', {}).get('rabbitmq', {}).get('client', None) if rabbitmq_config is None: raise McJobBrokerAppException("No supported job broker is configured.") broker_uri = 'amqp://%(username)s:%(password)s@%(hostname)s:%(port)d/%(vhost)s' % { 'username': rabbitmq_config['username'], 'password': rabbitmq_config['password'], 'hostname': rabbitmq_config['hostname'], 'port': int(rabbitmq_config['port']), 'vhost': rabbitmq_config['vhost'], } super().__init__(queue_name, broker=broker_uri) self.conf.broker_connection_timeout = int(rabbitmq_config['timeout']) # Concurrency is done by Supervisor, not Celery itself self.conf.worker_concurrency = 1 # Fetch only one job at a time self.conf.worker_prefetch_multiplier = 1 self.conf.worker_max_tasks_per_child = 1000 queue = Queue(name=queue_name, exchange=Exchange(queue_name), routing_key=queue_name, queue_arguments={ 'x-max-priority': 3, 'x-queue-mode': 'lazy', }) self.conf.task_queues = [queue] # noinspection PyUnusedLocal def __route_task(name, args_, kwargs_, options_, task_=None, **kw_): return {'queue': name, 'exchange': name, 'routing_key': name} self.conf.task_routes = (__route_task,) task = job_class() self.__task = self.register_task(task.celery_task()) self.__job_class = job_class
def __should_continue_with_outdated_schema( self, current_schema_version: int, target_schema_version: int) -> bool: """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise""" config = py_get_config() config_ignore_schema_version = False if 'ignore_schema_version' in config['mediawords']: config_ignore_schema_version = config["mediawords"][ "ignore_schema_version"] if config_ignore_schema_version or self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ: log.warning(""" The current Media Cloud database schema is older than the schema present in mediawords.sql, but %s is set so continuing anyway. """ % self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE) return True else: log.warning( """ ################################ The current Media Cloud database schema is not the same as the schema present in mediawords.sql. The database schema currently running in the database is %(current_schema_version)s, and the schema version in the mediawords.sql is %(target_schema_version)s. Please run: ./script/run_in_env.sh ./script/mediawords_upgrade_db.pl --import to automatically upgrade the database schema to the latest version. If you want to connect to the Media Cloud database anyway (ignoring the schema version), set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such: %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py or %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/run_in_env.sh ./script/your_script.pl ################################ """ % { "current_schema_version": current_schema_version, "target_schema_version": target_schema_version, "IGNORE_SCHEMA_VERSION_ENV_VARIABLE": self.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE, }) return False
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, 'a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % ( sql_now(), url, )) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % ( http_request_log_path, str(ex), )) pass
def rotate_supervisor_logs(): root_path = mc_root_path() l.debug('Media Cloud root path: %s' % root_path) config = py_get_config() child_log_dir = config['supervisor']['childlogdir'] l.debug('Child log directory: %s' % child_log_dir) supervisor_logs_dir = os.path.join(root_path, child_log_dir) l.info('Supervisor logs path: %s' % supervisor_logs_dir) logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state') l.debug('logrotate state file: %s' % logrotate_state_file) if not os.path.isdir(supervisor_logs_dir): raise Exception( 'Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir) logrotate_config = ''' %(supervisor_logs_dir)s/*.log { size %(log_max_size)d rotate %(old_log_count)d copytruncate compress missingok notifempty } ''' % { 'supervisor_logs_dir': supervisor_logs_dir, 'log_max_size': __LOG_MAX_SIZE, 'old_log_count': __OLD_LOG_COUNT, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp( suffix='.conf', prefix='logrotate') l.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, 'w') as tmp: tmp.write(logrotate_config) l.info('Running logrotate...') subprocess.check_call([ 'logrotate', '--verbose', '--state', logrotate_state_file, logrotate_temp_config_path ]) l.debug('Cleaning up temporary logrotate config...') os.unlink(logrotate_temp_config_path)
def setUpClass(cls) -> None: """Create a fresh template data from mediawords.sql. The template database will be used to execute the 'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database for each individual unit test. Recreating from a template is much faster than creating a database from scratch from our large schema. """ super().setUpClass() config = py_get_config() db_config = list( filter(lambda x: x['label'] == cls.TEST_DB_LABEL, config['database'])) if len(db_config) < 1: raise McTestDatabaseTestCaseException( "Unable to find %s database in mediawords.yml" % cls.TEST_DB_LABEL) cls.db_name = (db_config[0])['db'] cls.template_db_name = config['mediawords'].get( 'test_template_db_name', None) if cls.template_db_name is not None: log.warning("use existing test db template: %s" % cls.template_db_name) return log.info("create test db template") cls.template_db_name = cls.db_name + '_template' # we insert this db name directly into sql, so be paranoid about what is in it if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None: raise McTestDatabaseTestCaseException("Illegal table name: " + cls.db_name) # mediacloud_test should already exist, so we have to connect to it to create the template database db = connect_to_db(label=cls.TEST_DB_LABEL, do_not_check_schema_version=True) cls.__kill_connections_to_database(db=db, database_name=cls.template_db_name) db.query("DROP DATABASE IF EXISTS {}".format(cls.template_db_name)) db.query("CREATE DATABASE {}".format(cls.template_db_name)) db.disconnect() recreate_db(label=cls.TEST_DB_LABEL, is_template=True)
def rotate_supervisor_logs(): root_path = mc_root_path() log.debug('Media Cloud root path: %s' % root_path) config = py_get_config() child_log_dir = config['supervisor']['childlogdir'] log.debug('Child log directory: %s' % child_log_dir) supervisor_logs_dir = os.path.join(root_path, child_log_dir) log.info('Supervisor logs path: %s' % supervisor_logs_dir) logrotate_state_file = os.path.join(supervisor_logs_dir, 'logrotate.state') log.debug('logrotate state file: %s' % logrotate_state_file) if not os.path.isdir(supervisor_logs_dir): raise Exception('Supervisor logs directory does not exist at path: %s' % supervisor_logs_dir) logrotate_config = ''' %(supervisor_logs_dir)s/*.log { size %(log_max_size)d rotate %(old_log_count)d copytruncate compress missingok notifempty } ''' % { 'supervisor_logs_dir': supervisor_logs_dir, 'log_max_size': __LOG_MAX_SIZE, 'old_log_count': __OLD_LOG_COUNT, } logrotate_temp_fd, logrotate_temp_config_path = tempfile.mkstemp(suffix='.conf', prefix='logrotate') log.debug('Temporary logtorate config path: %s' % logrotate_temp_config_path) with os.fdopen(logrotate_temp_fd, 'w') as tmp: tmp.write(logrotate_config) log.info('Running logrotate...') subprocess.check_call([ 'logrotate', '--verbose', '--state', logrotate_state_file, logrotate_temp_config_path ]) log.debug('Cleaning up temporary logrotate config...') os.unlink(logrotate_temp_config_path)
def _request_for_text(self, text: str) -> Request: text = decode_object_from_bytes_if_needed(text) # CLIFF annotator URL config = py_get_config() url = config.get('cliff', {}).get('annotator_url', None) if url is None: raise McCLIFFAnnotatorException( "Unable to determine CLIFF annotator URL to use.") request = Request(method='POST', url=url) request.set_content_type( 'application/x-www-form-urlencoded; charset=utf-8') request.set_content({'q': text}) return request
def __should_continue_with_outdated_schema(current_schema_version: int, target_schema_version: int) -> bool: """Schema is outdated / too new; returns 1 if MC should continue nevertheless, 0 otherwise""" config = py_get_config() config_ignore_schema_version = False if 'ignore_schema_version' in config['mediawords']: config_ignore_schema_version = config["mediawords"]["ignore_schema_version"] if config_ignore_schema_version or DatabaseHandler.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE in os.environ: log.warning(""" The current Media Cloud database schema is older than the schema present in mediawords.sql, but %s is set so continuing anyway. """ % DatabaseHandler.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE) return True else: log.warning(""" ################################ The current Media Cloud database schema is not the same as the schema present in mediawords.sql. The database schema currently running in the database is %(current_schema_version)s, and the schema version in the mediawords.sql is %(target_schema_version)s. Please run: ./script/run_in_env.sh ./script/upgrade_db.pl --import to automatically upgrade the database schema to the latest version. If you want to connect to the Media Cloud database anyway (ignoring the schema version), set the %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s environment variable as such: %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/your_script.py or %(IGNORE_SCHEMA_VERSION_ENV_VARIABLE)s=1 ./script/run_in_env.sh ./script/your_script.pl ################################ """ % { "current_schema_version": current_schema_version, "target_schema_version": target_schema_version, "IGNORE_SCHEMA_VERSION_ENV_VARIABLE": DatabaseHandler.__IGNORE_SCHEMA_VERSION_ENV_VARIABLE, }) return False
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, encoding='utf-8', mode='a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % (sql_now(), url,)) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),)) pass
def test_run_block_with_large_work_mem(self): normal_work_mem = 256 # MB large_work_mem = 512 # MB old_large_work_mem = None config = py_get_config() if 'large_work_mem' in config['mediawords']: old_large_work_mem = config['mediawords']['large_work_mem'] config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem py_set_config(config) self.db().query("SET work_mem TO %s", ('%sMB' % normal_work_mem, )) current_work_mem = int(self.db().query(""" SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """).flat()[0]) assert current_work_mem == normal_work_mem * 1024 def __test_run_block_with_large_work_mem_inner(): self.db().execute_with_large_work_mem(""" INSERT INTO execute_large_work_mem (work_mem) SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """) self.db().query( 'CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)' ) self.db().run_block_with_large_work_mem( __test_run_block_with_large_work_mem_inner) statement_work_mem = int(self.db().query(""" SELECT work_mem FROM execute_large_work_mem """).flat()[0]) assert statement_work_mem == large_work_mem * 1024 current_work_mem = int(self.db().query(""" SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """).flat()[0]) assert current_work_mem == normal_work_mem * 1024 config['mediawords']['large_work_mem'] = old_large_work_mem py_set_config(config)
def setUp(self) -> None: """Create a fresh testing database for each unit test. The first time this function is called within a given process, it will create a template database from mediawords.sql. For each test, it will create a new test database using the postgres 'create database mediacloud_test template mediacloud_test_template' functionality. Recreating each unit test database from the template is much faster than recreating from mediawords.sql. """ test_db_label = 'test' config = py_get_config() db_config = list(filter(lambda x: x['label'] == test_db_label, config['database'])) if len(db_config) < 1: raise McTestDatabaseTestCaseException("Unable to find %s database in mediawords.yml" % [test_db_label]) db_name = (db_config[0])['db'] template_db_name = db_name + '_template' if re.search('[^a-z0-9_]', db_name, flags=re.I) is not None: raise McTestDatabaseTestCaseException("Illegal table name: " + db_name) if not TestDatabaseWithSchemaTestCase._template_db_created: log.info("create test db template") # mediacloud_test should already exist, so we have to connect to it to create the template database db = connect_to_db(label=test_db_label, do_not_check_schema_version=True) db.query("drop database if exists %s" % (template_db_name,)) db.query("create database %s" % (template_db_name,)) db.disconnect() recreate_db(label=test_db_label, is_template=True) TestDatabaseWithSchemaTestCase._template_db_created = True # now connect to the template database to execure the create command for the test database log.info("recreate test db template") db = connect_to_db(label=test_db_label, is_template=True) db.query("drop database if exists %s" % (db_name,)) db.query("create database %s template %s" % (db_name, template_db_name)) db.disconnect() db = connect_to_db(label=test_db_label) force_using_test_database() self.__db = db
def _create_database_handler(): log.info("Looking for test database credentials...") test_database = None config = py_get_config() for database in config['database']: if database['label'] == 'test': test_database = database break assert test_database is not None log.info( "Connecting to test database '%s' via DatabaseHandler class..." % test_database['db']) db = DatabaseHandler(host=test_database['host'], port=test_database['port'], username=test_database['user'], password=test_database['pass'], database=test_database['db']) return db
def create_database_handler() -> DatabaseHandler: log.info("Looking for test database credentials...") test_database = None config = py_get_config() for database in config['database']: if database['label'] == 'test': test_database = database break assert test_database is not None log.info("Connecting to test database '%s' via DatabaseHandler class..." % test_database['db']) db = DatabaseHandler( host=test_database['host'], port=test_database['port'], username=test_database['user'], password=test_database['pass'], database=test_database['db'] ) return db
def setUpClass(cls) -> None: """Create a fresh template data from mediawords.sql. The template database will be used to execute the 'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database for each individual unit test. Recreating from a template is much faster than creating a database from scratch from our large schema. """ log.info("create test db template") config = py_get_config() db_config = list( filter(lambda x: x['label'] == cls.TEST_DB_LABEL, config['database'])) if len(db_config) < 1: raise McTestDatabaseTestCaseException( "Unable to find %s database in mediawords.yml" % cls.TEST_DB_LABEL) cls.db_name = (db_config[0])['db'] cls.template_db_name = cls.db_name + '_template' # we only want to run this once per test suite for all database test cases, so this needs to be a global global _template_db_created if _template_db_created: return # we insert this db name directly into sql, so be paranoid about what is in it if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None: raise McTestDatabaseTestCaseException("Illegal table name: " + cls.db_name) # mediacloud_test should already exist, so we have to connect to it to create the template database db = connect_to_db(label=cls.TEST_DB_LABEL, do_not_check_schema_version=True) db.query("drop database if exists %s" % (cls.template_db_name, )) db.query("create database %s" % (cls.template_db_name, )) db.disconnect() recreate_db(label=cls.TEST_DB_LABEL, is_template=True) _template_db_created = True
def test_run_block_with_large_work_mem(self): normal_work_mem = 256 # MB large_work_mem = 512 # MB old_large_work_mem = None config = py_get_config() if 'large_work_mem' in config['mediawords']: old_large_work_mem = config['mediawords']['large_work_mem'] config['mediawords']['large_work_mem'] = '%dMB' % large_work_mem py_set_config(config) self.db().query("SET work_mem TO %s", ('%sMB' % normal_work_mem,)) current_work_mem = int(self.db().query(""" SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """).flat()[0]) assert current_work_mem == normal_work_mem * 1024 def __test_run_block_with_large_work_mem_inner(): self.db().execute_with_large_work_mem(""" INSERT INTO execute_large_work_mem (work_mem) SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """) self.db().query('CREATE TEMPORARY TABLE execute_large_work_mem (work_mem INT NOT NULL)') self.db().run_block_with_large_work_mem(__test_run_block_with_large_work_mem_inner) statement_work_mem = int(self.db().query(""" SELECT work_mem FROM execute_large_work_mem """).flat()[0]) assert statement_work_mem == large_work_mem * 1024 current_work_mem = int(self.db().query(""" SELECT setting::INT FROM pg_settings WHERE name = 'work_mem' """).flat()[0]) assert current_work_mem == normal_work_mem * 1024 config['mediawords']['large_work_mem'] = old_large_work_mem py_set_config(config)
def setUpClass(cls) -> None: """Create a fresh template data from mediawords.sql. The template database will be used to execute the 'create database mediacloud_test template mediacloud_test_template' functionality to create a fresh database for each individual unit test. Recreating from a template is much faster than creating a database from scratch from our large schema. """ super().setUpClass() config = py_get_config() db_config = list(filter(lambda x: x['label'] == cls.TEST_DB_LABEL, config['database'])) if len(db_config) < 1: raise McTestDatabaseTestCaseException("Unable to find %s database in mediawords.yml" % cls.TEST_DB_LABEL) cls.db_name = (db_config[0])['db'] cls.template_db_name = config['mediawords'].get('test_template_db_name', None) if cls.template_db_name is not None: log.warning("use existing test db template: %s" % cls.template_db_name) return log.info("create test db template") cls.template_db_name = cls.db_name + '_template' # we insert this db name directly into sql, so be paranoid about what is in it if re.search('[^a-z0-9_]', cls.db_name, flags=re.I) is not None: raise McTestDatabaseTestCaseException("Illegal table name: " + cls.db_name) # mediacloud_test should already exist, so we have to connect to it to create the template database db = connect_to_db(label=cls.TEST_DB_LABEL, do_not_check_schema_version=True) cls.__kill_connections_to_database(db=db, database_name=cls.template_db_name) db.query("DROP DATABASE IF EXISTS {}".format(cls.template_db_name)) db.query("CREATE DATABASE {}".format(cls.template_db_name)) db.disconnect() recreate_db(label=cls.TEST_DB_LABEL, is_template=True)
def __blacklist_request_if_needed(request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() blacklist_url_pattern = None if 'blacklist_url_pattern' in config['mediawords']: blacklist_url_pattern = config['mediawords']['blacklist_url_pattern'] if blacklist_url_pattern is not None and len(blacklist_url_pattern) > 0: if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE) is not None: request.set_url("http://0.0.0.1/%s" % url) return request
def _tags_for_annotation( self, annotation: Union[dict, list]) -> List[JSONAnnotator.Tag]: annotation = decode_object_from_bytes_if_needed(annotation) config = py_get_config() nytlabels_config = config.get('nytlabels', None) if nytlabels_config is None: raise McNYTLabelsAnnotatorException("NYTLabels is not configured.") nytlabels_labels_tag_set = nytlabels_config.get( 'nytlabels_labels_tag_set', None) if nytlabels_labels_tag_set is None: raise McNYTLabelsAnnotatorException( "NYTLabels labels tag set is unset in configuration.") nytlabels_version_tag = nytlabels_config.get('nytlabels_version_tag', None) if nytlabels_version_tag is None: raise McNYTLabelsAnnotatorException( "NYTLabels version tag is unset in configuration.") tags = list() tags.append( JSONAnnotator.Tag(tag_sets_name=self.__NYTLABELS_VERSION_TAG_SET, tag_sets_label=self.__NYTLABELS_VERSION_TAG_SET, tag_sets_description= 'NYTLabels version the story was tagged with', tags_name=nytlabels_version_tag, tags_label=nytlabels_version_tag, tags_description="Story was tagged with '%s'" % nytlabels_version_tag)) descriptors600 = annotation.get('descriptors600', None) if descriptors600 is not None and len(descriptors600) > 0: for descriptor in descriptors600: label = descriptor['label'] score = float(descriptor['score']) if score > self.__NYTLABELS_SCORE_THRESHOLD: tags.append( JSONAnnotator.Tag( tag_sets_name=nytlabels_labels_tag_set, tag_sets_label=nytlabels_labels_tag_set, tag_sets_description='NYTLabels labels', # e.g. "hurricanes and tropical storms" tags_name=label, tags_label=label, tags_description=label)) else: log.debug(( "Skipping label '%(label)s' because its score %(score)2.6f" "is lower than the threshold %(threshold)2.6f" % { 'label': label, 'score': score, 'threshold': self.__NYTLABELS_SCORE_THRESHOLD, })) return tags
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query( """ SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, { 'object_id': stories_id }).hash() assert annotation_exists is not None story_tags = self.db().query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, { 'stories_id': stories_id }).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
def send_email(message: Message) -> bool: """Send email to someone. Returns True on success, False on failure. Raises on programming error.""" if message is None: raise McSendEmailException('Message is None.') if not message.from_: raise McSendEmailException("'from' is unset.") if message.to and (not isinstance(message.to, list)): raise McSendEmailException("'to' is not a list.") if message.cc and (not isinstance(message.cc, list)): raise McSendEmailException("'cc' is not a list.") if message.bcc and (not isinstance(message.bcc, list)): raise McSendEmailException("'bcc' is not a list.") if not (len(message.to) > 0 or len(message.cc) > 0 or len(message.bcc) > 0): raise McSendEmailException("No one to send the email to.") if not message.subject: raise McSendEmailException("'subject' is unset.") if not (message.text_body or message.html_body): raise McSendEmailException("No message body.") try: # Create message mime_message = MIMEMultipart('alternative') mime_message['Subject'] = '[Media Cloud] %s' % message.subject mime_message['From'] = message.from_ if message.to: mime_message['To'] = ', '.join(message.to) else: mime_message['To'] = 'undisclosed recipients' if message.cc: mime_message['Cc'] = ', '.join(message.cc) if message.bcc: mime_message['Bcc'] = ', '.join(message.bcc) if message.text_body: message_part = MIMEText(message.text_body, 'plain', 'utf-8') mime_message.attach(message_part) # HTML gets attached last, thus making it a preferred part as per RFC if message.html_body: message_part = MIMEText(message.html_body, 'html', 'utf-8') mime_message.attach(message_part) if test_mode_is_enabled(): log.info("Test mode is enabled, not actually sending any email.") log.debug("Omitted email:\n\n%s" % mime_message.as_string()) else: # Connect to SMTP config = py_get_config() smtp_config = config['mail']['smtp'] smtp = smtplib.SMTP(host=smtp_config['host'], port=smtp_config['port']) if smtp_config['starttls']: smtp.starttls() if smtp_config['username'] and smtp_config['password']: smtp.login(user=smtp_config['username'], password=smtp_config['password']) # Send message refused_recipients = smtp.sendmail(mime_message['From'], mime_message['To'], mime_message.as_string()) if len(refused_recipients): log.warning( "Unable to send email to the following recipients: %s" % str(refused_recipients)) smtp.quit() except Exception as ex: log.warning('Unable to send email to %s: %s' % (message.to, str(ex))) return False return True
def parallel_get(urls: List[str]) -> List[Response]: """GET multiple URLs in parallel.""" # FIXME doesn't respect timing() and other object properties urls = decode_object_from_bytes_if_needed(urls) # Original implementation didn't raise on undefined / empty list of URLs if urls is None: return [] if len(urls) == 0: return [] # Remove duplicates from list while maintaining order because: # 1) We don't want to fetch the same URL twice # 2) URLs are being used as unique dictionary IDs later on urls_before_removing_duplicates = urls.copy() urls = list(OrderedDict.fromkeys(urls)) if len(urls) != len(urls_before_removing_duplicates): log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates)) # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid, # get() in a fork should be able to come up with a reasonable Response object for it for url in urls: if not is_http_url(url): raise McParallelGetException("URL %s is not a valid URL; URLs: %s" % (url, str(urls),)) config = py_get_config() if 'web_store_num_parallel' not in config['mediawords']: raise McParallelGetException('"web_store_num_parallel" is not set.') num_parallel = config['mediawords']['web_store_num_parallel'] if 'web_store_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_timeout" is not set.') timeout = config['mediawords']['web_store_timeout'] if 'web_store_per_domain_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_per_domain_timeout" is not set.') per_domain_timeout = config['mediawords']['web_store_per_domain_timeout'] url_stack = UserAgent.__get_scheduled_urls(urls_=urls, per_domain_timeout_=per_domain_timeout) start_time = time.time() url_blocks = {} while len(url_stack) > 0: block_i = len(url_stack) % num_parallel if block_i not in url_blocks: url_blocks[block_i] = [] url_blocks[block_i].append(url_stack.pop()) pool = multiprocessing.Pool(processes=num_parallel) all_results = [] for i, url_block in url_blocks.items(): result = pool.apply_async(_parallel_get_web_store, args=(url_block, start_time, timeout,)) all_results.append(result) all_responses = [] for result in all_results: responses = result.get() all_responses = all_responses + responses # No timeouts here because we trust the workers to timeout by themselves (by UserAgent) pool.close() pool.join() pool.terminate() # Sort URLs in parameter order # (if URLs weren't split into blocks, we could probably use map_async) response_url_map = {} for response in all_responses: url = response.scheduled_url.url response_url_map[url] = response.response sorted_responses = [] for url in urls: if url not in response_url_map: raise McParallelGetException("URL %s is not in the response URL map %s." % (url, response_url_map,)) sorted_responses.append(response_url_map[url]) if len(urls) != len(sorted_responses): raise McParallelGetException( "Response count doesn't match URL count; responses: %s; URLs: %s" % (sorted_responses, urls,) ) return sorted_responses
def _tags_for_annotation( self, annotation: Union[dict, list]) -> List[JSONAnnotator.Tag]: annotation = decode_object_from_bytes_if_needed(annotation) config = py_get_config() cliff_config = config.get('cliff', None) if cliff_config is None: raise McCLIFFAnnotatorException("CLIFF is not configured.") cliff_version_tag = cliff_config.get('cliff_version_tag', None) if cliff_version_tag is None: raise McCLIFFAnnotatorException( "CLIFF version tag is unset in configuration.") cliff_geonames_tag_set = cliff_config.get('cliff_geonames_tag_set', None) if cliff_geonames_tag_set is None: raise McCLIFFAnnotatorException( "CLIFF geographical names tag set is unset in configuration.") cliff_organizations_tag_set = cliff_config.get( 'cliff_organizations_tag_set', None) if cliff_organizations_tag_set is None: raise McCLIFFAnnotatorException( "CLIFF organizations tag set is unset in configuration.") cliff_people_tag_set = cliff_config.get('cliff_people_tag_set', None) if cliff_people_tag_set is None: raise McCLIFFAnnotatorException( "CLIFF people tag set is unset in configuration.") tags = list() tags.append( JSONAnnotator.Tag( tag_sets_name=self.__CLIFF_VERSION_TAG_SET, tag_sets_label=self.__CLIFF_VERSION_TAG_SET, tag_sets_description='CLIFF version the story was tagged with', tags_name=cliff_version_tag, tags_label=cliff_version_tag, tags_description="Story was tagged with '%s'" % cliff_version_tag)) results = annotation.get('results', None) if results is None or len(results) == 0: return tags organizations = results.get('organizations', None) if organizations is not None: for organization in organizations: tags.append( JSONAnnotator.Tag( tag_sets_name=cliff_organizations_tag_set, tag_sets_label=cliff_organizations_tag_set, tag_sets_description='CLIFF organizations', # e.g. "United Nations" tags_name=organization['name'], tags_label=organization['name'], tags_description=organization['name'])) people = results.get('people', None) if people is not None: for person in people: tags.append( JSONAnnotator.Tag( tag_sets_name=cliff_people_tag_set, tag_sets_label=cliff_people_tag_set, tag_sets_description='CLIFF people', # e.g. "Einstein" tags_name=person['name'], tags_label=person['name'], tags_description=person['name'])) places = results.get('places', None) if places is not None: focus = places.get('focus', None) if focus is not None: countries = focus.get('countries', None) if countries is not None: for country in countries: tags.append( JSONAnnotator.Tag( tag_sets_name=cliff_geonames_tag_set, tag_sets_label=cliff_geonames_tag_set, tag_sets_description='CLIFF geographical names', # e.g. "geonames_6252001" tags_name=self.__CLIFF_GEONAMES_TAG_PREFIX + str(country['id']), # e.g. "United States" tags_label=country['name'], # e.g. "United States | A | US" tags_description= '%(name)s | %(feature)s | %(country)s' % { 'name': country['name'], 'feature': country['featureClass'], 'country': country['countryCode'], })) states = focus.get('states', None) if states is not None: for state in states: tags.append( JSONAnnotator.Tag( tag_sets_name=cliff_geonames_tag_set, tag_sets_label=cliff_geonames_tag_set, tag_sets_description='CLIFF geographical names', # e.g. "geonames_4273857" tags_name=self.__CLIFF_GEONAMES_TAG_PREFIX + str(state['id']), # e.g. "Kansas" tags_label=state['name'], # e.g. "Kansas | A | KS | US" tags_description=('%(name)s | %(feature)s | ' '%(state)s | %(country)s') % { 'name': state['name'], 'feature': state['featureClass'], 'state': state['stateCode'], 'country': state['countryCode'], })) cities = focus.get('cities', None) if cities is not None: for city in cities: tags.append( JSONAnnotator.Tag( tag_sets_name=cliff_geonames_tag_set, tag_sets_label=cliff_geonames_tag_set, tag_sets_description='CLIFF geographical names', # e.g. "geonames_4273857" tags_name=self.__CLIFF_GEONAMES_TAG_PREFIX + str(city['id']), # e.g. "Kansas" tags_label=city['name'], # e.g. "Kansas | A | KS | US" tags_description=('%(name)s | %(feature)s | ' '%(state)s | %(country)s') % { 'name': city['name'], 'feature': city['featureClass'], 'state': city['stateCode'], 'country': city['countryCode'], })) return tags
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query(""" SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None story_tags = self.db().query(""" SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, {'stories_id': stories_id}).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
def get_similarweb_client(): config = py_get_config() return SimilarWebClient(api_key=config['similarweb']['api_key'])
def connect_to_db( label: typing.Optional[str] = None, do_not_check_schema_version: bool = False, is_template: bool = False) -> DatabaseHandler: """Connect to PostgreSQL. Arguments: label - db config section label for mediawords.yml do_no_check_schema_version - if false, throw an error if the versions in mediawords.ym and the db do not match is_template - if true, connect to a db called <db_name>_template instead of <db_name> """ label = decode_str_from_bytes_if_needed(label) # If this is Catalyst::Test run, force the label to the test database if using_test_database(): label = 'test' config = py_get_config() if 'database' not in config: raise McConnectToDBException("No database connections are configured") all_settings = config['database'] if all_settings is None: raise McConnectToDBException("No database connections are configured") settings = None if label is not None: for configured_database in all_settings: if configured_database['label'] == label: settings = configured_database break if settings is None: raise McConnectToDBException("No database connection settings labeled '%s'." % label) else: if len(all_settings) == 0: raise McConnectToDBException("No default connection settings found.") settings = all_settings[0] if settings is None: raise McConnectToDBException("Settings are undefined.") if 'host' not in settings or 'db' not in settings: raise McConnectToDBException("Settings are incomplete ('db' and 'host' must both be set).") host = settings['host'] port = int(settings['port']) username = settings['user'] password = settings['pass'] database = settings['db'] if is_template: database = database + "_template" try: ret = DatabaseHandler( host=host, port=port, username=username, password=password, database=database, do_not_check_schema_version=do_not_check_schema_version ) except Exception as ex: raise McConnectToDBException( "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % { 'username': username, 'host': host, 'port': port, 'database': database, 'exception': str(ex) }) if ret is None: raise McConnectToDBException("Error while connecting to the database.") if 'db_statement_timeout' in config['mediawords']: db_statement_timeout = config['mediawords']['db_statement_timeout'] ret.query('SET statement_timeout TO %(db_statement_timeout)s' % {'db_statement_timeout': db_statement_timeout}) return ret
def parallel_get(urls: List[str]) -> List[Response]: """GET multiple URLs in parallel.""" # FIXME doesn't respect timing() and other object properties def __get_url_domain(url_: str) -> str: if not is_http_url(url_): return url_ host = get_url_host(url_) name_parts = host.split('.') n = len(name_parts) - 1 # for country domains, use last three parts of name if re.search(pattern=r"\...$", string=host): domain = '.'.join( [name_parts[n - 2], name_parts[n - 1], name_parts[0]]) elif re.search(pattern=r"(localhost|blogspot\.com|wordpress\.com)", string=host): domain = url_ else: domain = '.'.join([name_parts[n - 1], name_parts[n]]) return domain.lower() def __get_scheduled_urls( urls_: List[str], per_domain_timeout_: int) -> List[_ParallelGetScheduledURL]: """Schedule the URLs by adding a { time => $time } field to each URL to make sure we obey the 'per_domain_timeout'. Sort requests by ascending time.""" domain_urls = {} for url_ in urls_: domain = __get_url_domain(url_=url_) if domain not in domain_urls: domain_urls[domain] = [] domain_urls[domain].append(url_) scheduled_urls = [] for domain, urls_in_domain in domain_urls.items(): time_ = 0 for domain_url in urls_in_domain: domain_url = _ParallelGetScheduledURL(url=domain_url, time_=time_) scheduled_urls.append(domain_url) if time_ % 5 == 0: # FIXME why 5? time_ = time_ + per_domain_timeout_ scheduled_urls = sorted(scheduled_urls, key=lambda x: x.time) return scheduled_urls # --- urls = decode_object_from_bytes_if_needed(urls) # Original implementation didn't raise on undefined / empty list of URLs if urls is None: return [] if len(urls) == 0: return [] # Remove duplicates from list while maintaining order because: # 1) We don't want to fetch the same URL twice # 2) URLs are being used as unique dictionary IDs later on urls_before_removing_duplicates = urls.copy() urls = list(OrderedDict.fromkeys(urls)) if len(urls) != len(urls_before_removing_duplicates): log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates)) # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid, # get() in a fork should be able to come up with a reasonable Response object for it for url in urls: if not is_http_url(url): raise McParallelGetException( "URL %s is not a valid URL; URLs: %s" % ( url, str(urls), )) config = py_get_config() if 'web_store_num_parallel' not in config['mediawords']: raise McParallelGetException( '"web_store_num_parallel" is not set.') num_parallel = config['mediawords']['web_store_num_parallel'] if 'web_store_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_timeout" is not set.') timeout = config['mediawords']['web_store_timeout'] if 'web_store_per_domain_timeout' not in config['mediawords']: raise McParallelGetException( '"web_store_per_domain_timeout" is not set.') per_domain_timeout = config['mediawords'][ 'web_store_per_domain_timeout'] url_stack = __get_scheduled_urls( urls_=urls, per_domain_timeout_=per_domain_timeout) start_time = time.time() url_blocks = {} while len(url_stack) > 0: block_i = len(url_stack) % num_parallel if block_i not in url_blocks: url_blocks[block_i] = [] url_blocks[block_i].append(url_stack.pop()) pool = multiprocessing.Pool(processes=num_parallel) all_results = [] for i, url_block in url_blocks.items(): result = pool.apply_async(_parallel_get_web_store, args=( url_block, start_time, timeout, )) all_results.append(result) all_responses = [] for result in all_results: responses = result.get() all_responses = all_responses + responses # No timeouts here because we trust the workers to timeout by themselves (by UserAgent) pool.close() pool.join() pool.terminate() # Sort URLs in parameter order # (if URLs weren't split into blocks, we could probably use map_async) response_url_map = {} for response in all_responses: url = response.scheduled_url.url response_url_map[url] = response.response sorted_responses = [] for url in urls: if url not in response_url_map: raise McParallelGetException( "URL %s is not in the response URL map %s." % ( url, response_url_map, )) sorted_responses.append(response_url_map[url]) if len(urls) != len(sorted_responses): raise McParallelGetException( "Response count doesn't match URL count; responses: %s; URLs: %s" % ( sorted_responses, urls, )) return sorted_responses
def parallel_get(urls: List[str]) -> List[Response]: """GET multiple URLs in parallel.""" # FIXME doesn't respect timing() and other object properties urls = decode_object_from_bytes_if_needed(urls) # Original implementation didn't raise on undefined / empty list of URLs if urls is None: return [] if len(urls) == 0: return [] # Remove duplicates from list while maintaining order because: # 1) We don't want to fetch the same URL twice # 2) URLs are being used as unique dictionary IDs later on urls_before_removing_duplicates = urls.copy() urls = list(OrderedDict.fromkeys(urls)) if len(urls) != len(urls_before_removing_duplicates): log.warning("Some of the URLs are duplicate; URLs: %s" % str(urls_before_removing_duplicates)) # Raise on one or more invalid URLs because we consider it a caller's problem; if URL at least looks valid, # get() in a fork should be able to come up with a reasonable Response object for it for url in urls: if not is_http_url(url): raise McParallelGetException( "URL %s is not a valid URL; URLs: %s" % ( url, str(urls), )) config = py_get_config() if 'web_store_num_parallel' not in config['mediawords']: raise McParallelGetException( '"web_store_num_parallel" is not set.') num_parallel = config['mediawords']['web_store_num_parallel'] if 'web_store_timeout' not in config['mediawords']: raise McParallelGetException('"web_store_timeout" is not set.') timeout = config['mediawords']['web_store_timeout'] if 'web_store_per_domain_timeout' not in config['mediawords']: raise McParallelGetException( '"web_store_per_domain_timeout" is not set.') per_domain_timeout = config['mediawords'][ 'web_store_per_domain_timeout'] url_stack = UserAgent.__get_scheduled_urls( urls_=urls, per_domain_timeout_=per_domain_timeout) start_time = time.time() url_blocks = {} while len(url_stack) > 0: block_i = len(url_stack) % num_parallel if block_i not in url_blocks: url_blocks[block_i] = [] url_blocks[block_i].append(url_stack.pop()) pool = multiprocessing.Pool(processes=num_parallel) all_results = [] for i, url_block in url_blocks.items(): result = pool.apply_async(_parallel_get_web_store, args=( url_block, start_time, timeout, )) all_results.append(result) all_responses = [] for result in all_results: responses = result.get() all_responses = all_responses + responses # No timeouts here because we trust the workers to timeout by themselves (by UserAgent) pool.close() pool.join() pool.terminate() # Sort URLs in parameter order # (if URLs weren't split into blocks, we could probably use map_async) response_url_map = {} for response in all_responses: url = response.scheduled_url.url response_url_map[url] = response.response sorted_responses = [] for url in urls: if url not in response_url_map: raise McParallelGetException( "URL %s is not in the response URL map %s." % ( url, response_url_map, )) sorted_responses.append(response_url_map[url]) if len(urls) != len(sorted_responses): raise McParallelGetException( "Response count doesn't match URL count; responses: %s; URLs: %s" % ( sorted_responses, urls, )) return sorted_responses
def connect_to_db( label: str = None, do_not_check_schema_version: bool = False) -> DatabaseHandler: """Connect to PostgreSQL.""" label = decode_object_from_bytes_if_needed(label) # If this is Catalyst::Test run, force the label to the test database if using_test_database(): label = 'test' config = py_get_config() if 'database' not in config: raise McConnectToDBException("No database connections are configured") all_settings = config['database'] if all_settings is None: raise McConnectToDBException("No database connections are configured") settings = None if label is not None: for configured_database in all_settings: if configured_database['label'] == label: settings = configured_database break if settings is None: raise McConnectToDBException( "No database connection settings labeled '%s'." % label) else: if len(all_settings) == 0: raise McConnectToDBException( "No default connection settings found.") settings = all_settings[0] if settings is None: raise McConnectToDBException("Settings are undefined.") if 'host' not in settings or 'db' not in settings: raise McConnectToDBException( "Settings are incomplete ('db' and 'host' must both be set).") host = settings['host'] port = int(settings['port']) username = settings['user'] password = settings['pass'] database = settings['db'] try: ret = DatabaseHandler( host=host, port=port, username=username, password=password, database=database, do_not_check_schema_version=do_not_check_schema_version) except Exception as ex: raise McConnectToDBException( "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % { 'username': username, 'host': host, 'port': port, 'database': database, 'exception': str(ex) }) if ret is None: raise McConnectToDBException("Error while connecting to the database.") if 'db_statement_timeout' in config['mediawords']: db_statement_timeout = config['mediawords']['db_statement_timeout'] ret.query('SET statement_timeout TO %(db_statement_timeout)s' % {'db_statement_timeout': db_statement_timeout}) # Reset the session variable in case the database connection is being reused due to pooling ret.query(""" DO $$ BEGIN PERFORM enable_story_triggers(); EXCEPTION WHEN undefined_function THEN -- This exception will be raised if the database is uninitialized at this point. -- So, don't emit any kind of error because of an non-existent function. NULL; WHEN OTHERS THEN -- Forward the exception RAISE; END $$; """) return ret