def __init__(self, user_agent_config: UserAgentConfig = None): """Constructor.""" self._user_agent_config = user_agent_config if not self._user_agent_config: self._user_agent_config = CommonConfig.user_agent() # "requests" session to carry the cookie pool around self.__session = requests.Session() self.__session.headers.update({ 'From': self.__OWNER, 'User-Agent': self.__USER_AGENT, 'Accept-Charset': 'utf-8', # MC_REWRITE_TO_PYTHON: # # Disable keep-alive (and fancy requests' connection pooling) because rudimentary HTTP server used for Perl # unit tests doesn't support it (but then maybe we don't want keep-alive anyway) 'Connection': 'close', }) self.set_max_redirect(self.__DEFAULT_MAX_REDIRECT) self.__timeout = None self.set_timeout(self.__DEFAULT_TIMEOUT) self.__max_size = None self.set_max_size(self.__DEFAULT_MAX_SIZE) # Disable retries by default; if client wants those, it should call # timing() itself, e.g. set it to '1,2,4,8' self.__timing = None self.set_timing(None)
def __init__(self, to: Union[str, List[str]], subject: str, text_body: str, html_body: Optional[str] = None, cc: Optional[Union[str, List[str]]] = None, bcc: Optional[Union[str, List[str]]] = None): """Email message constructor.""" self.from_ = CommonConfig.email_from_address() self.subject = decode_object_from_bytes_if_needed(subject) self.text_body = decode_object_from_bytes_if_needed(text_body) self.html_body = decode_object_from_bytes_if_needed(html_body) self.to = decode_object_from_bytes_if_needed(to) if isinstance(self.to, str): self.to = [self.to] self.cc = decode_object_from_bytes_if_needed(cc) if isinstance(self.cc, str): self.cc = [self.cc] self.bcc = decode_object_from_bytes_if_needed(bcc) if isinstance(self.bcc, str): self.bcc = [self.bcc]
def connect_to_db() -> DatabaseHandler: """Connect to PostgreSQL.""" db_config = CommonConfig.database() retries_config = db_config.retries() assert retries_config.max_attempts() > 0, "max_tries can't be negative." db = None for attempt in range(1, retries_config.max_attempts() + 1): try: log.debug("Connecting to PostgreSQL...") db = DatabaseHandler( host=db_config.hostname(), port=db_config.port(), username=db_config.username(), password=db_config.password(), database=db_config.database_name(), ) if not db: raise ValueError("Returned value is None.") # Return the database handler upon successful connection break except Exception as ex: error_message = "Unable to connect to %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % { 'username': db_config.username(), 'host': db_config.hostname(), 'port': db_config.port(), 'database': db_config.database_name(), 'exception': str(ex), } log.error(error_message) if attempt < retries_config.max_attempts(): log.info( f"Will retry for #{attempt} time in {retries_config.sleep_between_attempts()} seconds..." ) time.sleep(retries_config.sleep_between_attempts()) else: log.info("Out of retries, giving up and exiting...") # Don't throw any exceptions because they might be caught by # the try-catch block, and so the caller will just assume that # there was something wrong with the input data and proceed # with processing next item in the job queue (e.g. the next # story). Instead, just quit and wait for someone to restart # the whole app that requires database access. fatal_error(error_message) return db
def __init__(self, queue_name: str): """Return job broker (Celery app object) prepared for the specific queue name.""" assert queue_name, "Queue name is empty." self.__queue_name = queue_name config = CommonConfig() rabbitmq_config = config.rabbitmq() broker_uri = 'amqp://{username}:{password}@{hostname}:{port}/{vhost}'.format( username=rabbitmq_config.username(), password=rabbitmq_config.password(), hostname=rabbitmq_config.hostname(), port=rabbitmq_config.port(), vhost=rabbitmq_config.vhost(), ) db_config = CommonConfig.database() result_backend_url = 'db+postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{database}'.format( username=db_config.username(), password=db_config.password(), hostname=db_config.hostname(), port=db_config.port(), database=db_config.database_name(), ) self.__app = celery.Celery(queue_name, broker=broker_uri, backend=result_backend_url) self.__app.conf.broker_connection_timeout = rabbitmq_config.timeout() # Concurrency is done by us, not Celery itself self.__app.conf.worker_concurrency = 1 self.__app.conf.broker_heartbeat = 0 # https://tech.labs.oliverwyman.com/blog/2015/04/30/making-celery-play-nice-with-rabbitmq-and-bigwig/ self.__app.conf.broker_transport_options = {'confirm_publish': True} self.__app.conf.database_table_names = { 'task': 'celery_tasks', 'group': 'celery_groups', } # Fetch only one job at a time self.__app.conf.worker_prefetch_multiplier = 1 self.__app.conf.worker_max_tasks_per_child = 1000 queue = Queue( name=queue_name, exchange=Exchange(queue_name), routing_key=queue_name, queue_arguments={ 'x-max-priority': 3, 'x-queue-mode': 'lazy', }, ) self.__app.conf.task_queues = [queue] # noinspection PyUnusedLocal def __route_task(name, args_, kwargs_, options_, task_=None, **kw_): return { 'queue': name, 'exchange': name, 'routing_key': name, } self.__app.conf.task_routes = (__route_task,)
def solr_request(path: str, params: SolrParams = None, content: Union[str, SolrParams] = None, content_type: Optional[str] = None, config: Optional[CommonConfig] = None) -> str: """ Send a request to Solr. :param path: Solr path to call, e.g. 'select'. :param params: Query parameters to add to the path. :param content: String or dictionary content to send via POST request. :param content_type: Content-Type for the POST content. :param config: (testing) Configuration object :return: Raw response content on success, raise exception on error. """ path = decode_object_from_bytes_if_needed(path) params = decode_object_from_bytes_if_needed(params) content = decode_object_from_bytes_if_needed(content) content_type = decode_object_from_bytes_if_needed(content_type) if not path: raise McSolrRequestInvalidParamsException("Path is unset.") if params: if not isinstance(params, dict): raise McSolrRequestInvalidParamsException( f"Params is not a dictionary: {params}") if content: if not (isinstance(content, str) or isinstance(content, dict)): raise McSolrRequestInvalidParamsException( f"Content is not a string not a dictionary: {content}") if not config: config = CommonConfig() solr_url = config.solr_url() if not params: params = {} abs_uri = furl(f"{solr_url}/mediacloud/{path}") abs_uri = abs_uri.set(params) abs_url = str(abs_uri) ua = UserAgent() ua.set_timeout(__QUERY_HTTP_TIMEOUT) ua.set_max_size(None) # Remediate CVE-2017-12629 q_param = str(params.get('q', '')) if 'xmlparser' in q_param.lower(): raise McSolrRequestQueryErrorException( "XML queries are not supported.") # Solr might still be starting up so wait for it to expose the collections list __wait_for_solr_to_start(config=config) if content: if not content_type: fallback_content_type = 'text/plain; charset=utf-8' log.warning( f"Content-Type is not set; falling back to '{fallback_content_type}'" ) content_type = fallback_content_type if isinstance(content, dict): content = urlencode(content, doseq=True) content_encoded = content.encode('utf-8', errors='replace') request = Request(method='POST', url=abs_url) request.set_header(name='Content-Type', value=content_type) request.set_header(name='Content-Length', value=str(len(content_encoded))) request.set_content(content_encoded) else: request = Request(method='GET', url=abs_url) log.debug(f"Sending Solr request: {request}") response = ua.request(request) if not response.is_success(): error_message = __solr_error_message_from_response(response=response) raise McSolrRequestQueryErrorException( f"Error fetching Solr response: {error_message}") return response.decoded_content()
def send_email(message: Message) -> bool: """Send email to someone. Returns True on success, False on failure. Raises on programming error.""" if message is None: raise McSendEmailException('Message is None.') if not message.from_: raise McSendEmailException("'from' is unset.") if message.to and (not isinstance(message.to, list)): raise McSendEmailException("'to' is not a list.") if message.cc and (not isinstance(message.cc, list)): raise McSendEmailException("'cc' is not a list.") if message.bcc and (not isinstance(message.bcc, list)): raise McSendEmailException("'bcc' is not a list.") if not (len(message.to) > 0 or len(message.cc) > 0 or len(message.bcc) > 0): raise McSendEmailException("No one to send the email to.") if not message.subject: raise McSendEmailException("'subject' is unset.") if not (message.text_body or message.html_body): raise McSendEmailException("No message body.") try: # Create message mime_message = MIMEMultipart('alternative') mime_message['Subject'] = '[Media Cloud] %s' % message.subject mime_message['From'] = message.from_ if message.to: mime_message['To'] = ', '.join(message.to) else: mime_message['To'] = 'undisclosed recipients' if message.cc: mime_message['Cc'] = ', '.join(message.cc) if message.bcc: mime_message['Bcc'] = ', '.join(message.bcc) if message.text_body: message_part = MIMEText(message.text_body, 'plain', 'utf-8') mime_message.attach(message_part) unsubscribe_address = CommonConfig.smtp().unsubscribe_address() mime_message.add_header( 'List-Unsubscribe', f'mailto:{unsubscribe_address}?subject=Delete%20account%20and%20unsubscribe') # HTML gets attached last, thus making it a preferred part as per RFC if message.html_body: message_part = MIMEText(message.html_body, 'html', 'utf-8') mime_message.attach(message_part) if test_mode_is_enabled(): log.info("Test mode is enabled, not actually sending any email.") log.debug("Omitted email:\n\n%s" % mime_message.as_string()) else: # Connect to SMTP smtp = smtplib.SMTP( host=CommonConfig.smtp().hostname(), port=CommonConfig.smtp().port(), ) # Send message refused_recipients = smtp.sendmail(mime_message['From'], mime_message['To'], mime_message.as_string()) if len(refused_recipients): log.warning("Unable to send email to the following recipients: %s" % str(refused_recipients)) smtp.quit() except Exception as ex: log.warning('Unable to send email to %s: %s' % (message.to, str(ex))) return False return True
def __init__(self, queue_name: str, rabbitmq_config: Optional[RabbitMQConfig] = None): """ Create job broker object. :param queue_name: Queue name. """ queue_name = decode_object_from_bytes_if_needed(queue_name) assert queue_name, "Queue name is empty." self.__queue_name = queue_name config = CommonConfig() if not rabbitmq_config: rabbitmq_config = config.rabbitmq() broker_uri = 'amqp://{username}:{password}@{hostname}:{port}/{vhost}'.format( username=rabbitmq_config.username(), password=rabbitmq_config.password(), hostname=rabbitmq_config.hostname(), port=rabbitmq_config.port(), vhost=rabbitmq_config.vhost(), ) db_config = CommonConfig.database() result_backend_url = 'db+postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{database}'.format( username=db_config.username(), password=db_config.password(), hostname=db_config.hostname(), port=db_config.port(), database=db_config.database_name(), ) self.__app = celery.Celery(queue_name, broker=broker_uri, backend=result_backend_url) self.__app.conf.broker_connection_timeout = rabbitmq_config.timeout() # Concurrency is done by us, not Celery itself self.__app.conf.worker_concurrency = 1 self.__app.conf.broker_heartbeat = 0 # Acknowledge tasks after they get run, not before self.__app.conf.task_acks_late = 1 # https://tech.labs.oliverwyman.com/blog/2015/04/30/making-celery-play-nice-with-rabbitmq-and-bigwig/ self.__app.conf.broker_transport_options = {'confirm_publish': True} self.__app.conf.database_table_names = { 'task': 'celery_tasks', 'group': 'celery_groups', } # Fetch only one job at a time self.__app.conf.worker_prefetch_multiplier = 1 self.__app.conf.worker_max_tasks_per_child = 1000 retries_config = rabbitmq_config.retries() if retries_config: self.__app.task_publish_retry = True self.__app.task_publish_retry_policy = { 'max_retries': retries_config.max_retries(), 'interval_start': retries_config.interval_start(), 'interval_step': retries_config.interval_step(), 'interval_max': retries_config.interval_max(), } else: self.__app.task_publish_retry = False queue = Queue( name=queue_name, exchange=Exchange(queue_name), routing_key=queue_name, queue_arguments={ 'x-max-priority': 3, 'x-queue-mode': 'lazy', }, ) self.__app.conf.task_queues = [queue] # noinspection PyUnusedLocal def __route_task(name, args_, kwargs_, options_, task_=None, **kw_): return { 'queue': name, 'exchange': name, 'routing_key': name, } self.__app.conf.task_routes = (__route_task, )
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]: """ Using full page HTML as a parameter, extract part of HTML that contains the news article. :param content: Full page HTML. :param config: Optional CommonConfig object, useful for testing. :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version tag ("extractor_version" key). """ content = decode_object_from_bytes_if_needed(content) if not config: config = CommonConfig() ua = UserAgent() api_url = config.extractor_api_url() # Wait up to a minute for extraction to finish ua.set_timeout(EXTRACT_TIMEOUT) # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere api_uri = furl(api_url) api_url_hostname = str(api_uri.host) api_url_port = int(api_uri.port) assert api_url_hostname, f"API URL hostname is not set for URL {api_url}" assert api_url_port, f"API URL port is not set for URL {api_url}" if not wait_for_tcp_port_to_open( port=api_url_port, hostname=api_url_hostname, retries=EXTRACTOR_SERVICE_TIMEOUT, ): # Instead of throwing an exception, just crash the whole application # because there's no point in continuing on running it whatsoever: # # 1) If the extractor service didn't come up in a given time, it won't # suddenly show up # 2) If it's a test that's doing the extraction, it can't do its job # and should fail one way or another; exit(1) is just one of the # ways how it can fail # 3) If it's some production code that needs something to get # extracted, and if we were to throw an exception instead of doing # exit(1), the caller might treat this exception as a failure to # extract this one specific input HTML file, and so it might # mis-extract a bunch of stories that way (making it hard for us to # spot the problem and time-consuming to fix it later (e.g. there # would be a need to manually re-extract a million of stories)) # # A better solution instead of exit(1) might be to throw different # kinds of exceptions and handle them appropriately in the caller, but # with the Perl-Python codebase that's a bit hard to do. fatal_error( "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format( url=api_url, timeout=EXTRACTOR_SERVICE_TIMEOUT, ) ) request_json = encode_json({'html': content}) http_request = Request(method='POST', url=api_url) http_request.set_content_type('application/json; charset=utf-8') http_request.set_content(request_json) # Try extracting multiple times # # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry # extracting the content a couple of times manually. http_response = None extraction_succeeded = False for retry in range(EXTRACT_RETRIES): if retry > 0: log.warning(f"Retrying #{retry + 1}...") http_response = ua.request(http_request) if http_response.is_success(): extraction_succeeded = True break else: log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}") if not extraction_succeeded: raise McExtractArticleFromPageException( f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}" ) response = http_response.decoded_json() assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key." assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key." return response
def _default_amazon_s3_downloads_config() -> AmazonS3DownloadsConfig: return CommonConfig.amazon_s3_downloads()
def _default_download_storage_config() -> DownloadStorageConfig: return CommonConfig.download_storage()