コード例 #1
0
ファイル: __init__.py プロジェクト: robpotter89/backend
    def __init__(self, user_agent_config: UserAgentConfig = None):
        """Constructor."""

        self._user_agent_config = user_agent_config
        if not self._user_agent_config:
            self._user_agent_config = CommonConfig.user_agent()

            # "requests" session to carry the cookie pool around
        self.__session = requests.Session()

        self.__session.headers.update({
            'From': self.__OWNER,
            'User-Agent': self.__USER_AGENT,
            'Accept-Charset': 'utf-8',

            # MC_REWRITE_TO_PYTHON:
            #
            # Disable keep-alive (and fancy requests' connection pooling) because rudimentary HTTP server used for Perl
            # unit tests doesn't support it (but then maybe we don't want keep-alive anyway)
            'Connection': 'close',
        })

        self.set_max_redirect(self.__DEFAULT_MAX_REDIRECT)

        self.__timeout = None
        self.set_timeout(self.__DEFAULT_TIMEOUT)

        self.__max_size = None
        self.set_max_size(self.__DEFAULT_MAX_SIZE)

        # Disable retries by default; if client wants those, it should call
        # timing() itself, e.g. set it to '1,2,4,8'
        self.__timing = None
        self.set_timing(None)
コード例 #2
0
ファイル: mail.py プロジェクト: vishalbelsare/mediacloud
    def __init__(self,
                 to: Union[str, List[str]],
                 subject: str,
                 text_body: str,
                 html_body: Optional[str] = None,
                 cc: Optional[Union[str, List[str]]] = None,
                 bcc: Optional[Union[str, List[str]]] = None):
        """Email message constructor."""

        self.from_ = CommonConfig.email_from_address()

        self.subject = decode_object_from_bytes_if_needed(subject)
        self.text_body = decode_object_from_bytes_if_needed(text_body)
        self.html_body = decode_object_from_bytes_if_needed(html_body)

        self.to = decode_object_from_bytes_if_needed(to)
        if isinstance(self.to, str):
            self.to = [self.to]

        self.cc = decode_object_from_bytes_if_needed(cc)
        if isinstance(self.cc, str):
            self.cc = [self.cc]

        self.bcc = decode_object_from_bytes_if_needed(bcc)
        if isinstance(self.bcc, str):
            self.bcc = [self.bcc]
コード例 #3
0
def connect_to_db() -> DatabaseHandler:
    """Connect to PostgreSQL."""

    db_config = CommonConfig.database()
    retries_config = db_config.retries()

    assert retries_config.max_attempts() > 0, "max_tries can't be negative."

    db = None

    for attempt in range(1, retries_config.max_attempts() + 1):

        try:

            log.debug("Connecting to PostgreSQL...")

            db = DatabaseHandler(
                host=db_config.hostname(),
                port=db_config.port(),
                username=db_config.username(),
                password=db_config.password(),
                database=db_config.database_name(),
            )
            if not db:
                raise ValueError("Returned value is None.")

            # Return the database handler upon successful connection
            break

        except Exception as ex:

            error_message = "Unable to connect to %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % {
                'username': db_config.username(),
                'host': db_config.hostname(),
                'port': db_config.port(),
                'database': db_config.database_name(),
                'exception': str(ex),
            }

            log.error(error_message)

            if attempt < retries_config.max_attempts():
                log.info(
                    f"Will retry for #{attempt} time in {retries_config.sleep_between_attempts()} seconds..."
                )
                time.sleep(retries_config.sleep_between_attempts())

            else:
                log.info("Out of retries, giving up and exiting...")

                # Don't throw any exceptions because they might be caught by
                # the try-catch block, and so the caller will just assume that
                # there was something wrong with the input data and proceed
                # with processing next item in the job queue (e.g. the next
                # story). Instead, just quit and wait for someone to restart
                # the whole app that requires database access.
                fatal_error(error_message)

    return db
コード例 #4
0
    def __init__(self, queue_name: str):
        """Return job broker (Celery app object) prepared for the specific queue name."""

        assert queue_name, "Queue name is empty."
        self.__queue_name = queue_name

        config = CommonConfig()

        rabbitmq_config = config.rabbitmq()
        broker_uri = 'amqp://{username}:{password}@{hostname}:{port}/{vhost}'.format(
            username=rabbitmq_config.username(),
            password=rabbitmq_config.password(),
            hostname=rabbitmq_config.hostname(),
            port=rabbitmq_config.port(),
            vhost=rabbitmq_config.vhost(),
        )

        db_config = CommonConfig.database()
        result_backend_url = 'db+postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{database}'.format(
            username=db_config.username(),
            password=db_config.password(),
            hostname=db_config.hostname(),
            port=db_config.port(),
            database=db_config.database_name(),
        )

        self.__app = celery.Celery(queue_name, broker=broker_uri, backend=result_backend_url)

        self.__app.conf.broker_connection_timeout = rabbitmq_config.timeout()

        # Concurrency is done by us, not Celery itself
        self.__app.conf.worker_concurrency = 1

        self.__app.conf.broker_heartbeat = 0

        # https://tech.labs.oliverwyman.com/blog/2015/04/30/making-celery-play-nice-with-rabbitmq-and-bigwig/
        self.__app.conf.broker_transport_options = {'confirm_publish': True}

        self.__app.conf.database_table_names = {
            'task': 'celery_tasks',
            'group': 'celery_groups',
        }

        # Fetch only one job at a time
        self.__app.conf.worker_prefetch_multiplier = 1

        self.__app.conf.worker_max_tasks_per_child = 1000

        queue = Queue(
            name=queue_name,
            exchange=Exchange(queue_name),
            routing_key=queue_name,
            queue_arguments={
                'x-max-priority': 3,
                'x-queue-mode': 'lazy',
            },
        )
        self.__app.conf.task_queues = [queue]

        # noinspection PyUnusedLocal
        def __route_task(name, args_, kwargs_, options_, task_=None, **kw_):
            return {
                'queue': name,
                'exchange': name,
                'routing_key': name,
            }

        self.__app.conf.task_routes = (__route_task,)
コード例 #5
0
def solr_request(path: str,
                 params: SolrParams = None,
                 content: Union[str, SolrParams] = None,
                 content_type: Optional[str] = None,
                 config: Optional[CommonConfig] = None) -> str:
    """
    Send a request to Solr.

    :param path: Solr path to call, e.g. 'select'.
    :param params: Query parameters to add to the path.
    :param content: String or dictionary content to send via POST request.
    :param content_type: Content-Type for the POST content.
    :param config: (testing) Configuration object
    :return: Raw response content on success, raise exception on error.
    """
    path = decode_object_from_bytes_if_needed(path)
    params = decode_object_from_bytes_if_needed(params)
    content = decode_object_from_bytes_if_needed(content)
    content_type = decode_object_from_bytes_if_needed(content_type)

    if not path:
        raise McSolrRequestInvalidParamsException("Path is unset.")

    if params:
        if not isinstance(params, dict):
            raise McSolrRequestInvalidParamsException(
                f"Params is not a dictionary: {params}")

    if content:
        if not (isinstance(content, str) or isinstance(content, dict)):
            raise McSolrRequestInvalidParamsException(
                f"Content is not a string not a dictionary: {content}")

    if not config:
        config = CommonConfig()

    solr_url = config.solr_url()

    if not params:
        params = {}

    abs_uri = furl(f"{solr_url}/mediacloud/{path}")
    abs_uri = abs_uri.set(params)
    abs_url = str(abs_uri)

    ua = UserAgent()
    ua.set_timeout(__QUERY_HTTP_TIMEOUT)
    ua.set_max_size(None)

    # Remediate CVE-2017-12629
    q_param = str(params.get('q', ''))
    if 'xmlparser' in q_param.lower():
        raise McSolrRequestQueryErrorException(
            "XML queries are not supported.")

    # Solr might still be starting up so wait for it to expose the collections list
    __wait_for_solr_to_start(config=config)

    if content:

        if not content_type:
            fallback_content_type = 'text/plain; charset=utf-8'
            log.warning(
                f"Content-Type is not set; falling back to '{fallback_content_type}'"
            )
            content_type = fallback_content_type

        if isinstance(content, dict):
            content = urlencode(content, doseq=True)

        content_encoded = content.encode('utf-8', errors='replace')

        request = Request(method='POST', url=abs_url)
        request.set_header(name='Content-Type', value=content_type)
        request.set_header(name='Content-Length',
                           value=str(len(content_encoded)))
        request.set_content(content_encoded)

    else:

        request = Request(method='GET', url=abs_url)

    log.debug(f"Sending Solr request: {request}")

    response = ua.request(request)

    if not response.is_success():
        error_message = __solr_error_message_from_response(response=response)
        raise McSolrRequestQueryErrorException(
            f"Error fetching Solr response: {error_message}")

    return response.decoded_content()
コード例 #6
0
ファイル: mail.py プロジェクト: vishalbelsare/mediacloud
def send_email(message: Message) -> bool:
    """Send email to someone.

    Returns True on success, False on failure.

    Raises on programming error."""

    if message is None:
        raise McSendEmailException('Message is None.')

    if not message.from_:
        raise McSendEmailException("'from' is unset.")
    if message.to and (not isinstance(message.to, list)):
        raise McSendEmailException("'to' is not a list.")
    if message.cc and (not isinstance(message.cc, list)):
        raise McSendEmailException("'cc' is not a list.")
    if message.bcc and (not isinstance(message.bcc, list)):
        raise McSendEmailException("'bcc' is not a list.")

    if not (len(message.to) > 0 or len(message.cc) > 0 or len(message.bcc) > 0):
        raise McSendEmailException("No one to send the email to.")

    if not message.subject:
        raise McSendEmailException("'subject' is unset.")

    if not (message.text_body or message.html_body):
        raise McSendEmailException("No message body.")

    try:

        # Create message
        mime_message = MIMEMultipart('alternative')
        mime_message['Subject'] = '[Media Cloud] %s' % message.subject
        mime_message['From'] = message.from_
        if message.to:
            mime_message['To'] = ', '.join(message.to)
        else:
            mime_message['To'] = 'undisclosed recipients'
        if message.cc:
            mime_message['Cc'] = ', '.join(message.cc)
        if message.bcc:
            mime_message['Bcc'] = ', '.join(message.bcc)

        if message.text_body:
            message_part = MIMEText(message.text_body, 'plain', 'utf-8')
            mime_message.attach(message_part)

        unsubscribe_address = CommonConfig.smtp().unsubscribe_address()
        
        mime_message.add_header(
            'List-Unsubscribe', 
             f'mailto:{unsubscribe_address}?subject=Delete%20account%20and%20unsubscribe')

        # HTML gets attached last, thus making it a preferred part as per RFC
        if message.html_body:
            message_part = MIMEText(message.html_body, 'html', 'utf-8')
            mime_message.attach(message_part)

        if test_mode_is_enabled():
            log.info("Test mode is enabled, not actually sending any email.")
            log.debug("Omitted email:\n\n%s" % mime_message.as_string())

        else:

            # Connect to SMTP
            smtp = smtplib.SMTP(
                host=CommonConfig.smtp().hostname(),
                port=CommonConfig.smtp().port(),
            )

            # Send message
            refused_recipients = smtp.sendmail(mime_message['From'], mime_message['To'], mime_message.as_string())
            if len(refused_recipients):
                log.warning("Unable to send email to the following recipients: %s" % str(refused_recipients))

            smtp.quit()

    except Exception as ex:
        log.warning('Unable to send email to %s: %s' % (message.to, str(ex)))
        return False

    return True
コード例 #7
0
    def __init__(self,
                 queue_name: str,
                 rabbitmq_config: Optional[RabbitMQConfig] = None):
        """
        Create job broker object.

        :param queue_name: Queue name.
        """

        queue_name = decode_object_from_bytes_if_needed(queue_name)

        assert queue_name, "Queue name is empty."

        self.__queue_name = queue_name

        config = CommonConfig()

        if not rabbitmq_config:
            rabbitmq_config = config.rabbitmq()

        broker_uri = 'amqp://{username}:{password}@{hostname}:{port}/{vhost}'.format(
            username=rabbitmq_config.username(),
            password=rabbitmq_config.password(),
            hostname=rabbitmq_config.hostname(),
            port=rabbitmq_config.port(),
            vhost=rabbitmq_config.vhost(),
        )

        db_config = CommonConfig.database()
        result_backend_url = 'db+postgresql+psycopg2://{username}:{password}@{hostname}:{port}/{database}'.format(
            username=db_config.username(),
            password=db_config.password(),
            hostname=db_config.hostname(),
            port=db_config.port(),
            database=db_config.database_name(),
        )

        self.__app = celery.Celery(queue_name,
                                   broker=broker_uri,
                                   backend=result_backend_url)

        self.__app.conf.broker_connection_timeout = rabbitmq_config.timeout()

        # Concurrency is done by us, not Celery itself
        self.__app.conf.worker_concurrency = 1

        self.__app.conf.broker_heartbeat = 0

        # Acknowledge tasks after they get run, not before
        self.__app.conf.task_acks_late = 1

        # https://tech.labs.oliverwyman.com/blog/2015/04/30/making-celery-play-nice-with-rabbitmq-and-bigwig/
        self.__app.conf.broker_transport_options = {'confirm_publish': True}

        self.__app.conf.database_table_names = {
            'task': 'celery_tasks',
            'group': 'celery_groups',
        }

        # Fetch only one job at a time
        self.__app.conf.worker_prefetch_multiplier = 1

        self.__app.conf.worker_max_tasks_per_child = 1000

        retries_config = rabbitmq_config.retries()
        if retries_config:
            self.__app.task_publish_retry = True
            self.__app.task_publish_retry_policy = {
                'max_retries': retries_config.max_retries(),
                'interval_start': retries_config.interval_start(),
                'interval_step': retries_config.interval_step(),
                'interval_max': retries_config.interval_max(),
            }

        else:
            self.__app.task_publish_retry = False

        queue = Queue(
            name=queue_name,
            exchange=Exchange(queue_name),
            routing_key=queue_name,
            queue_arguments={
                'x-max-priority': 3,
                'x-queue-mode': 'lazy',
            },
        )
        self.__app.conf.task_queues = [queue]

        # noinspection PyUnusedLocal
        def __route_task(name, args_, kwargs_, options_, task_=None, **kw_):
            return {
                'queue': name,
                'exchange': name,
                'routing_key': name,
            }

        self.__app.conf.task_routes = (__route_task, )
コード例 #8
0
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]:
    """
    Using full page HTML as a parameter, extract part of HTML that contains the news article.
    :param content: Full page HTML.
    :param config: Optional CommonConfig object, useful for testing.
    :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version
             tag ("extractor_version" key).
    """
    content = decode_object_from_bytes_if_needed(content)

    if not config:
        config = CommonConfig()

    ua = UserAgent()
    api_url = config.extractor_api_url()

    # Wait up to a minute for extraction to finish
    ua.set_timeout(EXTRACT_TIMEOUT)

    # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere
    api_uri = furl(api_url)
    api_url_hostname = str(api_uri.host)
    api_url_port = int(api_uri.port)
    assert api_url_hostname, f"API URL hostname is not set for URL {api_url}"
    assert api_url_port, f"API URL port is not set for URL {api_url}"

    if not wait_for_tcp_port_to_open(
            port=api_url_port,
            hostname=api_url_hostname,
            retries=EXTRACTOR_SERVICE_TIMEOUT,
    ):
        # Instead of throwing an exception, just crash the whole application
        # because there's no point in continuing on running it whatsoever:
        #
        # 1) If the extractor service didn't come up in a given time, it won't
        #    suddenly show up
        # 2) If it's a test that's doing the extraction, it can't do its job
        #    and should fail one way or another; exit(1) is just one of the
        #    ways how it can fail
        # 3) If it's some production code that needs something to get
        #    extracted, and if we were to throw an exception instead of doing
        #    exit(1), the caller might treat this exception as a failure to
        #    extract this one specific input HTML file, and so it might
        #    mis-extract a bunch of stories that way (making it hard for us to
        #    spot the problem and time-consuming to fix it later (e.g. there
        #    would be a need to manually re-extract a million of stories))
        #
        # A better solution instead of exit(1) might be to throw different
        # kinds of exceptions and handle them appropriately in the caller, but
        # with the Perl-Python codebase that's a bit hard to do.
        fatal_error(
            "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format(
                url=api_url,
                timeout=EXTRACTOR_SERVICE_TIMEOUT,
            )
        )

    request_json = encode_json({'html': content})

    http_request = Request(method='POST', url=api_url)
    http_request.set_content_type('application/json; charset=utf-8')
    http_request.set_content(request_json)

    # Try extracting multiple times
    #
    # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by
    # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry
    # extracting the content a couple of times manually.
    http_response = None
    extraction_succeeded = False
    for retry in range(EXTRACT_RETRIES):

        if retry > 0:
            log.warning(f"Retrying #{retry + 1}...")

        http_response = ua.request(http_request)
        if http_response.is_success():
            extraction_succeeded = True
            break
        else:
            log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}")

    if not extraction_succeeded:
        raise McExtractArticleFromPageException(
            f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}"
        )

    response = http_response.decoded_json()

    assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key."
    assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key."

    return response
コード例 #9
0
def _default_amazon_s3_downloads_config() -> AmazonS3DownloadsConfig:
    return CommonConfig.amazon_s3_downloads()
コード例 #10
0
def _default_download_storage_config() -> DownloadStorageConfig:
    return CommonConfig.download_storage()