Beispiel #1
0
class _SitemapWebClient(AbstractWebClient):
    # Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
    __HTTP_REQUEST_TIMEOUT = 60

    __slots__ = [
        '__ua',
    ]

    def __init__(self):
        self.__ua = UserAgent()
        self.__ua.set_timeout(self.__HTTP_REQUEST_TIMEOUT)

    def set_max_response_data_length(self,
                                     max_response_data_length: int) -> None:
        self.__ua.set_max_size(max_response_data_length)

    def get(self, url: str) -> AbstractWebClientResponse:
        ua_response = self.__ua.get(url)

        if ua_response.is_success():
            return _SitemapWebClientResponse(ua_response=ua_response)
        else:
            return WebClientErrorResponse(
                message=ua_response.status_line(),
                retryable=ua_response.code() in RETRYABLE_HTTP_STATUS_CODES,
            )
Beispiel #2
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Optional[Response]:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        try:
            db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)
        except McTupleAlreadyMovedError as ex:
            # Some attempts to set the download's row to "fetching" fail with:
            #
            #   "tuple to be locked was already moved to another partition due to concurrent update"
            #
            # If that happens, we assume that some other fetcher instance somehow got to the download first and do
            # nothing
            log.warning(f"Some other fetcher got to download {download['downloads_id']} first: {ex}")
            return None
        except Exception as ex:
            # Raise further on misc. errors
            raise ex

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Beispiel #3
0
def _get_feed_url_from_google_podcasts_url(url: str) -> str:
    """
    Given a Google Podcasts URL, try to determine a RSS feed URL from it.

    :param url: Google Podcasts URL, e.g. https://podcasts.google.com/?feed=aHR0cHM6Ly93d3cucmVzaWRlbnRhZHZpc29yLm5ldC94
                bWwvcG9kY2FzdC54bWw&ved=0CAAQ4aUDahcKEwiot6W5hrnnAhUAAAAAHQAAAAAQAQ&hl=lt
    :return: RSS feed URL that Google Podcasts uses, or original URL if it's not a Google Podcasts URL / feed URL can't
             be determined.
    """

    uri = furl(url)

    if uri.host != 'podcasts.google.com':
        log.debug(f"URL '{url}' is not Google Podcasts URL.")
        return url

    if 'feed' not in uri.args:
        log.error(f"URL '{url}' doesn't have 'feed' parameter.")

    # Remove the rest of the arguments because they might lead to an episode page which doesn't have "data-feed"
    args = list(uri.args.keys())
    for arg in args:
        if arg != 'feed':
            del uri.args[arg]

    url = str(uri.url)

    ua = UserAgent()
    res = ua.get(url)
    if not res.is_success():
        log.error(f"Unable to fetch Google Podcasts feed URL: {res.status_line()}")
        return url

    html = res.decoded_content()

    # check whether this is an individual episode URL rather than the show's Google Podcasts homepage; the feed URL
    # doesn't appear on individual episode pages, so we need to spider to the show's Google Podcasts homepage to get it
    if '/episode/' in url:
        show_homepage = url.split('/episode/')[0]
        res = ua.get(show_homepage)
        if not res.is_success():
            log.error(f"Unable to fetch Google Podcasts feed URL: {res.status_line()}")
            return show_homepage
        else:
            html = res.decoded_content()

    # get show's feed URL from its Google Podcasts homepage
    match = re.search(r'c-data id="i3" jsdata=".*(https?://.+?);2', html, flags=re.IGNORECASE)
    if not match:
        log.error(f"Feed URL was not found in Google Podcasts feed page.")
        return url

    feed_url = match.group(1)

    log.info(f"Resolved Google Podcasts URL '{url}' as '{feed_url}'")

    return feed_url
    def _get_content_from_api(self, query: str, start_date: datetime,
                              end_date: datetime) -> str:
        """Fetch the posts data from thw ch api and return the http response content."""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        return response.decoded_content()
Beispiel #5
0
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None:
    """Wait for Solr to start and collections to become available, if needed."""

    # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason
    sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"

    connected = False

    for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1):

        if retry > 0:
            log.debug(f"Retrying Solr connection ({retry})...")

        try:

            ua = UserAgent()
            ua.set_timeout(1)
            response = ua.get(sample_select_url)

            if not response.is_success():
                raise Exception(f"Unable to connect: {response.status_line()}")

            if not response.decoded_content():
                raise Exception("Response is empty.")

            try:
                result = response.decoded_json()
            except Exception as ex:
                raise Exception(f"Unable to decode response: {ex}")

            if not isinstance(result, dict):
                raise Exception(
                    f"Result is not a dictionary: {response.decoded_content()}"
                )

            if 'response' not in result:
                raise Exception(
                    f"Response doesn't have 'response' key: {response.decoded_content()}"
                )

        except Exception as ex:

            log.warning(f"Solr is down, will retry: {ex}")
            time.sleep(1)

        else:
            log.debug("Solr is up!")
            connected = True
            break

    if not connected:
        raise McSolrRequestDidNotStartInTimeException(
            f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up"
        )
Beispiel #6
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads', object_id=download['downloads_id'], update_hash=download)

        ua = UserAgent()
        url_with_credentials = self._api_request_url_with_signature_from_config(api_url=download['url'])
        request = Request(method='GET', url=url_with_credentials)
        response = ua.request(request)

        return response
    def fetch_posts(self, query: dict, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch tweets from archive.org that match the given query for the given day."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        enc_query = urlencode({
            'q': query,
            'date_from': start_arg,
            'date_to': end_arg
        })

        url = "https://searchtweets.archivelab.org/export?" + enc_query

        log.debug("archive.org url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsArchiveTwitterDataException(
                "error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        # sometimes we get null characters, which choke the csv module
        decoded_content = decoded_content.replace('\x00', '')

        meta_tweets = []
        lines = decoded_content.splitlines()[1:]
        for row in csv.reader(lines, delimiter="\t"):
            fields = 'user_name user_screen_name lang text timestamp_ms url'.split(
                ' ')
            meta_tweet = {}
            for i, field in enumerate(fields):
                meta_tweet[field] = row[i] if i < len(row) else ''

            if 'url' not in meta_tweet or meta_tweet['url'] == '':
                log.warning("meta_tweet '%s' does not have a url" % str(row))
                continue

            meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url'])

            meta_tweets.append(meta_tweet)

        add_tweets_to_meta_tweets(meta_tweets)

        return meta_tweets
Beispiel #8
0
def _get_user_agent() -> UserAgent:
    """Get a properly configured user agent."""
    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    return ua
Beispiel #9
0
def get_url_retry_on_client_errors(url: str,
                                   ua: UserAgent,
                                   retry_count: int = 5,
                                   sleep_between_retries: int = 1) -> Response:
    """Fetch URL, retry on client errors (which, as per implementation, might be request timeouts too)."""
    assert retry_count > 0, "Retry count must be positive."

    response = None
    for retry in range(0, retry_count):
        log.info("Fetching URL {}...".format(url))
        response = ua.get(url)
        if response.is_success():
            return response
        else:
            log.warning("Request for URL {} failed: {}".format(url, response.message()))

            if response.error_is_client_side():
                log.info("Retrying URL {} in {} seconds...".format(url, sleep_between_retries))
                time.sleep(sleep_between_retries)

            else:
                log.info("Not retrying for URL {}".format(url))
                return response

    log.info("Giving up on URL {}".format(url))
    return response
Beispiel #10
0
def _get_feed_url_from_google_podcasts_url(url: str) -> str:
    """
    Given a Google Podcasts URL, try to determine a RSS feed URL from it.

    :param url: Google Podcasts URL, e.g. https://podcasts.google.com/?feed=aHR0cHM6Ly93d3cucmVzaWRlbnRhZHZpc29yLm5ldC94
                bWwvcG9kY2FzdC54bWw&ved=0CAAQ4aUDahcKEwiot6W5hrnnAhUAAAAAHQAAAAAQAQ&hl=lt
    :return: RSS feed URL that Google Podcasts uses, or original URL if it's not a Google Podcasts URL / feed URL can't
             be determined.
    """

    uri = furl(url)

    if uri.host != 'podcasts.google.com':
        log.debug(f"URL '{url}' is not Google Podcasts URL.")
        return url

    if 'feed' not in uri.args:
        log.error(f"URL '{url}' doesn't have 'feed' parameter.")

    # Remove the rest of the arguments because they might lead to an episode page which doesn't have "data-feed"
    args = list(uri.args.keys())
    for arg in args:
        if arg != 'feed':
            del uri.args[arg]

    url = str(uri.url)

    ua = UserAgent()
    res = ua.get(url)
    if not res.is_success():
        log.error(
            f"Unable to fetch Google Podcasts feed URL: {res.status_line()}")
        return url

    html = res.decoded_content()

    # <div jsname="<...>" jscontroller="<...>" jsaction="<...>" data-feed="<...>">
    match = re.search(r'data-feed="(https?://.+?)"', html, flags=re.IGNORECASE)
    if not match:
        log.error(f"Feed URL was not found in Google Podcasts feed page.")
        return url

    feed_url = match.group(1)

    log.info(f"Resolved Google Podcasts URL '{url}' as '{feed_url}'")

    return feed_url
Beispiel #11
0
    def __init__(self, ap_config: Optional[APCrawlerConfig] = None):

        self.api_key = None
        self.api_version = '1.1'
        self.retry_limit = 5
        self.ratelimit_info = dict()
        self.ua = UserAgent()
        self.ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256])

        if not ap_config:
            ap_config = APCrawlerConfig()

        self.api_key = ap_config.api_key()

        if not self.api_key:
            raise McAPMissingAPIKey(
                "API key configuration data missing for associated_press.")
Beispiel #12
0
    def fetch_download(self, db: DatabaseHandler, download: dict) -> Response:
        download = decode_object_from_bytes_if_needed(download)

        url = self._download_url(download=download)
        if not is_http_url(url):
            raise McCrawlerFetcherSoftError(f"URL is not HTTP(s): {url}")

        download['download_time'] = sql_now()
        download['state'] = 'fetching'

        db.update_by_id(table='downloads',
                        object_id=download['downloads_id'],
                        update_hash=download)

        ua = UserAgent()
        response = ua.get_follow_http_html_redirects(url)

        return response
Beispiel #13
0
    def __init__(self, db: mediawords.db.DatabaseHandler, domain_timeout: typing.Optional[int]=None) -> None:
        """
        Add database handler and domain_timeout to UserAgent object.

        If domain_timeout is not specified, use mediawords.throttles_user_agent_domain_timeout from mediawords.yml.
        If not present in mediawords.yml, use _DEFAULT_DOMAIN_TIMEOUT.
        """
        self.db = db
        self.domain_timeout = domain_timeout

        if self.domain_timeout is None:
            config = mediawords.util.config.get_config()
            if 'throttled_user_agent_domain_timeout' in config['mediawords']:
                self.domain_timeout = int(config['mediawords']['throttled_user_agent_domain_timeout'])
            if self.domain_timeout is None or self.domain_timeout < 1:
                self.domain_timeout = _DEFAULT_DOMAIN_TIMEOUT

        UserAgent.__init__(self)
Beispiel #14
0
def fetch_meta_tweets_from_ch(query: str, day: str) -> list:
    """Fetch day of tweets from crimson hexagon"""
    ch_monitor_id = int(query)

    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    config = mediawords.util.config.get_config()
    if 'crimson_hexagon' not in config or 'key' not in config[
            'crimson_hexagon']:
        raise McFetchTopicTweetsConfigException(
            "no key in mediawords.yml at //crimson_hexagon/key.")

    key = config['crimson_hexagon']['key']

    next_day = day + datetime.timedelta(days=1)

    day_arg = day.strftime('%Y-%m-%d')
    next_day_arg = next_day.strftime('%Y-%m-%d')

    url = (
        "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
        % (key, ch_monitor_id, day_arg, next_day_arg))

    log.debug("crimson hexagon url: " + url)

    response = ua.get(url)

    if not response.is_success():
        raise McFetchTopicTweetsDataException("error fetching posts: " +
                                              response.decoded_content())

    decoded_content = response.decoded_content()

    data = dict(mediawords.util.parse_json.decode_json(decoded_content))

    if 'status' not in data or not data['status'] == 'success':
        raise McFetchTopicTweetsDataException("Unknown response status: " +
                                              str(data))

    meta_tweets = data['posts']

    for mt in meta_tweets:
        mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

    return meta_tweets
    def test_api_request(self):
        """Make an API request, see if it succeeds."""

        credentials = self.univision_credentials()

        handler = DownloadFeedUnivisionHandler(crawler_config=self._mock_crawler_config())
        api_request_url = handler._api_request_url_with_signature_from_config(api_url=credentials.url)
        assert api_request_url, 'API request URL is not empty'

        ua = UserAgent()
        ua.set_timeout(30)

        response = ua.get(api_request_url)
        assert response.is_success(), 'API request was successful'

        json_string = response.decoded_content()
        assert json_string, 'JSON response is not empty'

        json = response.decoded_json()
        assert json.get('status', None) == 'success', "JSON response was successful"
        assert 'data' in json, 'JSON response has "data" key'
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config[
                'crimson_hexagon']:
            raise McFetchTopicTweetsConfigException(
                "no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " +
                                                  response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " +
                                                  str(data))

        return data
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']:
            raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" %
               (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " + str(data))

        return data
Beispiel #18
0
def solr_request(path: str,
                 params: SolrParams = None,
                 content: Union[str, SolrParams] = None,
                 content_type: Optional[str] = None,
                 config: Optional[CommonConfig] = None) -> str:
    """
    Send a request to Solr.

    :param path: Solr path to call, e.g. 'select'.
    :param params: Query parameters to add to the path.
    :param content: String or dictionary content to send via POST request.
    :param content_type: Content-Type for the POST content.
    :param config: (testing) Configuration object
    :return: Raw response content on success, raise exception on error.
    """
    path = decode_object_from_bytes_if_needed(path)
    params = decode_object_from_bytes_if_needed(params)
    content = decode_object_from_bytes_if_needed(content)
    content_type = decode_object_from_bytes_if_needed(content_type)

    if not path:
        raise McSolrRequestInvalidParamsException("Path is unset.")

    if params:
        if not isinstance(params, dict):
            raise McSolrRequestInvalidParamsException(
                f"Params is not a dictionary: {params}")

    if content:
        if not (isinstance(content, str) or isinstance(content, dict)):
            raise McSolrRequestInvalidParamsException(
                f"Content is not a string not a dictionary: {content}")

    if not config:
        config = CommonConfig()

    solr_url = config.solr_url()

    if not params:
        params = {}

    abs_uri = furl(f"{solr_url}/mediacloud/{path}")
    abs_uri = abs_uri.set(params)
    abs_url = str(abs_uri)

    ua = UserAgent()
    ua.set_timeout(__QUERY_HTTP_TIMEOUT)
    ua.set_max_size(None)

    # Remediate CVE-2017-12629
    q_param = str(params.get('q', ''))
    if 'xmlparser' in q_param.lower():
        raise McSolrRequestQueryErrorException(
            "XML queries are not supported.")

    # Solr might still be starting up so wait for it to expose the collections list
    __wait_for_solr_to_start(config=config)

    if content:

        if not content_type:
            fallback_content_type = 'text/plain; charset=utf-8'
            log.warning(
                f"Content-Type is not set; falling back to '{fallback_content_type}'"
            )
            content_type = fallback_content_type

        if isinstance(content, dict):
            content = urlencode(content, doseq=True)

        content_encoded = content.encode('utf-8', errors='replace')

        request = Request(method='POST', url=abs_url)
        request.set_header(name='Content-Type', value=content_type)
        request.set_header(name='Content-Length',
                           value=str(len(content_encoded)))
        request.set_content(content_encoded)

    else:

        request = Request(method='GET', url=abs_url)

    log.debug(f"Sending Solr request: {request}")

    response = ua.request(request)

    if not response.is_success():
        error_message = __solr_error_message_from_response(response=response)
        raise McSolrRequestQueryErrorException(
            f"Error fetching Solr response: {error_message}")

    return response.decoded_content()
def extract_article_html_from_page_html(content: str, config: Optional[CommonConfig] = None) -> Dict[str, str]:
    """
    Using full page HTML as a parameter, extract part of HTML that contains the news article.
    :param content: Full page HTML.
    :param config: Optional CommonConfig object, useful for testing.
    :return: Dictionary with HTML that contains the news article content ("extracted_html" key) and extractor version
             tag ("extractor_version" key).
    """
    content = decode_object_from_bytes_if_needed(content)

    if not config:
        config = CommonConfig()

    ua = UserAgent()
    api_url = config.extractor_api_url()

    # Wait up to a minute for extraction to finish
    ua.set_timeout(EXTRACT_TIMEOUT)

    # Wait for the extractor's HTTP port to become open as the service might be still starting up somewhere
    api_uri = furl(api_url)
    api_url_hostname = str(api_uri.host)
    api_url_port = int(api_uri.port)
    assert api_url_hostname, f"API URL hostname is not set for URL {api_url}"
    assert api_url_port, f"API URL port is not set for URL {api_url}"

    if not wait_for_tcp_port_to_open(
            port=api_url_port,
            hostname=api_url_hostname,
            retries=EXTRACTOR_SERVICE_TIMEOUT,
    ):
        # Instead of throwing an exception, just crash the whole application
        # because there's no point in continuing on running it whatsoever:
        #
        # 1) If the extractor service didn't come up in a given time, it won't
        #    suddenly show up
        # 2) If it's a test that's doing the extraction, it can't do its job
        #    and should fail one way or another; exit(1) is just one of the
        #    ways how it can fail
        # 3) If it's some production code that needs something to get
        #    extracted, and if we were to throw an exception instead of doing
        #    exit(1), the caller might treat this exception as a failure to
        #    extract this one specific input HTML file, and so it might
        #    mis-extract a bunch of stories that way (making it hard for us to
        #    spot the problem and time-consuming to fix it later (e.g. there
        #    would be a need to manually re-extract a million of stories))
        #
        # A better solution instead of exit(1) might be to throw different
        # kinds of exceptions and handle them appropriately in the caller, but
        # with the Perl-Python codebase that's a bit hard to do.
        fatal_error(
            "Extractor service at {url} didn't come up in {timeout} seconds, exiting...".format(
                url=api_url,
                timeout=EXTRACTOR_SERVICE_TIMEOUT,
            )
        )

    request_json = encode_json({'html': content})

    http_request = Request(method='POST', url=api_url)
    http_request.set_content_type('application/json; charset=utf-8')
    http_request.set_content(request_json)

    # Try extracting multiple times
    #
    # UserAgent's set_timing() would only retry on retryable HTTP status codes and doesn't retry on connection errors by
    # default as such retries might have side effects, e.g. an API getting called multiple times. So, we retry
    # extracting the content a couple of times manually.
    http_response = None
    extraction_succeeded = False
    for retry in range(EXTRACT_RETRIES):

        if retry > 0:
            log.warning(f"Retrying #{retry + 1}...")

        http_response = ua.request(http_request)
        if http_response.is_success():
            extraction_succeeded = True
            break
        else:
            log.error(f"Extraction attempt {retry + 1} failed: {http_response.decoded_content()}")

    if not extraction_succeeded:
        raise McExtractArticleFromPageException(
            f"Extraction of {len(content)} characters; failed; last error: {http_response.decoded_content()}"
        )

    response = http_response.decoded_json()

    assert 'extracted_html' in response, "Response is expected to have 'extracted_html' key."
    assert 'extractor_version' in response, "Response is expected to have 'extractor_version' key."

    return response
Beispiel #20
0
    def fetch_posts(self, query: str, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch day of tweets from crimson hexagon"""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.warning("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                post = {
                    'post_id': mt['tweet_id'],
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': mt['tweet']['created_at'],
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts
Beispiel #21
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(url_link_rel_canonical) > 0:
            log.debug(
                (
                    'Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                    '(original URL: %(url)s): %(url_link_rel_canonical)s'
                ) % {
                    "url_after_redirects": url_after_redirects,
                    "url": url,
                    "url_link_rel_canonical": url_link_rel_canonical,
                }
            )

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {key: urls[key] for key in urls.keys() if not is_homepage_url(urls[key])}

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [x for x in distinct_urls if not re.search(pattern=invalid_url_variant_regex, string=x)]

    return distinct_urls
Beispiel #22
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info("Annotating %d characters of text..." % len(text))

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it."
                    % (
                        text_length,
                        self.__TEXT_LENGTH_LIMIT,
                    ))
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                "Unable to create annotator request for text '%s': %s" % (
                    text,
                    str(ex),
                ))

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {url}"
        assert port, f"API URL port is not set for URL {url}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                "Annotator service at {url} didn't come up in {timeout} seconds, exiting..."
                .format(
                    url=url,
                    timeout=self.__ANNOTATOR_SERVICE_TIMEOUT,
                ))

        log.debug("Sending request to %s..." % request.url())
        response = ua.request(request)
        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning("Request failed: %s" % response.decoded_content())

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    "The request timed out, giving up; text length: %d; text: %s"
                    % (
                        len(text),
                        text,
                    ))

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error("User agent error: %s: %s" % (
                    response.status_line(),
                    results_string,
                ))

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error('%s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        'Annotator service was unable to process the download: %s'
                        % results_string)

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error('Unknown HTTP response: %s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                "Annotator returned nothing for text: %s" % text)

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error("Unable to parse JSON response: %s\nJSON string: %s" %
                        (
                            str(ex),
                            results_string,
                        ))
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                "Unable to determine whether response is valid: %s\nJSON string: %s"
                % (str(ex), results_string))
        if not response_is_valid:
            fatal_error("Annotator response is invalid for JSON string: %s" %
                        results_string)

        log.info("Done annotating %d characters of text." % len(text))

        return results
Beispiel #23
0
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]:
    """
    Make Facebook API request.

    Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if
    something went wrong.

    :param node: Facebook API node to call.
    :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if
                   multiple values with the same key have to be passed.
    :param config: Facebook configuration object.
    :return: API response.
    """
    node = decode_object_from_bytes_if_needed(node)
    params = decode_object_from_bytes_if_needed(params)

    if node is None:
        raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).")

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Params is not a dict.")

    if not config.is_enabled():
        raise McFacebookInvalidConfigurationException("Facebook API is not enabled.")

    if not config.api_endpoint():
        raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.")

    api_uri = furl(config.api_endpoint())
    api_uri.path.segments.append(node)

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Parameters should be a dictionary.")

    for key, values in params.items():
        if key is None or values is None:
            raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.")

        if isinstance(values, str):
            # A single value
            api_uri = api_uri.add({key: values})

        elif isinstance(values, list):
            # Multiple values for the same key
            for value in values:
                api_uri = api_uri.add({key: value})

        else:
            raise McFacebookInvalidParametersException("Values is neither a string nor a list.")

    log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}")

    app_id = config.app_id()
    app_secret = config.app_secret()

    if not (app_id and app_secret):
        raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.")

    access_token = f"{app_id}|{app_secret}"
    api_uri = api_uri.add({'access_token': access_token})

    # Last API error to set as an exception message if we run out of retries
    last_api_error = None
    data = None

    for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1):

        if retry > 1:
            log.warning(f"Retrying #{retry}...")

        ua = UserAgent()
        ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT)

        try:
            response = ua.get(api_uri.url)
        except Exception as ex:
            # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up
            # something in the code or arguments
            raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}")

        decoded_content = response.decoded_content()

        if not decoded_content:
            # some stories consistenty return empty content, so just return a soft error and move on
            raise McFacebookSoftFailureException("Decoded content is empty.")

        try:
            data = decode_json(decoded_content)
        except Exception as ex:

            if 'something went wrong' in decoded_content:
                # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the
                # request
                last_api_error = f"API responded with 'Something went wrong', will retry"
                log.error(last_api_error)
                continue

            else:
                # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"Unable to decode JSON response: {ex}",
                )

        if response.is_success():
            # Response was successful and we managed to decode JSON -- break from the retry loop
            return data

        else:
            if 'error' not in data:
                # More likely than not it's our problem so consider it a hard failure
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"No 'error' key but HTTP status is not 2xx",
                )

            error = data['error']
            error_code = error.get('code', -1)
            error_message = error.get('message', 'unknown message')

            if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES:
                # Retryable error
                last_api_error = (
                    f"Retryable error {error_code}: {error_message}, "
                    f"will retry in {config.seconds_to_wait_between_retries()} seconds"
                )
                log.error(last_api_error)
                time.sleep(config.seconds_to_wait_between_retries())
                continue

            else:
                # Non-retryable error
                log.error(f"Non-retryable error {error_code}: {error_message}")
                return data

    # At this point, we've retried the request for some time but nothing worked
    log.error(f"Ran out of retries; last error: {last_api_error}")
    return data
Beispiel #24
0
def all_url_variants(db: DatabaseHandler, url: str) -> List[str]:
    """Given the URL, return all URL variants that we can think of:

    1) Normal URL (the one passed as a parameter)
    2) URL after redirects (i.e., fetch the URL, see if it gets redirected somewhere)
    3) Canonical URL (after removing #fragments, session IDs, tracking parameters, etc.)
    4) Canonical URL after redirects (do the redirect check first, then strip the tracking parameters from the URL)
    5) URL from <link rel="canonical" /> (if any)
    6) Any alternative URLs from topic_merged_stories or topic_links"""

    url = decode_object_from_bytes_if_needed(url)

    if url is None:
        raise McAllURLVariantsException("URL is None.")

    url = fix_common_url_mistakes(url)
    if not is_http_url(url):
        log.warning("URL %s is not a valid HTTP URL." % url)
        return [
            url,
        ]

    # Get URL after HTTP / HTML redirects
    ua = UserAgent()
    response = ua.get_follow_http_html_redirects(url)
    url_after_redirects = response.request().url()
    data_after_redirects = response.decoded_content()

    urls = {

        # Normal URL (don't touch anything)
        'normal': url,

        # Normal URL after redirects
        'after_redirects': url_after_redirects,

        # Canonical URL
        'normalized': normalize_url(url),

        # Canonical URL after redirects
        'after_redirects_normalized': normalize_url(url_after_redirects),
    }

    # If <link rel="canonical" /> is present, try that one too
    if data_after_redirects is not None:
        url_link_rel_canonical = link_canonical_url_from_html(
            html=data_after_redirects, base_url=url_after_redirects)
        if url_link_rel_canonical is not None and len(
                url_link_rel_canonical) > 0:
            log.debug(
                ('Found <link rel="canonical" /> for URL %(url_after_redirects)s '
                 '(original URL: %(url)s): %(url_link_rel_canonical)s') % {
                     "url_after_redirects": url_after_redirects,
                     "url": url,
                     "url_link_rel_canonical": url_link_rel_canonical,
                 })

            urls['after_redirects_canonical'] = url_link_rel_canonical

    # If URL gets redirected to the homepage (e.g.
    # http://m.wired.com/threatlevel/2011/12/sopa-watered-down-amendment/ leads
    # to http://www.wired.com/), don't use those redirects
    if not is_homepage_url(url):
        urls = {
            key: urls[key]
            for key in urls.keys() if not is_homepage_url(urls[key])
        }

    distinct_urls = list(set(urls.values()))

    topic_urls = __get_topic_url_variants(db=db, urls=distinct_urls)

    distinct_urls = distinct_urls + topic_urls
    distinct_urls = list(set(distinct_urls))

    # Remove URLs that can't be variants of the initial URL
    for invalid_url_variant_regex in __INVALID_URL_VARIANT_REGEXES:
        distinct_urls = [
            x for x in distinct_urls
            if not re.search(pattern=invalid_url_variant_regex, string=x)
        ]

    return distinct_urls
Beispiel #25
0
def sitemap_useragent() -> UserAgent:
    ua = UserAgent()
    ua.set_max_size(__MAX_SITEMAP_SIZE)
    return ua
Beispiel #26
0
class AssociatedPressAPI:
    """Object used to interface with the Associated Press API and to return data from
    various API endpoints.
    """
    def __init__(self, ap_config: Optional[APCrawlerConfig] = None):

        self.api_key = None
        self.api_version = '1.1'
        self.retry_limit = 5
        self.ratelimit_info = dict()
        self.ua = UserAgent()
        self.ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256])

        if not ap_config:
            ap_config = APCrawlerConfig()

        self.api_key = ap_config.api_key()

        if not self.api_key:
            raise McAPMissingAPIKey(
                "API key configuration data missing for associated_press.")

    def feed(self, **kwargs) -> dict:
        """Feed API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Search-and-Feed/#feed)

        METHOD: GET

        ENDPOINT PARAMETERS:

        q: Query Expression

        include, exclude: Parameters used to customize the fields returned in the response.

        text_links: Specifies the format of the text renditions (stories, captions, scripts and shotlists) to return in
        the response.  For stories, the valid value is nitf (NITF) or anpa (ANPA 1312). For captions, scripts and
        shotlists, the valid value is nitf (NITF).  The value of all returns all available formats (this is the
        default).

        page_size: The maximum number of items to return per page. The default is 10 items with a maximum of 100 per
        page.

        versions: Specifies whether to return all available versions of the content item and all ANPA filings or only
        the latest (the same story in the ANPA format may be filed multiple times; for example, with a different
        category code).

        REQUEST HEADERS:

        Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip.

        """
        url = 'https://api.ap.org/media/v/content/feed'
        api_method = 'feed'
        params = {'apikey': self.api_key}
        params.update(kwargs)
        self._check_ratelimit(api_method)
        feed_data = self._make_request(url, params)
        return json.loads(feed_data)['data']

    def search(self, **kwargs) -> dict:
        """Feed API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Search-and-Feed/#search)

        METHOD: GET

        ENDPOINT PARAMETERS:

        q: Query Expression

        include, exclude: Parameters used to customize the fields returned in the response.

        text_links: Specifies the format of the text renditions (stories, captions, scripts and shotlists) to return in
        the response.  For stories, the valid value is nitf (NITF) or anpa (ANPA 1312). For captions, scripts and
        shotlists, the valid value is nitf (NITF).  The value of all returns all available formats (this is the
        default).

        sort: The sort order of the returned results. By default, the results are sorted by relevance (meta.score) - the
        most relevant items first, regardless of the time period. Valid options are:

            versioncreated: desc. The latest items first (reverse chronological order).  versioncreated: asc. The oldest
            items first (chronological order).

        page_size: The maximum number of items to return per page. The default is 10 items with a maximum of 100 per
        page.

        page:. The requested page number within the set of search results. Page numbers start at 1.

        REQUEST HEADERS:

        Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip.

        """
        url = 'https://api.ap.org/media/v/content/search'
        api_method = 'search'
        params = {'apikey': self.api_key}
        params.update(kwargs)
        self._check_ratelimit(api_method)
        search_data = self._make_request(url, params)
        return json.loads(search_data)['data']

    def content(self, path, **kwargs) -> Optional[str]:
        """Content API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Content-Item/)
        Example: https://api.ap.org/media/v[{version}]/content/{item_id}?apikey={apikey}[{optional_parameters}]

        METHOD: GET

        ENDPOINT PARAMETERS:

        qt: Unknown. They are present in the feed response but don't appear to be in the documentation

        et: Unknown. Same as above.

        REQUEST HEADERS:

        Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip.
        """

        url = 'https://api.ap.org/media/v/content/{}'.format(path)
        api_method = 'item'
        params = {'apikey': self.api_key}
        params.update(kwargs)
        self._check_ratelimit(api_method)
        content_data = self._make_request(url, params)
        return content_data

    def _make_request(self, url: str, params: dict = None) -> Optional[str]:
        """Internal method for making API requests"""

        retries = self.retry_limit

        # Begin making request and retry up to retry limit
        while retries:

            log.debug("Making request to {} with parameters {}".format(
                url, params))

            try:
                response = requests.get(url, params=params, timeout=30)
            except Exception as e:
                log.warning(
                    "Encountered an exception while making request to {}. Exception info: {}"
                    .format(url, e))
            else:
                if response.status_code == 200:
                    log.debug("Successfully retrieved {}".format(url))
                    self._update_ratelimit_info(response.headers)
                    return response.content
                elif response.status_code == 403:
                    log.warning(
                        "Received a 403 (forbidden) response for {} -- skipping."
                        .format(url))
                    return None
                else:
                    print(response.content)
                    log.warning(
                        "Received HTTP status code {} when fetching {}".format(
                            response.status_code, url))

            retries -= 1

            if retries == 0:
                raise McAPFetchError(
                    "Could not fetch {} after {} attempts. Giving up.".format(
                        url, self.retry_limit))

            wait_time = (self.retry_limit - retries)**2
            log.info(
                "Exponentially backing off for {} seconds.".format(wait_time))
            time.sleep(wait_time)

        return None

    def _check_ratelimit(self, api_method: str) -> None:
        """Check the endpoint rate limit before making an API call to that endpoint and to wait if necessary"""
        if api_method in self.ratelimit_info:
            current_window_remaining = float(
                self.ratelimit_info[api_method].get('current_window_remaining',
                                                    None))
            next_window = float(self.ratelimit_info[api_method].get(
                'next_window', None))
            if current_window_remaining < 1 and next_window > time.time():
                wait_time = math.ceil(next_window - time.time())
                if wait_time > 0:
                    log.info(
                        'Rate limit for {}. Sleeping {} before next API call'.
                        format(api_method, wait_time))
                    time.sleep(wait_time)

    def _update_ratelimit_info(self, headers):
        """Internal method to update rate limit information for an API endpoint"""
        api_method = headers['x-mediaapi-Q-name']
        calls_used, window_limit = [
            int(x) for x in headers['x-mediaapi-Q-used'].split('/')
        ]

        if api_method not in self.ratelimit_info:
            self.ratelimit_info[api_method] = dict()

        self.ratelimit_info[api_method]['next_window'] = math.ceil(
            int(headers['x-mediaapi-Q-secondsLeft']) + time.time())
        self.ratelimit_info[api_method]['current_window_limit'] = window_limit
        self.ratelimit_info[api_method][
            'current_window_remaining'] = window_limit - calls_used
Beispiel #27
0
 def __init__(self):
     self.__ua = UserAgent()
     self.__ua.set_timeout(self.__HTTP_REQUEST_TIMEOUT)
Beispiel #28
0
def _get_feed_url_from_itunes_podcasts_url(url: str) -> str:
    """
    Given a iTunes Podcasts URL, try to determine a RSS feed URL from it.

    :param url: iTunes Podcasts URL, e.g. https://podcasts.apple.com/lt/podcast/blah-blah/id1364954186?i=1000455255008
    :return: RSS feed URL that iTunes Podcasts uses, or original URL if it's not a iTunes Podcasts URL / feed URL can't
             be determined.
    """

    uri = furl(url)

    if uri.host not in {'podcasts.apple.com', 'itunes.apple.com'}:
        log.debug(f"URL '{url}' is not iTunes Podcasts URL.")
        return url

    # https://podcasts.apple.com/lt/podcast/blah-blah/id1364954186?i=1000455255008
    itunes_id = None
    for segment in reversed(uri.path.segments):
        match = re.match(r'^id(\d+?)$', segment)
        if match:
            itunes_id = match.group(1)
            break

    if not itunes_id:
        log.error(f"Unable to determine iTunes ID from URL '{url}'")
        return url

    ua = UserAgent()
    res = ua.get(
        f"https://itunes.apple.com/lookup?id={itunes_id}&entity=podcast")
    if not res.is_success():
        log.error(
            f"Unable to fetch iTunes Podcasts feed URL: {res.status_line()}")
        return url

    try:
        res_dict = res.decoded_json()
        if not isinstance(res_dict, dict):
            raise Exception("Result is not a dictionary")
    except Exception as ex:
        log.error(f"Unable to decode iTunes Podcasts feed JSON: {ex}")
        return url

    if res_dict.get('resultCount', None) != 1:
        log.error("Result count is not 1")
        return url

    results = res_dict.get('results', None)
    if not results:
        log.error("'results' not found in JSON response")
        return url

    if len(results) != 1:
        log.error("'results' is expected to have a single list item")
        return url

    feed_url = results[0].get('feedUrl', None)
    if not feed_url:
        log.error("'feedUrl' was not found in first row of 'results'")
        return url

    log.info(f"Resolved iTunes Podcasts URL '{url}' as '{feed_url}'")

    return feed_url
Beispiel #29
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info(f"Annotating {len(text)} characters of text...")

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    f"Text length ({text_length}) has exceeded the request text length limit"
                    f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.")
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                f"Unable to create annotator request for text '{text}': {ex}")

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {request.url()}"
        assert port, f"API URL port is not set for URL {request.url()}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, "
                f"exiting...")

        log.debug(f"Sending request to {request.url()}...")

        # Try requesting a few times because sometimes it throws a connection error, e.g.:
        #
        #   WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>:
        #   ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
        #   WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104,
        #   'Connection reset by peer'))
        #   ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.',
        #   ConnectionResetError(104, 'Connection reset by peer'))
        response = None
        retries = 60
        sleep_between_retries = 1
        for retry in range(1, retries + 1):

            if retry > 1:
                log.warning(f"Retrying ({retry} / {retries})...")

            response = ua.request(request)

            if response.is_success():
                break
            else:
                if response.error_is_client_side():
                    log.error(
                        f"Request failed on the client side: {response.decoded_content()}"
                    )
                    time.sleep(sleep_between_retries)
                else:
                    break

        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning(f"Request failed: {response.decoded_content()}")

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    f"The request timed out, giving up; text length: {len(text)}; text: {text}"
                )

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error(
                    f"User agent error: {response.status_line()}: {results_string}"
                )

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error(f'{response.status_line()}: {results_string}')

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        f'Annotator service was unable to process the download: {results_string}'
                    )

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error(
                        f'Unknown HTTP response: {response.status_line()}: {results_string}'
                    )

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                f"Annotator returned nothing for text: {text}")

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error(
                f"Unable to parse JSON response: {ex}\nJSON string: {results_string}"
            )
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}"
            )
        if not response_is_valid:
            fatal_error(
                f"Annotator response is invalid for JSON string: {results_string}"
            )

        log.info(f"Done annotating {len(text)} characters of text.")

        return results