Beispiel #1
0
def _get_feed_url_from_google_podcasts_url(url: str) -> str:
    """
    Given a Google Podcasts URL, try to determine a RSS feed URL from it.

    :param url: Google Podcasts URL, e.g. https://podcasts.google.com/?feed=aHR0cHM6Ly93d3cucmVzaWRlbnRhZHZpc29yLm5ldC94
                bWwvcG9kY2FzdC54bWw&ved=0CAAQ4aUDahcKEwiot6W5hrnnAhUAAAAAHQAAAAAQAQ&hl=lt
    :return: RSS feed URL that Google Podcasts uses, or original URL if it's not a Google Podcasts URL / feed URL can't
             be determined.
    """

    uri = furl(url)

    if uri.host != 'podcasts.google.com':
        log.debug(f"URL '{url}' is not Google Podcasts URL.")
        return url

    if 'feed' not in uri.args:
        log.error(f"URL '{url}' doesn't have 'feed' parameter.")

    # Remove the rest of the arguments because they might lead to an episode page which doesn't have "data-feed"
    args = list(uri.args.keys())
    for arg in args:
        if arg != 'feed':
            del uri.args[arg]

    url = str(uri.url)

    ua = UserAgent()
    res = ua.get(url)
    if not res.is_success():
        log.error(f"Unable to fetch Google Podcasts feed URL: {res.status_line()}")
        return url

    html = res.decoded_content()

    # check whether this is an individual episode URL rather than the show's Google Podcasts homepage; the feed URL
    # doesn't appear on individual episode pages, so we need to spider to the show's Google Podcasts homepage to get it
    if '/episode/' in url:
        show_homepage = url.split('/episode/')[0]
        res = ua.get(show_homepage)
        if not res.is_success():
            log.error(f"Unable to fetch Google Podcasts feed URL: {res.status_line()}")
            return show_homepage
        else:
            html = res.decoded_content()

    # get show's feed URL from its Google Podcasts homepage
    match = re.search(r'c-data id="i3" jsdata=".*(https?://.+?);2', html, flags=re.IGNORECASE)
    if not match:
        log.error(f"Feed URL was not found in Google Podcasts feed page.")
        return url

    feed_url = match.group(1)

    log.info(f"Resolved Google Podcasts URL '{url}' as '{feed_url}'")

    return feed_url
    def _get_content_from_api(self, query: str, start_date: datetime,
                              end_date: datetime) -> str:
        """Fetch the posts data from thw ch api and return the http response content."""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        return response.decoded_content()
Beispiel #3
0
class _SitemapWebClient(AbstractWebClient):
    # Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
    __HTTP_REQUEST_TIMEOUT = 60

    __slots__ = [
        '__ua',
    ]

    def __init__(self):
        self.__ua = UserAgent()
        self.__ua.set_timeout(self.__HTTP_REQUEST_TIMEOUT)

    def set_max_response_data_length(self,
                                     max_response_data_length: int) -> None:
        self.__ua.set_max_size(max_response_data_length)

    def get(self, url: str) -> AbstractWebClientResponse:
        ua_response = self.__ua.get(url)

        if ua_response.is_success():
            return _SitemapWebClientResponse(ua_response=ua_response)
        else:
            return WebClientErrorResponse(
                message=ua_response.status_line(),
                retryable=ua_response.code() in RETRYABLE_HTTP_STATUS_CODES,
            )
Beispiel #4
0
def get_url_retry_on_client_errors(url: str,
                                   ua: UserAgent,
                                   retry_count: int = 5,
                                   sleep_between_retries: int = 1) -> Response:
    """Fetch URL, retry on client errors (which, as per implementation, might be request timeouts too)."""
    assert retry_count > 0, "Retry count must be positive."

    response = None
    for retry in range(0, retry_count):
        log.info("Fetching URL {}...".format(url))
        response = ua.get(url)
        if response.is_success():
            return response
        else:
            log.warning("Request for URL {} failed: {}".format(url, response.message()))

            if response.error_is_client_side():
                log.info("Retrying URL {} in {} seconds...".format(url, sleep_between_retries))
                time.sleep(sleep_between_retries)

            else:
                log.info("Not retrying for URL {}".format(url))
                return response

    log.info("Giving up on URL {}".format(url))
    return response
Beispiel #5
0
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None:
    """Wait for Solr to start and collections to become available, if needed."""

    # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason
    sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json"

    connected = False

    for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1):

        if retry > 0:
            log.debug(f"Retrying Solr connection ({retry})...")

        try:

            ua = UserAgent()
            ua.set_timeout(1)
            response = ua.get(sample_select_url)

            if not response.is_success():
                raise Exception(f"Unable to connect: {response.status_line()}")

            if not response.decoded_content():
                raise Exception("Response is empty.")

            try:
                result = response.decoded_json()
            except Exception as ex:
                raise Exception(f"Unable to decode response: {ex}")

            if not isinstance(result, dict):
                raise Exception(
                    f"Result is not a dictionary: {response.decoded_content()}"
                )

            if 'response' not in result:
                raise Exception(
                    f"Response doesn't have 'response' key: {response.decoded_content()}"
                )

        except Exception as ex:

            log.warning(f"Solr is down, will retry: {ex}")
            time.sleep(1)

        else:
            log.debug("Solr is up!")
            connected = True
            break

    if not connected:
        raise McSolrRequestDidNotStartInTimeException(
            f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up"
        )
    def fetch_posts(self, query: dict, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch tweets from archive.org that match the given query for the given day."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        enc_query = urlencode({
            'q': query,
            'date_from': start_arg,
            'date_to': end_arg
        })

        url = "https://searchtweets.archivelab.org/export?" + enc_query

        log.debug("archive.org url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsArchiveTwitterDataException(
                "error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        # sometimes we get null characters, which choke the csv module
        decoded_content = decoded_content.replace('\x00', '')

        meta_tweets = []
        lines = decoded_content.splitlines()[1:]
        for row in csv.reader(lines, delimiter="\t"):
            fields = 'user_name user_screen_name lang text timestamp_ms url'.split(
                ' ')
            meta_tweet = {}
            for i, field in enumerate(fields):
                meta_tweet[field] = row[i] if i < len(row) else ''

            if 'url' not in meta_tweet or meta_tweet['url'] == '':
                log.warning("meta_tweet '%s' does not have a url" % str(row))
                continue

            meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url'])

            meta_tweets.append(meta_tweet)

        add_tweets_to_meta_tweets(meta_tweets)

        return meta_tweets
Beispiel #7
0
def fetch_meta_tweets_from_ch(query: str, day: str) -> list:
    """Fetch day of tweets from crimson hexagon"""
    ch_monitor_id = int(query)

    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    config = mediawords.util.config.get_config()
    if 'crimson_hexagon' not in config or 'key' not in config[
            'crimson_hexagon']:
        raise McFetchTopicTweetsConfigException(
            "no key in mediawords.yml at //crimson_hexagon/key.")

    key = config['crimson_hexagon']['key']

    next_day = day + datetime.timedelta(days=1)

    day_arg = day.strftime('%Y-%m-%d')
    next_day_arg = next_day.strftime('%Y-%m-%d')

    url = (
        "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
        % (key, ch_monitor_id, day_arg, next_day_arg))

    log.debug("crimson hexagon url: " + url)

    response = ua.get(url)

    if not response.is_success():
        raise McFetchTopicTweetsDataException("error fetching posts: " +
                                              response.decoded_content())

    decoded_content = response.decoded_content()

    data = dict(mediawords.util.parse_json.decode_json(decoded_content))

    if 'status' not in data or not data['status'] == 'success':
        raise McFetchTopicTweetsDataException("Unknown response status: " +
                                              str(data))

    meta_tweets = data['posts']

    for mt in meta_tweets:
        mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

    return meta_tweets
Beispiel #8
0
def _get_feed_url_from_google_podcasts_url(url: str) -> str:
    """
    Given a Google Podcasts URL, try to determine a RSS feed URL from it.

    :param url: Google Podcasts URL, e.g. https://podcasts.google.com/?feed=aHR0cHM6Ly93d3cucmVzaWRlbnRhZHZpc29yLm5ldC94
                bWwvcG9kY2FzdC54bWw&ved=0CAAQ4aUDahcKEwiot6W5hrnnAhUAAAAAHQAAAAAQAQ&hl=lt
    :return: RSS feed URL that Google Podcasts uses, or original URL if it's not a Google Podcasts URL / feed URL can't
             be determined.
    """

    uri = furl(url)

    if uri.host != 'podcasts.google.com':
        log.debug(f"URL '{url}' is not Google Podcasts URL.")
        return url

    if 'feed' not in uri.args:
        log.error(f"URL '{url}' doesn't have 'feed' parameter.")

    # Remove the rest of the arguments because they might lead to an episode page which doesn't have "data-feed"
    args = list(uri.args.keys())
    for arg in args:
        if arg != 'feed':
            del uri.args[arg]

    url = str(uri.url)

    ua = UserAgent()
    res = ua.get(url)
    if not res.is_success():
        log.error(
            f"Unable to fetch Google Podcasts feed URL: {res.status_line()}")
        return url

    html = res.decoded_content()

    # <div jsname="<...>" jscontroller="<...>" jsaction="<...>" data-feed="<...>">
    match = re.search(r'data-feed="(https?://.+?)"', html, flags=re.IGNORECASE)
    if not match:
        log.error(f"Feed URL was not found in Google Podcasts feed page.")
        return url

    feed_url = match.group(1)

    log.info(f"Resolved Google Podcasts URL '{url}' as '{feed_url}'")

    return feed_url
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config[
                'crimson_hexagon']:
            raise McFetchTopicTweetsConfigException(
                "no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " +
                                                  response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " +
                                                  str(data))

        return data
    def test_api_request(self):
        """Make an API request, see if it succeeds."""

        credentials = self.univision_credentials()

        handler = DownloadFeedUnivisionHandler(crawler_config=self._mock_crawler_config())
        api_request_url = handler._api_request_url_with_signature_from_config(api_url=credentials.url)
        assert api_request_url, 'API request URL is not empty'

        ua = UserAgent()
        ua.set_timeout(30)

        response = ua.get(api_request_url)
        assert response.is_success(), 'API request was successful'

        json_string = response.decoded_content()
        assert json_string, 'JSON response is not empty'

        json = response.decoded_json()
        assert json.get('status', None) == 'success', "JSON response was successful"
        assert 'data' in json, 'JSON response has "data" key'
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']:
            raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" %
               (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " + str(data))

        return data
Beispiel #12
0
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]:
    """
    Make Facebook API request.

    Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if
    something went wrong.

    :param node: Facebook API node to call.
    :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if
                   multiple values with the same key have to be passed.
    :param config: Facebook configuration object.
    :return: API response.
    """
    node = decode_object_from_bytes_if_needed(node)
    params = decode_object_from_bytes_if_needed(params)

    if node is None:
        raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).")

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Params is not a dict.")

    if not config.is_enabled():
        raise McFacebookInvalidConfigurationException("Facebook API is not enabled.")

    if not config.api_endpoint():
        raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.")

    api_uri = furl(config.api_endpoint())
    api_uri.path.segments.append(node)

    if not isinstance(params, dict):
        raise McFacebookInvalidParametersException("Parameters should be a dictionary.")

    for key, values in params.items():
        if key is None or values is None:
            raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.")

        if isinstance(values, str):
            # A single value
            api_uri = api_uri.add({key: values})

        elif isinstance(values, list):
            # Multiple values for the same key
            for value in values:
                api_uri = api_uri.add({key: value})

        else:
            raise McFacebookInvalidParametersException("Values is neither a string nor a list.")

    log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}")

    app_id = config.app_id()
    app_secret = config.app_secret()

    if not (app_id and app_secret):
        raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.")

    access_token = f"{app_id}|{app_secret}"
    api_uri = api_uri.add({'access_token': access_token})

    # Last API error to set as an exception message if we run out of retries
    last_api_error = None
    data = None

    for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1):

        if retry > 1:
            log.warning(f"Retrying #{retry}...")

        ua = UserAgent()
        ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT)

        try:
            response = ua.get(api_uri.url)
        except Exception as ex:
            # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up
            # something in the code or arguments
            raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}")

        decoded_content = response.decoded_content()

        if not decoded_content:
            # some stories consistenty return empty content, so just return a soft error and move on
            raise McFacebookSoftFailureException("Decoded content is empty.")

        try:
            data = decode_json(decoded_content)
        except Exception as ex:

            if 'something went wrong' in decoded_content:
                # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the
                # request
                last_api_error = f"API responded with 'Something went wrong', will retry"
                log.error(last_api_error)
                continue

            else:
                # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"Unable to decode JSON response: {ex}",
                )

        if response.is_success():
            # Response was successful and we managed to decode JSON -- break from the retry loop
            return data

        else:
            if 'error' not in data:
                # More likely than not it's our problem so consider it a hard failure
                raise McFacebookUnexpectedAPIResponseException(
                    response=decoded_content,
                    error_message=f"No 'error' key but HTTP status is not 2xx",
                )

            error = data['error']
            error_code = error.get('code', -1)
            error_message = error.get('message', 'unknown message')

            if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES:
                # Retryable error
                last_api_error = (
                    f"Retryable error {error_code}: {error_message}, "
                    f"will retry in {config.seconds_to_wait_between_retries()} seconds"
                )
                log.error(last_api_error)
                time.sleep(config.seconds_to_wait_between_retries())
                continue

            else:
                # Non-retryable error
                log.error(f"Non-retryable error {error_code}: {error_message}")
                return data

    # At this point, we've retried the request for some time but nothing worked
    log.error(f"Ran out of retries; last error: {last_api_error}")
    return data
Beispiel #13
0
def _get_feed_url_from_itunes_podcasts_url(url: str) -> str:
    """
    Given a iTunes Podcasts URL, try to determine a RSS feed URL from it.

    :param url: iTunes Podcasts URL, e.g. https://podcasts.apple.com/lt/podcast/blah-blah/id1364954186?i=1000455255008
    :return: RSS feed URL that iTunes Podcasts uses, or original URL if it's not a iTunes Podcasts URL / feed URL can't
             be determined.
    """

    uri = furl(url)

    if uri.host not in {'podcasts.apple.com', 'itunes.apple.com'}:
        log.debug(f"URL '{url}' is not iTunes Podcasts URL.")
        return url

    # https://podcasts.apple.com/lt/podcast/blah-blah/id1364954186?i=1000455255008
    itunes_id = None
    for segment in reversed(uri.path.segments):
        match = re.match(r'^id(\d+?)$', segment)
        if match:
            itunes_id = match.group(1)
            break

    if not itunes_id:
        log.error(f"Unable to determine iTunes ID from URL '{url}'")
        return url

    ua = UserAgent()
    res = ua.get(
        f"https://itunes.apple.com/lookup?id={itunes_id}&entity=podcast")
    if not res.is_success():
        log.error(
            f"Unable to fetch iTunes Podcasts feed URL: {res.status_line()}")
        return url

    try:
        res_dict = res.decoded_json()
        if not isinstance(res_dict, dict):
            raise Exception("Result is not a dictionary")
    except Exception as ex:
        log.error(f"Unable to decode iTunes Podcasts feed JSON: {ex}")
        return url

    if res_dict.get('resultCount', None) != 1:
        log.error("Result count is not 1")
        return url

    results = res_dict.get('results', None)
    if not results:
        log.error("'results' not found in JSON response")
        return url

    if len(results) != 1:
        log.error("'results' is expected to have a single list item")
        return url

    feed_url = results[0].get('feedUrl', None)
    if not feed_url:
        log.error("'feedUrl' was not found in first row of 'results'")
        return url

    log.info(f"Resolved iTunes Podcasts URL '{url}' as '{feed_url}'")

    return feed_url
Beispiel #14
0
    def fetch_posts(self, query: str, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch day of tweets from crimson hexagon"""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.warning("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                post = {
                    'post_id': mt['tweet_id'],
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': mt['tweet']['created_at'],
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts