def _get_feed_url_from_google_podcasts_url(url: str) -> str: """ Given a Google Podcasts URL, try to determine a RSS feed URL from it. :param url: Google Podcasts URL, e.g. https://podcasts.google.com/?feed=aHR0cHM6Ly93d3cucmVzaWRlbnRhZHZpc29yLm5ldC94 bWwvcG9kY2FzdC54bWw&ved=0CAAQ4aUDahcKEwiot6W5hrnnAhUAAAAAHQAAAAAQAQ&hl=lt :return: RSS feed URL that Google Podcasts uses, or original URL if it's not a Google Podcasts URL / feed URL can't be determined. """ uri = furl(url) if uri.host != 'podcasts.google.com': log.debug(f"URL '{url}' is not Google Podcasts URL.") return url if 'feed' not in uri.args: log.error(f"URL '{url}' doesn't have 'feed' parameter.") # Remove the rest of the arguments because they might lead to an episode page which doesn't have "data-feed" args = list(uri.args.keys()) for arg in args: if arg != 'feed': del uri.args[arg] url = str(uri.url) ua = UserAgent() res = ua.get(url) if not res.is_success(): log.error(f"Unable to fetch Google Podcasts feed URL: {res.status_line()}") return url html = res.decoded_content() # check whether this is an individual episode URL rather than the show's Google Podcasts homepage; the feed URL # doesn't appear on individual episode pages, so we need to spider to the show's Google Podcasts homepage to get it if '/episode/' in url: show_homepage = url.split('/episode/')[0] res = ua.get(show_homepage) if not res.is_success(): log.error(f"Unable to fetch Google Podcasts feed URL: {res.status_line()}") return show_homepage else: html = res.decoded_content() # get show's feed URL from its Google Podcasts homepage match = re.search(r'c-data id="i3" jsdata=".*(https?://.+?);2', html, flags=re.IGNORECASE) if not match: log.error(f"Feed URL was not found in Google Podcasts feed page.") return url feed_url = match.group(1) log.info(f"Resolved Google Podcasts URL '{url}' as '{feed_url}'") return feed_url
def _get_content_from_api(self, query: str, start_date: datetime, end_date: datetime) -> str: """Fetch the posts data from thw ch api and return the http response content.""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() start_arg = start_date.strftime('%Y-%m-%d') end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) return response.decoded_content()
class _SitemapWebClient(AbstractWebClient): # Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big. __HTTP_REQUEST_TIMEOUT = 60 __slots__ = [ '__ua', ] def __init__(self): self.__ua = UserAgent() self.__ua.set_timeout(self.__HTTP_REQUEST_TIMEOUT) def set_max_response_data_length(self, max_response_data_length: int) -> None: self.__ua.set_max_size(max_response_data_length) def get(self, url: str) -> AbstractWebClientResponse: ua_response = self.__ua.get(url) if ua_response.is_success(): return _SitemapWebClientResponse(ua_response=ua_response) else: return WebClientErrorResponse( message=ua_response.status_line(), retryable=ua_response.code() in RETRYABLE_HTTP_STATUS_CODES, )
def get_url_retry_on_client_errors(url: str, ua: UserAgent, retry_count: int = 5, sleep_between_retries: int = 1) -> Response: """Fetch URL, retry on client errors (which, as per implementation, might be request timeouts too).""" assert retry_count > 0, "Retry count must be positive." response = None for retry in range(0, retry_count): log.info("Fetching URL {}...".format(url)) response = ua.get(url) if response.is_success(): return response else: log.warning("Request for URL {} failed: {}".format(url, response.message())) if response.error_is_client_side(): log.info("Retrying URL {} in {} seconds...".format(url, sleep_between_retries)) time.sleep(sleep_between_retries) else: log.info("Not retrying for URL {}".format(url)) return response log.info("Giving up on URL {}".format(url)) return response
def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None: """Wait for Solr to start and collections to become available, if needed.""" # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" connected = False for retry in range(0, __SOLR_STARTUP_TIMEOUT + 1): if retry > 0: log.debug(f"Retrying Solr connection ({retry})...") try: ua = UserAgent() ua.set_timeout(1) response = ua.get(sample_select_url) if not response.is_success(): raise Exception(f"Unable to connect: {response.status_line()}") if not response.decoded_content(): raise Exception("Response is empty.") try: result = response.decoded_json() except Exception as ex: raise Exception(f"Unable to decode response: {ex}") if not isinstance(result, dict): raise Exception( f"Result is not a dictionary: {response.decoded_content()}" ) if 'response' not in result: raise Exception( f"Response doesn't have 'response' key: {response.decoded_content()}" ) except Exception as ex: log.warning(f"Solr is down, will retry: {ex}") time.sleep(1) else: log.debug("Solr is up!") connected = True break if not connected: raise McSolrRequestDidNotStartInTimeException( f"Solr is still down after {__SOLR_STARTUP_TIMEOUT} retries, giving up" )
def fetch_posts(self, query: dict, start_date: datetime, end_date: datetime) -> list: """Fetch tweets from archive.org that match the given query for the given day.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') enc_query = urlencode({ 'q': query, 'date_from': start_arg, 'date_to': end_arg }) url = "https://searchtweets.archivelab.org/export?" + enc_query log.debug("archive.org url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsArchiveTwitterDataException( "error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() # sometimes we get null characters, which choke the csv module decoded_content = decoded_content.replace('\x00', '') meta_tweets = [] lines = decoded_content.splitlines()[1:] for row in csv.reader(lines, delimiter="\t"): fields = 'user_name user_screen_name lang text timestamp_ms url'.split( ' ') meta_tweet = {} for i, field in enumerate(fields): meta_tweet[field] = row[i] if i < len(row) else '' if 'url' not in meta_tweet or meta_tweet['url'] == '': log.warning("meta_tweet '%s' does not have a url" % str(row)) continue meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url']) meta_tweets.append(meta_tweet) add_tweets_to_meta_tweets(meta_tweets) return meta_tweets
def fetch_meta_tweets_from_ch(query: str, day: str) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config[ 'crimson_hexagon']: raise McFetchTopicTweetsConfigException( "no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) return meta_tweets
def _get_feed_url_from_google_podcasts_url(url: str) -> str: """ Given a Google Podcasts URL, try to determine a RSS feed URL from it. :param url: Google Podcasts URL, e.g. https://podcasts.google.com/?feed=aHR0cHM6Ly93d3cucmVzaWRlbnRhZHZpc29yLm5ldC94 bWwvcG9kY2FzdC54bWw&ved=0CAAQ4aUDahcKEwiot6W5hrnnAhUAAAAAHQAAAAAQAQ&hl=lt :return: RSS feed URL that Google Podcasts uses, or original URL if it's not a Google Podcasts URL / feed URL can't be determined. """ uri = furl(url) if uri.host != 'podcasts.google.com': log.debug(f"URL '{url}' is not Google Podcasts URL.") return url if 'feed' not in uri.args: log.error(f"URL '{url}' doesn't have 'feed' parameter.") # Remove the rest of the arguments because they might lead to an episode page which doesn't have "data-feed" args = list(uri.args.keys()) for arg in args: if arg != 'feed': del uri.args[arg] url = str(uri.url) ua = UserAgent() res = ua.get(url) if not res.is_success(): log.error( f"Unable to fetch Google Podcasts feed URL: {res.status_line()}") return url html = res.decoded_content() # <div jsname="<...>" jscontroller="<...>" jsaction="<...>" data-feed="<...>"> match = re.search(r'data-feed="(https?://.+?)"', html, flags=re.IGNORECASE) if not match: log.error(f"Feed URL was not found in Google Podcasts feed page.") return url feed_url = match.group(1) log.info(f"Resolved Google Podcasts URL '{url}' as '{feed_url}'") return feed_url
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """Implement fetch_posts on ch api using the config data from mediawords.yml.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config[ 'crimson_hexagon']: raise McFetchTopicTweetsConfigException( "no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) return data
def test_api_request(self): """Make an API request, see if it succeeds.""" credentials = self.univision_credentials() handler = DownloadFeedUnivisionHandler(crawler_config=self._mock_crawler_config()) api_request_url = handler._api_request_url_with_signature_from_config(api_url=credentials.url) assert api_request_url, 'API request URL is not empty' ua = UserAgent() ua.set_timeout(30) response = ua.get(api_request_url) assert response.is_success(), 'API request was successful' json_string = response.decoded_content() assert json_string, 'JSON response is not empty' json = response.decoded_json() assert json.get('status', None) == 'success', "JSON response was successful" assert 'data' in json, 'JSON response has "data" key'
def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict: """Implement fetch_posts on ch api using the config data from mediawords.yml.""" ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = mediawords.util.config.get_config() if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']: raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.") key = config['crimson_hexagon']['key'] next_day = day + datetime.timedelta(days=1) day_arg = day.strftime('%Y-%m-%d') next_day_arg = next_day.strftime('%Y-%m-%d') url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (key, ch_monitor_id, day_arg, next_day_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(mediawords.util.parse_json.decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McFetchTopicTweetsDataException("Unknown response status: " + str(data)) return data
def _api_request(node: str, params: Dict[str, Union[str, List[str]]], config: FacebookConfig) -> Union[dict, list]: """ Make Facebook API request. Return successful or failed API response if we were able to make a request. Throw McFacebookException subclass if something went wrong. :param node: Facebook API node to call. :param params: Dictionary of parameters to pass to the API; values might be either strings of lists of strings if multiple values with the same key have to be passed. :param config: Facebook configuration object. :return: API response. """ node = decode_object_from_bytes_if_needed(node) params = decode_object_from_bytes_if_needed(params) if node is None: raise McFacebookInvalidParametersException("Node is undefined (node might be an empty string).") if not isinstance(params, dict): raise McFacebookInvalidParametersException("Params is not a dict.") if not config.is_enabled(): raise McFacebookInvalidConfigurationException("Facebook API is not enabled.") if not config.api_endpoint(): raise McFacebookInvalidConfigurationException("Facebook API endpoint URL is not configured.") api_uri = furl(config.api_endpoint()) api_uri.path.segments.append(node) if not isinstance(params, dict): raise McFacebookInvalidParametersException("Parameters should be a dictionary.") for key, values in params.items(): if key is None or values is None: raise McFacebookInvalidParametersException("Both 'key' and 'value' must be defined.") if isinstance(values, str): # A single value api_uri = api_uri.add({key: values}) elif isinstance(values, list): # Multiple values for the same key for value in values: api_uri = api_uri.add({key: value}) else: raise McFacebookInvalidParametersException("Values is neither a string nor a list.") log.debug(f"Facebook API final URL (pre-authentication): {api_uri.url}") app_id = config.app_id() app_secret = config.app_secret() if not (app_id and app_secret): raise McFacebookInvalidConfigurationException("Both app ID and app secret must be set.") access_token = f"{app_id}|{app_secret}" api_uri = api_uri.add({'access_token': access_token}) # Last API error to set as an exception message if we run out of retries last_api_error = None data = None for retry in range(1, __FACEBOOK_GRAPH_API_RETRY_COUNT + 1): if retry > 1: log.warning(f"Retrying #{retry}...") ua = UserAgent() ua.set_timeout(__FACEBOOK_API_HTTP_TIMEOUT) try: response = ua.get(api_uri.url) except Exception as ex: # UserAgent dying should be pretty rare, so if it does die, it means that we probably have messed up # something in the code or arguments raise McFacebookInvalidParametersException(f"UserAgent died while trying to fetch Facebook API URL: {ex}") decoded_content = response.decoded_content() if not decoded_content: # some stories consistenty return empty content, so just return a soft error and move on raise McFacebookSoftFailureException("Decoded content is empty.") try: data = decode_json(decoded_content) except Exception as ex: if 'something went wrong' in decoded_content: # Occasionally Facebook returns a "something went wrong" 500 page on which we'd like to retry the # request last_api_error = f"API responded with 'Something went wrong', will retry" log.error(last_api_error) continue else: # If we can't seem to decode JSON and it's not a "something went wrong" issue, we should give up raise McFacebookUnexpectedAPIResponseException( response=decoded_content, error_message=f"Unable to decode JSON response: {ex}", ) if response.is_success(): # Response was successful and we managed to decode JSON -- break from the retry loop return data else: if 'error' not in data: # More likely than not it's our problem so consider it a hard failure raise McFacebookUnexpectedAPIResponseException( response=decoded_content, error_message=f"No 'error' key but HTTP status is not 2xx", ) error = data['error'] error_code = error.get('code', -1) error_message = error.get('message', 'unknown message') if error_code in __FACEBOOK_GRAPH_API_RETRYABLE_ERROR_CODES: # Retryable error last_api_error = ( f"Retryable error {error_code}: {error_message}, " f"will retry in {config.seconds_to_wait_between_retries()} seconds" ) log.error(last_api_error) time.sleep(config.seconds_to_wait_between_retries()) continue else: # Non-retryable error log.error(f"Non-retryable error {error_code}: {error_message}") return data # At this point, we've retried the request for some time but nothing worked log.error(f"Ran out of retries; last error: {last_api_error}") return data
def _get_feed_url_from_itunes_podcasts_url(url: str) -> str: """ Given a iTunes Podcasts URL, try to determine a RSS feed URL from it. :param url: iTunes Podcasts URL, e.g. https://podcasts.apple.com/lt/podcast/blah-blah/id1364954186?i=1000455255008 :return: RSS feed URL that iTunes Podcasts uses, or original URL if it's not a iTunes Podcasts URL / feed URL can't be determined. """ uri = furl(url) if uri.host not in {'podcasts.apple.com', 'itunes.apple.com'}: log.debug(f"URL '{url}' is not iTunes Podcasts URL.") return url # https://podcasts.apple.com/lt/podcast/blah-blah/id1364954186?i=1000455255008 itunes_id = None for segment in reversed(uri.path.segments): match = re.match(r'^id(\d+?)$', segment) if match: itunes_id = match.group(1) break if not itunes_id: log.error(f"Unable to determine iTunes ID from URL '{url}'") return url ua = UserAgent() res = ua.get( f"https://itunes.apple.com/lookup?id={itunes_id}&entity=podcast") if not res.is_success(): log.error( f"Unable to fetch iTunes Podcasts feed URL: {res.status_line()}") return url try: res_dict = res.decoded_json() if not isinstance(res_dict, dict): raise Exception("Result is not a dictionary") except Exception as ex: log.error(f"Unable to decode iTunes Podcasts feed JSON: {ex}") return url if res_dict.get('resultCount', None) != 1: log.error("Result count is not 1") return url results = res_dict.get('results', None) if not results: log.error("'results' not found in JSON response") return url if len(results) != 1: log.error("'results' is expected to have a single list item") return url feed_url = results[0].get('feedUrl', None) if not feed_url: log.error("'feedUrl' was not found in first row of 'results'") return url log.info(f"Resolved iTunes Podcasts URL '{url}' as '{feed_url}'") return feed_url
def fetch_posts(self, query: str, start_date: datetime, end_date: datetime) -> list: """Fetch day of tweets from crimson hexagon""" ch_monitor_id = int(query) log.debug("crimson_hexagon_twitter.fetch_posts") ua = UserAgent() ua.set_max_size(100 * 1024 * 1024) ua.set_timeout(90) ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512]) config = TopicsMineConfig() api_key = config.crimson_hexagon_api_key() end_date = end_date + datetime.timedelta(days=1) start_arg = start_date.strftime('%Y-%m-%d') end_arg = end_date.strftime('%Y-%m-%d') url = ( "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" % (api_key, ch_monitor_id, start_arg, end_arg)) log.debug("crimson hexagon url: " + url) response = ua.get(url) if not response.is_success(): raise McPostsCHTwitterDataException("error fetching posts: " + response.decoded_content()) decoded_content = response.decoded_content() data = dict(decode_json(decoded_content)) if 'status' not in data or not data['status'] == 'success': raise McPostsCHTwitterDataException("Unknown response status: " + str(data)) meta_tweets = data['posts'] for mt in meta_tweets: mt['tweet_id'] = get_tweet_id_from_url(mt['url']) add_tweets_to_meta_tweets(meta_tweets) posts = [] for mt in meta_tweets: log.warning("mt: %d" % mt['tweet_id']) if 'tweet' in mt: post = { 'post_id': mt['tweet_id'], 'data': mt, 'content': mt['tweet']['text'], 'publish_date': mt['tweet']['created_at'], 'author': mt['tweet']['user']['screen_name'], 'channel': mt['tweet']['user']['screen_name'], 'url': mt['url'] } posts.append(post) return posts