def _get_content_from_api(self, query: str, start_date: datetime,
                              end_date: datetime) -> str:
        """Fetch the posts data from thw ch api and return the http response content."""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = (end_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        return response.decoded_content()
Exemple #2
0
def _get_user_agent() -> UserAgent:
    """Get a properly configured user agent."""
    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    return ua
    def fetch_posts(self, query: dict, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch tweets from archive.org that match the given query for the given day."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        enc_query = urlencode({
            'q': query,
            'date_from': start_arg,
            'date_to': end_arg
        })

        url = "https://searchtweets.archivelab.org/export?" + enc_query

        log.debug("archive.org url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsArchiveTwitterDataException(
                "error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        # sometimes we get null characters, which choke the csv module
        decoded_content = decoded_content.replace('\x00', '')

        meta_tweets = []
        lines = decoded_content.splitlines()[1:]
        for row in csv.reader(lines, delimiter="\t"):
            fields = 'user_name user_screen_name lang text timestamp_ms url'.split(
                ' ')
            meta_tweet = {}
            for i, field in enumerate(fields):
                meta_tweet[field] = row[i] if i < len(row) else ''

            if 'url' not in meta_tweet or meta_tweet['url'] == '':
                log.warning("meta_tweet '%s' does not have a url" % str(row))
                continue

            meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url'])

            meta_tweets.append(meta_tweet)

        add_tweets_to_meta_tweets(meta_tweets)

        return meta_tweets
Exemple #4
0
def fetch_meta_tweets_from_ch(query: str, day: str) -> list:
    """Fetch day of tweets from crimson hexagon"""
    ch_monitor_id = int(query)

    ua = UserAgent()
    ua.set_max_size(100 * 1024 * 1024)
    ua.set_timeout(90)
    ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

    config = mediawords.util.config.get_config()
    if 'crimson_hexagon' not in config or 'key' not in config[
            'crimson_hexagon']:
        raise McFetchTopicTweetsConfigException(
            "no key in mediawords.yml at //crimson_hexagon/key.")

    key = config['crimson_hexagon']['key']

    next_day = day + datetime.timedelta(days=1)

    day_arg = day.strftime('%Y-%m-%d')
    next_day_arg = next_day.strftime('%Y-%m-%d')

    url = (
        "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
        % (key, ch_monitor_id, day_arg, next_day_arg))

    log.debug("crimson hexagon url: " + url)

    response = ua.get(url)

    if not response.is_success():
        raise McFetchTopicTweetsDataException("error fetching posts: " +
                                              response.decoded_content())

    decoded_content = response.decoded_content()

    data = dict(mediawords.util.parse_json.decode_json(decoded_content))

    if 'status' not in data or not data['status'] == 'success':
        raise McFetchTopicTweetsDataException("Unknown response status: " +
                                              str(data))

    meta_tweets = data['posts']

    for mt in meta_tweets:
        mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

    return meta_tweets
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config[
                'crimson_hexagon']:
            raise McFetchTopicTweetsConfigException(
                "no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " +
                                                  response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " +
                                                  str(data))

        return data
    def fetch_posts(ch_monitor_id: int, day: datetime.datetime) -> dict:
        """Implement fetch_posts on ch api using the config data from mediawords.yml."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = mediawords.util.config.get_config()
        if 'crimson_hexagon' not in config or 'key' not in config['crimson_hexagon']:
            raise McFetchTopicTweetsConfigException("no key in mediawords.yml at //crimson_hexagon/key.")

        key = config['crimson_hexagon']['key']

        next_day = day + datetime.timedelta(days=1)

        day_arg = day.strftime('%Y-%m-%d')
        next_day_arg = next_day.strftime('%Y-%m-%d')

        url = ("https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true" %
               (key, ch_monitor_id, day_arg, next_day_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McFetchTopicTweetsDataException("error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(mediawords.util.parse_json.decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McFetchTopicTweetsDataException("Unknown response status: " + str(data))

        return data
Exemple #7
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info("Annotating %d characters of text..." % len(text))

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    "Text length (%d) has exceeded the request text length limit (%d) so I will truncate it."
                    % (
                        text_length,
                        self.__TEXT_LENGTH_LIMIT,
                    ))
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                "Unable to create annotator request for text '%s': %s" % (
                    text,
                    str(ex),
                ))

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {url}"
        assert port, f"API URL port is not set for URL {url}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                "Annotator service at {url} didn't come up in {timeout} seconds, exiting..."
                .format(
                    url=url,
                    timeout=self.__ANNOTATOR_SERVICE_TIMEOUT,
                ))

        log.debug("Sending request to %s..." % request.url())
        response = ua.request(request)
        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning("Request failed: %s" % response.decoded_content())

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    "The request timed out, giving up; text length: %d; text: %s"
                    % (
                        len(text),
                        text,
                    ))

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error("User agent error: %s: %s" % (
                    response.status_line(),
                    results_string,
                ))

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error('%s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        'Annotator service was unable to process the download: %s'
                        % results_string)

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error('Unknown HTTP response: %s: %s' % (
                        response.status_line(),
                        results_string,
                    ))

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                "Annotator returned nothing for text: %s" % text)

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error("Unable to parse JSON response: %s\nJSON string: %s" %
                        (
                            str(ex),
                            results_string,
                        ))
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                "Unable to determine whether response is valid: %s\nJSON string: %s"
                % (str(ex), results_string))
        if not response_is_valid:
            fatal_error("Annotator response is invalid for JSON string: %s" %
                        results_string)

        log.info("Done annotating %d characters of text." % len(text))

        return results
Exemple #8
0
class AssociatedPressAPI:
    """Object used to interface with the Associated Press API and to return data from
    various API endpoints.
    """
    def __init__(self, ap_config: Optional[APCrawlerConfig] = None):

        self.api_key = None
        self.api_version = '1.1'
        self.retry_limit = 5
        self.ratelimit_info = dict()
        self.ua = UserAgent()
        self.ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256])

        if not ap_config:
            ap_config = APCrawlerConfig()

        self.api_key = ap_config.api_key()

        if not self.api_key:
            raise McAPMissingAPIKey(
                "API key configuration data missing for associated_press.")

    def feed(self, **kwargs) -> dict:
        """Feed API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Search-and-Feed/#feed)

        METHOD: GET

        ENDPOINT PARAMETERS:

        q: Query Expression

        include, exclude: Parameters used to customize the fields returned in the response.

        text_links: Specifies the format of the text renditions (stories, captions, scripts and shotlists) to return in
        the response.  For stories, the valid value is nitf (NITF) or anpa (ANPA 1312). For captions, scripts and
        shotlists, the valid value is nitf (NITF).  The value of all returns all available formats (this is the
        default).

        page_size: The maximum number of items to return per page. The default is 10 items with a maximum of 100 per
        page.

        versions: Specifies whether to return all available versions of the content item and all ANPA filings or only
        the latest (the same story in the ANPA format may be filed multiple times; for example, with a different
        category code).

        REQUEST HEADERS:

        Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip.

        """
        url = 'https://api.ap.org/media/v/content/feed'
        api_method = 'feed'
        params = {'apikey': self.api_key}
        params.update(kwargs)
        self._check_ratelimit(api_method)
        feed_data = self._make_request(url, params)
        return json.loads(feed_data)['data']

    def search(self, **kwargs) -> dict:
        """Feed API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Search-and-Feed/#search)

        METHOD: GET

        ENDPOINT PARAMETERS:

        q: Query Expression

        include, exclude: Parameters used to customize the fields returned in the response.

        text_links: Specifies the format of the text renditions (stories, captions, scripts and shotlists) to return in
        the response.  For stories, the valid value is nitf (NITF) or anpa (ANPA 1312). For captions, scripts and
        shotlists, the valid value is nitf (NITF).  The value of all returns all available formats (this is the
        default).

        sort: The sort order of the returned results. By default, the results are sorted by relevance (meta.score) - the
        most relevant items first, regardless of the time period. Valid options are:

            versioncreated: desc. The latest items first (reverse chronological order).  versioncreated: asc. The oldest
            items first (chronological order).

        page_size: The maximum number of items to return per page. The default is 10 items with a maximum of 100 per
        page.

        page:. The requested page number within the set of search results. Page numbers start at 1.

        REQUEST HEADERS:

        Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip.

        """
        url = 'https://api.ap.org/media/v/content/search'
        api_method = 'search'
        params = {'apikey': self.api_key}
        params.update(kwargs)
        self._check_ratelimit(api_method)
        search_data = self._make_request(url, params)
        return json.loads(search_data)['data']

    def content(self, path, **kwargs) -> Optional[str]:
        """Content API endpoint (Documentation: https://api.ap.org/media/v/docs/api/Content-Item/)
        Example: https://api.ap.org/media/v[{version}]/content/{item_id}?apikey={apikey}[{optional_parameters}]

        METHOD: GET

        ENDPOINT PARAMETERS:

        qt: Unknown. They are present in the feed response but don't appear to be in the documentation

        et: Unknown. Same as above.

        REQUEST HEADERS:

        Accept-Encoding: Compresses the response to the gzip format. The valid value is gzip.
        """

        url = 'https://api.ap.org/media/v/content/{}'.format(path)
        api_method = 'item'
        params = {'apikey': self.api_key}
        params.update(kwargs)
        self._check_ratelimit(api_method)
        content_data = self._make_request(url, params)
        return content_data

    def _make_request(self, url: str, params: dict = None) -> Optional[str]:
        """Internal method for making API requests"""

        retries = self.retry_limit

        # Begin making request and retry up to retry limit
        while retries:

            log.debug("Making request to {} with parameters {}".format(
                url, params))

            try:
                response = requests.get(url, params=params, timeout=30)
            except Exception as e:
                log.warning(
                    "Encountered an exception while making request to {}. Exception info: {}"
                    .format(url, e))
            else:
                if response.status_code == 200:
                    log.debug("Successfully retrieved {}".format(url))
                    self._update_ratelimit_info(response.headers)
                    return response.content
                elif response.status_code == 403:
                    log.warning(
                        "Received a 403 (forbidden) response for {} -- skipping."
                        .format(url))
                    return None
                else:
                    print(response.content)
                    log.warning(
                        "Received HTTP status code {} when fetching {}".format(
                            response.status_code, url))

            retries -= 1

            if retries == 0:
                raise McAPFetchError(
                    "Could not fetch {} after {} attempts. Giving up.".format(
                        url, self.retry_limit))

            wait_time = (self.retry_limit - retries)**2
            log.info(
                "Exponentially backing off for {} seconds.".format(wait_time))
            time.sleep(wait_time)

        return None

    def _check_ratelimit(self, api_method: str) -> None:
        """Check the endpoint rate limit before making an API call to that endpoint and to wait if necessary"""
        if api_method in self.ratelimit_info:
            current_window_remaining = float(
                self.ratelimit_info[api_method].get('current_window_remaining',
                                                    None))
            next_window = float(self.ratelimit_info[api_method].get(
                'next_window', None))
            if current_window_remaining < 1 and next_window > time.time():
                wait_time = math.ceil(next_window - time.time())
                if wait_time > 0:
                    log.info(
                        'Rate limit for {}. Sleeping {} before next API call'.
                        format(api_method, wait_time))
                    time.sleep(wait_time)

    def _update_ratelimit_info(self, headers):
        """Internal method to update rate limit information for an API endpoint"""
        api_method = headers['x-mediaapi-Q-name']
        calls_used, window_limit = [
            int(x) for x in headers['x-mediaapi-Q-used'].split('/')
        ]

        if api_method not in self.ratelimit_info:
            self.ratelimit_info[api_method] = dict()

        self.ratelimit_info[api_method]['next_window'] = math.ceil(
            int(headers['x-mediaapi-Q-secondsLeft']) + time.time())
        self.ratelimit_info[api_method]['current_window_limit'] = window_limit
        self.ratelimit_info[api_method][
            'current_window_remaining'] = window_limit - calls_used
Exemple #9
0
    def __annotate_text(self, text: str) -> Union[dict, list]:
        """Fetch JSON annotation for text, decode it into dictionary / list."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            fatal_error("Text is None.")

        if len(text) == 0:
            # Annotators accept empty strings, but that might happen with some stories so we're just die()ing here
            raise McJSONAnnotationFetcherException("Text is empty.")

        log.info(f"Annotating {len(text)} characters of text...")

        # Trim the text because that's what the annotator will do, and if the text is empty, we want to fail early
        # without making a request to the annotator at all
        text = text.strip()

        if self.__TEXT_LENGTH_LIMIT > 0:
            text_length = len(text)
            if text_length > self.__TEXT_LENGTH_LIMIT:
                log.warning(
                    f"Text length ({text_length}) has exceeded the request text length limit"
                    f"({self.__TEXT_LENGTH_LIMIT}) so I will truncate it.")
                text = text[:self.__TEXT_LENGTH_LIMIT]

        # Make a request
        ua = UserAgent()
        ua.set_timing([1, 2, 4, 8])
        ua.set_timeout(self.__HTTP_TIMEOUT)
        ua.set_max_size(None)

        request = None
        try:
            request = self._request_for_text(text=text)
            if request is None:
                raise McJSONAnnotationFetcherException(
                    "Returned request is None.")
        except Exception as ex:
            # Assume that this is some sort of a programming error too
            fatal_error(
                f"Unable to create annotator request for text '{text}': {ex}")

        # Wait for the service's HTTP port to become open as the service might be
        # still starting up somewhere
        uri = furl(request.url())
        hostname = str(uri.host)
        port = int(uri.port)
        assert hostname, f"URL hostname is not set for URL {request.url()}"
        assert port, f"API URL port is not set for URL {request.url()}"

        if not wait_for_tcp_port_to_open(
                port=port,
                hostname=hostname,
                retries=self.__ANNOTATOR_SERVICE_TIMEOUT,
        ):
            # Instead of throwing an exception, just crash the whole application
            # because there's no point in continuing on running it whatsoever.
            fatal_error(
                f"Annotator service at {request.url()} didn't come up in {self.__ANNOTATOR_SERVICE_TIMEOUT} seconds, "
                f"exiting...")

        log.debug(f"Sending request to {request.url()}...")

        # Try requesting a few times because sometimes it throws a connection error, e.g.:
        #
        #   WARNING mediawords.util.web.user_agent: Client-side error while processing request <PreparedRequest [POST]>:
        #   ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
        #   WARNING mediawords.annotator.fetcher: Request failed: ('Connection aborted.', ConnectionResetError(104,
        #   'Connection reset by peer'))
        #   ERROR mediawords.util.process: User agent error: 400 Client-side error: ('Connection aborted.',
        #   ConnectionResetError(104, 'Connection reset by peer'))
        response = None
        retries = 60
        sleep_between_retries = 1
        for retry in range(1, retries + 1):

            if retry > 1:
                log.warning(f"Retrying ({retry} / {retries})...")

            response = ua.request(request)

            if response.is_success():
                break
            else:
                if response.error_is_client_side():
                    log.error(
                        f"Request failed on the client side: {response.decoded_content()}"
                    )
                    time.sleep(sleep_between_retries)
                else:
                    break

        log.debug("Response received.")

        # Force UTF-8 encoding on the response because the server might not always
        # return correct "Content-Type"
        results_string = response.decoded_utf8_content()

        if not response.is_success():
            # Error; determine whether we should be blamed for making a malformed
            # request, or is it an extraction error
            log.warning(f"Request failed: {response.decoded_content()}")

            if response.code() == HTTPStatus.REQUEST_TIMEOUT.value:
                # Raise on request timeouts without retrying anything because those usually mean that we posted
                # something funky to the annotator service and it got stuck
                raise McJSONAnnotationFetcherException(
                    f"The request timed out, giving up; text length: {len(text)}; text: {text}"
                )

            if response.error_is_client_side():
                # Error was generated by the user agent client code; likely didn't reach server at all (timeout,
                # unresponsive host, etc.)
                fatal_error(
                    f"User agent error: {response.status_line()}: {results_string}"
                )

            else:

                # Error was generated by server
                http_status_code = response.code()

                if http_status_code == HTTPStatus.METHOD_NOT_ALLOWED.value \
                        or http_status_code == HTTPStatus.BAD_REQUEST.value:
                    # Not POST, empty POST
                    fatal_error(f'{response.status_line()}: {results_string}')

                elif http_status_code == HTTPStatus.INTERNAL_SERVER_ERROR.value:
                    # Processing error -- raise so that the error gets caught and logged into a database
                    raise McJSONAnnotationFetcherException(
                        f'Annotator service was unable to process the download: {results_string}'
                    )

                else:
                    # Shutdown the extractor on unconfigured responses
                    fatal_error(
                        f'Unknown HTTP response: {response.status_line()}: {results_string}'
                    )

        if results_string is None or len(results_string) == 0:
            raise McJSONAnnotationFetcherException(
                f"Annotator returned nothing for text: {text}")

        log.debug("Parsing response's JSON...")
        results = None
        try:
            results = decode_json(results_string)
            if results is None:
                raise McJSONAnnotationFetcherException(
                    "Returned JSON is None.")
        except Exception as ex:
            # If the JSON is invalid, it's probably something broken with the remote service, so that's why whe do
            # fatal_error() here
            fatal_error(
                f"Unable to parse JSON response: {ex}\nJSON string: {results_string}"
            )
        log.debug("Done parsing response's JSON.")

        response_is_valid = False
        try:
            response_is_valid = self._fetched_annotation_is_valid(results)
        except Exception as ex:
            fatal_error(
                f"Unable to determine whether response is valid: {ex}\nJSON string: {results_string}"
            )
        if not response_is_valid:
            fatal_error(
                f"Annotator response is invalid for JSON string: {results_string}"
            )

        log.info(f"Done annotating {len(text)} characters of text.")

        return results
Exemple #10
0
    def fetch_posts(self, query: str, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch day of tweets from crimson hexagon"""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.warning("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                post = {
                    'post_id': mt['tweet_id'],
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': mt['tweet']['created_at'],
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts