def fetch_posts(self, query: dict, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch tweets from archive.org that match the given query for the given day."""
        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        enc_query = urlencode({
            'q': query,
            'date_from': start_arg,
            'date_to': end_arg
        })

        url = "https://searchtweets.archivelab.org/export?" + enc_query

        log.debug("archive.org url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsArchiveTwitterDataException(
                "error fetching posts: " + response.decoded_content())

        decoded_content = response.decoded_content()

        # sometimes we get null characters, which choke the csv module
        decoded_content = decoded_content.replace('\x00', '')

        meta_tweets = []
        lines = decoded_content.splitlines()[1:]
        for row in csv.reader(lines, delimiter="\t"):
            fields = 'user_name user_screen_name lang text timestamp_ms url'.split(
                ' ')
            meta_tweet = {}
            for i, field in enumerate(fields):
                meta_tweet[field] = row[i] if i < len(row) else ''

            if 'url' not in meta_tweet or meta_tweet['url'] == '':
                log.warning("meta_tweet '%s' does not have a url" % str(row))
                continue

            meta_tweet['tweet_id'] = get_tweet_id_from_url(meta_tweet['url'])

            meta_tweets.append(meta_tweet)

        add_tweets_to_meta_tweets(meta_tweets)

        return meta_tweets
Ejemplo n.º 2
0
    def fetch_posts_from_api(self, query: str, start_date: datetime,
                             end_date: datetime) -> list:
        """Fetch day of tweets from crimson hexagon and twitter."""
        meta_tweets = []
        next_cursor = None
        while True:
            data = self._fetch_posts_from_api_single_page(
                query, start_date, end_date, next_cursor)
            meta_tweets = meta_tweets + data['results']
            if 'nextCursor' in data:
                next_cursor = data['nextCursor']
            else:
                break

        if 'results' not in data:
            raise McPostsBWTwitterDataException("Unknown response status: " +
                                                str(data))

        for mt in meta_tweets:
            try:
                mt['tweet_id'] = get_tweet_id_from_url(mt['url'])
            except McTwitterUrlException:
                raise McPostsBWTwitterQueryException("""
                    Unable to parse tweet url %s. Make sure brandwatch query only includes twitter as a source.
                    """ % mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.debug("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                publish_date = dateutil.parser.parse(
                    mt['tweet']['created_at']).isoformat()

                post = {
                    'post_id': str(mt['tweet_id']),
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': publish_date,
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts
Ejemplo n.º 3
0
    def fetch_posts_from_api(
        self,
        query: str,
        start_date: datetime,
        end_date: datetime,
        sample: Optional[int] = None,
        page_size: Optional[int] = None,
    ) -> list:
        """Fetch day of tweets from crimson hexagon and twitter."""
        decoded_content = self._get_content_from_api(query, start_date,
                                                     end_date)

        assert sample is None, "Sampling is not implemented."
        assert page_size is None, "Page size limiting is not supported."

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.debug("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                publish_date = dateutil.parser.parse(
                    mt['tweet']['created_at']).isoformat()

                post = {
                    'post_id': str(mt['tweet_id']),
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': publish_date,
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts
Ejemplo n.º 4
0
    def fetch_posts_from_api(
        self,
        query: str,
        start_date: datetime,
        end_date: datetime,
        sample: Optional[int] = None,
        page_size: Optional[int] = None,
    ) -> list:
        """Fetch day of tweets."""

        if page_size is None:
            page_size = 5000

        meta_tweets = []
        next_cursor = None
        while True:
            data = self._fetch_posts_from_api_single_page(
                query=query,
                start_date=start_date,
                end_date=end_date,
                next_cursor=next_cursor,
                page_size=page_size,
            )
            meta_tweets = meta_tweets + data['results']
            log.debug(f"Sample: {sample}; meta_tweets: {len(meta_tweets)}")

            if 'nextCursor' not in data or (sample is not None and len(meta_tweets) >= sample):
                break
            else:
                next_cursor = data['nextCursor']

        if 'results' not in data:
            raise McPostsBWTwitterDataException("Unknown response status: " + str(data))

        for mt in meta_tweets:
            try:
                mt['tweet_id'] = get_tweet_id_from_url(mt['url'])
            except McTwitterUrlException:
                raise McPostsBWTwitterQueryException(
                    """
                    Unable to parse tweet url %s. Make sure brandwatch query only includes twitter as a source.
                    """ % mt['url'])


        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.debug("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                publish_date = dateutil.parser.parse(mt['tweet']['created_at']).isoformat()

                post = {
                    'post_id': str(mt['tweet_id']),
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': publish_date,
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts
Ejemplo n.º 5
0
    def fetch_posts(self, query: str, start_date: datetime,
                    end_date: datetime) -> list:
        """Fetch day of tweets from crimson hexagon"""
        ch_monitor_id = int(query)

        log.debug("crimson_hexagon_twitter.fetch_posts")

        ua = UserAgent()
        ua.set_max_size(100 * 1024 * 1024)
        ua.set_timeout(90)
        ua.set_timing([1, 2, 4, 8, 16, 32, 64, 128, 256, 512])

        config = TopicsMineConfig()
        api_key = config.crimson_hexagon_api_key()

        end_date = end_date + datetime.timedelta(days=1)

        start_arg = start_date.strftime('%Y-%m-%d')
        end_arg = end_date.strftime('%Y-%m-%d')

        url = (
            "https://api.crimsonhexagon.com/api/monitor/posts?auth=%s&id=%d&start=%s&end=%s&extendLimit=true"
            % (api_key, ch_monitor_id, start_arg, end_arg))

        log.debug("crimson hexagon url: " + url)

        response = ua.get(url)

        if not response.is_success():
            raise McPostsCHTwitterDataException("error fetching posts: " +
                                                response.decoded_content())

        decoded_content = response.decoded_content()

        data = dict(decode_json(decoded_content))

        if 'status' not in data or not data['status'] == 'success':
            raise McPostsCHTwitterDataException("Unknown response status: " +
                                                str(data))

        meta_tweets = data['posts']

        for mt in meta_tweets:
            mt['tweet_id'] = get_tweet_id_from_url(mt['url'])

        add_tweets_to_meta_tweets(meta_tweets)

        posts = []
        for mt in meta_tweets:
            log.warning("mt: %d" % mt['tweet_id'])
            if 'tweet' in mt:
                post = {
                    'post_id': mt['tweet_id'],
                    'data': mt,
                    'content': mt['tweet']['text'],
                    'publish_date': mt['tweet']['created_at'],
                    'author': mt['tweet']['user']['screen_name'],
                    'channel': mt['tweet']['user']['screen_name'],
                    'url': mt['url']
                }

                posts.append(post)

        return posts