コード例 #1
0
ファイル: api_scraper.py プロジェクト: medialab/minet
def payload_tweets_iter(payload):
    tweet_index = payload['globalObjects']['tweets']
    user_index = payload['globalObjects']['users']

    for instruction in payload['timeline']['instructions']:
        if 'addEntries' in instruction:
            entries = instruction['addEntries']['entries']
        elif 'replaceEntry' in instruction:
            entries = [instruction['replaceEntry']['entry']]
        else:
            continue

        for entry in entries:
            entry_id = entry['entryId']

            # Filtering tweets
            if (not entry_id.startswith('sq-I-t-')
                    and not entry_id.startswith('tweet-')):
                continue

            tweet_meta = getpath(entry,
                                 ['content', 'item', 'content', 'tweet'])

            if tweet_meta is None:
                tweet_meta = getpath(
                    entry,
                    ['content', 'item', 'content', 'tombstone', 'tweet'])

            # Parsing error?
            if tweet_meta is None:
                raise TwitterPublicAPIParsingError

            # Skipping ads
            if 'promotedMetadata' in tweet_meta:
                continue

            tweet = process_single_tweet(tweet_meta['id'], tweet_index,
                                         user_index)

            # Additional metadata
            meta = None

            if tweet is not None:

                if 'forwardPivot' in tweet_meta:
                    pivot = tweet_meta['forwardPivot']

                    meta = {
                        'intervention_text': getpath(pivot, ['text', 'text']),
                        'intervention_type': pivot.get('displayType'),
                        'intervention_url': getpath(pivot,
                                                    ['landingUrl', 'url'])
                    }

                yield tweet, meta
コード例 #2
0
        def generator():
            starting_url = forge_comments_url(self.key, video_id)

            queue = deque([(False, video_id, starting_url)])

            while len(queue) != 0:
                is_reply, item_id, url = queue.popleft()

                result = self.request_json(url)

                for item in result['items']:
                    comment_id = item['id']
                    replies = getpath(item, ['replies', 'comments'], [])
                    total_reply_count = getpath(item,
                                                ['snippet', 'totalReplyCount'],
                                                0)

                    if not raw:
                        item = format_comment(
                            item) if not is_reply else format_reply(
                                item, video_id=video_id)

                    yield item

                    if is_reply:
                        continue

                    # Getting replies
                    if not full_replies or len(replies) >= total_reply_count:
                        for reply in replies:
                            if not raw:
                                reply = format_reply(reply)

                            yield reply
                    elif total_reply_count > 0:
                        replies_url = forge_replies_url(self.key, comment_id)

                        queue.append((True, comment_id, replies_url))

                if len(result['items']) == 0:
                    break

                # Next page
                token = result.get('nextPageToken')

                if token is not None:
                    forge = forge_replies_url if is_reply else forge_comments_url

                    next_url = forge(self.key, item_id, token=token)

                    queue.append((is_reply, item_id, next_url))
コード例 #3
0
ファイル: summary.py プロジェクト: zanachka/minet
def crowdtangle_summary(pool,
                        link,
                        token=None,
                        start_date=None,
                        with_top_posts=False,
                        sort_by=CROWDTANGLE_SUMMARY_DEFAULT_SORT_TYPE,
                        raw=False,
                        platforms=None):

    if token is None:
        raise CrowdTangleMissingTokenError

    if not isinstance(start_date, str):
        raise TypeError(
            'minet.crowdtangle.summary: expecting a `start_date` kwarg.')

    if sort_by not in CROWDTANGLE_SUMMARY_SORT_TYPES:
        raise TypeError('minet.crowdtangle.summary: unknown `sort_by`.')

    # Fetching
    api_url = url_forge(link, token, start_date, sort_by, platforms,
                        with_top_posts)

    err, response, data = request_json(api_url, pool=pool)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    stats = getpath(data, ['result', 'summary', 'facebook'])
    posts = getpath(data, ['result', 'posts']) if with_top_posts else None

    if stats is not None:
        if not raw:
            stats = format_summary(stats)

    if not with_top_posts:
        return stats

    else:
        if not raw:
            posts = [format_post(post, link=link) for post in posts]

        return stats, posts
コード例 #4
0
def crowdtangle_post(pool, post_id, token=None, raw=False):

    if token is None:
        raise CrowdTangleMissingTokenError

    # Fetching
    api_url = URL_TEMPLATE % (post_id, token)

    err, response, data = request_json(api_url, pool=pool)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    post = getpath(data, ['result', 'posts', 0])

    if post is None:
        return

    if not raw:
        return format_post(post)

    return post
コード例 #5
0
def crowdtangle_lists(pool, token=None, raw=False):

    if token is None:
        raise CrowdTangleMissingTokenError

    # Fetching
    api_url = URL_TEMPLATE % token

    err, response, data = request_json(api_url, pool=pool)

    if err is not None:
        raise err

    if response.status == 401:
        raise CrowdTangleInvalidTokenError

    if response.status >= 400:
        raise CrowdTangleInvalidRequestError(api_url)

    lists = getpath(data, ['result', 'lists'])

    if not raw:
        return [format_list(l) for l in lists]

    return lists
コード例 #6
0
def format_comment(item):
    meta = item['snippet']
    snippet = getpath(item, ['snippet', 'topLevelComment', 'snippet'])

    row = YouTubeComment(
        meta['videoId'],
        item['id'],
        snippet['authorDisplayName'],
        getpath(snippet, ['authorChannelId', 'value']),
        snippet['textOriginal'],
        int(snippet['likeCount']),
        snippet['publishedAt'],
        snippet['updatedAt'],
        int(meta['totalReplyCount']),
        None
    )

    return row
コード例 #7
0
ファイル: argparse.py プロジェクト: zanachka/minet
    def resolve(self, config):

        # Attempting to resolve env variable
        env_var = rc_key_to_env_var(self.key)
        env_value = os.environ.get(env_var, '').strip()

        if env_value:
            return self.type(env_value)

        return getpath(config, self.key, self.default)
コード例 #8
0
    def search_hashtag(self, name):
        name = name.lstrip('#')
        cursor = None

        while True:
            url = forge_hashtag_search_url(name, cursor=cursor)
            print(url, cursor)

            data = self.request_json(url)

            data = getpath(data, ['data', 'hashtag', 'edge_hashtag_to_media'])
            edges = data.get('edges')

            for edge in edges:
                yield edge['node']['shortcode']

            print('Found %i posts' % len(edges))

            has_next_page = getpath(data, ['page_info', 'has_next_page'])

            if not has_next_page:
                break

            cursor = getpath(data, ['page_info', 'end_cursor'])
コード例 #9
0
def crowdtangle_post(request, post_id, token=None, raw=False):

    if token is None:
        raise CrowdTangleMissingTokenError

    # Fetching
    api_url = URL_TEMPLATE % (post_id, token)
    data = request(api_url)
    post = getpath(data, ['posts', 0])

    if post is None:
        return

    if not raw:
        return format_post(post)

    return post
コード例 #10
0
def format_reply(item, video_id=None):
    snippet = item['snippet']

    row = YouTubeComment(
        video_id if video_id is not None else snippet['videoId'],
        item['id'],
        snippet['authorDisplayName'],
        getpath(snippet, ['authorChannelId', 'value']),
        snippet['textOriginal'],
        int(snippet['likeCount']),
        snippet['publishedAt'],
        snippet['updatedAt'],
        None,
        snippet['parentId']
    )

    return row
コード例 #11
0
def collect_top_reactions(data):
    edges = getpath(data, ['top_reactions', 'edges'])

    if edges is None:
        return

    index = {}

    for edge in edges:
        emotion = FACEBOOK_REACTION_KEYS.get(edge['node']['key'])

        if emotion is None:
            print_err('Found unkown emotion %s' % edge)
            continue

        index[emotion] = edge['reaction_count'] or 0

    return index
コード例 #12
0
    def request_json(self, url):
        err, response, data = request_json(url, pool=self.pool)

        if err:
            raise err

        if response.status == 403:
            sleep_time = seconds_to_midnight_pacific_time() + 10

            if callable(self.before_sleep):
                self.before_sleep(sleep_time)

            time.sleep(sleep_time)

            return self.request_json(url)

        if response.status >= 400:
            if data is not None and 'API key not valid' in getpath(
                    data, ['error', 'message'], ''):
                raise YouTubeInvalidAPIKeyError

            raise YouTubeInvalidAPICall(url, response.status, data)

        return data
コード例 #13
0
    def test_getpath(self):
        with pytest.raises(TypeError):
            getpath(NESTED_OBJECT, 'test')

        assert getpath(NESTED_OBJECT, ['a', 'd', 'e']) == 5
        assert getpath(NESTED_OBJECT, ['a', 'd', 'e'], items=None) is None
        assert getpath(NESTED_OBJECT, ['a', 'c']) is None
        assert getpath(NESTED_OBJECT, ['a', 'c'], 67) == 67
        assert getpath(NESTED_OBJECT, ['a', 'b', 1]) == 45
        assert getpath(NESTED_OBJECT, ['a', 'b', -1, 'f', -1]) == 3
        assert getpath(NESTED_OBJECT, ['a', 'b', 0, 'c']) == 4
        assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 'numbers', 1]) is None
        assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 'numbers', 1],
                       attributes=True) == 5
        assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 3],
                       attributes=True) is None
        assert getpath(NESTED_OBJECT, ['a', 'd', 'g', 'recursion', 'numbers'],
                       attributes=True) == [4, 5, 6]
        assert getpath(NESTED_OBJECT, 'a.d.e', split_char='.') == 5
        assert getpath(NESTED_OBJECT, 'a§d§e', split_char='§') == 5
        assert getpath(NESTED_OBJECT,
                       'a.b.1',
                       split_char='.',
                       parse_indices=True) == 45
        assert getpath(NESTED_OBJECT,
                       'a.b.-1.f.-1',
                       split_char='.',
                       parse_indices=True) == 3

        assert getpath([[1, 2]], [3, 4, 17]) is None
コード例 #14
0
    def fetch_facebook_page_stats(url):
        err, response = request(url, cookie='locale=en_US')

        if err:
            return 'http-error', None

        if response.status == 404:
            return 'not-found', None

        if response.status >= 400:
            return 'http-error', None

        html = response.data

        if CAPTCHA in html:
            die(['Rate limit reached!', 'Last url: %s' % url])

        if (CURRENT_AVAILABILITY_DISCLAIMER in html
                or AVAILABILITY_DISCLAIMER in html):
            return 'unavailable', None

        if LOGIN_DISCLAIMER in html:
            return 'private-or-unavailable', None

        # TODO: integrate into ural
        bpost_id = url.rsplit('/', 1)[-1].encode()

        # Extracting metadata
        meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id)

        match = meta_extractor.search(html)

        if match is None:
            return 'extraction-failed', None

        data = json5.loads(match.group(1).decode())
        data = getpath(data, [
            'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result',
            'data', 'feedback'
        ])

        if data is None:
            return 'extraction-failed', None

        # TODO: remove, this is here as a test
        # TODO: try to find a post where comments are disabled
        if get_count(data['seen_by_count']):
            print_err('Found seen_by_count: %i for %s' %
                      (get_count(data['seen_by_count']), url))

        if 'political_figure_data' in data and data['political_figure_data']:
            print_err('Found political_figure_data:')
            print_err(data['political_figure_data'])

        if get_count(data['reaction_count']) != get_count(data['reactors']):
            print_err('Found different reactions/reactors for %s' % url)

        # Extracting data from hidden html
        hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id)
        match = hidden_html_extractor.search(html)

        if match is not None:
            hidden_html = match.group(1).decode()
            soup = BeautifulSoup(hidden_html, 'lxml')

            # Sometimes fetching a post behaves weirdly
            if soup.select_one('h5 a') is None:
                return 'extraction-failed', None

            data['scraped'] = {}

            timestamp_elem = soup.select_one('[data-utime]')
            timestamp = int(timestamp_elem.get('data-utime'))

            data['scraped']['account_name'] = soup.select_one(
                'h5 a').get_text().strip()
            data['scraped']['timestamp'] = timestamp
            data['scraped']['time'] = datetime.fromtimestamp(
                timestamp).isoformat()

            # TODO: use a context manager
            try:
                data['scraped']['aria_label'] = timestamp_elem.parent.get(
                    'aria-label')
            except:
                pass

            try:
                data['scraped']['text'] = soup.select_one(
                    '[data-testid="post_message"]').get_text()
            except:
                pass

            # try:
            #     data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href')
            # except:
            #     pass

        return None, data
コード例 #15
0
ファイル: api_scraper.py プロジェクト: medialab/minet
    def request_search(self, query, cursor=None, refs=None, dump=False):
        params = forge_search_params(query, cursor=cursor)
        url = '%s?%s' % (TWITTER_PUBLIC_SEARCH_ENDPOINT, params)

        headers = {
            'Authorization': TWITTER_PUBLIC_API_AUTH_HEADER,
            'X-Guest-Token': self.guest_token,
            'Cookie': self.cookie,
            'Accept-Language': 'en'
        }

        err, response, data = self.request_json(url, headers=headers)

        if err:
            raise err

        if response.status == 429:
            self.reset()
            raise TwitterPublicAPIRateLimitError

        if response.status >= 400:
            error = getpath(data, ['errors', 0])

            if error is not None and response.status == 400 and error.get(
                    'code') == 47:
                raise TwitterPublicAPIBadRequest

            if error is not None and error.get('code') == 130:
                raise TwitterPublicAPIOverCapacityError

            raise TwitterPublicAPIInvalidResponseError

        cursor = extract_cursor_from_payload(data)
        tweets = []

        if dump:
            return data

        for tweet, meta in payload_tweets_iter(data):
            result = normalize_tweet(tweet,
                                     extract_referenced_tweets=refs
                                     is not None,
                                     collection_source='scraping')

            if refs is not None:
                for is_first, extracted_tweet in with_is_first(result):

                    # Casting to int64 to save up memory
                    id_int64 = int(extracted_tweet['id'])

                    if id_int64 in refs:
                        continue

                    if is_first:
                        tweets.append((extracted_tweet, meta))
                    else:
                        tweets.append((extracted_tweet, None))

                    refs.add(id_int64)
            else:
                tweets.append((result, meta))

        return cursor, tweets