Ejemplo n.º 1
0
def scrape_post_id(post_url):
    post_mobile_url = convert_facebook_url_to_mobile(post_url)

    err, response, html = request_text(post_mobile_url)

    if err:
        raise err

    soup = BeautifulSoup(html, 'lxml')

    root_element = soup.select_one('#m_story_permalink_view [data-ft]')

    if root_element is None:

        # Is this a photo post?
        next_link = soup.select_one('[href^="/photo.php"]')

        if next_link is None:
            return

        href = next_link.get('href')

        if not href:
            return

        link = urljoin(FACEBOOK_URL, href)
        query = urlsplit(link).query

        if not query:
            return

        query = dict(parse_qsl(query))

        return '%s_%s' % (query['id'], query['fbid'])

    data = root_element.get('data-ft')

    if data is None:
        return

    try:
        data = json.loads(data)
    except json.JSONDecodeError:
        return

    content_owner_id_new = data.get('content_owner_id_new') or data.get(
        'page_id')
    mf_story_key = data.get('mf_story_key')

    if content_owner_id_new is None or mf_story_key is None:
        return

    return '%s_%s' % (content_owner_id_new, mf_story_key)
Ejemplo n.º 2
0
    def __call__(self, url, detailed=False, per_call=False, format='raw'):

        if format not in FACEBOOK_OUTPUT_FORMATS:
            raise TypeError('minet.facebook.scrape_comments: unkown `format`.')

        # Reformatting url to hit mobile website
        url = force_protocol(url, 'https')
        url = convert_facebook_url_to_mobile(url)

        html = self.request_page(url)

        members_link = scrape_members_link(html)

        while members_link is not None:
            html = self.request_page(members_link)

            next_link, members = scrape_members(html)

            yield from members

            members_link = next_link
Ejemplo n.º 3
0
def facebook_comments_action(namespace):

    # Reformatting url to hit mobile website
    url = force_protocol(namespace.url, 'https')
    url = convert_facebook_url_to_mobile(url)

    # Grabbing cookie
    cookie = grab_facebook_cookie(namespace)

    # Handling output
    output_file = open_output_file(namespace.output)

    writer = csv.writer(output_file)
    writer.writerow(CSV_HEADERS)

    http = create_pool()

    def request_page(target):
        error, result = request(http, target, cookie=cookie)

        if error is not None:
            raise error

        return result.data.decode('utf-8')

    # Loading bar
    loading_bar = tqdm(desc='Scraping comments',
                       dynamic_ncols=True,
                       unit=' comments')

    url_queue = deque([(url, None)])

    url_count = 0
    replies_count = 0

    while len(url_queue) != 0:
        current_url, in_reply_to = url_queue.popleft()

        html = request_page(current_url)
        data = scrape_comments(html, in_reply_to)

        url_count += 1

        for reply_url, commented_id in data['replies']:
            url_queue.append((reply_url, commented_id))

        if data['next'] is not None:
            url_queue.append((data['next'], in_reply_to))

        for comment in data['comments']:
            loading_bar.update()
            writer.writerow(format_csv_row(comment))

            if in_reply_to is not None:
                replies_count += 1

        loading_bar.set_postfix(urls=url_count,
                                replies=replies_count,
                                q=len(url_queue))

        # Don't be too greedy
        time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE)

    loading_bar.close()
Ejemplo n.º 4
0
def convert_url_to_mobile(url):
    url = force_protocol(url, 'https')
    return convert_facebook_url_to_mobile(url)
Ejemplo n.º 5
0
    def test_convert_facebook_url_to_mobile(self):
        for url, expected in MOBILE_TESTS:
            assert convert_facebook_url_to_mobile(url) == expected

        with pytest.raises(Exception):
            convert_facebook_url_to_mobile('http://twitter.com')
Ejemplo n.º 6
0
    def __call__(self, url, detailed=False, per_call=False, format='raw'):

        if format not in FACEBOOK_OUTPUT_FORMATS:
            raise TypeError('minet.facebook.scrape_comments: unkown `format`.')

        # Reformatting url to hit mobile website
        url = force_protocol(url, 'https')
        url = convert_facebook_url_to_mobile(url)

        url_queue = deque([(url, None, None)])

        calls = 0
        replies = 0

        while len(url_queue) != 0:
            current_url, direction, in_reply_to = url_queue.popleft()

            html = self.request_page(current_url)

            try:
                data = scrape_comments(html, direction, in_reply_to)
            except TypeError:
                # with open('./dump.html', 'w') as f:
                #     f.write(html)
                print('Could not process comment in %s' % current_url,
                      file=sys.stderr)
                sys.exit(1)

            calls += 1

            for reply_url, commented_id in data['replies']:
                url_queue.append((reply_url, None, commented_id))

            if data['next'] is not None:
                url_queue.append(
                    (data['next'], data['direction'], in_reply_to))

            comments = []

            for comment in data['comments']:
                if in_reply_to is not None:
                    replies += 1

                if format == 'csv_row':
                    comment = format_comment(comment)

                if per_call:
                    comments.append(comment)
                else:
                    yield comment

            if detailed:
                details = {
                    'calls': calls,
                    'replies': replies,
                    'queue_size': len(url_queue)
                }

                yield details, comments
            else:
                yield comments