コード例 #1
0
ファイル: mobile_scraper.py プロジェクト: medialab/minet
    def post_author(self, url):
        if not has_facebook_comments(url):
            raise FacebookInvalidTargetError

        # Reformatting url to hit mobile website
        url = convert_url_to_mobile(url)

        html = self.request_page(url)
        soup = BeautifulSoup(html, 'lxml')

        user_item = soup.select_one('[data-ft] h3 a[href]')

        if user_item is None:
            return None

        parsed = parse_facebook_url(user_item.get('href'), allow_relative_urls=True)
        user_label = user_item.get_text().strip()

        if isinstance(parsed, ParsedFacebookHandle):
            return FacebookUser(
                user_label,
                None,
                parsed.handle,
                parsed.url
            )
        elif isinstance(parsed, ParsedFacebookUser):
            return FacebookUser(
                user_label,
                parsed.id,
                parsed.handle,
                parsed.url
            )
        else:
            raise TypeError
コード例 #2
0
def extract_facebook_addendum(url):
    parsed = parse_facebook_url(url)

    if parsed is None:
        return None

    if isinstance(parsed, FacebookPost):
        return ['post', parsed.id, parsed.full_id or '', '', parsed.url]

    elif isinstance(parsed, FacebookHandle):
        return ['handle', '', '', parsed.handle, parsed.url]

    elif isinstance(parsed, FacebookUser):
        return ['user', parsed.id or '', '', parsed.handle or '', parsed.url]

    elif isinstance(parsed, FacebookGroup):
        return ['group', parsed.id or '', '', parsed.handle or '', parsed.url]

    elif isinstance(parsed, FacebookPhoto):
        return ['photo', parsed.id, '', '', parsed.url]

    elif isinstance(parsed, FacebookVideo):
        return ['video', parsed.id, '', '', parsed.url]

    else:
        raise TypeError('unknown facebook parse result type!')
コード例 #3
0
ファイル: mobile_scraper.py プロジェクト: medialab/minet
    def posts(self, url):
        parsed = parse_facebook_url(url)

        if not isinstance(parsed, ParsedFacebookGroup):
            raise FacebookInvalidTargetError

        url = convert_url_to_mobile(parsed.url)

        def generator():
            current_url = url

            while True:
                html = self.request_page(current_url)

                # with open('./dump.html', 'w') as f:
                #     f.write(html)

                next_url, posts = scrape_posts(html)

                for post in posts:
                    yield post

                if next_url is None or len(posts) == 0:
                    break

                current_url = next_url

        return generator()
コード例 #4
0
ファイル: post_id_from_url.py プロジェクト: paulgirard/minet
def post_id_from_url(post_url):
    parsed = parse_facebook_url(post_url)

    if not isinstance(parsed, FacebookPost):
        return

    if parsed.full_id is not None:
        return parsed.full_id

    return scrape_post_id(post_url)
コード例 #5
0
    def test_parse_facebook_url(self):
        for url, target in PARSE_TESTS:
            result = parse_facebook_url(url, allow_relative_urls=True)

            assert result == target

        result = parse_facebook_url(
            'https://www.facebook.com/groups/277506326438568/permalink/319815378874329'
        )

        assert result.full_id == '277506326438568_319815378874329'

        result = parse_facebook_url(
            'https://www.facebook.com/permalink.php?story_fbid=1354978971282622&id=598338556946671'
        )

        assert result.full_id == '598338556946671_1354978971282622'

        result = parse_facebook_url(
            'https://www.facebook.com/meilleurdesmondesoff/posts/1810737099256795'
        )

        assert result.full_id is None

        result = parse_facebook_url(
            'https://www.facebook.com/108082977404530/posts/195887261957434')

        assert result.full_id == '108082977404530_195887261957434'

        result = parse_facebook_url(
            'https://www.facebook.com/groups/US4MF/permalink/787216138752904/')

        assert result.full_id is None
コード例 #6
0
def facebook_url_parse_action(namespace):
    output_file = open_output_file(namespace.output)

    enricher = casanova.enricher(
        namespace.file,
        output_file,
        keep=namespace.select,
        add=REPORT_HEADERS
    )

    loading_bar = tqdm(
        desc='Parsing',
        dynamic_ncols=True,
        unit=' lines',
    )

    for row, url in enricher.cells(namespace.column, with_rows=True):

        loading_bar.update()

        url_data = url.strip()

        parsed = parse_facebook_url(url_data)

        if parsed is None:
            enricher.writerow(row)

        if isinstance(parsed, FacebookPost):
            enricher.writerow(
                row,
                ['post', parsed.id, '', parsed.url]
            )

        elif isinstance(parsed, FacebookHandle):
            enricher.writerow(
                row,
                ['handle', '', parsed.handle, parsed.url]
            )

        elif isinstance(parsed, FacebookUser):
            enricher.writerow(
                row,
                ['user', parsed.id or '', parsed.handle or '', parsed.url]
            )
コード例 #7
0
ファイル: members.py プロジェクト: paulgirard/minet
def scrape_members(html):
    soup = BeautifulSoup(html, 'lxml')

    member_roots = soup.select('table[id^="member_"]')
    members = []

    for m_root in member_roots:
        title = m_root.select_one('h3 > a')
        user = parse_facebook_url(resolve_relative_url(title.get('href')))

        titles = m_root.select('h3')

        admin = False
        joined = None
        parsed_joined = None

        if len(titles) > 1:
            second_title = titles[1]

            if 'Admin' in second_title.get_text():
                admin = True

            joined = second_title.select_one('abbr').get_text().strip()
            parsed_joined = parse_formatted_date(joined)

        member = {
            'user_id': getattr(user, 'id', ''),
            'user_handle': getattr(user, 'handle', ''),
            'user_url': getattr(user, 'url', ''),
            'user_label': title.get_text().strip(),
            'admin': admin,
            'formatted_joined': joined,
            'joined': parsed_joined.isoformat() if parsed_joined else ''
        }

        members.append(member)

    next_link = soup.select_one('a[href^="/browse/group/members/?"]')
    next_link = resolve_relative_url(
        next_link.get('href')) if next_link else None

    return next_link, members
コード例 #8
0
def post_id_from_url(post_url):
    parsed = parse_facebook_url(post_url)

    if not isinstance(parsed, FacebookPost):
        return

    if parsed.full_id is not None:
        return parsed.full_id

    if parsed.parent_handle is not None:
        parent_id = page_id_from_handle(parsed.parent_handle)

        if parent_id is not None:
            return '%s_%s' % (parent_id, parsed.id)

    elif parsed.group_handle is not None:
        group_id = group_id_from_handle(parsed.group_handle)

        if group_id is not None:
            return '%s_%s' % (group_id, parsed.id)

    return scrape_post_id(post_url)
コード例 #9
0
ファイル: comments.py プロジェクト: AleksiKnuutila/minet-fork
def scrape_comments(html, in_reply_to=None):
    soup = BeautifulSoup(html, 'lxml')

    data = {
        'post_id': None,
        'comments': [],
        'next': None,
        'replies': [],
        'in_reply_to': in_reply_to
    }

    valid_items = (item for item in soup.select('[id]')
                   if VALID_ID_RE.match(item.get('id')))

    for item in valid_items:
        item_id = item.get('id')

        if item_id is None:
            continue

        if item_id.startswith('see_next'):
            next_link = item.select_one('a')
            data['next'] = urljoin(BASE_URL, next_link.get('href'))
            break

        # Skipping comment if same as commented
        if item_id == in_reply_to:
            continue

        user_link = item.select_one('h3 > a')

        # TODO: this should be fixed. Truncated comments are not correctly handled
        if not user_link:
            continue

        user_label = user_link.get_text().strip()
        user_href = user_link.get('href')
        user = parse_facebook_url(urljoin(BASE_URL, user_href))

        # TODO: link to comment
        content_element = item.select_one('h3 + div')
        comment_text = content_element.get_text().strip()
        comment_html = str(content_element)
        formatted_date = item.select_one('abbr').get_text().strip()
        parsed_date = parse_formatted_date(formatted_date)

        post_id = item.select_one('[id^="like_"]').get('id').split('_')[1]

        # TODO: this is baaaad
        data['post_id'] = post_id

        reactions_item = item.select_one('[aria-label*=" reaction"]')
        reactions = '0'

        if reactions_item is not None:
            reactions = reactions_item.get_text().strip()

        replies_items = item.select('a[href^="/comment/replies"]')
        replies = '0'

        if len(replies_items) > 0:
            replies_item = replies_items[-1]

            if replies_item is not None:
                replies_text = replies_item.get_text()

                if replies_text != 'Reply':
                    replies = replies_text.split('·')[-1].split(
                        ' repl')[0].strip()

                    replies_url = replies_item.get('href')
                    data['replies'].append((urljoin(BASE_URL,
                                                    replies_url), item_id))

        data['comments'].append({
            'post_id':
            post_id,
            'comment_id':
            item_id,
            'user_id':
            getattr(user, 'id', ''),
            'user_handle':
            getattr(user, 'handle', ''),
            'user_url':
            getattr(user, 'url', ''),
            'user_label':
            user_label,
            'comment_text':
            comment_text,
            'comment_html':
            comment_html,
            'formatted_date':
            formatted_date,
            'date':
            parsed_date.isoformat() if parsed_date else '',
            'reactions':
            reactions,
            'replies':
            replies,
            'in_reply_to':
            in_reply_to
        })

    return data
コード例 #10
0
ファイル: mobile_scraper.py プロジェクト: medialab/minet
def extract_user_information_from_link(element):
    user_label = element.get_text().strip()
    user_href = element.get('href')
    user = parse_facebook_url(resolve_relative_url(user_href))

    return user_label, user
コード例 #11
0
def post_id_from_url(post_url):
    parsed = parse_facebook_url(post_url)

    if not isinstance(parsed, FacebookPost):
        return

    if parsed.full_id is not None:
        return parsed.full_id

    post_mobile_url = convert_facebook_url_to_mobile(post_url)

    err, response, html = request_text(FACEBOOK_DEFAULT_POOL, post_mobile_url)

    if err:
        raise err

    soup = BeautifulSoup(html, 'lxml')

    root_element = soup.select_one('#m_story_permalink_view [data-ft]')

    if root_element is None:

        # Is this a photo post?
        next_link = soup.select_one('[href^="/photo.php"]')

        if next_link is None:
            return

        href = next_link.get('href')

        if not href:
            return

        link = urljoin(FACEBOOK_URL, href)
        query = urlsplit(link).query

        if not query:
            return

        query = dict(parse_qsl(query))

        return '%s_%s' % (query['id'], query['fbid'])

    data = root_element.get('data-ft')

    if data is None:
        return

    try:
        data = json.loads(data)
    except json.JSONDecodeError:
        return

    content_owner_id_new = data.get('content_owner_id_new') or data.get(
        'page_id')
    mf_story_key = data.get('mf_story_key')

    if content_owner_id_new is None or mf_story_key is None:
        return

    return '%s_%s' % (content_owner_id_new, mf_story_key)
コード例 #12
0
ファイル: facebook_test.py プロジェクト: paubre/ural
    def test_parse_facebook_url(self):
        for url, target in PARSE_TESTS:
            result = parse_facebook_url(url, allow_relative_urls=True)

            assert result == target
コード例 #13
0
ファイル: comments.py プロジェクト: paulgirard/minet
def scrape_comments(html, direction=None, in_reply_to=None):
    soup = BeautifulSoup(html, 'lxml')

    data = {
        'direction': direction,
        'post_id': None,
        'comments': [],
        'next': None,
        'replies': [],
        'in_reply_to': in_reply_to
    }

    if not in_reply_to:
        if direction is None or direction == 'forward':
            next_link = soup.select_one('[id^="see_next_"] > a[href]')

            if next_link:
                data['next'] = resolve_relative_url(next_link.get('href'))

                if direction is None:
                    data['direction'] = 'forward'

        if direction is None or direction == 'backward':
            next_link = soup.select_one('[id^="see_prev_"] > a[href]')

            if next_link:
                data['next'] = resolve_relative_url(next_link.get('href'))

                if direction is None:
                    data['direction'] = 'backward'
    else:
        if direction is None or direction == 'backward':
            next_link = soup.select_one(
                '[id^="comment_replies_more_1"] > a[href]')

            if next_link:
                data['next'] = resolve_relative_url(next_link.get('href'))

                if direction is None:
                    data['direction'] = 'backward'

    valid_items = (
        item for item in soup.select('[id]')
        if VALID_ID_RE.match(item.get('id'))
        and not item.parent.get('id', '').startswith('comment_replies_more'))

    for item in valid_items:
        item_id = item.get('id')

        # Skipping comment if same as commented
        if item_id == in_reply_to:
            continue

        user_link = item.select_one('h3 > a')

        # NOTE: this is a raise bomb
        if not user_link:
            raise TypeError

        user_label = user_link.get_text().strip()
        user_href = user_link.get('href')
        user = parse_facebook_url(resolve_relative_url(user_href))

        # TODO: link to comment
        content_elements_candidates = item.select_one('h3').find_next_siblings(
            'div')
        content_elements = []
        content_elements_html = []

        for el in content_elements_candidates:
            if el.select_one('[id^=like_]'):
                break

            content_elements_html.append(el)

            if el.get_text().strip():
                content_elements.append(el)

        comment_text = '\n'.join(el.get_text().strip()
                                 for el in content_elements)
        comment_html = ''.join(str(el) for el in content_elements_html)

        formatted_date = item.select_one('abbr').get_text().strip()
        parsed_date = parse_formatted_date(formatted_date)

        post_id = item.select_one('[id^="like_"]').get('id').split('_')[1]

        # NOTE: this could be better (we already know this beforehand)
        data['post_id'] = post_id

        reactions_item = item.select_one('[href^="/ufi/reaction/"]')
        reactions = '0'

        if reactions_item is not None:
            reactions = reactions_item.get_text().strip()

        replies_items = item.select('a[href^="/comment/replies"]')
        replies = '0'

        if len(replies_items) > 0:
            replies_item = replies_items[-1]

            if replies_item is not None:
                replies_text = replies_item.get_text()

                if replies_text != 'Reply':

                    if 'See all' in replies_text:
                        replies = replies_text.split('See all')[-1].split(
                            ' replies')[0].strip()
                    else:
                        replies = replies_text.split('·')[-1].split(
                            ' repl')[0].strip()

                    replies_url = replies_item.get('href')
                    data['replies'].append(
                        (resolve_relative_url(replies_url), item_id))

        data['comments'].append({
            'post_id':
            post_id,
            'comment_id':
            item_id,
            'user_id':
            getattr(user, 'id', ''),
            'user_handle':
            getattr(user, 'handle', ''),
            'user_url':
            getattr(user, 'url', ''),
            'user_label':
            user_label,
            'comment_text':
            comment_text,
            'comment_html':
            comment_html,
            'formatted_date':
            formatted_date,
            'date':
            parsed_date.isoformat() if parsed_date else '',
            'reactions':
            reactions,
            'replies':
            replies,
            'in_reply_to':
            in_reply_to
        })

    return data
コード例 #14
0
ファイル: mobile_scraper.py プロジェクト: lebelgique/minet
def scrape_comments(html, direction=None, in_reply_to=None):
    soup = BeautifulSoup(html, 'lxml')

    data = {
        'direction': direction,
        'post_id': None,
        'comments': [],
        'next': None,
        'replies': [],
        'in_reply_to': in_reply_to
    }

    # Detecting if we are in a video pagelet
    video_pagelet = soup.select_one('#mobile_injected_video_feed_pagelet')

    if video_pagelet is not None:
        actual_comments_link = video_pagelet.select_one(
            'a[href^="/story.php?"]')

        if actual_comments_link:
            data['next'] = resolve_relative_url(
                actual_comments_link.get('href'))

        return data

    if not in_reply_to:
        if direction is None or direction == 'forward':
            next_link = soup.select_one('[id^="see_next_"] > a[href]')

            if next_link:
                data['next'] = resolve_relative_url(next_link.get('href'))

                if direction is None:
                    data['direction'] = 'forward'

        if direction is None or direction == 'backward':
            next_link = soup.select_one('[id^="see_prev_"] > a[href]')

            if next_link:
                data['next'] = resolve_relative_url(next_link.get('href'))

                if direction is None:
                    data['direction'] = 'backward'
    else:
        if direction is None or direction == 'backward':
            next_link = soup.select_one(
                '[id^="comment_replies_more_1"] > a[href]')

            if next_link:
                data['next'] = resolve_relative_url(next_link.get('href'))

                if direction is None:
                    data['direction'] = 'backward'

    valid_items = (
        item for item in soup.select('[id]:has(h3 > a)')
        if VALID_ID_RE.match(item.get('id'))
        and not item.parent.get('id', '').startswith('comment_replies_more'))

    for item in valid_items:
        item_id = item.get('id')

        # Skipping comment if same as commented
        if item_id == in_reply_to:
            continue

        user_link = item.select_one('h3 > a')
        user_label = user_link.get_text().strip()
        user_href = user_link.get('href')
        user = parse_facebook_url(resolve_relative_url(user_href))

        # TODO: link to comment
        content_elements_candidates = item.select_one('h3').find_next_siblings(
            'div')
        content_elements = []
        content_elements_html = []

        for el in content_elements_candidates:
            if el.select_one('[id^=like_]'):
                break

            content_elements_html.append(el)

            if el.get_text().strip():
                content_elements.append(el)

        comment_text = get_display_text(content_elements)
        comment_html = ''.join(str(el) for el in content_elements_html)

        formatted_date = item.select_one('abbr').get_text().strip()
        parsed_date = parse_date(formatted_date)

        post_id_item = item.select_one('[id^="like_"]')

        if post_id_item is None:
            raise TypeError

        post_id = item.select_one('[id^="like_"]').get('id').split('_')[1]

        # NOTE: this could be better (we already know this beforehand)
        data['post_id'] = post_id

        reactions_item = item.select_one('[href^="/ufi/reaction/"]')
        reactions = '0'

        if reactions_item is not None:
            reactions = reactions_item.get_text().strip()

        replies_items = item.select('a[href^="/comment/replies"]')
        replies = '0'

        if len(replies_items) > 0:
            replies_item = replies_items[-1]

            if replies_item is not None:
                replies_text = replies_item.get_text()

                if replies_text != 'Reply':

                    if 'See all' in replies_text:
                        replies = replies_text.split('See all')[-1].split(
                            ' replies')[0].strip()
                    else:
                        replies = replies_text.split('·')[-1].split(
                            ' repl')[0].strip()

                    replies_url = replies_item.get('href')
                    data['replies'].append(
                        (resolve_relative_url(replies_url), item_id))

        data['comments'].append(
            FacebookComment(post_id=post_id,
                            id=item_id,
                            user_id=getattr(user, 'id', ''),
                            user_handle=getattr(user, 'handle', ''),
                            user_url=getattr(user, 'url', ''),
                            user_label=user_label,
                            text=comment_text,
                            html=comment_html,
                            formatted_date=formatted_date,
                            date=parsed_date,
                            reactions=reactions,
                            replies=replies,
                            in_reply_to=in_reply_to))

    return data