def post_author(self, url): if not has_facebook_comments(url): raise FacebookInvalidTargetError # Reformatting url to hit mobile website url = convert_url_to_mobile(url) html = self.request_page(url) soup = BeautifulSoup(html, 'lxml') user_item = soup.select_one('[data-ft] h3 a[href]') if user_item is None: return None parsed = parse_facebook_url(user_item.get('href'), allow_relative_urls=True) user_label = user_item.get_text().strip() if isinstance(parsed, ParsedFacebookHandle): return FacebookUser( user_label, None, parsed.handle, parsed.url ) elif isinstance(parsed, ParsedFacebookUser): return FacebookUser( user_label, parsed.id, parsed.handle, parsed.url ) else: raise TypeError
def extract_facebook_addendum(url): parsed = parse_facebook_url(url) if parsed is None: return None if isinstance(parsed, FacebookPost): return ['post', parsed.id, parsed.full_id or '', '', parsed.url] elif isinstance(parsed, FacebookHandle): return ['handle', '', '', parsed.handle, parsed.url] elif isinstance(parsed, FacebookUser): return ['user', parsed.id or '', '', parsed.handle or '', parsed.url] elif isinstance(parsed, FacebookGroup): return ['group', parsed.id or '', '', parsed.handle or '', parsed.url] elif isinstance(parsed, FacebookPhoto): return ['photo', parsed.id, '', '', parsed.url] elif isinstance(parsed, FacebookVideo): return ['video', parsed.id, '', '', parsed.url] else: raise TypeError('unknown facebook parse result type!')
def posts(self, url): parsed = parse_facebook_url(url) if not isinstance(parsed, ParsedFacebookGroup): raise FacebookInvalidTargetError url = convert_url_to_mobile(parsed.url) def generator(): current_url = url while True: html = self.request_page(current_url) # with open('./dump.html', 'w') as f: # f.write(html) next_url, posts = scrape_posts(html) for post in posts: yield post if next_url is None or len(posts) == 0: break current_url = next_url return generator()
def post_id_from_url(post_url): parsed = parse_facebook_url(post_url) if not isinstance(parsed, FacebookPost): return if parsed.full_id is not None: return parsed.full_id return scrape_post_id(post_url)
def test_parse_facebook_url(self): for url, target in PARSE_TESTS: result = parse_facebook_url(url, allow_relative_urls=True) assert result == target result = parse_facebook_url( 'https://www.facebook.com/groups/277506326438568/permalink/319815378874329' ) assert result.full_id == '277506326438568_319815378874329' result = parse_facebook_url( 'https://www.facebook.com/permalink.php?story_fbid=1354978971282622&id=598338556946671' ) assert result.full_id == '598338556946671_1354978971282622' result = parse_facebook_url( 'https://www.facebook.com/meilleurdesmondesoff/posts/1810737099256795' ) assert result.full_id is None result = parse_facebook_url( 'https://www.facebook.com/108082977404530/posts/195887261957434') assert result.full_id == '108082977404530_195887261957434' result = parse_facebook_url( 'https://www.facebook.com/groups/US4MF/permalink/787216138752904/') assert result.full_id is None
def facebook_url_parse_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) loading_bar = tqdm( desc='Parsing', dynamic_ncols=True, unit=' lines', ) for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url_data = url.strip() parsed = parse_facebook_url(url_data) if parsed is None: enricher.writerow(row) if isinstance(parsed, FacebookPost): enricher.writerow( row, ['post', parsed.id, '', parsed.url] ) elif isinstance(parsed, FacebookHandle): enricher.writerow( row, ['handle', '', parsed.handle, parsed.url] ) elif isinstance(parsed, FacebookUser): enricher.writerow( row, ['user', parsed.id or '', parsed.handle or '', parsed.url] )
def scrape_members(html): soup = BeautifulSoup(html, 'lxml') member_roots = soup.select('table[id^="member_"]') members = [] for m_root in member_roots: title = m_root.select_one('h3 > a') user = parse_facebook_url(resolve_relative_url(title.get('href'))) titles = m_root.select('h3') admin = False joined = None parsed_joined = None if len(titles) > 1: second_title = titles[1] if 'Admin' in second_title.get_text(): admin = True joined = second_title.select_one('abbr').get_text().strip() parsed_joined = parse_formatted_date(joined) member = { 'user_id': getattr(user, 'id', ''), 'user_handle': getattr(user, 'handle', ''), 'user_url': getattr(user, 'url', ''), 'user_label': title.get_text().strip(), 'admin': admin, 'formatted_joined': joined, 'joined': parsed_joined.isoformat() if parsed_joined else '' } members.append(member) next_link = soup.select_one('a[href^="/browse/group/members/?"]') next_link = resolve_relative_url( next_link.get('href')) if next_link else None return next_link, members
def post_id_from_url(post_url): parsed = parse_facebook_url(post_url) if not isinstance(parsed, FacebookPost): return if parsed.full_id is not None: return parsed.full_id if parsed.parent_handle is not None: parent_id = page_id_from_handle(parsed.parent_handle) if parent_id is not None: return '%s_%s' % (parent_id, parsed.id) elif parsed.group_handle is not None: group_id = group_id_from_handle(parsed.group_handle) if group_id is not None: return '%s_%s' % (group_id, parsed.id) return scrape_post_id(post_url)
def scrape_comments(html, in_reply_to=None): soup = BeautifulSoup(html, 'lxml') data = { 'post_id': None, 'comments': [], 'next': None, 'replies': [], 'in_reply_to': in_reply_to } valid_items = (item for item in soup.select('[id]') if VALID_ID_RE.match(item.get('id'))) for item in valid_items: item_id = item.get('id') if item_id is None: continue if item_id.startswith('see_next'): next_link = item.select_one('a') data['next'] = urljoin(BASE_URL, next_link.get('href')) break # Skipping comment if same as commented if item_id == in_reply_to: continue user_link = item.select_one('h3 > a') # TODO: this should be fixed. Truncated comments are not correctly handled if not user_link: continue user_label = user_link.get_text().strip() user_href = user_link.get('href') user = parse_facebook_url(urljoin(BASE_URL, user_href)) # TODO: link to comment content_element = item.select_one('h3 + div') comment_text = content_element.get_text().strip() comment_html = str(content_element) formatted_date = item.select_one('abbr').get_text().strip() parsed_date = parse_formatted_date(formatted_date) post_id = item.select_one('[id^="like_"]').get('id').split('_')[1] # TODO: this is baaaad data['post_id'] = post_id reactions_item = item.select_one('[aria-label*=" reaction"]') reactions = '0' if reactions_item is not None: reactions = reactions_item.get_text().strip() replies_items = item.select('a[href^="/comment/replies"]') replies = '0' if len(replies_items) > 0: replies_item = replies_items[-1] if replies_item is not None: replies_text = replies_item.get_text() if replies_text != 'Reply': replies = replies_text.split('·')[-1].split( ' repl')[0].strip() replies_url = replies_item.get('href') data['replies'].append((urljoin(BASE_URL, replies_url), item_id)) data['comments'].append({ 'post_id': post_id, 'comment_id': item_id, 'user_id': getattr(user, 'id', ''), 'user_handle': getattr(user, 'handle', ''), 'user_url': getattr(user, 'url', ''), 'user_label': user_label, 'comment_text': comment_text, 'comment_html': comment_html, 'formatted_date': formatted_date, 'date': parsed_date.isoformat() if parsed_date else '', 'reactions': reactions, 'replies': replies, 'in_reply_to': in_reply_to }) return data
def extract_user_information_from_link(element): user_label = element.get_text().strip() user_href = element.get('href') user = parse_facebook_url(resolve_relative_url(user_href)) return user_label, user
def post_id_from_url(post_url): parsed = parse_facebook_url(post_url) if not isinstance(parsed, FacebookPost): return if parsed.full_id is not None: return parsed.full_id post_mobile_url = convert_facebook_url_to_mobile(post_url) err, response, html = request_text(FACEBOOK_DEFAULT_POOL, post_mobile_url) if err: raise err soup = BeautifulSoup(html, 'lxml') root_element = soup.select_one('#m_story_permalink_view [data-ft]') if root_element is None: # Is this a photo post? next_link = soup.select_one('[href^="/photo.php"]') if next_link is None: return href = next_link.get('href') if not href: return link = urljoin(FACEBOOK_URL, href) query = urlsplit(link).query if not query: return query = dict(parse_qsl(query)) return '%s_%s' % (query['id'], query['fbid']) data = root_element.get('data-ft') if data is None: return try: data = json.loads(data) except json.JSONDecodeError: return content_owner_id_new = data.get('content_owner_id_new') or data.get( 'page_id') mf_story_key = data.get('mf_story_key') if content_owner_id_new is None or mf_story_key is None: return return '%s_%s' % (content_owner_id_new, mf_story_key)
def test_parse_facebook_url(self): for url, target in PARSE_TESTS: result = parse_facebook_url(url, allow_relative_urls=True) assert result == target
def scrape_comments(html, direction=None, in_reply_to=None): soup = BeautifulSoup(html, 'lxml') data = { 'direction': direction, 'post_id': None, 'comments': [], 'next': None, 'replies': [], 'in_reply_to': in_reply_to } if not in_reply_to: if direction is None or direction == 'forward': next_link = soup.select_one('[id^="see_next_"] > a[href]') if next_link: data['next'] = resolve_relative_url(next_link.get('href')) if direction is None: data['direction'] = 'forward' if direction is None or direction == 'backward': next_link = soup.select_one('[id^="see_prev_"] > a[href]') if next_link: data['next'] = resolve_relative_url(next_link.get('href')) if direction is None: data['direction'] = 'backward' else: if direction is None or direction == 'backward': next_link = soup.select_one( '[id^="comment_replies_more_1"] > a[href]') if next_link: data['next'] = resolve_relative_url(next_link.get('href')) if direction is None: data['direction'] = 'backward' valid_items = ( item for item in soup.select('[id]') if VALID_ID_RE.match(item.get('id')) and not item.parent.get('id', '').startswith('comment_replies_more')) for item in valid_items: item_id = item.get('id') # Skipping comment if same as commented if item_id == in_reply_to: continue user_link = item.select_one('h3 > a') # NOTE: this is a raise bomb if not user_link: raise TypeError user_label = user_link.get_text().strip() user_href = user_link.get('href') user = parse_facebook_url(resolve_relative_url(user_href)) # TODO: link to comment content_elements_candidates = item.select_one('h3').find_next_siblings( 'div') content_elements = [] content_elements_html = [] for el in content_elements_candidates: if el.select_one('[id^=like_]'): break content_elements_html.append(el) if el.get_text().strip(): content_elements.append(el) comment_text = '\n'.join(el.get_text().strip() for el in content_elements) comment_html = ''.join(str(el) for el in content_elements_html) formatted_date = item.select_one('abbr').get_text().strip() parsed_date = parse_formatted_date(formatted_date) post_id = item.select_one('[id^="like_"]').get('id').split('_')[1] # NOTE: this could be better (we already know this beforehand) data['post_id'] = post_id reactions_item = item.select_one('[href^="/ufi/reaction/"]') reactions = '0' if reactions_item is not None: reactions = reactions_item.get_text().strip() replies_items = item.select('a[href^="/comment/replies"]') replies = '0' if len(replies_items) > 0: replies_item = replies_items[-1] if replies_item is not None: replies_text = replies_item.get_text() if replies_text != 'Reply': if 'See all' in replies_text: replies = replies_text.split('See all')[-1].split( ' replies')[0].strip() else: replies = replies_text.split('·')[-1].split( ' repl')[0].strip() replies_url = replies_item.get('href') data['replies'].append( (resolve_relative_url(replies_url), item_id)) data['comments'].append({ 'post_id': post_id, 'comment_id': item_id, 'user_id': getattr(user, 'id', ''), 'user_handle': getattr(user, 'handle', ''), 'user_url': getattr(user, 'url', ''), 'user_label': user_label, 'comment_text': comment_text, 'comment_html': comment_html, 'formatted_date': formatted_date, 'date': parsed_date.isoformat() if parsed_date else '', 'reactions': reactions, 'replies': replies, 'in_reply_to': in_reply_to }) return data
def scrape_comments(html, direction=None, in_reply_to=None): soup = BeautifulSoup(html, 'lxml') data = { 'direction': direction, 'post_id': None, 'comments': [], 'next': None, 'replies': [], 'in_reply_to': in_reply_to } # Detecting if we are in a video pagelet video_pagelet = soup.select_one('#mobile_injected_video_feed_pagelet') if video_pagelet is not None: actual_comments_link = video_pagelet.select_one( 'a[href^="/story.php?"]') if actual_comments_link: data['next'] = resolve_relative_url( actual_comments_link.get('href')) return data if not in_reply_to: if direction is None or direction == 'forward': next_link = soup.select_one('[id^="see_next_"] > a[href]') if next_link: data['next'] = resolve_relative_url(next_link.get('href')) if direction is None: data['direction'] = 'forward' if direction is None or direction == 'backward': next_link = soup.select_one('[id^="see_prev_"] > a[href]') if next_link: data['next'] = resolve_relative_url(next_link.get('href')) if direction is None: data['direction'] = 'backward' else: if direction is None or direction == 'backward': next_link = soup.select_one( '[id^="comment_replies_more_1"] > a[href]') if next_link: data['next'] = resolve_relative_url(next_link.get('href')) if direction is None: data['direction'] = 'backward' valid_items = ( item for item in soup.select('[id]:has(h3 > a)') if VALID_ID_RE.match(item.get('id')) and not item.parent.get('id', '').startswith('comment_replies_more')) for item in valid_items: item_id = item.get('id') # Skipping comment if same as commented if item_id == in_reply_to: continue user_link = item.select_one('h3 > a') user_label = user_link.get_text().strip() user_href = user_link.get('href') user = parse_facebook_url(resolve_relative_url(user_href)) # TODO: link to comment content_elements_candidates = item.select_one('h3').find_next_siblings( 'div') content_elements = [] content_elements_html = [] for el in content_elements_candidates: if el.select_one('[id^=like_]'): break content_elements_html.append(el) if el.get_text().strip(): content_elements.append(el) comment_text = get_display_text(content_elements) comment_html = ''.join(str(el) for el in content_elements_html) formatted_date = item.select_one('abbr').get_text().strip() parsed_date = parse_date(formatted_date) post_id_item = item.select_one('[id^="like_"]') if post_id_item is None: raise TypeError post_id = item.select_one('[id^="like_"]').get('id').split('_')[1] # NOTE: this could be better (we already know this beforehand) data['post_id'] = post_id reactions_item = item.select_one('[href^="/ufi/reaction/"]') reactions = '0' if reactions_item is not None: reactions = reactions_item.get_text().strip() replies_items = item.select('a[href^="/comment/replies"]') replies = '0' if len(replies_items) > 0: replies_item = replies_items[-1] if replies_item is not None: replies_text = replies_item.get_text() if replies_text != 'Reply': if 'See all' in replies_text: replies = replies_text.split('See all')[-1].split( ' replies')[0].strip() else: replies = replies_text.split('·')[-1].split( ' repl')[0].strip() replies_url = replies_item.get('href') data['replies'].append( (resolve_relative_url(replies_url), item_id)) data['comments'].append( FacebookComment(post_id=post_id, id=item_id, user_id=getattr(user, 'id', ''), user_handle=getattr(user, 'handle', ''), user_url=getattr(user, 'url', ''), user_label=user_label, text=comment_text, html=comment_html, formatted_date=formatted_date, date=parsed_date, reactions=reactions, replies=replies, in_reply_to=in_reply_to)) return data