def scrape_post_id(post_url): post_mobile_url = convert_facebook_url_to_mobile(post_url) err, response, html = request_text(post_mobile_url) if err: raise err soup = BeautifulSoup(html, 'lxml') root_element = soup.select_one('#m_story_permalink_view [data-ft]') if root_element is None: # Is this a photo post? next_link = soup.select_one('[href^="/photo.php"]') if next_link is None: return href = next_link.get('href') if not href: return link = urljoin(FACEBOOK_URL, href) query = urlsplit(link).query if not query: return query = dict(parse_qsl(query)) return '%s_%s' % (query['id'], query['fbid']) data = root_element.get('data-ft') if data is None: return try: data = json.loads(data) except json.JSONDecodeError: return content_owner_id_new = data.get('content_owner_id_new') or data.get( 'page_id') mf_story_key = data.get('mf_story_key') if content_owner_id_new is None or mf_story_key is None: return return '%s_%s' % (content_owner_id_new, mf_story_key)
def __call__(self, url, detailed=False, per_call=False, format='raw'): if format not in FACEBOOK_OUTPUT_FORMATS: raise TypeError('minet.facebook.scrape_comments: unkown `format`.') # Reformatting url to hit mobile website url = force_protocol(url, 'https') url = convert_facebook_url_to_mobile(url) html = self.request_page(url) members_link = scrape_members_link(html) while members_link is not None: html = self.request_page(members_link) next_link, members = scrape_members(html) yield from members members_link = next_link
def facebook_comments_action(namespace): # Reformatting url to hit mobile website url = force_protocol(namespace.url, 'https') url = convert_facebook_url_to_mobile(url) # Grabbing cookie cookie = grab_facebook_cookie(namespace) # Handling output output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) http = create_pool() def request_page(target): error, result = request(http, target, cookie=cookie) if error is not None: raise error return result.data.decode('utf-8') # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') url_queue = deque([(url, None)]) url_count = 0 replies_count = 0 while len(url_queue) != 0: current_url, in_reply_to = url_queue.popleft() html = request_page(current_url) data = scrape_comments(html, in_reply_to) url_count += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, commented_id)) if data['next'] is not None: url_queue.append((data['next'], in_reply_to)) for comment in data['comments']: loading_bar.update() writer.writerow(format_csv_row(comment)) if in_reply_to is not None: replies_count += 1 loading_bar.set_postfix(urls=url_count, replies=replies_count, q=len(url_queue)) # Don't be too greedy time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE) loading_bar.close()
def convert_url_to_mobile(url): url = force_protocol(url, 'https') return convert_facebook_url_to_mobile(url)
def test_convert_facebook_url_to_mobile(self): for url, expected in MOBILE_TESTS: assert convert_facebook_url_to_mobile(url) == expected with pytest.raises(Exception): convert_facebook_url_to_mobile('http://twitter.com')
def __call__(self, url, detailed=False, per_call=False, format='raw'): if format not in FACEBOOK_OUTPUT_FORMATS: raise TypeError('minet.facebook.scrape_comments: unkown `format`.') # Reformatting url to hit mobile website url = force_protocol(url, 'https') url = convert_facebook_url_to_mobile(url) url_queue = deque([(url, None, None)]) calls = 0 replies = 0 while len(url_queue) != 0: current_url, direction, in_reply_to = url_queue.popleft() html = self.request_page(current_url) try: data = scrape_comments(html, direction, in_reply_to) except TypeError: # with open('./dump.html', 'w') as f: # f.write(html) print('Could not process comment in %s' % current_url, file=sys.stderr) sys.exit(1) calls += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, None, commented_id)) if data['next'] is not None: url_queue.append( (data['next'], data['direction'], in_reply_to)) comments = [] for comment in data['comments']: if in_reply_to is not None: replies += 1 if format == 'csv_row': comment = format_comment(comment) if per_call: comments.append(comment) else: yield comment if detailed: details = { 'calls': calls, 'replies': replies, 'queue_size': len(url_queue) } yield details, comments else: yield comments