def test_basics(self): for url, output in TESTS: assert force_protocol(url) == output assert force_protocol('http://lemonde.fr?utm_hp_ref=test', 'ftp') == 'ftp://lemonde.fr?utm_hp_ref=test' assert force_protocol('ftp://lemonde.fr?utm_hp_ref=test', 'http://') == 'http://lemonde.fr?utm_hp_ref=test'
def __call__(self, url, detailed=False, per_call=False, format='raw'): if format not in FACEBOOK_OUTPUT_FORMATS: raise TypeError('minet.facebook.scrape_comments: unkown `format`.') # Reformatting url to hit mobile website url = force_protocol(url, 'https') url = convert_facebook_url_to_mobile(url) html = self.request_page(url) members_link = scrape_members_link(html) while members_link is not None: html = self.request_page(members_link) next_link, members = scrape_members(html) yield from members members_link = next_link
def facebook_comments_action(namespace): # Reformatting url to hit mobile website url = force_protocol(namespace.url, 'https') url = convert_facebook_url_to_mobile(url) # Grabbing cookie cookie = grab_facebook_cookie(namespace) # Handling output output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) http = create_pool() def request_page(target): error, result = request(http, target, cookie=cookie) if error is not None: raise error return result.data.decode('utf-8') # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') url_queue = deque([(url, None)]) url_count = 0 replies_count = 0 while len(url_queue) != 0: current_url, in_reply_to = url_queue.popleft() html = request_page(current_url) data = scrape_comments(html, in_reply_to) url_count += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, commented_id)) if data['next'] is not None: url_queue.append((data['next'], in_reply_to)) for comment in data['comments']: loading_bar.update() writer.writerow(format_csv_row(comment)) if in_reply_to is not None: replies_count += 1 loading_bar.set_postfix(urls=url_count, replies=replies_count, q=len(url_queue)) # Don't be too greedy time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE) loading_bar.close()
def convert_url_to_mobile(url): url = force_protocol(url, 'https') return convert_facebook_url_to_mobile(url)
def __call__(self, url, detailed=False, per_call=False, format='raw'): if format not in FACEBOOK_OUTPUT_FORMATS: raise TypeError('minet.facebook.scrape_comments: unkown `format`.') # Reformatting url to hit mobile website url = force_protocol(url, 'https') url = convert_facebook_url_to_mobile(url) url_queue = deque([(url, None, None)]) calls = 0 replies = 0 while len(url_queue) != 0: current_url, direction, in_reply_to = url_queue.popleft() html = self.request_page(current_url) try: data = scrape_comments(html, direction, in_reply_to) except TypeError: # with open('./dump.html', 'w') as f: # f.write(html) print('Could not process comment in %s' % current_url, file=sys.stderr) sys.exit(1) calls += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, None, commented_id)) if data['next'] is not None: url_queue.append( (data['next'], data['direction'], in_reply_to)) comments = [] for comment in data['comments']: if in_reply_to is not None: replies += 1 if format == 'csv_row': comment = format_comment(comment) if per_call: comments.append(comment) else: yield comment if detailed: details = { 'calls': calls, 'replies': replies, 'queue_size': len(url_queue) } yield details, comments else: yield comments