def url_extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select.split(',') if namespace.select else None) extract = EXTRACTORS[getattr(namespace, 'from')] loading_bar = tqdm(desc='Extracting', dynamic_ncols=True, unit=' rows', total=namespace.total) for row, content in enricher.cells(namespace.column, with_rows=True): loading_bar.update() content = content.strip() if not content: continue for url in extract(content): if namespace.base_url is not None: url = urljoin(namespace.base_url, url) enricher.writerow(row, [url]) output_file.close()
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS) loading_bar = tqdm(desc='Extracting content', total=namespace.total, dynamic_ncols=True, unit=' docs') files = create_report_iterator(namespace, enricher, loading_bar=loading_bar) with Pool(namespace.processes) as pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, [report_error(error)] + PADDING) continue if result is None: enricher.writerow(row, ['no-content'] + PADDING) continue enricher.writerow(row, result) output_file.close()
def google_sheets_action(namespace): output_file = open_output_file(namespace.output, flag='w') try: data = export_google_sheets_as_csv(namespace.url, cookie=namespace.cookie, authuser=namespace.authuser) except GoogleSheetsInvalidTargetError: die('Could not extract a valid google sheets id from provided argument!' ) except BrowserCookieError: die('Could not extract cookie from %s!' % namespace.cookie) except GoogleSheetsMissingCookieError: die('Did not find a relevant cookie!') except GoogleSheetsInvalidContentTypeError: die('Could not export spreadsheet as CSV!') except GoogleSheetsNotFoundError: die('Could not find spreadsheet (404)!') except GoogleSheetsUnauthorizedError: die('You don\'t have access to this spreadsheet. Did you forget to set --cookie?' ) except GoogleSheetsMaxAttemptsExceeded: die('Maximum number of attempts exceeded! You can still set --authuser if you logged in numerous google accounts at once.' ) output_file.write(data) output_file.close()
def youtube_action(namespace): output_file = open_output_file(namespace.output, flag='w') if namespace.yt_action == 'videos': check_key(namespace) from minet.cli.youtube.videos import videos_action videos_action(namespace, output_file) elif namespace.yt_action == 'comments': check_key(namespace) from minet.cli.youtube.comments import comments_action comments_action(namespace, output_file) elif namespace.yt_action == 'captions': from minet.cli.youtube.captions import captions_action captions_action(namespace, output_file) elif namespace.yt_action == 'search': check_key(namespace) from minet.cli.youtube.search import search_action search_action(namespace, output_file) if namespace.output is not None: output_file.close()
def twitter_action(namespace): # Credentials are required to be able to access the API if not namespace.api_key or \ not namespace.api_secret_key or \ not namespace.access_token or \ not namespace.access_token_secret: die([ 'Full credentials are required to access Twitter API.', 'You can provide them using various CLI arguments:', ' --api-key', ' --api-secret-key', ' --access-token', ' --access-token-secret' ]) output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w') if namespace.tw_action == 'friends': from minet.cli.twitter.friends import twitter_friends_action twitter_friends_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def url_join_action(namespace): right_reader = casanova.reader(namespace.file2) left_reader = casanova.reader(namespace.file1, namespace.output) output_file = open_output_file(namespace.output) output_writer = csv.writer(output_file) left_headers = left_reader.fieldnames left_indices = None if namespace.select is not None: selected = namespace.select.split(',') left_headers = [h for h in left_headers if h in selected] left_indices = collect_column_indices(left_reader.pos, left_headers) empty = [''] * len(left_headers) output_writer.writerow(right_reader.fieldnames + left_headers) loading_bar = tqdm(desc='Indexing left file', dynamic_ncols=True, unit=' lines') # First step is to index left file trie = NormalizedLRUTrie(strip_trailing_slash=True) for row, url in left_reader.cells(namespace.column1, with_rows=True): url = url.strip() if left_indices is not None: row = [row[i] for i in left_indices] trie.set(url, row) loading_bar.update() loading_bar.close() loading_bar = tqdm(desc='Matching right file', dynamic_ncols=True, unit=' lines') for row, url in right_reader.cells(namespace.column2, with_rows=True): url = url.strip() match = None if url: match = trie.match(url) loading_bar.update() if match is None: output_writer.writerow(row + empty) continue row.extend(match) output_writer.writerow(row) output_file.close()
def twitter_action(namespace): output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w' ) if namespace.tw_action == 'scrape': from minet.cli.twitter.scrape import twitter_scrape_action twitter_scrape_action(namespace, output_file) else: check_credentials(namespace) if namespace.tw_action == 'friends': from minet.cli.twitter.friends import twitter_friends_action twitter_friends_action(namespace, output_file) elif namespace.tw_action == 'followers': from minet.cli.twitter.followers import twitter_followers_action twitter_followers_action(namespace, output_file) elif namespace.tw_action == 'users': from minet.cli.twitter.users import twitter_users_action twitter_users_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookCommentScraper(namespace.cookie) except FacebookInvalidCookieError: if namespace.cookie in ['firefox', 'chrome']: die('Could not extract cookies from %s.' % namespace.cookie) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not is_facebook_post_url(url): loading_bar.close() die('Given url (line %i) is not a Facebook post url: %s' % (i + 1, url)) batches = scraper(url, per_call=True, detailed=True, format='csv_row') for details, batch in batches: for comment in batch: enricher.writerow(row, comment) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()
def url_parse_action(namespace): output_file = open_output_file(namespace.output) headers = REPORT_HEADERS if namespace.facebook: headers = FACEBOOK_REPORT_HEADERS elif namespace.youtube: headers = YOUTUBE_REPORT_HEADERS enricher = casanova.enricher( namespace.file, output_file, add=headers, keep=namespace.select ) loading_bar = tqdm( desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total ) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True, require_protocol=False): enricher.writerow(row) continue if namespace.facebook: addendum = extract_facebook_addendum(url) elif namespace.youtube: addendum = extract_youtube_addendum(url) else: addendum = extract_standard_addendum(namespace, url) if addendum is None: enricher.writerow(row) continue enricher.writerow(row, addendum) output_file.close()
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=namespace.total, unit='doc' ) def on_irrelevant_row(reason, row): loading_bar.update() enricher.writerow(row, format_error(reason)) try: files = create_report_iterator( namespace, enricher, on_irrelevant_row=on_irrelevant_row ) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % namespace.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) pool = LazyPool(namespace.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-content')) continue enricher.writerow(row, result) loading_bar.close() output_file.close()
def crowdtangle_action(namespace): # A token is needed to be able to access the API if not namespace.token: die([ "A token is needed to be able to access CrowdTangle's API.", "You can provide one using the `--token` argument.", ]) output_file = open_output_file( namespace.output, flag="a+" if getattr(namespace, "resume", False) else "w") if namespace.ct_action == "posts": from minet.cli.crowdtangle.posts import crowdtangle_posts_action crowdtangle_posts_action(namespace, output_file) elif namespace.ct_action == "posts-by-id": from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action crowdtangle_posts_by_id_action(namespace, output_file) elif namespace.ct_action == "lists": from minet.cli.crowdtangle.lists import crowdtangle_lists_action crowdtangle_lists_action(namespace, output_file) elif namespace.ct_action == "leaderboard": from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action crowdtangle_leaderboard_action(namespace, output_file) elif namespace.ct_action == "search": from minet.cli.crowdtangle.search import crowdtangle_search_action crowdtangle_search_action(namespace, output_file) elif namespace.ct_action == "summary": from minet.cli.crowdtangle.summary import crowdtangle_summary_action crowdtangle_summary_action(namespace, output_file) elif namespace.ct_action == "links": from crowdtangle.links import crowdtangle_links_action crowdtangle_links_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def search_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) edit_namespace_with_csv_io(namespace, 'keyword') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit='videos', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) limit = namespace.limit for (row, keyword) in enricher.cells(namespace.column, with_rows=True): url = URL_template_accurate % {'subject': keyword, 'key': namespace.key} next_page = True while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!') time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data_l = get_data(result) for data in data_l: if limit is not(None): if limit == 0: return True else: limit -= 1 enricher.writerow(row, data) else: enricher.writerow(row, data)
def scrape_action(namespace): output_file = open_output_file(namespace.output) # Parsing scraper definition try: scraper = load_definition(namespace.scraper) except TypeError: die(['Unknown scraper format.', 'Expecting a JSON or YAML file.']) except: die('Invalid scraper file.') if namespace.format == 'csv': output_headers = headers_from_definition(scraper) output_writer = csv.DictWriter(output_file, fieldnames=output_headers) output_writer.writeheader() else: output_writer = ndjson.writer(output_file) loading_bar = tqdm(desc='Scraping pages', total=namespace.total, dynamic_ncols=True, unit=' pages') loading_bar.set_postfix(p=namespace.processes) if namespace.glob is not None: files = create_glob_iterator(namespace, scraper) else: reader = casanova.reader(namespace.report) files = create_report_iterator(namespace, reader, scraper, loading_bar) with Pool(namespace.processes) as pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if not isinstance(items, list): items = [items] for item in items: if not isinstance(item, dict): item = {'value': item} output_writer.writerow(item) output_file.close()
def crowdtangle_action(namespace): # A token is needed to be able to access the API if not namespace.token: die([ 'A token is needed to be able to access CrowdTangle\'s API.', 'You can provide one using the `--token` argument.' ]) output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w') if namespace.ct_action == 'posts': from minet.cli.crowdtangle.posts import crowdtangle_posts_action crowdtangle_posts_action(namespace, output_file) elif namespace.ct_action == 'posts-by-id': from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action crowdtangle_posts_by_id_action(namespace, output_file) elif namespace.ct_action == 'lists': from minet.cli.crowdtangle.lists import crowdtangle_lists_action crowdtangle_lists_action(namespace, output_file) elif namespace.ct_action == 'leaderboard': from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action crowdtangle_leaderboard_action(namespace, output_file) elif namespace.ct_action == 'search': from minet.cli.crowdtangle.search import crowdtangle_search_action crowdtangle_search_action(namespace, output_file) elif namespace.ct_action == 'summary': from minet.cli.crowdtangle.summary import crowdtangle_summary_action crowdtangle_summary_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def facebook_url_likes_action(namespace): output_file = open_output_file(namespace.output) if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) loading_bar = tqdm( desc='Retrieving likes', dynamic_ncols=True, unit=' urls', total=namespace.total ) http = create_pool() for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url = url.strip() err, html = make_request(http, url) if err is not None: loading_bar.close() die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.close() die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def youtube_action(namespace): output_file = open_output_file(namespace.output, flag='w') if namespace.yt_action == 'url-parse': from minet.cli.youtube.url_parse import url_parse_action url_parse_action(namespace, output_file) elif namespace.yt_action == 'videos': from minet.cli.youtube.videos import videos_action videos_action(namespace, output_file) elif namespace.yt_action == 'comments': from minet.cli.youtube.comments import comments_action comments_action(namespace, output_file) if namespace.output is not None: output_file.close()
def facebook_url_parse_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) loading_bar = tqdm( desc='Parsing', dynamic_ncols=True, unit=' lines', ) for row, url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() url_data = url.strip() parsed = parse_facebook_url(url_data) if parsed is None: enricher.writerow(row) if isinstance(parsed, FacebookPost): enricher.writerow( row, ['post', parsed.id, '', parsed.url] ) elif isinstance(parsed, FacebookHandle): enricher.writerow( row, ['handle', '', parsed.handle, parsed.url] ) elif isinstance(parsed, FacebookUser): enricher.writerow( row, ['user', parsed.id or '', parsed.handle or '', parsed.url] )
def twitter_action(namespace): output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w') if getattr(namespace, 'resume', False) and not namespace.output: die('Cannot --resume if -o/--output is not set!') if namespace.tw_action == 'scrape': from minet.cli.twitter.scrape import twitter_scrape_action twitter_scrape_action(namespace, output_file) else: check_credentials(namespace) if namespace.tw_action == 'friends': from minet.cli.twitter.friends import twitter_friends_action twitter_friends_action(namespace, output_file) elif namespace.tw_action == 'followers': from minet.cli.twitter.followers import twitter_followers_action twitter_followers_action(namespace, output_file) elif namespace.tw_action == 'users': from minet.cli.twitter.users import twitter_users_action twitter_users_action(namespace, output_file) elif namespace.tw_action == 'user-tweets': from minet.cli.twitter.user_tweets import twitter_user_tweets_action twitter_user_tweets_action(namespace, output_file) else: raise TypeError('unkown tw_action "%s"' % namespace.tw_action) # Cleanup if namespace.output is not None: output_file.close()
def mediacloud_action(namespace): # A token is needed to be able to access the API if not namespace.token: die([ 'A token is needed to be able to access Mediacloud\'s API.', 'You can provide one using the `--token` argument.' ]) output_file = open_output_file(namespace.output) if namespace.mc_action == 'topic': from minet.cli.mediacloud.topic import mediacloud_topic_action mediacloud_topic_action(namespace, output_file) elif namespace.mc_action == 'search': from minet.cli.mediacloud.search import mediacloud_search_action mediacloud_search_action(namespace, output_file) output_file.close()
def extract_action(namespace): input_headers, pos, reader = custom_reader( namespace.report, ('status', 'filename', 'encoding')) selected_fields = namespace.select.split(',') if namespace.select else None selected_pos = [input_headers.index(h) for h in selected_fields] if selected_fields else None output_headers = (list(input_headers) if not selected_pos else [input_headers[i] for i in selected_pos]) output_headers += OUTPUT_ADDITIONAL_HEADERS output_file = open_output_file(namespace.output) output_writer = csv.writer(output_file) output_writer.writerow(output_headers) loading_bar = tqdm(desc='Extracting content', total=namespace.total, dynamic_ncols=True, unit=' docs') namespace.report.close() namespace.report = open(namespace.report.name) files = create_report_iterator(namespace, loading_bar=loading_bar) with Pool(namespace.processes) as pool: for error, line, content in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: message = report_error(error) line.extend([message, '']) output_writer.writerow(line) continue line.extend(['', content]) output_writer.writerow(line) output_file.close()
def comments_action(namespace, output_file): output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key} next_page = True all_data = [] while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data = get_data(result) for comment in data: loading_bar.update() writer.writerow(comment)
def url_parse_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm(desc='Parsing', dynamic_ncols=True, unit=' rows', total=namespace.total) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() loading_bar.update() if namespace.separator: urls = url.split(namespace.separator) else: urls = [url] for url in urls: if not is_url(url, allow_spaces_in_path=True): enricher.writerow(row) continue enricher.writerow(row, [ normalize_url(url, strip_protocol=namespace.strip_protocol, strip_trailing_slash=True), get_domain_name(url), get_hostname(url), get_normalized_hostname(url) ]) output_file.close()
def cookies_action(namespace): output_file = open_output_file(namespace.output) if namespace.csv: output_writer = csv.writer(output_file) try: jar = getattr(browser_cookie3, namespace.browser)() except browser_cookie3.BrowserCookieError: die('Could not extract cookies from %s!' % namespace.browser) if namespace.url is not None: resolver = CookieResolver(jar) cookie = resolver(namespace.url) if cookie is not None: if namespace.csv: output_writer.writerow(MORSEL_CSV_HEADER) parsed = SimpleCookie(cookie) for morsel in parsed.values(): output_writer.writerow(format_morsel_for_csv(morsel)) else: print(cookie, file=output_file) else: die('Could not find relevant cookie for %s in %s!' % (namespace.url, namespace.browser)) else: if namespace.csv: output_writer.writerow(COOKIE_CSV_HEADER) for cookie in jar: output_writer.writerow(format_cookie_for_csv(cookie)) else: write_jar_as_text_mozilla(jar, output_file)
def comments_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_youtube_video_id(namespace.column): edit_namespace_with_csv_io(namespace, 'video_id') elif is_youtube_url(namespace.column): edit_namespace_with_csv_io(namespace, 'video_url') # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) def make_requests(current_url, http=http): return (request_json(http, current_url), current_url) for (row, url_id) in enricher.cells(namespace.column, with_rows=True): if is_youtube_url(url_id): yt_id = extract_video_id_from_youtube_url(url_id) if yt_id: url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key} elif is_youtube_video_id(url_id): url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key} else: continue url_queue = deque([url]) while len(url_queue) != 0: couche = [] with ThreadPoolExecutor(max_workers=25) as executor: time.sleep(0.01) couche = executor.map(make_requests, url_queue) url_queue = deque() for resp in couche: ((err, response, result), current_url) = resp if err: error_file.write('{} for {}'.format(err, current_url)) continue elif response.status == 403 and result.get('error').get( 'errors')[0].get('reason') == 'commentsDisabled': error_file.write( 'Comments are disabled for {}'.format(current_url)) continue elif response.status == 403: error_file.write( 'Running out of API points. You will have to wait until midnight, Pacific time!' ) time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: error_file.write('Error {} for {}'.format( response.status, current_url)) continue kind = result.get('kind', None) next_page = result.get('nextPageToken', None) if next_page: url_next = current_url + '&pageToken=' + next_page url_queue.append(url_next) if kind == 'youtube#commentThreadListResponse': # Handling comments pagination items = result.get('items', None) for item in items: snippet = item['snippet'] replies = item.get('replies') if replies: # Checking whether youtube's API send a subset of the replies or not if snippet['totalReplyCount'] != len( replies['comments']) and namespace.full: # If we want the replies and those are not all given by the API, we add the URL specific to the topComment # to the queue, and we deal with that topLevelComment new_url = URL_PARENTID_TEMPLATE % { 'id': snippet['topLevelComment']['id'], 'key': namespace.key } url_queue.append(new_url) data = get_data_full(snippet, True) enricher.writerow(row, data) else: dataTop = get_data_full(snippet, True) enricher.writerow(row, dataTop) for rep in replies['comments']: enricher.writerow( row, get_data_full(rep, False)) else: # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment top_comment = get_data_full(snippet, True) enricher.writerow(row, top_comment) else: # Handling, commentList, nothing to see here, dealing commments by comments items = result.get('items', None) for item in items: data = get_data_full(item, False) enricher.writerow(row, data)
def facebook_comments_action(namespace): # Reformatting url to hit mobile website url = force_protocol(namespace.url, 'https') url = convert_facebook_url_to_mobile(url) # Grabbing cookie cookie = grab_facebook_cookie(namespace) # Handling output output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) http = create_pool() def request_page(target): error, result = request(http, target, cookie=cookie) if error is not None: raise error return result.data.decode('utf-8') # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') url_queue = deque([(url, None)]) url_count = 0 replies_count = 0 while len(url_queue) != 0: current_url, in_reply_to = url_queue.popleft() html = request_page(current_url) data = scrape_comments(html, in_reply_to) url_count += 1 for reply_url, commented_id in data['replies']: url_queue.append((reply_url, commented_id)) if data['next'] is not None: url_queue.append((data['next'], in_reply_to)) for comment in data['comments']: loading_bar.update() writer.writerow(format_csv_row(comment)) if in_reply_to is not None: replies_count += 1 loading_bar.set_postfix(urls=url_count, replies=replies_count, q=len(url_queue)) # Don't be too greedy time.sleep(FACEBOOK_MOBILE_DEFAULT_THROTTLE) loading_bar.close()
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? single_url = namespace.file is sys.stdin and is_url(namespace.column) if single_url: edit_namespace_with_csv_io(namespace, 'url') # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) # Resume listener listener = None resuming_reader_loading = None skipped = 0 if resuming: resuming_reader_loading = tqdm(desc='Resuming', dynamic_ncols=True, unit=' lines') def listener(event, row): nonlocal skipped if event == 'resume.output': resuming_reader_loading.update() if event == 'resume.input': skipped += 1 loading_bar.set_postfix(skipped=skipped) loading_bar.update() # Enricher enricher = casanova.threadsafe_enricher( namespace.file, output_file, resumable=resuming, auto_resume=False, add=OUTPUT_ADDITIONAL_HEADERS + (['raw_contents'] if namespace.contents_in_report else []), keep=namespace.select, listener=listener) if namespace.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % namespace.column ]) url_pos = enricher.pos[namespace.column] filename_pos = None if namespace.filename is not None: if namespace.filename not in enricher.pos: die([ 'Could not find the "%s" column containing the filenames in the given CSV file.' % namespace.filename ]) filename_pos = enricher.pos[namespace.filename] indexed_input_headers = {h: i for i, h in enumerate(enricher.fieldnames)} if resuming: enricher.resume() resuming_reader_loading.close() # Loading bar total = namespace.total loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): url = item[1][url_pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', encoding or '' ] if namespace.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() fetch_kwargs = { 'threads': namespace.threads, 'throttle': namespace.throttle, 'domain_parallelism': namespace.domain_parallelism } if namespace.timeout is not None: fetch_kwargs['timeout'] = namespace.timeout multithreaded_iterator = multithreaded_fetch(enricher, key=url_key, request_args=request_args, **fetch_kwargs) for result in multithreaded_iterator: index, row = result.item if not result.url: write_output(index, row) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=row[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, row)) else: filename = row[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( index, row, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(index, row, error=error_code) # Closing files output_file.close()
def scrape_action(namespace): output_file = open_output_file(namespace.output) # Parsing scraper definition try: scraper = Scraper(namespace.scraper, strain=namespace.strain) except DefinitionInvalidFormatError: die(['Unknown scraper format!', 'It should be a JSON or YAML file.']) except FileNotFoundError: die('Could not find scraper file!') except InvalidScraperError as error: print('Your scraper is invalid! You need to fix the following errors:', file=sys.stderr) print(file=sys.stderr) sys.stderr.write( report_scraper_validation_errors(error.validation_errors)) die() except CSSSelectorTooComplex: die([ 'Your strainer\'s CSS selector %s is too complex.' % colored(namespace.strain, 'blue'), 'You cannot use relations to create a strainer.', 'Try to simplify the selector you passed to --strain.' ]) if namespace.validate: print('Your scraper is valid.', file=sys.stderr) sys.exit(0) if scraper.headers is None and namespace.format == 'csv': die([ 'Your scraper does not yield tabular data.', 'Try changing it or setting --format to "jsonl".' ]) loading_bar = LoadingBar(desc='Scraping pages', total=namespace.total, unit='page') proc_args = (namespace.format, namespace.separator) def on_irrelevant_row(reason, row): loading_bar.update() if namespace.glob is not None: files = create_glob_iterator(namespace, proc_args) else: reader = casanova.reader(namespace.report) try: files = create_report_iterator(namespace, reader, args=proc_args, on_irrelevant_row=on_irrelevant_row) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % namespace.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) if namespace.format == 'csv': output_writer = csv.DictWriter(output_file, fieldnames=scraper.headers) output_writer.writeheader() else: output_writer = ndjson.writer(output_file) pool = LazyPool(namespace.processes, initializer=init_process, initargs=(scraper.definition, namespace.strain)) loading_bar.update_stats(p=pool.processes) with pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: if isinstance(error, (ScraperEvalError, ScraperEvalTypeError, ScraperEvalNoneError)): loading_bar.print(report_scraper_evaluation_error(error), end='') loading_bar.inc('errors') continue for item in items: output_writer.writerow(item) loading_bar.close() output_file.close()
def facebook_post_stats_action(namespace): # Handling output output_file = open_output_file(namespace.output) enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) http = create_pool() def fetch_facebook_page_stats(url): err, response = request(http, url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = nested_get([ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ], data) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data # Loading bar loading_bar = tqdm(desc='Fetching post stats', dynamic_ncols=True, unit=' posts', total=namespace.total) for row, post_url in enricher.cells(namespace.column, with_rows=True): loading_bar.update() if (not post_url or not is_facebook_post_url(post_url) or not is_facebook_url(post_url)): enricher.writerow(row, format_err('not-facebook-post')) continue err, data = fetch_facebook_page_stats(post_url) if err: enricher.writerow(row, format_err(err)) else: enricher.writerow(row, format(data)) # Throttling sleep_with_entropy(FACEBOOK_WEB_DEFAULT_THROTTLE, 5.0)
def fetch_action(namespace): # Are we resuming resuming = namespace.resume if resuming and not namespace.output: die(['Cannot --resume without specifying -o/--output.']) # Do we need to fetch only a single url? if namespace.file is sys.stdin and is_url(namespace.column): namespace.file = StringIO('url\n%s' % namespace.column) namespace.column = 'url' # If we are hitting a single url we enable contents_in_report if namespace.contents_in_report is None: namespace.contents_in_report = True input_headers, pos, reader = custom_reader(namespace.file, namespace.column) filename_pos = input_headers.index( namespace.filename) if namespace.filename else None indexed_input_headers = {h: p for p, h in enumerate(input_headers)} selected_fields = namespace.select.split(',') if namespace.select else None selected_pos = [input_headers.index(h) for h in selected_fields] if selected_fields else None # HTTP method http_method = namespace.method # Cookie grabber get_cookie = None if namespace.grab_cookies: get_cookie = grab_cookies(namespace.grab_cookies) # Global headers global_headers = None if namespace.headers: global_headers = {} for header in namespace.headers: k, v = parse_http_header(header) global_headers = v # Reading output output_headers = (list(input_headers) if not selected_pos else [input_headers[i] for i in selected_pos]) output_headers += OUTPUT_ADDITIONAL_HEADERS if namespace.contents_in_report: output_headers.append('raw_content') flag = 'w' if namespace.output is not None and resuming and isfile(namespace.output): flag = 'r+' output_file = open_output_file(namespace.output, flag=flag) output_writer = csv.writer(output_file) if not resuming: output_writer.writerow(output_headers) else: # Reading report to know what need to be done _, rpos, resuming_reader = custom_reader(output_file, 'line') resuming_reader_loading = tqdm(resuming_reader, desc='Resuming', dynamic_ncols=True, unit=' lines') already_done = ContiguousRangeSet() for line in resuming_reader_loading: index = line[rpos] already_done.add(int(index)) # Loading bar total = namespace.total if total is not None and resuming: total -= len(already_done) loading_bar = tqdm(desc='Fetching pages', total=total, dynamic_ncols=True, unit=' urls') def url_key(item): line = item[1] url = line[pos].strip() if not url: return # Url templating if namespace.url_template: return namespace.url_template.format(value=url) return url def request_args(url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return {'method': http_method, 'cookie': cookie, 'headers': headers} def write_output(index, line, resolved=None, status=None, error=None, filename=None, encoding=None, data=None): if selected_pos: line = [line[p] for p in selected_pos] line.extend([ index, resolved or '', status or '', error or '', filename or '', encoding or '' ]) if namespace.contents_in_report: line.append(data or '') output_writer.writerow(line) errors = 0 status_codes = Counter() target_iterator = enumerate(reader) if resuming: target_iterator = (pair for pair in target_iterator if not already_done.stateful_contains(pair[0])) multithreaded_iterator = multithreaded_fetch(target_iterator, key=url_key, request_args=request_args, threads=namespace.threads, throttle=namespace.throttle) for result in multithreaded_iterator: line_index, line = result.item if not result.url: write_output(line_index, line) loading_bar.update() continue response = result.response data = response.data if response is not None else None content_write_flag = 'wb' # Updating stats if result.error is not None: errors += 1 else: if response.status >= 400: status_codes[response.status] += 1 postfix = {'errors': errors} for code, count in status_codes.most_common(1): postfix[str(code)] = count loading_bar.set_postfix(**postfix) loading_bar.update() # No error if result.error is None: filename = None # Building filename if data: if filename_pos is not None or namespace.filename_template: if namespace.filename_template: filename = CUSTOM_FORMATTER.format( namespace.filename_template, value=line[filename_pos] if filename_pos is not None else None, ext=result.meta['ext'], line=LazyLineDict(indexed_input_headers, line)) else: filename = line[filename_pos] + result.meta['ext'] else: # NOTE: it would be nice to have an id that can be sorted by time filename = str(uuid4()) + result.meta['ext'] # Standardize encoding? encoding = result.meta['encoding'] if data and namespace.standardize_encoding or namespace.contents_in_report: if encoding is None or encoding != 'utf-8' or namespace.contents_in_report: data = data.decode( encoding if encoding is not None else 'utf-8', errors='replace') encoding = 'utf-8' content_write_flag = 'w' # Writing file on disk if data and not namespace.contents_in_report: if namespace.compress: filename += '.gz' resource_path = join(namespace.output_dir, filename) resource_dir = dirname(resource_path) os.makedirs(resource_dir, exist_ok=True) with open(resource_path, content_write_flag) as f: # TODO: what if standardize_encoding + compress? f.write( gzip.compress(data) if namespace.compress else data) # Reporting in output resolved_url = response.geturl() write_output( line_index, line, resolved=resolved_url if resolved_url != result.url else None, status=response.status, filename=filename, encoding=encoding, data=data) # Handling potential errors else: error_code = report_error(result.error) write_output(line_index, line, error=error_code) # Closing files if namespace.output is not None: output_file.close()
def facebook_comments_action(namespace): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'post_url') try: scraper = FacebookMobileScraper(namespace.cookie, throttle=namespace.throttle) except FacebookInvalidCookieError: if namespace.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % namespace.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook post comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = tqdm(desc='Scraping comments', dynamic_ncols=True, unit=' comments') for i, (row, url) in enumerate(enricher.cells(namespace.column, with_rows=True)): if not has_facebook_comments(url): tqdm.write( 'Given url (line %i) probably cannot have Facebook comments: %s' % (i + 1, url), file=sys.stderr) continue batches = scraper.comments(url, per_call=True, detailed=True) for details, batch in batches: for comment in batch: enricher.writerow(row, comment.as_csv_row()) loading_bar.update(len(batch)) loading_bar.set_postfix(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i + 1) loading_bar.close()