def videos_action(namespace, output_file): enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() column = namespace.column def rows_with_videos_id(): for row, ytb_data in enricher.cells(namespace.column, with_rows=True): video_id = None if is_youtube_video_id(ytb_data): video_id = ytb_data elif is_youtube_url(ytb_data): video_id = extract_video_id_from_youtube_url(ytb_data) yield row, video_id for chunk in chunks_iter(rows_with_videos_id(), 50): all_ids = [video_id for _, video_id in chunk if video_id] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) not_available = [] id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for row, video_id in chunk: if video_id is None or video_id in not_available: enricher.writerow(row) else: enricher.writerow(row, data[video_id])
def twitter_action(namespace): # Credentials are required to be able to access the API if not namespace.api_key or \ not namespace.api_secret_key or \ not namespace.access_token or \ not namespace.access_token_secret: die([ 'Full credentials are required to access Twitter API.', 'You can provide them using various CLI arguments:', ' --api-key', ' --api-secret-key', ' --access-token', ' --access-token-secret' ]) output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w') if namespace.tw_action == 'friends': from minet.cli.twitter.friends import twitter_friends_action twitter_friends_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def crowdtangle_posts_by_id_action(namespace, output_file): client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit) already_done = 0 def listener(event, row): nonlocal already_done if event == 'resume.input': already_done += 1 enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CROWDTANGLE_POST_CSV_HEADERS, resumable=namespace.resume, listener=listener ) loading_bar = tqdm( desc='Retrieving posts', dynamic_ncols=True, total=namespace.total, unit=' posts' ) loading_bar.update(already_done) loading_bar_context = LoadingBarContext(loading_bar) try: for row, url in enricher.cells(namespace.column, with_rows=True): with loading_bar_context: url = url.strip() if not url: enricher.writerow(row) continue url = ensure_protocol(url) if not is_facebook_post_url(url): enricher.writerow(row) continue post_id = facebook.post_id_from_url(url) if post_id is None: enricher.writerow(row) continue post = client.post(post_id, format='csv_row') enricher.writerow(row, post) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])
def cookies_action(cli_args): if cli_args.csv: output_writer = csv.writer(cli_args.output) try: jar = getattr(browser_cookie3, cli_args.browser)() except browser_cookie3.BrowserCookieError: die('Could not extract cookies from %s!' % cli_args.browser) if cli_args.url is not None: resolver = CookieResolver(jar) cookie = resolver(cli_args.url) if cookie is not None: if cli_args.csv: output_writer.writerow(MORSEL_CSV_HEADER) parsed = SimpleCookie(cookie) for morsel in parsed.values(): output_writer.writerow(format_morsel_for_csv(morsel)) else: print(cookie, file=cli_args.output) else: die('Could not find relevant cookie for %s in %s!' % (cli_args.url, cli_args.browser)) else: if cli_args.csv: output_writer.writerow(COOKIE_CSV_HEADER) for cookie in jar: output_writer.writerow(format_cookie_for_csv(cookie)) else: write_jar_as_text_mozilla(jar, cli_args.output)
def check_key(namespace): # A key is required to used the API if not namespace.key: die([ 'A key is required to access YouTube API.', 'You can provide it using the --key argument.' ])
def check_key(cli_args): # A key is required to used the API if not cli_args.key: die([ 'A key is required to access YouTube API.', 'You can provide it using the --key argument.' ])
def facebook_posts_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_POST_CSV_HEADERS ) # Loading bar loading_bar = LoadingBar( desc='Scraping posts', unit='post' ) translated_langs = set() for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): loading_bar.inc('groups') try: posts = scraper.posts(url) except FacebookInvalidTargetError: loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url)) continue for post in posts: if post.translated_text and post.translated_from not in translated_langs: translated_langs.add(post.translated_from) lines = [ 'Found text translated from %s!' % post.translated_from, 'Since it means original text may not be entirely retrieved you might want', 'to edit your Facebook language settings to add "%s" to' % post.translated_from, 'the "Languages you don\'t want to be offered translations for" list here:', 'https://www.facebook.com/settings/?tab=language' ] for line in lines: loading_bar.print(line) loading_bar.print() loading_bar.update() enricher.writerow(row, post.as_csv_row())
def check_dragnet(): try: import dragnet except: die([ 'The `dragnet` library is not installed. The `extract` command won\'t work.', 'To install it correctly, run the following commands in order:', '', ' pip install lxml numpy Cython', ' pip install dragnet' ])
def crowdtangle_summary_action(namespace, output_file): if not namespace.start_date: die('Missing --start-date!') if is_url(namespace.column): edit_namespace_with_csv_io(namespace, 'url') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select.split(',') if namespace.select else None, add=CROWDTANGLE_SUMMARY_CSV_HEADERS) posts_writer = None if namespace.posts is not None: posts_writer = csv.writer(namespace.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = tqdm(desc='Collecting data', dynamic_ncols=True, total=namespace.total, unit=' urls') client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) for row, url in enricher.cells(namespace.column, with_rows=True): url = url.strip() try: stats = client.summary(url, start_date=namespace.start_date, with_top_posts=namespace.posts is not None, sort_by=namespace.sort_by, format='csv_row', platforms=namespace.platforms) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) except Exception as err: raise err if namespace.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow([url] + post) enricher.writerow(row, stats) loading_bar.update()
def main(): # Building parser parser, subparser_index = build_parser(MINET_COMMANDS) # Parsing arguments and triggering commands cli_args = parser.parse_args() action = subparser_index.get(cli_args.action) if action is not None: # Loading config config = get_rcfile(cli_args.rcfile) # Resolving namespace dependencies try: to_close = resolve_arg_dependencies(cli_args, config) except OSError as e: parser.error('Could not open output file (-o/--output): %s' % str(e)) except NotResumable: parser.error( 'Cannot --resume without knowing where the output will be written (use -o/--output)' ) # Lazy loading module for faster startup m = importlib.import_module(action['command']['package']) fn = getattr(m, action['command']['action']) with ExitStack() as stack: for buffer in to_close: stack.callback(buffer.close) try: fn(cli_args) except InvalidArgumentsError as e: parser.error(e.message) sys.exit(1) elif cli_args.action == 'help': if len(cli_args.subcommand) == 0: parser.print_help() return target = get_subparser(subparser_index, cli_args.subcommand) if target is None: die('Unknow command "%s"' % ' '.join(cli_args.subcommand)) else: target.print_help() else: parser.print_help()
def videos_action(namespace, output_file): enricher = CSVEnricher( namespace.file, namespace.column, output_file, report_headers=REPORT_HEADERS, select=namespace.select.split(',') if namespace.select else None ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() for chunk in gen_chunks(enricher): all_ids = [row[0] for row in chunk if row[0]] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for item in chunk: video_id, line = item if video_id is None: enricher.write_empty(line) elif video_id in not_available: line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1) enricher.write(line, line_empty) else: enricher.write(line, data[video_id])
def crowdtangle_summary_action(cli_args): if not cli_args.start_date: die('Missing --start-date!') enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=CROWDTANGLE_SUMMARY_CSV_HEADERS ) posts_writer = None if cli_args.posts is not None: posts_writer = csv.writer(cli_args.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = LoadingBar( desc='Collecting data', total=cli_args.total, unit='url' ) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) for row, url in enricher.cells(cli_args.column, with_rows=True): url = url.strip() try: stats = client.summary( url, start_date=cli_args.start_date, with_top_posts=cli_args.posts is not None, sort_by=cli_args.sort_by, platforms=cli_args.platforms ) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) if cli_args.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow(post.as_csv_row()) enricher.writerow(row, stats.as_csv_row() if stats is not None else None) loading_bar.update()
def check_credentials(namespace): # Credentials are required to be able to access the API if not namespace.api_key or \ not namespace.api_secret_key or \ not namespace.access_token or \ not namespace.access_token_secret: die([ 'Full credentials are required to access Twitter API.', 'You can provide them using various CLI arguments:', ' --api-key', ' --api-secret-key', ' --access-token', ' --access-token-secret' ])
def crowdtangle_action(namespace): # A token is needed to be able to access the API if not namespace.token: die([ "A token is needed to be able to access CrowdTangle's API.", "You can provide one using the `--token` argument.", ]) output_file = open_output_file( namespace.output, flag="a+" if getattr(namespace, "resume", False) else "w") if namespace.ct_action == "posts": from minet.cli.crowdtangle.posts import crowdtangle_posts_action crowdtangle_posts_action(namespace, output_file) elif namespace.ct_action == "posts-by-id": from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action crowdtangle_posts_by_id_action(namespace, output_file) elif namespace.ct_action == "lists": from minet.cli.crowdtangle.lists import crowdtangle_lists_action crowdtangle_lists_action(namespace, output_file) elif namespace.ct_action == "leaderboard": from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action crowdtangle_leaderboard_action(namespace, output_file) elif namespace.ct_action == "search": from minet.cli.crowdtangle.search import crowdtangle_search_action crowdtangle_search_action(namespace, output_file) elif namespace.ct_action == "summary": from minet.cli.crowdtangle.summary import crowdtangle_summary_action crowdtangle_summary_action(namespace, output_file) elif namespace.ct_action == "links": from crowdtangle.links import crowdtangle_links_action crowdtangle_links_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def search_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) edit_namespace_with_csv_io(namespace, 'keyword') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit='videos', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) limit = namespace.limit for (row, keyword) in enricher.cells(namespace.column, with_rows=True): url = URL_template_accurate % {'subject': keyword, 'key': namespace.key} next_page = True while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!') time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data_l = get_data(result) for data in data_l: if limit is not(None): if limit == 0: return True else: limit -= 1 enricher.writerow(row, data) else: enricher.writerow(row, data)
def main(): # Building parser parser, subparser_index = build_parser(MINET_COMMANDS) # Parsing arguments and triggering commands args = parser.parse_args() action = subparser_index.get(args.action) if action is not None: # Loading config config = get_rcfile(args.rcfile) # Bootstrapping config for name in vars(args): value = getattr(args, name) if isinstance(value, WrappedConfigValue): setattr(args, name, value.resolve(config)) # Need to check something? if 'before' in action['command']: action['command']['before']() # Lazy loading module for faster startup m = importlib.import_module(action['command']['package']) fn = getattr(m, action['command']['action']) fn(args) elif args.action == 'help': if len(args.subcommand) == 0: parser.print_help() return target = get_subparser(subparser_index, args.subcommand) if target is None: die('Unknow command "%s"' % ' '.join(args.subcommand)) else: target.print_help() else: parser.print_help()
def crowdtangle_lists_action(namespace, output_file): client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) writer = csv.writer(output_file) writer.writerow(CROWDTANGLE_LIST_CSV_HEADERS) try: lists = client.lists(format='csv_row') for l in lists: writer.writerow(l) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])
def scrape_action(namespace): output_file = open_output_file(namespace.output) # Parsing scraper definition try: scraper = load_definition(namespace.scraper) except TypeError: die(['Unknown scraper format.', 'Expecting a JSON or YAML file.']) except: die('Invalid scraper file.') if namespace.format == 'csv': output_headers = headers_from_definition(scraper) output_writer = csv.DictWriter(output_file, fieldnames=output_headers) output_writer.writeheader() else: output_writer = ndjson.writer(output_file) loading_bar = tqdm(desc='Scraping pages', total=namespace.total, dynamic_ncols=True, unit=' pages') loading_bar.set_postfix(p=namespace.processes) if namespace.glob is not None: files = create_glob_iterator(namespace, scraper) else: reader = casanova.reader(namespace.report) files = create_report_iterator(namespace, reader, scraper, loading_bar) with Pool(namespace.processes) as pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if not isinstance(items, list): items = [items] for item in items: if not isinstance(item, dict): item = {'value': item} output_writer.writerow(item) output_file.close()
def crowdtangle_action(namespace): # A token is needed to be able to access the API if not namespace.token: die([ 'A token is needed to be able to access CrowdTangle\'s API.', 'You can provide one using the `--token` argument.' ]) output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w') if namespace.ct_action == 'posts': from minet.cli.crowdtangle.posts import crowdtangle_posts_action crowdtangle_posts_action(namespace, output_file) elif namespace.ct_action == 'posts-by-id': from minet.cli.crowdtangle.posts_by_id import crowdtangle_posts_by_id_action crowdtangle_posts_by_id_action(namespace, output_file) elif namespace.ct_action == 'lists': from minet.cli.crowdtangle.lists import crowdtangle_lists_action crowdtangle_lists_action(namespace, output_file) elif namespace.ct_action == 'leaderboard': from minet.cli.crowdtangle.leaderboard import crowdtangle_leaderboard_action crowdtangle_leaderboard_action(namespace, output_file) elif namespace.ct_action == 'search': from minet.cli.crowdtangle.search import crowdtangle_search_action crowdtangle_search_action(namespace, output_file) elif namespace.ct_action == 'summary': from minet.cli.crowdtangle.summary import crowdtangle_summary_action crowdtangle_summary_action(namespace, output_file) # Cleanup if namespace.output is not None: output_file.close()
def facebook_comments_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = LoadingBar(desc='Scraping comments', unit='comment') for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): try: batches = scraper.comments(url, per_call=True, detailed=True) except FacebookInvalidTargetError: loading_bar.print( 'Given url (line %i) is probably not a Facebook resource having comments: %s' % (i, url)) continue for details, batch in batches: for comment in batch: enricher.writerow(row, comment.as_csv_row()) loading_bar.update(len(batch)) loading_bar.update_stats(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i)
def mediacloud_medias_action(namespace, output_file): added_headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:] feeds_file = None feeds_writer = None if namespace.feeds: added_headers.append('feeds') feeds_file = open(namespace.feeds, 'w', encoding='utf-8') feeds_writer = csv.writer(feeds_file) feeds_writer.writerow(MEDIACLOUD_FEED_CSV_HEADER) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=added_headers) loading_bar = tqdm(desc='Fetching medias', dynamic_ncols=True, unit=' medias', total=namespace.total) client = MediacloudAPIClient(namespace.token) for row, media_id in enricher.cells(namespace.column, with_rows=True): try: result = client.media(media_id, format='csv_row') if namespace.feeds: feeds = client.feeds(media_id, format='csv_row') enricher.writerow(row, result[1:] + [len(feeds)]) for feed in feeds: feeds_writer.writerow(feed) else: enricher.writerow(row, result[1:]) except MediacloudServerError as e: loading_bar.close() die(['Aborted due to a mediacloud server error:', e.server_error]) loading_bar.update() feeds_file.close()
def crowdtangle_lists_action(cli_args): client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) writer = csv.writer(cli_args.output) writer.writerow(CROWDTANGLE_LIST_CSV_HEADERS) try: lists = client.lists() for l in lists: writer.writerow(l) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])
def twitter_action(namespace): output_file = open_output_file( namespace.output, flag='a+' if getattr(namespace, 'resume', False) else 'w') if getattr(namespace, 'resume', False) and not namespace.output: die('Cannot --resume if -o/--output is not set!') if namespace.tw_action == 'scrape': from minet.cli.twitter.scrape import twitter_scrape_action twitter_scrape_action(namespace, output_file) else: check_credentials(namespace) if namespace.tw_action == 'friends': from minet.cli.twitter.friends import twitter_friends_action twitter_friends_action(namespace, output_file) elif namespace.tw_action == 'followers': from minet.cli.twitter.followers import twitter_followers_action twitter_followers_action(namespace, output_file) elif namespace.tw_action == 'users': from minet.cli.twitter.users import twitter_users_action twitter_users_action(namespace, output_file) elif namespace.tw_action == 'user-tweets': from minet.cli.twitter.user_tweets import twitter_user_tweets_action twitter_user_tweets_action(namespace, output_file) else: raise TypeError('unkown tw_action "%s"' % namespace.tw_action) # Cleanup if namespace.output is not None: output_file.close()
def mediacloud_action(namespace): # A token is needed to be able to access the API if not namespace.token: die([ 'A token is needed to be able to access Mediacloud\'s API.', 'You can provide one using the `--token` argument.' ]) output_file = open_output_file(namespace.output) if namespace.mc_action == 'topic': from minet.cli.mediacloud.topic import mediacloud_topic_action mediacloud_topic_action(namespace, output_file) elif namespace.mc_action == 'search': from minet.cli.mediacloud.search import mediacloud_search_action mediacloud_search_action(namespace, output_file) output_file.close()
def mediacloud_action(cli_args): # A token is needed to be able to access the API if not cli_args.token: die([ 'A token is needed to be able to access Mediacloud\'s API.', 'You can provide one using the `--token` argument.' ]) if cli_args.mc_action == 'medias': from minet.cli.mediacloud.medias import mediacloud_medias_action mediacloud_medias_action(cli_args) if cli_args.mc_action == 'topic': from minet.cli.mediacloud.topic import mediacloud_topic_action mediacloud_topic_action(cli_args) elif cli_args.mc_action == 'search': from minet.cli.mediacloud.search import mediacloud_search_action mediacloud_search_action(cli_args)
def grab_facebook_cookie(namespace): if namespace.cookie == 'firefox' or namespace.cookie == 'chrome': get_cookie_for_url = grab_cookies(namespace.cookie) if get_cookie_for_url is None: die('Could not extract cookies from %s.' % namespace.cookie) cookie = get_cookie_for_url(FACEBOOK_URL + '/') else: cookie = namespace.cookie.strip() if not cookie: die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to access Facebook pages.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) return fix_cookie(cookie)
def facebook_url_likes_action(cli_args): enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=REPORT_HEADERS, total=cli_args.total, prebuffer_bytes=DEFAULT_PREBUFFER_BYTES ) if cli_args.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column ]) loading_bar = LoadingBar( desc='Retrieving likes', unit='url', total=enricher.total ) for row, url in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() url = url.strip() if not url or not is_url(url, require_protocol=False): enricher.writerow(row) continue err, html = make_request(url) if err is not None: loading_bar.die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def mediacloud_search_action(namespace, output_file): writer = csv.writer(output_file) writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER) client = MediacloudClient(namespace.token) kwargs = { 'collections': namespace.collections } loading_bar = tqdm( desc='Searching stories', dynamic_ncols=True, unit=' stories' ) try: if not namespace.skip_count: count = client.count( namespace.query, **kwargs ) loading_bar.total = count iterator = client.search( namespace.query, format='csv_row', **kwargs ) for story in iterator: writer.writerow(story) loading_bar.update() except MediacloudServerError as e: loading_bar.close() die([ 'Aborted due to a mediacloud server error:', e.server_error ])
def comments_action(namespace, output_file): output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key} next_page = True all_data = [] while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data = get_data(result) for comment in data: loading_bar.update() writer.writerow(comment)
def facebook_post_authors_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_USER_CSV_HEADERS ) # Loading bar loading_bar = LoadingBar( desc='Finding authors', unit='post' ) for i, (row, post_url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): loading_bar.update() try: author = scraper.post_author(post_url) except FacebookInvalidTargetError: loading_bar.print('Given url (line %i) is probably not a Facebook group post: %s' % (i, post_url)) continue enricher.writerow(row, author.as_csv_row() if author is not None else None)