def comments_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=cli_args.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(cli_args.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos')
def comments_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(namespace.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos') loading_bar.close()
def facebook_posts_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook groups.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_POST_CSV_HEADERS ) # Loading bar loading_bar = LoadingBar( desc='Scraping posts', unit='post' ) translated_langs = set() for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): loading_bar.inc('groups') try: posts = scraper.posts(url) except FacebookInvalidTargetError: loading_bar.print('Given url (line %i) is probably not a Facebook group: %s' % (i, url)) continue for post in posts: if post.translated_text and post.translated_from not in translated_langs: translated_langs.add(post.translated_from) lines = [ 'Found text translated from %s!' % post.translated_from, 'Since it means original text may not be entirely retrieved you might want', 'to edit your Facebook language settings to add "%s" to' % post.translated_from, 'the "Languages you don\'t want to be offered translations for" list here:', 'https://www.facebook.com/settings/?tab=language' ] for line in lines: loading_bar.print(line) loading_bar.print() loading_bar.update() enricher.writerow(row, post.as_csv_row())
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar('Collecting tweets', total=cli_args.limit, unit='tweet', stats={ 'tokens': 1, 'queries': 0 }) enricher = casanova.enricher(cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, TwitterPublicAPIRateLimitError): loading_bar.inc('tokens') else: loading_bar.inc('failures') loading_bar.print( 'Failed to call Twitter search. Will retry in %s' % prettyprint_seconds(retry_state.idle_for)) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format(cli_args.query_template, value=query) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, before_sleep=before_sleep, include_referenced_tweets=cli_args.include_refs, with_meta=True) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar( 'Collecting tweets', total=cli_args.limit, unit='tweet', stats={'tokens': 1, 'queries': 0} ) enricher = casanova.enricher( cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select ) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format( cli_args.query_template, value=query ) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, include_referenced_tweets=cli_args.include_refs, with_meta=True ) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def action(cli_args): enricher = casanova.batch_enricher(cli_args.file, cli_args.output, keep=cli_args.select, add=csv_headers) loading_bar = LoadingBar(desc='Retrieving ids', unit=method_name[:-1], stats={'users': 0}) # TODO: this is temp debug def listener(event, data): loading_bar.print(event) loading_bar.print(repr(data)) wrapper = TwitterWrapper(cli_args.access_token, cli_args.access_token_secret, cli_args.api_key, cli_args.api_secret_key, listener=listener) resuming_state = None if cli_args.resume: resuming_state = cli_args.output.pop_state() for row, user in enricher.cells(cli_args.column, with_rows=True): loading_bar.update_stats(user=user) all_ids = [] next_cursor = -1 result = None if resuming_state is not None and resuming_state.last_cursor: next_cursor = int(resuming_state.last_cursor) if cli_args.ids: wrapper_kwargs = {'user_id': user} else: wrapper_kwargs = {'screen_name': user} while next_cursor != 0: wrapper_kwargs['cursor'] = next_cursor skip_in_output = None if resuming_state: skip_in_output = resuming_state.values_to_skip resuming_state = None try: result = wrapper.call([method_name, 'ids'], **wrapper_kwargs) except TwitterHTTPError as e: # The user does not exist loading_bar.inc('users_not_found') break if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) loading_bar.update(len(all_ids)) batch = [] for user_id in all_ids: if skip_in_output and user_id in skip_in_output: continue batch.append([user_id]) enricher.writebatch(row, batch, next_cursor or None) else: next_cursor = 0 loading_bar.inc('users')
def scrape_action(cli_args): # Parsing scraper definition try: scraper = Scraper(cli_args.scraper, strain=cli_args.strain) except DefinitionInvalidFormatError: die(['Unknown scraper format!', 'It should be a JSON or YAML file.']) except FileNotFoundError: die('Could not find scraper file!') except InvalidScraperError as error: print('Your scraper is invalid! You need to fix the following errors:', file=sys.stderr) print(file=sys.stderr) sys.stderr.write( report_scraper_validation_errors(error.validation_errors)) die() except CSSSelectorTooComplex: die([ 'Your strainer\'s CSS selector %s is too complex.' % colored(cli_args.strain, 'blue'), 'You cannot use relations to create a strainer.', 'Try to simplify the selector you passed to --strain.' ]) if cli_args.validate: print('Your scraper is valid.', file=sys.stderr) sys.exit(0) if scraper.headers is None and cli_args.format == 'csv': die([ 'Your scraper does not yield tabular data.', 'Try changing it or setting --format to "jsonl".' ]) loading_bar = LoadingBar(desc='Scraping pages', total=cli_args.total, unit='page') worker_args = (cli_args.format, cli_args.separator) def on_irrelevant_row(reason, row): loading_bar.update() if cli_args.glob is not None: files = create_glob_iterator(cli_args, worker_args) else: reader = casanova.reader(cli_args.report) try: files = create_report_iterator(cli_args, reader, worker_args=worker_args, on_irrelevant_row=on_irrelevant_row) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) if cli_args.format == 'csv': output_writer = csv.DictWriter(cli_args.output, fieldnames=scraper.headers) output_writer.writeheader() else: output_writer = ndjson.writer(cli_args.output) pool = LazyPool(cli_args.processes, initializer=init_process, initargs=(scraper.definition, cli_args.strain)) loading_bar.update_stats(p=pool.processes) with pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: if isinstance(error, (ScraperEvalError, ScraperEvalTypeError, ScraperEvalNoneError)): loading_bar.print(report_scraper_evaluation_error(error), end='') loading_bar.inc('errors') continue for item in items: output_writer.writerow(item)
def twitter_user_tweets_action(namespace, output_file): wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=TWEET_FIELDS) loading_bar = LoadingBar('Retrieving tweets', total=namespace.total, unit='tweet') for row, user in enricher.cells(namespace.column, with_rows=True): max_id = None loading_bar.update_stats(user=user) while True: if namespace.ids: kwargs = {'user_id': user} else: kwargs = {'screen_name': user} kwargs['include_rts'] = not namespace.exclude_retweets kwargs['count'] = TWITTER_API_MAX_STATUSES_COUNT kwargs['tweet_mode'] = 'extended' if max_id is not None: kwargs['max_id'] = max_id loading_bar.inc('calls') try: tweets = wrapper.call(['statuses', 'user_timeline'], **kwargs) except TwitterHTTPError as e: loading_bar.inc('errors') if e.e.code == 404: loading_bar.print('Could not find user "%s"' % user) else: loading_bar.print( 'An error happened when attempting to retrieve tweets from "%s"' % user) break if not tweets: break loading_bar.update(len(tweets)) max_id = min(int(tweet['id_str']) for tweet in tweets) - 1 for tweet in tweets: tweet = normalize_tweet(tweet, collection_source='api') addendum = format_tweet_as_csv_row(tweet) enricher.writerow(row, addendum) loading_bar.inc('done') loading_bar.close()