def captions_action(cli_args): enricher = casanova.enricher( cli_args.file, cli_args.output, add=YOUTUBE_CAPTIONS_CSV_HEADERS, keep=cli_args.select ) loading_bar = LoadingBar( 'Retrieving captions', unit='video' ) for row, video in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() result = get_video_captions(video, langs=cli_args.lang) if result is None: continue track, lines = result prefix = [track.lang, '1' if track.generated else ''] for line in lines: enricher.writerow(row, prefix + list(line))
def url_extract_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=REPORT_HEADERS, keep=cli_args.select) extract = EXTRACTORS[getattr(cli_args, 'from')] loading_bar = LoadingBar(desc='Extracting', unit='row', total=cli_args.total) for row, content in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() content = content.strip() if not content: continue for url in extract(content): if cli_args.base_url is not None: url = urljoin(cli_args.base_url, url) enricher.writerow(row, [url])
def url_join_action(cli_args): left_reader = casanova.reader(cli_args.file1) left_headers = left_reader.fieldnames left_idx = None if cli_args.select: left_idx = left_reader.pos.collect(cli_args.select) left_headers = list(cli_args.select) # Applying column prefix now left_headers = [cli_args.match_column_prefix + h for h in left_headers] right_enricher = casanova.enricher(cli_args.file2, cli_args.output, add=left_headers) loading_bar = LoadingBar(desc='Indexing left file', unit='line') # First step is to index left file trie = NormalizedLRUTrie() for row, cell in left_reader.cells(cli_args.column1, with_rows=True): loading_bar.update() if left_idx is not None: row = [row[i] for i in left_idx] urls = [cell] if cli_args.separator is not None: urls = cell.split(cli_args.separator) for url in urls: url = url.strip() # NOTE: should we filter invalid urls here? if url: trie.set(url, row) loading_bar.close() loading_bar = LoadingBar(desc='Matching right file', unit='line') for row, url in right_enricher.cells(cli_args.column2, with_rows=True): loading_bar.update() url = url.strip() match = None # NOTE: should we filter invalid urls here? if url: match = trie.match(url) if match is None: right_enricher.writerow(row) continue right_enricher.writerow(row, match)
def twitter_users_action(cli_args): client = TwitterAPIClient( cli_args.access_token, cli_args.access_token_secret, cli_args.api_key, cli_args.api_secret_key ) enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=USER_FIELDS ) loading_bar = LoadingBar( desc='Retrieving users', total=cli_args.total, unit='user' ) for chunk in as_chunks(100, enricher.cells(cli_args.column, with_rows=True)): users = ','.join(row[1].lstrip('@') for row in chunk) if cli_args.ids: client_args = {'user_id': users} key = 'id' else: client_args = {'screen_name': users} key = 'screen_name' try: result = client.call(['users', 'lookup'], **client_args) except TwitterHTTPError as e: if e.e.code == 404: for row, user in chunk: enricher.writerow(row) else: raise e continue indexed_result = {} for user in result: user = normalize_user(user) user_row = format_user_as_csv_row(user) indexed_result[user[key]] = user_row for row, user in chunk: user_row = indexed_result.get(user.lstrip('@')) enricher.writerow(row, user_row) loading_bar.update(len(chunk))
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar('Collecting tweets', total=cli_args.limit, unit='tweet', stats={ 'tokens': 1, 'queries': 0 }) enricher = casanova.enricher(cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, TwitterPublicAPIRateLimitError): loading_bar.inc('tokens') else: loading_bar.inc('failures') loading_bar.print( 'Failed to call Twitter search. Will retry in %s' % prettyprint_seconds(retry_state.idle_for)) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format(cli_args.query_template, value=query) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, before_sleep=before_sleep, include_referenced_tweets=cli_args.include_refs, with_meta=True) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def crowdtangle_summary_action(cli_args): if not cli_args.start_date: die('Missing --start-date!') enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=CROWDTANGLE_SUMMARY_CSV_HEADERS ) posts_writer = None if cli_args.posts is not None: posts_writer = csv.writer(cli_args.posts) posts_writer.writerow(CROWDTANGLE_POST_CSV_HEADERS_WITH_LINK) loading_bar = LoadingBar( desc='Collecting data', total=cli_args.total, unit='url' ) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) for row, url in enricher.cells(cli_args.column, with_rows=True): url = url.strip() try: stats = client.summary( url, start_date=cli_args.start_date, with_top_posts=cli_args.posts is not None, sort_by=cli_args.sort_by, platforms=cli_args.platforms ) except CrowdTangleInvalidTokenError: die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) if cli_args.posts is not None: stats, posts = stats if posts is not None: for post in posts: posts_writer.writerow(post.as_csv_row()) enricher.writerow(row, stats.as_csv_row() if stats is not None else None) loading_bar.update()
def extract_action(namespace): output_file = open_output_file(namespace.output) enricher = casanova.enricher( namespace.report, output_file, keep=namespace.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=namespace.total, unit='doc' ) def on_irrelevant_row(reason, row): loading_bar.update() enricher.writerow(row, format_error(reason)) try: files = create_report_iterator( namespace, enricher, on_irrelevant_row=on_irrelevant_row ) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % namespace.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) pool = LazyPool(namespace.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-content')) continue enricher.writerow(row, result) loading_bar.close() output_file.close()
def mediacloud_search_action(namespace, output_file): writer = csv.writer(output_file) writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER) client = MediacloudAPIClient(namespace.token) kwargs = { 'collections': namespace.collections, 'medias': namespace.medias, 'publish_day': namespace.publish_day, 'publish_month': namespace.publish_month, 'publish_year': namespace.publish_year } loading_bar = LoadingBar('Searching stories', unit='story', unit_plural='stories') try: if not namespace.skip_count: count = client.count(namespace.query, **kwargs) loading_bar.update_total(count) iterator = client.search(namespace.query, format='csv_row', **kwargs) for story in iterator: writer.writerow(story) loading_bar.update() except MediacloudServerError as e: loading_bar.die( ['Aborted due to a mediacloud server error:', e.server_error])
def comments_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(namespace.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos') loading_bar.close()
def mediacloud_search_action(cli_args): writer = csv.writer(cli_args.output) writer.writerow(MEDIACLOUD_STORIES_CSV_HEADER) client = MediacloudAPIClient(cli_args.token) kwargs = { 'collections': cli_args.collections, 'medias': cli_args.medias, 'publish_day': cli_args.publish_day, 'publish_month': cli_args.publish_month, 'publish_year': cli_args.publish_year, 'filter_query': cli_args.filter_query } loading_bar = LoadingBar('Searching stories', unit='story', unit_plural='stories') try: if not cli_args.skip_count: count = client.count(cli_args.query, **kwargs) loading_bar.update_total(count) iterator = client.search(cli_args.query, **kwargs) for story in iterator: writer.writerow(story.as_csv_row()) loading_bar.update() except MediacloudServerError as e: loading_bar.die( ['Aborted due to a mediacloud server error:', e.server_error])
def extract_action(cli_args): if cli_args.glob is None and cli_args.input_dir is None: cli_args.input_dir = DEFAULT_CONTENT_FOLDER input_data = cli_args.report if cli_args.glob is not None: input_data = dummy_csv_file_from_glob(cli_args.glob, cli_args.input_dir) enricher = casanova.enricher( input_data, cli_args.output, keep=cli_args.select, add=OUTPUT_ADDITIONAL_HEADERS ) loading_bar = LoadingBar( desc='Extracting content', total=cli_args.total, unit='doc' ) def on_irrelevant_row(reason, row, i): loading_bar.update() loading_bar.print('Row n°{n} could not be processed: {reason}'.format(n=i + 1, reason=reason)) enricher.writerow(row, format_error(reason)) if ( cli_args.glob is None and 'raw_contents' not in enricher.headers and not isdir(cli_args.input_dir) ): loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) files = create_report_iterator( cli_args, enricher, on_irrelevant_row=on_irrelevant_row ) pool = LazyPool(cli_args.processes) loading_bar.update_stats(p=pool.processes) with pool: for error, row, result in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: enricher.writerow(row, format_error(report_error(error))) continue if result is None: enricher.writerow(row, format_error('no-result')) continue enricher.writerow(row, result)
def mediacloud_topic_action(cli_args): writer = csv.writer(cli_args.output) writer.writerow(MEDIACLOUD_TOPIC_STORIES_CSV_HEADERS) loading_bar = LoadingBar(desc='Fetching stories', unit='story', unit_plural='stories') client = MediacloudAPIClient(cli_args.token) iterator = client.topic_stories(cli_args.topic_id, media_id=cli_args.media_id, from_media_id=cli_args.from_media_id) for story in iterator: writer.writerow(story.as_csv_row()) loading_bar.update()
def twitter_scrape_action(cli_args): scraper = TwitterAPIScraper() # Stats loading_bar = LoadingBar( 'Collecting tweets', total=cli_args.limit, unit='tweet', stats={'tokens': 1, 'queries': 0} ) enricher = casanova.enricher( cli_args.file, cli_args.output, add=TWEET_FIELDS + ADDITIONAL_TWEET_FIELDS, keep=cli_args.select ) for row, query in enricher.cells(cli_args.query, with_rows=True): # Templating? if cli_args.query_template is not None: query = CUSTOM_FORMATTER.format( cli_args.query_template, value=query ) loading_bar.print('Searching for "%s"' % query) loading_bar.inc('queries') iterator = scraper.search( query, limit=cli_args.limit, include_referenced_tweets=cli_args.include_refs, with_meta=True ) try: for tweet, meta in iterator: loading_bar.update() tweet_row = format_tweet_as_csv_row(tweet) enricher.writerow(row, tweet_row + format_meta_row(meta)) except TwitterPublicAPIOverCapacityError: loading_bar.die('Got an "Over Capacity" error. Shutting down...')
def url_parse_action(cli_args): headers = REPORT_HEADERS if cli_args.facebook: headers = FACEBOOK_REPORT_HEADERS elif cli_args.youtube: headers = YOUTUBE_REPORT_HEADERS enricher = casanova.enricher(cli_args.file, cli_args.output, add=headers, keep=cli_args.select) loading_bar = LoadingBar(desc='Parsing', unit='row', total=cli_args.total) for row, cell in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() if cli_args.separator: urls = cell.split(cli_args.separator) else: urls = [cell] for url in urls: url = url.strip() if not is_url( url, allow_spaces_in_path=True, require_protocol=False): enricher.writerow(row) continue if cli_args.facebook: addendum = extract_facebook_addendum(url) elif cli_args.youtube: addendum = extract_youtube_addendum(url) else: addendum = extract_standard_addendum(cli_args, url) if addendum is None: enricher.writerow(row) continue enricher.writerow(row, addendum)
def comments_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=YOUTUBE_COMMENT_CSV_HEADERS, keep=cli_args.select) loading_bar = LoadingBar('Retrieving comments', unit='comment', stats={'videos': 0}) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, video in enricher.cells(cli_args.column, with_rows=True): generator = client.comments(video) for comment in generator: loading_bar.update() enricher.writerow(row, comment.as_csv_row()) loading_bar.inc('videos')
def search_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS, keep=cli_args.select) loading_bar = LoadingBar('Searching videos', unit='video') def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( cli_args.key, before_sleep_until_midnight=before_sleep_until_midnight) for row, query in enricher.cells(cli_args.column, with_rows=True): loading_bar.print('Searching for "%s"' % query) searcher = client.search(query, order=cli_args.order) if cli_args.limit: searcher = islice(searcher, cli_args.limit) for video in searcher: loading_bar.update() enricher.writerow(row, video.as_csv_row())
def videos_action(namespace, output_file): enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_VIDEO_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving videos', unit='video', total=namespace.total) def before_sleep_until_midnight(seconds): loading_bar.print( 'API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight) iterator = enricher.cells(namespace.column, with_rows=True) for (row, _), video in client.videos(iterator, key=itemgetter(1)): loading_bar.update() enricher.writerow(row, video.as_csv_row() if video else None) loading_bar.close()
def captions_action(namespace, output_file): # Handling output single_video = namespace.file is sys.stdin and sys.stdin.isatty() if single_video: edit_namespace_with_csv_io(namespace, 'video') enricher = casanova.enricher(namespace.file, output_file, add=YOUTUBE_CAPTIONS_CSV_HEADERS, keep=namespace.select) loading_bar = LoadingBar('Retrieving captions', unit='video') for row, video in enricher.cells(namespace.column, with_rows=True): result = get_video_captions(video, langs=namespace.lang) loading_bar.update() if result is None: continue track, lines = result prefix = [track.lang, '1' if track.generated else ''] for line in lines: enricher.writerow(row, prefix + list(line)) loading_bar.close()
def crawl_action(cli_args, defer): # Loading crawler definition queue_path = join(cli_args.output_dir, 'queue') if cli_args.resume: print_err('Resuming crawl...') else: rmtree(queue_path, ignore_errors=True) # Scaffolding output directory os.makedirs(cli_args.output_dir, exist_ok=True) jobs_output_path = join(cli_args.output_dir, 'jobs.csv') jobs_output, jobs_writer = open_report(jobs_output_path, JOBS_HEADERS, resume=cli_args.resume) defer(jobs_output.close) # Creating crawler crawler = Crawler(cli_args.crawler, throttle=cli_args.throttle, queue_path=queue_path) reporter_pool = ScraperReporterPool(crawler, cli_args.output_dir, resume=cli_args.resume) defer(reporter_pool.close) # Loading bar loading_bar = LoadingBar(desc='Crawling', unit='page') def update_loading_bar(result): state = crawler.state loading_bar.update_stats(queued=state.jobs_queued, doing=state.jobs_doing + 1, spider=result.job.spider) loading_bar.update() # Starting crawler crawler.start() # Running crawler for result in crawler: update_loading_bar(result) jobs_writer.writerow(format_job_for_csv(result)) if result.error is not None: continue reporter_pool.write(result.job.spider, result.scraped)
def facebook_comments_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher(cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_COMMENT_CSV_HEADERS) # Loading bar loading_bar = LoadingBar(desc='Scraping comments', unit='comment') for i, (row, url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): try: batches = scraper.comments(url, per_call=True, detailed=True) except FacebookInvalidTargetError: loading_bar.print( 'Given url (line %i) is probably not a Facebook resource having comments: %s' % (i, url)) continue for details, batch in batches: for comment in batch: enricher.writerow(row, comment.as_csv_row()) loading_bar.update(len(batch)) loading_bar.update_stats(calls=details['calls'], replies=details['replies'], q=details['queue_size'], posts=i)
def search_action(namespace, output_file): # Handling output single_query = namespace.file is sys.stdin and sys.stdin.isatty() if single_query: edit_namespace_with_csv_io(namespace, 'query') enricher = casanova.enricher( namespace.file, output_file, add=YOUTUBE_VIDEO_SNIPPET_CSV_HEADERS, keep=namespace.select ) loading_bar = LoadingBar( 'Searching videos', unit='video' ) def before_sleep_until_midnight(seconds): loading_bar.print('API limits reached. Will now wait until midnight Pacific time!') client = YouTubeAPIClient( namespace.key, before_sleep_until_midnight=before_sleep_until_midnight ) for row, query in enricher.cells(namespace.column, with_rows=True): loading_bar.print('Searching for "%s"' % query) searcher = client.search(query, order=namespace.order) if namespace.limit: searcher = islice(searcher, namespace.limit) for video in searcher: loading_bar.update() enricher.writerow(row, video.as_csv_row()) loading_bar.close()
def facebook_url_likes_action(cli_args): enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=REPORT_HEADERS, total=cli_args.total, prebuffer_bytes=DEFAULT_PREBUFFER_BYTES ) if cli_args.column not in enricher.pos: die([ 'Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column ]) loading_bar = LoadingBar( desc='Retrieving likes', unit='url', total=enricher.total ) for row, url in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() url = url.strip() if not url or not is_url(url, require_protocol=False): enricher.writerow(row) continue err, html = make_request(url) if err is not None: loading_bar.die('An error occurred while fetching like button for this url: %s' % url) scraped = scrape(html) if scraped is None: loading_bar.die('Could not extract Facebook likes from this url\'s like button: %s' % url) enricher.writerow(row, scraped)
def mediacloud_medias_action(cli_args): added_headers = MEDIACLOUD_MEDIA_CSV_HEADER[1:] feeds_writer = None if cli_args.feeds: added_headers.append('feeds') feeds_writer = csv.writer(cli_args.feeds) feeds_writer.writerow(MEDIACLOUD_FEED_CSV_HEADER) enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=added_headers ) loading_bar = LoadingBar( desc='Fetching medias', unit='media', total=cli_args.total ) client = MediacloudAPIClient(cli_args.token) for row, media_id in enricher.cells(cli_args.column, with_rows=True): try: result = client.media(media_id) result = result.as_csv_row()[1:] if cli_args.feeds: feeds = client.feeds(media_id) enricher.writerow(row, result + [len(feeds)]) for feed in feeds: feeds_writer.writerow(feed.as_csv_row()) else: enricher.writerow(row, result) except MediacloudServerError as e: loading_bar.die([ 'Aborted due to a mediacloud server error:', e.server_error ]) loading_bar.update()
def facebook_post_authors_action(cli_args): try: scraper = FacebookMobileScraper(cli_args.cookie, throttle=cli_args.throttle) except FacebookInvalidCookieError: if cli_args.cookie in COOKIE_BROWSERS: die([ 'Could not extract relevant cookie from "%s".' % cli_args.cookie ]) die([ 'Relevant cookie not found.', 'A Facebook authentication cookie is necessary to be able to scrape Facebook comments.', 'Use the --cookie flag to choose a browser from which to extract the cookie or give your cookie directly.' ]) # Enricher enricher = casanova.enricher( cli_args.file, cli_args.output, keep=cli_args.select, add=FACEBOOK_USER_CSV_HEADERS ) # Loading bar loading_bar = LoadingBar( desc='Finding authors', unit='post' ) for i, (row, post_url) in enumerate(enricher.cells(cli_args.column, with_rows=True), 1): loading_bar.update() try: author = scraper.post_author(post_url) except FacebookInvalidTargetError: loading_bar.print('Given url (line %i) is probably not a Facebook group post: %s' % (i, post_url)) continue enricher.writerow(row, author.as_csv_row() if author is not None else None)
def twitter_user_tweets_action(namespace, output_file): wrapper = TwitterWrapper(namespace.access_token, namespace.access_token_secret, namespace.api_key, namespace.api_secret_key) enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=TWEET_FIELDS) loading_bar = LoadingBar('Retrieving tweets', total=namespace.total, unit='tweet') for row, user in enricher.cells(namespace.column, with_rows=True): max_id = None loading_bar.update_stats(user=user) while True: if namespace.ids: kwargs = {'user_id': user} else: kwargs = {'screen_name': user} kwargs['include_rts'] = not namespace.exclude_retweets kwargs['count'] = TWITTER_API_MAX_STATUSES_COUNT kwargs['tweet_mode'] = 'extended' if max_id is not None: kwargs['max_id'] = max_id loading_bar.inc('calls') try: tweets = wrapper.call(['statuses', 'user_timeline'], **kwargs) except TwitterHTTPError as e: loading_bar.inc('errors') if e.e.code == 404: loading_bar.print('Could not find user "%s"' % user) else: loading_bar.print( 'An error happened when attempting to retrieve tweets from "%s"' % user) break if not tweets: break loading_bar.update(len(tweets)) max_id = min(int(tweet['id_str']) for tweet in tweets) - 1 for tweet in tweets: tweet = normalize_tweet(tweet, collection_source='api') addendum = format_tweet_as_csv_row(tweet) enricher.writerow(row, addendum) loading_bar.inc('done') loading_bar.close()
def hyphe_dump_action(cli_args): # Paths output_dir = 'hyphe_corpus_%s' % cli_args.corpus if cli_args.output_dir is not None: output_dir = cli_args.output_dir os.makedirs(output_dir, exist_ok=True) webentities_output_path = join(output_dir, 'webentities.csv') pages_output_path = join(output_dir, 'pages.csv') if cli_args.body: body_output_dir = join(output_dir, 'content') os.makedirs(body_output_dir, exist_ok=True) client = HypheAPIClient(cli_args.url) corpus = client.corpus(cli_args.corpus, password=cli_args.password) try: corpus.ensure_is_started() except HypheCorpusAuthenticationError: die([ 'Wrong password for the "%s" corpus!' % cli_args.corpus, 'Don\'t forget to provide a password for this corpus using --password' ]) # Then we gather some handy statistics counts = corpus.count(statuses=cli_args.statuses) # Then we fetch webentities webentities_file = open(webentities_output_path, 'w', encoding='utf-8') webentities_writer = csv.writer(webentities_file) webentities_writer.writerow(WEBENTITY_CSV_HEADERS) loading_bar = LoadingBar(desc='Paginating web entities', unit='webentity', unit_plural='webentities', total=counts['webentities']) webentities = {} for webentity in corpus.webentities(statuses=cli_args.statuses): loading_bar.update() webentities[webentity['id']] = webentity webentities_writer.writerow(format_webentity_for_csv(webentity)) webentities_file.close() loading_bar.close() # Finally we paginate pages pages_file = open(pages_output_path, 'w', encoding='utf-8') pages_writer = csv.writer(pages_file) pages_writer.writerow(PAGE_CSV_HEADERS + (ADDITIONAL_PAGE_HEADERS if cli_args.body else [])) loading_bar = LoadingBar(desc='Fetching pages', unit='page', total=counts['pages']) for webentity in webentities.values(): for page in corpus.webentity_pages(webentity['id'], include_body=cli_args.body): loading_bar.update() filename = None if cli_args.body and 'body' in page: filename = format_page_filename(webentity, page) filepath = join(body_output_dir, filename) os.makedirs(dirname(filepath), exist_ok=True) with open(filter, 'wb') as f: binary = base64.b64decode(page['body']) binary = zlib.decompress(binary) binary = gzip.compress(binary) f.write(binary) pages_writer.writerow( format_page_for_csv(webentity, page, filename=filename))
def scrape_action(cli_args): # Parsing scraper definition try: scraper = Scraper(cli_args.scraper, strain=cli_args.strain) except DefinitionInvalidFormatError: die(['Unknown scraper format!', 'It should be a JSON or YAML file.']) except FileNotFoundError: die('Could not find scraper file!') except InvalidScraperError as error: print('Your scraper is invalid! You need to fix the following errors:', file=sys.stderr) print(file=sys.stderr) sys.stderr.write( report_scraper_validation_errors(error.validation_errors)) die() except CSSSelectorTooComplex: die([ 'Your strainer\'s CSS selector %s is too complex.' % colored(cli_args.strain, 'blue'), 'You cannot use relations to create a strainer.', 'Try to simplify the selector you passed to --strain.' ]) if cli_args.validate: print('Your scraper is valid.', file=sys.stderr) sys.exit(0) if scraper.headers is None and cli_args.format == 'csv': die([ 'Your scraper does not yield tabular data.', 'Try changing it or setting --format to "jsonl".' ]) loading_bar = LoadingBar(desc='Scraping pages', total=cli_args.total, unit='page') worker_args = (cli_args.format, cli_args.separator) def on_irrelevant_row(reason, row): loading_bar.update() if cli_args.glob is not None: files = create_glob_iterator(cli_args, worker_args) else: reader = casanova.reader(cli_args.report) try: files = create_report_iterator(cli_args, reader, worker_args=worker_args, on_irrelevant_row=on_irrelevant_row) except NotADirectoryError: loading_bar.die([ 'Could not find the "%s" directory!' % cli_args.input_dir, 'Did you forget to specify it with -i/--input-dir?' ]) if cli_args.format == 'csv': output_writer = csv.DictWriter(cli_args.output, fieldnames=scraper.headers) output_writer.writeheader() else: output_writer = ndjson.writer(cli_args.output) pool = LazyPool(cli_args.processes, initializer=init_process, initargs=(scraper.definition, cli_args.strain)) loading_bar.update_stats(p=pool.processes) with pool: for error, items in pool.imap_unordered(worker, files): loading_bar.update() if error is not None: if isinstance(error, (ScraperEvalError, ScraperEvalTypeError, ScraperEvalNoneError)): loading_bar.print(report_scraper_evaluation_error(error), end='') loading_bar.inc('errors') continue for item in items: output_writer.writerow(item)
def action(cli_args): enricher = casanova.batch_enricher(cli_args.file, cli_args.output, keep=cli_args.select, add=csv_headers) loading_bar = LoadingBar(desc='Retrieving ids', unit=method_name[:-1], stats={'users': 0}) # TODO: this is temp debug def listener(event, data): loading_bar.print(event) loading_bar.print(repr(data)) wrapper = TwitterWrapper(cli_args.access_token, cli_args.access_token_secret, cli_args.api_key, cli_args.api_secret_key, listener=listener) resuming_state = None if cli_args.resume: resuming_state = cli_args.output.pop_state() for row, user in enricher.cells(cli_args.column, with_rows=True): loading_bar.update_stats(user=user) all_ids = [] next_cursor = -1 result = None if resuming_state is not None and resuming_state.last_cursor: next_cursor = int(resuming_state.last_cursor) if cli_args.ids: wrapper_kwargs = {'user_id': user} else: wrapper_kwargs = {'screen_name': user} while next_cursor != 0: wrapper_kwargs['cursor'] = next_cursor skip_in_output = None if resuming_state: skip_in_output = resuming_state.values_to_skip resuming_state = None try: result = wrapper.call([method_name, 'ids'], **wrapper_kwargs) except TwitterHTTPError as e: # The user does not exist loading_bar.inc('users_not_found') break if result is not None: all_ids = result.get('ids', []) next_cursor = result.get('next_cursor', 0) loading_bar.update(len(all_ids)) batch = [] for user_id in all_ids: if skip_in_output and user_id in skip_in_output: continue batch.append([user_id]) enricher.writebatch(row, batch, next_cursor or None) else: next_cursor = 0 loading_bar.inc('users')
def fetch_action(cli_args, resolve=False, defer=None): # If we are hitting a single url we enable contents_in_report by default if not resolve and isinstance(cli_args.file, StringIO) and cli_args.contents_in_report is None: cli_args.contents_in_report = True if not resolve and cli_args.contents_in_report and cli_args.compress: raise InvalidArgumentsError('Cannot both --compress and output --contents-in-report!') # HTTP method http_method = cli_args.method # Cookie grabber get_cookie = None if cli_args.grab_cookies: get_cookie = grab_cookies(cli_args.grab_cookies) # Global headers global_headers = None if cli_args.headers: global_headers = {} for header in cli_args.headers: k, v = parse_http_header(header) global_headers[k] = v # Resume listener skipped_rows = 0 resuming_reader_loading = None if cli_args.resume and cli_args.output.can_resume(): resuming_reader_loading = LoadingBar( desc='Resuming', unit='line' ) def output_read_listener(event, row): nonlocal skipped_rows if event != 'output.row': return skipped_rows += 1 resuming_reader_loading.update() cli_args.output.listener = output_read_listener if resolve: additional_headers = RESOLVE_ADDITIONAL_HEADERS else: additional_headers = FETCH_ADDITIONAL_HEADERS if cli_args.contents_in_report: additional_headers = additional_headers + ['raw_contents'] # Enricher multiplex = None if cli_args.separator is not None: multiplex = (cli_args.column, cli_args.separator) enricher = casanova.threadsafe_enricher( cli_args.file, cli_args.output, add=additional_headers, keep=cli_args.select, total=cli_args.total, prebuffer_bytes=DEFAULT_PREBUFFER_BYTES, multiplex=multiplex ) if resuming_reader_loading is not None: resuming_reader_loading.close() if cli_args.column not in enricher.headers: raise InvalidArgumentsError('Could not find the "%s" column containing the urls in the given CSV file.' % cli_args.column) url_pos = enricher.headers[cli_args.column] filename_pos = None if not resolve and cli_args.filename is not None: if cli_args.filename not in enricher.headers: raise InvalidArgumentsError('Could not find the "%s" column containing the filenames in the given CSV file.' % cli_args.filename) filename_pos = enricher.headers[cli_args.filename] # Loading bar loading_bar = LoadingBar( desc='Fetching pages', total=enricher.total, unit='url', initial=skipped_rows ) defer(loading_bar.close) # NOTE: it could be dangerous with multithreaded execution, not to close it ourselves def update_loading_bar(result): nonlocal errors if result.error is not None: errors += 1 else: if resolve: status = result.stack[-1].status else: status = result.response.status if status >= 400: status_codes[status] += 1 stats = {'errors': errors} for code, count in status_codes.most_common(1): stats[str(code)] = count loading_bar.update_stats(**stats) loading_bar.update() only_shortened = getattr(cli_args, 'only_shortened', False) def url_key(item): url = item[1][url_pos].strip() if not url: return if only_shortened and not is_shortened_url(url): return # Url templating if cli_args.url_template: return cli_args.url_template.format(value=url) return url def request_args(domain, url, item): cookie = None # Cookie if get_cookie: cookie = get_cookie(url) # Headers headers = None if global_headers: headers = global_headers return { 'method': http_method, 'cookie': cookie, 'headers': headers } # Worker callback internals filename_builder = None files_writer = None if not resolve: try: filename_builder = FilenameBuilder( folder_strategy=cli_args.folder_strategy, template=cli_args.filename_template ) except TypeError: die([ 'Invalid "%s" --folder-strategy!' % cli_args.folder_strategy, 'Check the list at the end of the command help:', ' $ minet fetch -h' ]) files_writer = ThreadSafeFilesWriter(cli_args.output_dir) def worker_callback(result): # NOTE: at this point the callback is only fired on success row = result.item[1] response = result.response meta = result.meta if cli_args.keep_failed_contents and response.status != 200: return # First we need to build a filename filename_cell = row[filename_pos] if filename_pos else None formatter_kwargs = {} if cli_args.filename_template and 'line' in cli_args.filename_template: formatter_kwargs['line'] = enricher.wrap(row) try: filename = filename_builder( result.resolved, filename=filename_cell, ext=meta.get('ext'), formatter_kwargs=formatter_kwargs, compressed=cli_args.compress ) except FilenameFormattingError as e: result.error = e return meta['filename'] = filename # Decoding the response data? is_text = meta.get('is_text', False) original_encoding = meta.get('encoding', 'utf-8') data = response.data binary = True if is_text and (cli_args.standardize_encoding or cli_args.contents_in_report): data = data.decode(original_encoding, errors='replace') binary = False if cli_args.contents_in_report: meta['decoded_contents'] = data # Writing the file? # TODO: specify what should happen when contents are empty (e.g. POST queries) if data and not cli_args.contents_in_report: files_writer.write( filename, data, binary=binary, compress=cli_args.compress ) def write_fetch_output(index, row, resolved=None, status=None, error=None, filename=None, encoding=None, mimetype=None, data=None): addendum = [ resolved or '', status or '', error or '', filename or '', mimetype or '', encoding or '' ] if cli_args.contents_in_report: addendum.append(data or '') enricher.writerow(index, row, addendum) def write_resolve_output(index, row, resolved=None, status=None, error=None, redirects=None, chain=None): addendum = [ resolved or '', status or '', error or '', redirects or '', chain or '' ] enricher.writerow(index, row, addendum) errors = 0 status_codes = Counter() common_kwargs = { 'key': url_key, 'insecure': cli_args.insecure, 'threads': cli_args.threads, 'throttle': cli_args.throttle, 'domain_parallelism': cli_args.domain_parallelism, 'max_redirects': cli_args.max_redirects, 'wait': False, 'daemonic': True } if cli_args.timeout is not None: common_kwargs['timeout'] = cli_args.timeout # Normal fetch if not resolve: multithreaded_iterator = multithreaded_fetch( enricher, request_args=request_args, callback=worker_callback, **common_kwargs ) for result in multithreaded_iterator: index, row = result.item if not result.url: write_fetch_output( index, row ) loading_bar.update() continue # Updating stats update_loading_bar(result) # No error if result.error is None: meta = result.meta # Final url target resolved_url = result.resolved if resolved_url == result.url: resolved_url = None # Reporting in output write_fetch_output( index, row, resolved=resolved_url, status=result.response.status, filename=meta.get('filename'), encoding=meta.get('encoding'), mimetype=meta.get('mimetype'), data=meta.get('decoded_contents') ) # Handling potential errors else: error_code = report_error(result.error) resolved = None if isinstance(result.error, InvalidURLError): resolved = result.error.url if isinstance(result.error, FilenameFormattingError): loading_bar.print(report_filename_formatting_error(result.error)) write_fetch_output( index, row, error=error_code, resolved=resolved ) # Resolve else: multithreaded_iterator = multithreaded_resolve( enricher, resolve_args=request_args, follow_meta_refresh=cli_args.follow_meta_refresh, follow_js_relocation=cli_args.follow_js_relocation, infer_redirection=cli_args.infer_redirection, **common_kwargs ) for result in multithreaded_iterator: index, row = result.item if not result.url: write_resolve_output( index, row ) loading_bar.update() continue # Updating stats update_loading_bar(result) # No error if result.error is None: # Reporting in output last = result.stack[-1] write_resolve_output( index, row, resolved=last.url, status=last.status, redirects=len(result.stack) - 1, chain='|'.join(step.type for step in result.stack) ) # Handling potential errors else: error_code = report_error(result.error) write_resolve_output( index, row, error=error_code, redirects=(len(result.stack) - 1) if result.stack else None, chain='|'.join(step.type for step in result.stack) if result.stack else None )
def facebook_post_stats_action(cli_args): enricher = casanova.enricher(cli_args.file, cli_args.output, add=REPORT_HEADERS, keep=cli_args.select) def fetch_facebook_page_stats(url): err, response = request(url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = getpath(data, [ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ]) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data # Loading bar loading_bar = LoadingBar(desc='Fetching post stats', unit='post', total=cli_args.total) for row, post_url in enricher.cells(cli_args.column, with_rows=True): loading_bar.update() if (not post_url or not is_facebook_post_url(post_url)): enricher.writerow(row, format_err('not-facebook-post')) continue err, data = fetch_facebook_page_stats(post_url) if err: enricher.writerow(row, format_err(err)) else: enricher.writerow(row, format(data)) # Throttling sleep_with_entropy(FACEBOOK_WEB_DEFAULT_THROTTLE, 5.0)